aboutsummaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c4
-rw-r--r--fs/9p/cache.c9
-rw-r--r--fs/9p/v9fs.c11
-rw-r--r--fs/9p/v9fs.h1
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_addr.c12
-rw-r--r--fs/9p/vfs_dentry.c19
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_file.c163
-rw-r--r--fs/9p/vfs_inode.c36
-rw-r--r--fs/9p/vfs_inode_dotl.c24
-rw-r--r--fs/9p/vfs_super.c8
-rw-r--r--fs/9p/xattr.c10
-rw-r--r--fs/Kconfig7
-rw-r--r--fs/Makefile7
-rw-r--r--fs/adfs/adfs.h9
-rw-r--r--fs/adfs/file.c8
-rw-r--r--fs/adfs/super.c6
-rw-r--r--fs/affs/Changes2
-rw-r--r--fs/affs/affs.h36
-rw-r--r--fs/affs/amigaffs.c38
-rw-r--r--fs/affs/bitmap.c21
-rw-r--r--fs/affs/dir.c30
-rw-r--r--fs/affs/file.c54
-rw-r--r--fs/affs/inode.c16
-rw-r--r--fs/affs/namei.c71
-rw-r--r--fs/affs/super.c111
-rw-r--r--fs/affs/symlink.c2
-rw-r--r--fs/afs/cell.c2
-rw-r--r--fs/afs/cmservice.c19
-rw-r--r--fs/afs/file.c8
-rw-r--r--fs/afs/flock.c4
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/afs/internal.h5
-rw-r--r--fs/afs/main.c4
-rw-r--r--fs/afs/proc.c122
-rw-r--r--fs/afs/rxrpc.c84
-rw-r--r--fs/afs/vlocation.c3
-rw-r--r--fs/afs/volume.c2
-rw-r--r--fs/afs/write.c11
-rw-r--r--fs/aio.c487
-rw-r--r--fs/anon_inodes.c146
-rw-r--r--fs/attr.c38
-rw-r--r--fs/autofs4/autofs_i.h7
-rw-r--r--fs/autofs4/dev-ioctl.c27
-rw-r--r--fs/autofs4/expire.c14
-rw-r--r--fs/autofs4/inode.c60
-rw-r--r--fs/autofs4/root.c10
-rw-r--r--fs/autofs4/symlink.c4
-rw-r--r--fs/autofs4/waitq.c16
-rw-r--r--fs/befs/Makefile2
-rw-r--r--fs/befs/befs.h3
-rw-r--r--fs/befs/btree.c104
-rw-r--r--fs/befs/datastream.c89
-rw-r--r--fs/befs/debug.c74
-rw-r--r--fs/befs/inode.c10
-rw-r--r--fs/befs/io.c24
-rw-r--r--fs/befs/linuxvfs.c144
-rw-r--r--fs/bfs/file.c8
-rw-r--r--fs/bfs/inode.c4
-rw-r--r--fs/binfmt_aout.c13
-rw-r--r--fs/binfmt_elf.c155
-rw-r--r--fs/binfmt_elf_fdpic.c152
-rw-r--r--fs/binfmt_em86.c2
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/binfmt_misc.c1
-rw-r--r--fs/bio-integrity.c759
-rw-r--r--fs/bio.c2029
-rw-r--r--fs/block_dev.c109
-rw-r--r--fs/btrfs/Kconfig21
-rw-r--r--fs/btrfs/Makefile6
-rw-r--r--fs/btrfs/acl.c149
-rw-r--r--fs/btrfs/async-thread.c849
-rw-r--r--fs/btrfs/async-thread.h121
-rw-r--r--fs/btrfs/backref.c281
-rw-r--r--fs/btrfs/backref.h8
-rw-r--r--fs/btrfs/btrfs_inode.h42
-rw-r--r--fs/btrfs/check-integrity.c104
-rw-r--r--fs/btrfs/check-integrity.h2
-rw-r--r--fs/btrfs/compat.h7
-rw-r--r--fs/btrfs/compression.c54
-rw-r--r--fs/btrfs/ctree.c834
-rw-r--r--fs/btrfs/ctree.h434
-rw-r--r--fs/btrfs/delayed-inode.c240
-rw-r--r--fs/btrfs/delayed-inode.h8
-rw-r--r--fs/btrfs/delayed-ref.c350
-rw-r--r--fs/btrfs/delayed-ref.h50
-rw-r--r--fs/btrfs/dev-replace.c166
-rw-r--r--fs/btrfs/dir-item.c14
-rw-r--r--fs/btrfs/disk-io.c971
-rw-r--r--fs/btrfs/disk-io.h18
-rw-r--r--fs/btrfs/export.c1
-rw-r--r--fs/btrfs/extent-tree.c1475
-rw-r--r--fs/btrfs/extent_io.c901
-rw-r--r--fs/btrfs/extent_io.h24
-rw-r--r--fs/btrfs/extent_map.c128
-rw-r--r--fs/btrfs/extent_map.h19
-rw-r--r--fs/btrfs/file-item.c110
-rw-r--r--fs/btrfs/file.c687
-rw-r--r--fs/btrfs/free-space-cache.c454
-rw-r--r--fs/btrfs/free-space-cache.h4
-rw-r--r--fs/btrfs/hash.c50
-rw-r--r--fs/btrfs/hash.h11
-rw-r--r--fs/btrfs/inode-item.c67
-rw-r--r--fs/btrfs/inode-map.c51
-rw-r--r--fs/btrfs/inode.c1222
-rw-r--r--fs/btrfs/ioctl.c1287
-rw-r--r--fs/btrfs/locking.c80
-rw-r--r--fs/btrfs/lzo.c20
-rw-r--r--fs/btrfs/ordered-data.c151
-rw-r--r--fs/btrfs/ordered-data.h12
-rw-r--r--fs/btrfs/orphan.c20
-rw-r--r--fs/btrfs/print-tree.c15
-rw-r--r--fs/btrfs/props.c427
-rw-r--r--fs/btrfs/props.h42
-rw-r--r--fs/btrfs/qgroup.c999
-rw-r--r--fs/btrfs/qgroup.h107
-rw-r--r--fs/btrfs/raid56.c49
-rw-r--r--fs/btrfs/reada.c22
-rw-r--r--fs/btrfs/relocation.c322
-rw-r--r--fs/btrfs/root-tree.c32
-rw-r--r--fs/btrfs/scrub.c444
-rw-r--r--fs/btrfs/send.c1908
-rw-r--r--fs/btrfs/super.c352
-rw-r--r--fs/btrfs/sysfs.c710
-rw-r--r--fs/btrfs/sysfs.h73
-rw-r--r--fs/btrfs/tests/btrfs-tests.c171
-rw-r--r--fs/btrfs/tests/btrfs-tests.h36
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c229
-rw-r--r--fs/btrfs/tests/extent-io-tests.c276
-rw-r--r--fs/btrfs/tests/free-space-tests.c4
-rw-r--r--fs/btrfs/tests/inode-tests.c928
-rw-r--r--fs/btrfs/tests/qgroup-tests.c470
-rw-r--r--fs/btrfs/transaction.c298
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-defrag.c5
-rw-r--r--fs/btrfs/tree-log.c615
-rw-r--r--fs/btrfs/tree-log.h34
-rw-r--r--fs/btrfs/ulist.c117
-rw-r--r--fs/btrfs/ulist.h39
-rw-r--r--fs/btrfs/uuid-tree.c19
-rw-r--r--fs/btrfs/volumes.c389
-rw-r--r--fs/btrfs/volumes.h29
-rw-r--r--fs/btrfs/xattr.c17
-rw-r--r--fs/btrfs/xattr.h2
-rw-r--r--fs/btrfs/zlib.c34
-rw-r--r--fs/buffer.c93
-rw-r--r--fs/cachefiles/bind.c18
-rw-r--r--fs/cachefiles/daemon.c31
-rw-r--r--fs/cachefiles/interface.c9
-rw-r--r--fs/cachefiles/internal.h30
-rw-r--r--fs/cachefiles/main.c9
-rw-r--r--fs/cachefiles/namei.c62
-rw-r--r--fs/cachefiles/rdwr.c33
-rw-r--r--fs/cachefiles/security.c10
-rw-r--r--fs/cachefiles/xattr.c10
-rw-r--r--fs/ceph/Kconfig13
-rw-r--r--fs/ceph/Makefile1
-rw-r--r--fs/ceph/acl.c194
-rw-r--r--fs/ceph/addr.c110
-rw-r--r--fs/ceph/cache.c8
-rw-r--r--fs/ceph/cache.h23
-rw-r--r--fs/ceph/caps.c554
-rw-r--r--fs/ceph/debugfs.c11
-rw-r--r--fs/ceph/dir.c131
-rw-r--r--fs/ceph/export.c267
-rw-r--r--fs/ceph/file.c456
-rw-r--r--fs/ceph/inode.c527
-rw-r--r--fs/ceph/ioctl.c19
-rw-r--r--fs/ceph/locks.c93
-rw-r--r--fs/ceph/mds_client.c305
-rw-r--r--fs/ceph/mds_client.h8
-rw-r--r--fs/ceph/mdsmap.c2
-rw-r--r--fs/ceph/strings.c3
-rw-r--r--fs/ceph/super.c36
-rw-r--r--fs/ceph/super.h72
-rw-r--r--fs/ceph/xattr.c163
-rw-r--r--fs/char_dev.c3
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_unicode.c7
-rw-r--r--fs/cifs/cifsacl.c101
-rw-r--r--fs/cifs/cifsencrypt.c40
-rw-r--r--fs/cifs/cifsfs.c182
-rw-r--r--fs/cifs/cifsfs.h34
-rw-r--r--fs/cifs/cifsglob.h79
-rw-r--r--fs/cifs/cifspdu.h73
-rw-r--r--fs/cifs/cifsproto.h41
-rw-r--r--fs/cifs/cifssmb.c284
-rw-r--r--fs/cifs/connect.c40
-rw-r--r--fs/cifs/dir.c74
-rw-r--r--fs/cifs/file.c319
-rw-r--r--fs/cifs/fscache.c8
-rw-r--r--fs/cifs/inode.c288
-rw-r--r--fs/cifs/ioctl.c170
-rw-r--r--fs/cifs/link.c339
-rw-r--r--fs/cifs/misc.c96
-rw-r--r--fs/cifs/netmisc.c15
-rw-r--r--fs/cifs/readdir.c42
-rw-r--r--fs/cifs/sess.c4
-rw-r--r--fs/cifs/smb1ops.c167
-rw-r--r--fs/cifs/smb2glob.h3
-rw-r--r--fs/cifs/smb2inode.c16
-rw-r--r--fs/cifs/smb2maperror.c2
-rw-r--r--fs/cifs/smb2misc.c18
-rw-r--r--fs/cifs/smb2ops.c299
-rw-r--r--fs/cifs/smb2pdu.c249
-rw-r--r--fs/cifs/smb2pdu.h81
-rw-r--r--fs/cifs/smb2proto.h10
-rw-r--r--fs/cifs/smb2transport.c12
-rw-r--r--fs/cifs/smbfsctl.h16
-rw-r--r--fs/cifs/transport.c51
-rw-r--r--fs/cifs/xattr.c64
-rw-r--r--fs/coda/cnode.c4
-rw-r--r--fs/coda/coda_int.h2
-rw-r--r--fs/coda/coda_linux.h10
-rw-r--r--fs/coda/dir.c24
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coda/inode.c36
-rw-r--r--fs/coda/psdev.c39
-rw-r--r--fs/coda/sysctl.c4
-rw-r--r--fs/coda/upcall.c14
-rw-r--r--fs/compat.c162
-rw-r--r--fs/compat_binfmt_elf.c5
-rw-r--r--fs/compat_ioctl.c12
-rw-r--r--fs/configfs/configfs_internal.h6
-rw-r--r--fs/configfs/dir.c36
-rw-r--r--fs/configfs/inode.c5
-rw-r--r--fs/configfs/item.c58
-rw-r--r--fs/configfs/mount.c4
-rw-r--r--fs/coredump.c81
-rw-r--r--fs/coredump.h6
-rw-r--r--fs/cramfs/Kconfig5
-rw-r--r--fs/cramfs/inode.c54
-rw-r--r--fs/cramfs/internal.h4
-rw-r--r--fs/cramfs/uncompress.c2
-rw-r--r--fs/dcache.c846
-rw-r--r--fs/dcookies.c2
-rw-r--r--fs/debugfs/inode.c10
-rw-r--r--fs/devpts/inode.c28
-rw-r--r--fs/direct-io.c192
-rw-r--r--fs/dlm/ast.c3
-rw-r--r--fs/dlm/config.c26
-rw-r--r--fs/dlm/debug_fs.c34
-rw-r--r--fs/dlm/dir.c4
-rw-r--r--fs/dlm/dlm_internal.h2
-rw-r--r--fs/dlm/lock.c7
-rw-r--r--fs/dlm/lockspace.c33
-rw-r--r--fs/dlm/lowcomms.c19
-rw-r--r--fs/dlm/member.c27
-rw-r--r--fs/dlm/netlink.c10
-rw-r--r--fs/dlm/recover.c10
-rw-r--r--fs/dlm/recoverd.c34
-rw-r--r--fs/drop_caches.c18
-rw-r--r--fs/ecryptfs/crypto.c4
-rw-r--r--fs/ecryptfs/file.c29
-rw-r--r--fs/ecryptfs/inode.c42
-rw-r--r--fs/ecryptfs/keystore.c3
-rw-r--r--fs/ecryptfs/super.c2
-rw-r--r--fs/efivarfs/file.c13
-rw-r--r--fs/efivarfs/super.c13
-rw-r--r--fs/efs/dir.c18
-rw-r--r--fs/efs/efs.h6
-rw-r--r--fs/efs/file.c14
-rw-r--r--fs/efs/inode.c42
-rw-r--r--fs/efs/namei.c8
-rw-r--r--fs/efs/super.c84
-rw-r--r--fs/eventfd.c13
-rw-r--r--fs/eventpoll.c154
-rw-r--r--fs/exec.c245
-rw-r--r--fs/exofs/Kconfig.ore2
-rw-r--r--fs/exofs/file.c10
-rw-r--r--fs/exofs/inode.c33
-rw-r--r--fs/exofs/ore.c145
-rw-r--r--fs/exofs/ore_raid.c60
-rw-r--r--fs/exofs/ore_raid.h21
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/exportfs/expfs.c271
-rw-r--r--fs/ext2/acl.c189
-rw-r--r--fs/ext2/acl.h8
-rw-r--r--fs/ext2/file.c11
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c16
-rw-r--r--fs/ext2/namei.c2
-rw-r--r--fs/ext2/super.c4
-rw-r--r--fs/ext2/xattr.c8
-rw-r--r--fs/ext2/xattr.h2
-rw-r--r--fs/ext2/xattr_security.c4
-rw-r--r--fs/ext2/xip.c1
-rw-r--r--fs/ext3/acl.c223
-rw-r--r--fs/ext3/acl.h9
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext3/dir.c46
-rw-r--r--fs/ext3/file.c11
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c138
-rw-r--r--fs/ext3/namei.c7
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext3/xattr.c8
-rw-r--r--fs/ext3/xattr.h2
-rw-r--r--fs/ext3/xattr_security.c5
-rw-r--r--fs/ext4/acl.c223
-rw-r--r--fs/ext4/acl.h9
-rw-r--r--fs/ext4/balloc.c97
-rw-r--r--fs/ext4/block_validity.c33
-rw-r--r--fs/ext4/dir.c38
-rw-r--r--fs/ext4/ext4.h121
-rw-r--r--fs/ext4/ext4_extents.h22
-rw-r--r--fs/ext4/ext4_jbd2.c26
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c1130
-rw-r--r--fs/ext4/extents_status.c42
-rw-r--r--fs/ext4/extents_status.h9
-rw-r--r--fs/ext4/file.c180
-rw-r--r--fs/ext4/ialloc.c39
-rw-r--r--fs/ext4/indirect.c38
-rw-r--r--fs/ext4/inline.c44
-rw-r--r--fs/ext4/inode.c385
-rw-r--r--fs/ext4/ioctl.c37
-rw-r--r--fs/ext4/mballoc.c80
-rw-r--r--fs/ext4/mballoc.h4
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/mmp.c6
-rw-r--r--fs/ext4/move_extent.c84
-rw-r--r--fs/ext4/namei.c623
-rw-r--r--fs/ext4/page-io.c50
-rw-r--r--fs/ext4/resize.c49
-rw-r--r--fs/ext4/super.c269
-rw-r--r--fs/ext4/xattr.c102
-rw-r--r--fs/ext4/xattr.h8
-rw-r--r--fs/f2fs/Kconfig8
-rw-r--r--fs/f2fs/Makefile2
-rw-r--r--fs/f2fs/acl.c182
-rw-r--r--fs/f2fs/acl.h14
-rw-r--r--fs/f2fs/checkpoint.c519
-rw-r--r--fs/f2fs/data.c750
-rw-r--r--fs/f2fs/debug.c63
-rw-r--r--fs/f2fs/dir.c146
-rw-r--r--fs/f2fs/f2fs.h448
-rw-r--r--fs/f2fs/file.c301
-rw-r--r--fs/f2fs/gc.c69
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/inline.c250
-rw-r--r--fs/f2fs/inode.c121
-rw-r--r--fs/f2fs/namei.c80
-rw-r--r--fs/f2fs/node.c712
-rw-r--r--fs/f2fs/node.h44
-rw-r--r--fs/f2fs/recovery.c164
-rw-r--r--fs/f2fs/segment.c846
-rw-r--r--fs/f2fs/segment.h190
-rw-r--r--fs/f2fs/super.c326
-rw-r--r--fs/f2fs/xattr.c140
-rw-r--r--fs/f2fs/xattr.h10
-rw-r--r--fs/fat/fat.h4
-rw-r--r--fs/fat/file.c8
-rw-r--r--fs/fat/inode.c380
-rw-r--r--fs/fcntl.c42
-rw-r--r--fs/file.c147
-rw-r--r--fs/file_table.c189
-rw-r--r--fs/filesystems.c2
-rw-r--r--fs/freevxfs/vxfs_inode.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_super.c1
-rw-r--r--fs/fs-writeback.c49
-rw-r--r--fs/fs_struct.c2
-rw-r--r--fs/fscache/cache.c13
-rw-r--r--fs/fscache/cookie.c193
-rw-r--r--fs/fscache/fsdef.c1
-rw-r--r--fs/fscache/histogram.c6
-rw-r--r--fs/fscache/internal.h26
-rw-r--r--fs/fscache/main.c11
-rw-r--r--fs/fscache/netfs.c8
-rw-r--r--fs/fscache/object-list.c13
-rw-r--r--fs/fscache/object.c12
-rw-r--r--fs/fscache/operation.c3
-rw-r--r--fs/fscache/page.c65
-rw-r--r--fs/fuse/control.c2
-rw-r--r--fs/fuse/cuse.c20
-rw-r--r--fs/fuse/dev.c78
-rw-r--r--fs/fuse/dir.c260
-rw-r--r--fs/fuse/file.c884
-rw-r--r--fs/fuse/fuse_i.h40
-rw-r--r--fs/fuse/inode.c69
-rw-r--r--fs/generic_acl.c184
-rw-r--r--fs/gfs2/acl.c251
-rw-r--r--fs/gfs2/acl.h6
-rw-r--r--fs/gfs2/aops.c197
-rw-r--r--fs/gfs2/bmap.c128
-rw-r--r--fs/gfs2/bmap.h2
-rw-r--r--fs/gfs2/dir.c113
-rw-r--r--fs/gfs2/dir.h19
-rw-r--r--fs/gfs2/file.c70
-rw-r--r--fs/gfs2/glock.c165
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/glops.c97
-rw-r--r--fs/gfs2/glops.h2
-rw-r--r--fs/gfs2/incore.h115
-rw-r--r--fs/gfs2/inode.c308
-rw-r--r--fs/gfs2/lock_dlm.c26
-rw-r--r--fs/gfs2/log.c197
-rw-r--r--fs/gfs2/log.h11
-rw-r--r--fs/gfs2/lops.c94
-rw-r--r--fs/gfs2/lops.h5
-rw-r--r--fs/gfs2/main.c24
-rw-r--r--fs/gfs2/meta_io.c26
-rw-r--r--fs/gfs2/meta_io.h3
-rw-r--r--fs/gfs2/ops_fstype.c186
-rw-r--r--fs/gfs2/quota.c603
-rw-r--r--fs/gfs2/quota.h10
-rw-r--r--fs/gfs2/recovery.c40
-rw-r--r--fs/gfs2/recovery.h6
-rw-r--r--fs/gfs2/rgrp.c359
-rw-r--r--fs/gfs2/rgrp.h6
-rw-r--r--fs/gfs2/super.c153
-rw-r--r--fs/gfs2/sys.c20
-rw-r--r--fs/gfs2/trans.c73
-rw-r--r--fs/gfs2/util.c121
-rw-r--r--fs/gfs2/util.h33
-rw-r--r--fs/gfs2/xattr.c7
-rw-r--r--fs/hfs/btree.h5
-rw-r--r--fs/hfs/inode.c18
-rw-r--r--fs/hfs/super.c1
-rw-r--r--fs/hfsplus/acl.h9
-rw-r--r--fs/hfsplus/attributes.c38
-rw-r--r--fs/hfsplus/bnode.c49
-rw-r--r--fs/hfsplus/btree.c114
-rw-r--r--fs/hfsplus/catalog.c41
-rw-r--r--fs/hfsplus/dir.c34
-rw-r--r--fs/hfsplus/extents.c33
-rw-r--r--fs/hfsplus/hfsplus_fs.h216
-rw-r--r--fs/hfsplus/hfsplus_raw.h18
-rw-r--r--fs/hfsplus/inode.c88
-rw-r--r--fs/hfsplus/options.c11
-rw-r--r--fs/hfsplus/posix_acl.c168
-rw-r--r--fs/hfsplus/super.c8
-rw-r--r--fs/hfsplus/wrapper.c29
-rw-r--r--fs/hfsplus/xattr.c365
-rw-r--r--fs/hfsplus/xattr.h4
-rw-r--r--fs/hfsplus/xattr_security.c49
-rw-r--r--fs/hfsplus/xattr_trusted.c32
-rw-r--r--fs/hfsplus/xattr_user.c32
-rw-r--r--fs/hostfs/hostfs_kern.c74
-rw-r--r--fs/hpfs/alloc.c68
-rw-r--r--fs/hpfs/buffer.c108
-rw-r--r--fs/hpfs/dir.c6
-rw-r--r--fs/hpfs/dnode.c44
-rw-r--r--fs/hpfs/ea.c6
-rw-r--r--fs/hpfs/file.c8
-rw-r--r--fs/hpfs/hpfs_fn.h8
-rw-r--r--fs/hpfs/inode.c5
-rw-r--r--fs/hpfs/map.c17
-rw-r--r--fs/hpfs/name.c11
-rw-r--r--fs/hpfs/namei.c4
-rw-r--r--fs/hpfs/super.c114
-rw-r--r--fs/hugetlbfs/inode.c45
-rw-r--r--fs/inode.c108
-rw-r--r--fs/internal.h7
-rw-r--r--fs/ioctl.c4
-rw-r--r--fs/ioprio.c241
-rw-r--r--fs/isofs/inode.c15
-rw-r--r--fs/jbd/journal.c8
-rw-r--r--fs/jbd/revoke.c12
-rw-r--r--fs/jbd/transaction.c12
-rw-r--r--fs/jbd2/commit.c83
-rw-r--r--fs/jbd2/journal.c28
-rw-r--r--fs/jbd2/recovery.c2
-rw-r--r--fs/jbd2/transaction.c69
-rw-r--r--fs/jffs2/acl.c141
-rw-r--r--fs/jffs2/acl.h7
-rw-r--r--fs/jffs2/background.c12
-rw-r--r--fs/jffs2/compr_rtime.c4
-rw-r--r--fs/jffs2/dir.c1
-rw-r--r--fs/jffs2/file.c9
-rw-r--r--fs/jffs2/fs.c24
-rw-r--r--fs/jffs2/malloc.c4
-rw-r--r--fs/jffs2/nodelist.c28
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/nodemgmt.c14
-rw-r--r--fs/jffs2/readinode.c26
-rw-r--r--fs/jffs2/super.c1
-rw-r--r--fs/jffs2/symlink.c1
-rw-r--r--fs/jffs2/xattr.c9
-rw-r--r--fs/jfs/acl.c109
-rw-r--r--fs/jfs/file.c14
-rw-r--r--fs/jfs/inode.c12
-rw-r--r--fs/jfs/jfs_acl.h7
-rw-r--r--fs/jfs/jfs_dmap.c9
-rw-r--r--fs/jfs/jfs_inode.c19
-rw-r--r--fs/jfs/jfs_logmgr.c14
-rw-r--r--fs/jfs/jfs_metapage.c9
-rw-r--r--fs/jfs/jfs_xattr.h2
-rw-r--r--fs/jfs/namei.c1
-rw-r--r--fs/jfs/super.c80
-rw-r--r--fs/jfs/xattr.c123
-rw-r--r--fs/kernfs/Kconfig7
-rw-r--r--fs/kernfs/Makefile5
-rw-r--r--fs/kernfs/dir.c1432
-rw-r--r--fs/kernfs/file.c952
-rw-r--r--fs/kernfs/inode.c383
-rw-r--r--fs/kernfs/kernfs-internal.h120
-rw-r--r--fs/kernfs/mount.c250
-rw-r--r--fs/kernfs/symlink.c147
-rw-r--r--fs/libfs.c89
-rw-r--r--fs/lockd/clnt4xdr.c2
-rw-r--r--fs/lockd/clntxdr.c2
-rw-r--r--fs/lockd/svc.c9
-rw-r--r--fs/lockd/svclock.c8
-rw-r--r--fs/lockd/svcsubs.c3
-rw-r--r--fs/lockd/xdr.c2
-rw-r--r--fs/locks.c507
-rw-r--r--fs/logfs/dev_bdev.c53
-rw-r--r--fs/logfs/file.c8
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/logfs/segment.c3
-rw-r--r--fs/mbcache.c541
-rw-r--r--fs/minix/Kconfig2
-rw-r--r--fs/minix/file.c8
-rw-r--r--fs/minix/inode.c5
-rw-r--r--fs/mount.h31
-rw-r--r--fs/mpage.c99
-rw-r--r--fs/namei.c725
-rw-r--r--fs/namespace.c575
-rw-r--r--fs/ncpfs/dir.c69
-rw-r--r--fs/ncpfs/file.c24
-rw-r--r--fs/ncpfs/getopt.c23
-rw-r--r--fs/ncpfs/inode.c104
-rw-r--r--fs/ncpfs/ioctl.c17
-rw-r--r--fs/ncpfs/mmap.c2
-rw-r--r--fs/ncpfs/ncp_fs.h30
-rw-r--r--fs/ncpfs/ncp_fs_sb.h8
-rw-r--r--fs/ncpfs/ncplib_kernel.c28
-rw-r--r--fs/ncpfs/sock.c53
-rw-r--r--fs/ncpfs/symlink.c2
-rw-r--r--fs/nfs/Kconfig17
-rw-r--r--fs/nfs/Makefile4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c83
-rw-r--r--fs/nfs/blocklayout/blocklayout.h1
-rw-r--r--fs/nfs/blocklayout/extents.c2
-rw-r--r--fs/nfs/callback.c3
-rw-r--r--fs/nfs/callback_proc.c19
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/delegation.c11
-rw-r--r--fs/nfs/dir.c120
-rw-r--r--fs/nfs/direct.c694
-rw-r--r--fs/nfs/dns_resolve.c2
-rw-r--r--fs/nfs/file.c76
-rw-r--r--fs/nfs/filelayout/Makefile5
-rw-r--r--fs/nfs/filelayout/filelayout.c (renamed from fs/nfs/nfs4filelayout.c)233
-rw-r--r--fs/nfs/filelayout/filelayout.h (renamed from fs/nfs/nfs4filelayout.h)2
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c (renamed from fs/nfs/nfs4filelayoutdev.c)12
-rw-r--r--fs/nfs/fscache.c202
-rw-r--r--fs/nfs/fscache.h18
-rw-r--r--fs/nfs/getroot.c3
-rw-r--r--fs/nfs/inode.c273
-rw-r--r--fs/nfs/internal.h81
-rw-r--r--fs/nfs/nfs2xdr.c14
-rw-r--r--fs/nfs/nfs3acl.c330
-rw-r--r--fs/nfs/nfs3proc.c130
-rw-r--r--fs/nfs/nfs3super.c3
-rw-r--r--fs/nfs/nfs3xdr.c16
-rw-r--r--fs/nfs/nfs4_fs.h43
-rw-r--r--fs/nfs/nfs4client.c168
-rw-r--r--fs/nfs/nfs4file.c15
-rw-r--r--fs/nfs/nfs4namespace.c206
-rw-r--r--fs/nfs/nfs4proc.c857
-rw-r--r--fs/nfs/nfs4session.c25
-rw-r--r--fs/nfs/nfs4session.h2
-rw-r--r--fs/nfs/nfs4state.c312
-rw-r--r--fs/nfs/nfs4super.c28
-rw-r--r--fs/nfs/nfs4sysctl.c6
-rw-r--r--fs/nfs/nfs4trace.h8
-rw-r--r--fs/nfs/nfs4xdr.c209
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/objlayout/objio_osd.c24
-rw-r--r--fs/nfs/objlayout/objlayout.c24
-rw-r--r--fs/nfs/objlayout/objlayout.h8
-rw-r--r--fs/nfs/pagelist.c649
-rw-r--r--fs/nfs/pnfs.c250
-rw-r--r--fs/nfs/pnfs.h48
-rw-r--r--fs/nfs/proc.c46
-rw-r--r--fs/nfs/read.c422
-rw-r--r--fs/nfs/super.c229
-rw-r--r--fs/nfs/sysctl.c6
-rw-r--r--fs/nfs/unlink.c38
-rw-r--r--fs/nfs/write.c945
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/acl.h28
-rw-r--r--fs/nfsd/auth.c10
-rw-r--r--fs/nfsd/cache.h8
-rw-r--r--fs/nfsd/export.c112
-rw-r--r--fs/nfsd/export.h110
-rw-r--r--fs/nfsd/fault_inject.c15
-rw-r--r--fs/nfsd/idmap.h4
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs2acl.c84
-rw-r--r--fs/nfsd/nfs3acl.c68
-rw-r--r--fs/nfsd/nfs3xdr.c41
-rw-r--r--fs/nfsd/nfs4acl.c169
-rw-r--r--fs/nfsd/nfs4callback.c23
-rw-r--r--fs/nfsd/nfs4idmap.c52
-rw-r--r--fs/nfsd/nfs4proc.c280
-rw-r--r--fs/nfsd/nfs4state.c479
-rw-r--r--fs/nfsd/nfs4xdr.c2198
-rw-r--r--fs/nfsd/nfscache.c46
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfsd.h19
-rw-r--r--fs/nfsd/nfsfh.c33
-rw-r--r--fs/nfsd/nfsfh.h73
-rw-r--r--fs/nfsd/nfssvc.c36
-rw-r--r--fs/nfsd/nfsxdr.c15
-rw-r--r--fs/nfsd/state.h5
-rw-r--r--fs/nfsd/stats.c1
-rw-r--r--fs/nfsd/stats.h43
-rw-r--r--fs/nfsd/vfs.c587
-rw-r--r--fs/nfsd/vfs.h20
-rw-r--r--fs/nfsd/xdr3.h3
-rw-r--r--fs/nfsd/xdr4.h29
-rw-r--r--fs/nilfs2/cpfile.c12
-rw-r--r--fs/nilfs2/dat.c12
-rw-r--r--fs/nilfs2/file.c9
-rw-r--r--fs/nilfs2/inode.c15
-rw-r--r--fs/nilfs2/ioctl.c508
-rw-r--r--fs/nilfs2/segbuf.c3
-rw-r--r--fs/nilfs2/segment.c10
-rw-r--r--fs/nilfs2/sufile.c295
-rw-r--r--fs/nilfs2/sufile.h2
-rw-r--r--fs/nilfs2/super.c1
-rw-r--r--fs/nilfs2/the_nilfs.c10
-rw-r--r--fs/nls/mac-celtic.c1
-rw-r--r--fs/nls/mac-centeuro.c1
-rw-r--r--fs/nls/mac-croatian.c1
-rw-r--r--fs/nls/mac-cyrillic.c1
-rw-r--r--fs/nls/mac-gaelic.c1
-rw-r--r--fs/nls/mac-greek.c1
-rw-r--r--fs/nls/mac-iceland.c1
-rw-r--r--fs/nls/mac-inuit.c1
-rw-r--r--fs/nls/mac-roman.c1
-rw-r--r--fs/nls/mac-romanian.c1
-rw-r--r--fs/nls/mac-turkish.c1
-rw-r--r--fs/nls/nls_ascii.c1
-rw-r--r--fs/nls/nls_base.c5
-rw-r--r--fs/nls/nls_cp1250.c1
-rw-r--r--fs/nls/nls_cp1251.c1
-rw-r--r--fs/nls/nls_cp1255.c1
-rw-r--r--fs/nls/nls_cp437.c1
-rw-r--r--fs/nls/nls_cp737.c1
-rw-r--r--fs/nls/nls_cp775.c1
-rw-r--r--fs/nls/nls_cp850.c1
-rw-r--r--fs/nls/nls_cp852.c1
-rw-r--r--fs/nls/nls_cp855.c1
-rw-r--r--fs/nls/nls_cp857.c1
-rw-r--r--fs/nls/nls_cp860.c1
-rw-r--r--fs/nls/nls_cp861.c1
-rw-r--r--fs/nls/nls_cp862.c1
-rw-r--r--fs/nls/nls_cp863.c1
-rw-r--r--fs/nls/nls_cp864.c1
-rw-r--r--fs/nls/nls_cp865.c1
-rw-r--r--fs/nls/nls_cp866.c1
-rw-r--r--fs/nls/nls_cp869.c1
-rw-r--r--fs/nls/nls_cp874.c1
-rw-r--r--fs/nls/nls_cp932.c1
-rw-r--r--fs/nls/nls_cp936.c1
-rw-r--r--fs/nls/nls_cp949.c1
-rw-r--r--fs/nls/nls_cp950.c1
-rw-r--r--fs/nls/nls_euc-jp.c1
-rw-r--r--fs/nls/nls_iso8859-1.c1
-rw-r--r--fs/nls/nls_iso8859-13.c1
-rw-r--r--fs/nls/nls_iso8859-14.c1
-rw-r--r--fs/nls/nls_iso8859-15.c1
-rw-r--r--fs/nls/nls_iso8859-2.c1
-rw-r--r--fs/nls/nls_iso8859-3.c1
-rw-r--r--fs/nls/nls_iso8859-4.c1
-rw-r--r--fs/nls/nls_iso8859-5.c1
-rw-r--r--fs/nls/nls_iso8859-6.c1
-rw-r--r--fs/nls/nls_iso8859-7.c1
-rw-r--r--fs/nls/nls_iso8859-9.c1
-rw-r--r--fs/nls/nls_koi8-r.c1
-rw-r--r--fs/nls/nls_koi8-ru.c1
-rw-r--r--fs/nls/nls_koi8-u.c1
-rw-r--r--fs/nls/nls_utf8.c1
-rw-r--r--fs/notify/dnotify/dnotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.c263
-rw-r--r--fs/notify/fanotify/fanotify.h50
-rw-r--r--fs/notify/fanotify/fanotify_user.c275
-rw-r--r--fs/notify/fsnotify.c42
-rw-r--r--fs/notify/group.c7
-rw-r--r--fs/notify/inotify/inotify.h21
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c161
-rw-r--r--fs/notify/inotify/inotify_user.c133
-rw-r--r--fs/notify/mark.c2
-rw-r--r--fs/notify/notification.c358
-rw-r--r--fs/ntfs/attrib.c1
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/debug.c58
-rw-r--r--fs/ntfs/debug.h7
-rw-r--r--fs/ntfs/file.c12
-rw-r--r--fs/ntfs/inode.c6
-rw-r--r--fs/ntfs/super.c34
-rw-r--r--fs/ntfs/sysctl.c6
-rw-r--r--fs/ocfs2/Makefile1
-rw-r--r--fs/ocfs2/acl.c235
-rw-r--r--fs/ocfs2/acl.h13
-rw-r--r--fs/ocfs2/alloc.c61
-rw-r--r--fs/ocfs2/aops.c46
-rw-r--r--fs/ocfs2/aops.h5
-rw-r--r--fs/ocfs2/buffer_head_io.c6
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c44
-rw-r--r--fs/ocfs2/cluster/masklog.h3
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/sys.c2
-rw-r--r--fs/ocfs2/cluster/tcp.c123
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h2
-rw-r--r--fs/ocfs2/cluster/ver.c42
-rw-r--r--fs/ocfs2/cluster/ver.h31
-rw-r--r--fs/ocfs2/dcache.c61
-rw-r--r--fs/ocfs2/dcache.h12
-rw-r--r--fs/ocfs2/dir.c18
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h5
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c46
-rw-r--r--fs/ocfs2/dlm/dlmlock.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c79
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c49
-rw-r--r--fs/ocfs2/dlm/dlmthread.c13
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c18
-rw-r--r--fs/ocfs2/dlm/dlmver.c42
-rw-r--r--fs/ocfs2/dlm/dlmver.h31
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.c42
-rw-r--r--fs/ocfs2/dlmfs/dlmfsver.h31
-rw-r--r--fs/ocfs2/dlmglue.c57
-rw-r--r--fs/ocfs2/dlmglue.h3
-rw-r--r--fs/ocfs2/file.c280
-rw-r--r--fs/ocfs2/inode.c71
-rw-r--r--fs/ocfs2/inode.h17
-rw-r--r--fs/ocfs2/ioctl.c93
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/journal.h14
-rw-r--r--fs/ocfs2/localalloc.c42
-rw-r--r--fs/ocfs2/localalloc.h6
-rw-r--r--fs/ocfs2/locks.c2
-rw-r--r--fs/ocfs2/move_extents.c95
-rw-r--r--fs/ocfs2/namei.c199
-rw-r--r--fs/ocfs2/ocfs2.h35
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/quota.h2
-rw-r--r--fs/ocfs2/quota_global.c62
-rw-r--r--fs/ocfs2/quota_local.c4
-rw-r--r--fs/ocfs2/refcounttree.c54
-rw-r--r--fs/ocfs2/resize.c22
-rw-r--r--fs/ocfs2/stack_o2cb.c3
-rw-r--r--fs/ocfs2/stack_user.c308
-rw-r--r--fs/ocfs2/stackglue.c50
-rw-r--r--fs/ocfs2/stackglue.h15
-rw-r--r--fs/ocfs2/suballoc.c45
-rw-r--r--fs/ocfs2/suballoc.h16
-rw-r--r--fs/ocfs2/super.c79
-rw-r--r--fs/ocfs2/sysfile.c3
-rw-r--r--fs/ocfs2/uptodate.c2
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/xattr.c84
-rw-r--r--fs/ocfs2/xattr.h6
-rw-r--r--fs/omfs/file.c8
-rw-r--r--fs/omfs/inode.c2
-rw-r--r--fs/open.c135
-rw-r--r--fs/openpromfs/inode.c1
-rw-r--r--fs/pipe.c296
-rw-r--r--fs/pnode.c215
-rw-r--r--fs/pnode.h7
-rw-r--r--fs/posix_acl.c533
-rw-r--r--fs/proc/Kconfig4
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c24
-rw-r--r--fs/proc/base.c139
-rw-r--r--fs/proc/cmdline.c2
-rw-r--r--fs/proc/consoles.c12
-rw-r--r--fs/proc/cpuinfo.c2
-rw-r--r--fs/proc/devices.c2
-rw-r--r--fs/proc/fd.c6
-rw-r--r--fs/proc/generic.c21
-rw-r--r--fs/proc/inode.c28
-rw-r--r--fs/proc/internal.h7
-rw-r--r--fs/proc/interrupts.c2
-rw-r--r--fs/proc/kcore.c5
-rw-r--r--fs/proc/kmsg.c2
-rw-r--r--fs/proc/loadavg.c2
-rw-r--r--fs/proc/meminfo.c46
-rw-r--r--fs/proc/namespaces.c22
-rw-r--r--fs/proc/nommu.c14
-rw-r--r--fs/proc/page.c9
-rw-r--r--fs/proc/proc_devtree.c243
-rw-r--r--fs/proc/root.c5
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/softirqs.c2
-rw-r--r--fs/proc/stat.c26
-rw-r--r--fs/proc/task_mmu.c90
-rw-r--r--fs/proc/task_nommu.c19
-rw-r--r--fs/proc/uptime.c4
-rw-r--r--fs/proc/version.c2
-rw-r--r--fs/proc/vmcore.c29
-rw-r--r--fs/proc_namespace.c16
-rw-r--r--fs/pstore/inode.c1
-rw-r--r--fs/pstore/platform.c27
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/pstore/ram_core.c40
-rw-r--r--fs/qnx4/inode.c64
-rw-r--r--fs/qnx4/namei.c4
-rw-r--r--fs/qnx4/qnx4.h2
-rw-r--r--fs/qnx6/inode.c1
-rw-r--r--fs/quota/Kconfig7
-rw-r--r--fs/quota/dquot.c20
-rw-r--r--fs/quota/netlink.c16
-rw-r--r--fs/quota/quota.c15
-rw-r--r--fs/ramfs/file-mmu.c17
-rw-r--r--fs/ramfs/file-nommu.c27
-rw-r--r--fs/ramfs/inode.c9
-rw-r--r--fs/ramfs/internal.h1
-rw-r--r--fs/read_write.c219
-rw-r--r--fs/readdir.c4
-rw-r--r--fs/reiserfs/acl.h4
-rw-r--r--fs/reiserfs/bitmap.c272
-rw-r--r--fs/reiserfs/dir.c162
-rw-r--r--fs/reiserfs/do_balan.c2954
-rw-r--r--fs/reiserfs/file.c101
-rw-r--r--fs/reiserfs/fix_node.c1008
-rw-r--r--fs/reiserfs/hashes.c15
-rw-r--r--fs/reiserfs/ibalance.c271
-rw-r--r--fs/reiserfs/inode.c1218
-rw-r--r--fs/reiserfs/ioctl.c27
-rw-r--r--fs/reiserfs/item_ops.c108
-rw-r--r--fs/reiserfs/journal.c1339
-rw-r--r--fs/reiserfs/lbalance.c501
-rw-r--r--fs/reiserfs/namei.c517
-rw-r--r--fs/reiserfs/objectid.c101
-rw-r--r--fs/reiserfs/prints.c176
-rw-r--r--fs/reiserfs/procfs.c4
-rw-r--r--fs/reiserfs/reiserfs.h1932
-rw-r--r--fs/reiserfs/resize.c75
-rw-r--r--fs/reiserfs/stree.c892
-rw-r--r--fs/reiserfs/super.c564
-rw-r--r--fs/reiserfs/tail_conversion.c161
-rw-r--r--fs/reiserfs/xattr.c75
-rw-r--r--fs/reiserfs/xattr.h3
-rw-r--r--fs/reiserfs/xattr_acl.c226
-rw-r--r--fs/romfs/mmap-nommu.c4
-rw-r--r--fs/romfs/super.c7
-rw-r--r--fs/select.c7
-rw-r--r--fs/seq_file.c50
-rw-r--r--fs/splice.c337
-rw-r--r--fs/squashfs/Kconfig72
-rw-r--r--fs/squashfs/Makefile5
-rw-r--r--fs/squashfs/block.c36
-rw-r--r--fs/squashfs/cache.c28
-rw-r--r--fs/squashfs/decompressor.c59
-rw-r--r--fs/squashfs/decompressor.h24
-rw-r--r--fs/squashfs/decompressor_multi.c198
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c97
-rw-r--r--fs/squashfs/decompressor_single.c85
-rw-r--r--fs/squashfs/file.c142
-rw-r--r--fs/squashfs/file_cache.c38
-rw-r--r--fs/squashfs/file_direct.c176
-rw-r--r--fs/squashfs/lzo_wrapper.c47
-rw-r--r--fs/squashfs/page_actor.c100
-rw-r--r--fs/squashfs/page_actor.h81
-rw-r--r--fs/squashfs/squashfs.h22
-rw-r--r--fs/squashfs/squashfs_fs_sb.h4
-rw-r--r--fs/squashfs/super.c11
-rw-r--r--fs/squashfs/xz_wrapper.c105
-rw-r--r--fs/squashfs/zlib_wrapper.c64
-rw-r--r--fs/stat.c31
-rw-r--r--fs/statfs.c2
-rw-r--r--fs/super.c224
-rw-r--r--fs/sync.c19
-rw-r--r--fs/sysfs/Kconfig1
-rw-r--r--fs/sysfs/Makefile3
-rw-r--r--fs/sysfs/bin.c502
-rw-r--r--fs/sysfs/dir.c1043
-rw-r--r--fs/sysfs/file.c875
-rw-r--r--fs/sysfs/group.c114
-rw-r--r--fs/sysfs/inode.c357
-rw-r--r--fs/sysfs/mount.c188
-rw-r--r--fs/sysfs/symlink.c251
-rw-r--r--fs/sysfs/sysfs.h231
-rw-r--r--fs/sysv/file.c8
-rw-r--r--fs/sysv/inode.c3
-rw-r--r--fs/timerfd.c1
-rw-r--r--fs/ubifs/budget.c1
-rw-r--r--fs/ubifs/debug.c32
-rw-r--r--fs/ubifs/file.c32
-rw-r--r--fs/ubifs/gc.c3
-rw-r--r--fs/ubifs/io.c18
-rw-r--r--fs/ubifs/log.c21
-rw-r--r--fs/ubifs/lpt_commit.c4
-rw-r--r--fs/ubifs/orphan.c21
-rw-r--r--fs/ubifs/recovery.c21
-rw-r--r--fs/ubifs/shrinker.c1
-rw-r--r--fs/ubifs/super.c42
-rw-r--r--fs/ubifs/tnc.c27
-rw-r--r--fs/ubifs/tnc_commit.c4
-rw-r--r--fs/ubifs/ubifs.h11
-rw-r--r--fs/udf/file.c31
-rw-r--r--fs/udf/inode.c15
-rw-r--r--fs/udf/namei.c2
-rw-r--r--fs/udf/super.c54
-rw-r--r--fs/ufs/balloc.c48
-rw-r--r--fs/ufs/file.c8
-rw-r--r--fs/ufs/ialloc.c21
-rw-r--r--fs/ufs/inode.c2
-rw-r--r--fs/ufs/super.c37
-rw-r--r--fs/ufs/ufs.h1
-rw-r--r--fs/utimes.c9
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xattr_acl.c180
-rw-r--r--fs/xfs/Makefile8
-rw-r--r--fs/xfs/kmem.c43
-rw-r--r--fs/xfs/kmem.h21
-rw-r--r--fs/xfs/xfs_acl.c161
-rw-r--r--fs/xfs/xfs_acl.h9
-rw-r--r--fs/xfs/xfs_ag.h46
-rw-r--r--fs/xfs/xfs_alloc.c83
-rw-r--r--fs/xfs/xfs_alloc.h3
-rw-r--r--fs/xfs/xfs_alloc_btree.c31
-rw-r--r--fs/xfs/xfs_alloc_btree.h35
-rw-r--r--fs/xfs/xfs_aops.c227
-rw-r--r--fs/xfs/xfs_attr.c376
-rw-r--r--fs/xfs/xfs_attr_inactive.c21
-rw-r--r--fs/xfs/xfs_attr_leaf.c251
-rw-r--r--fs/xfs/xfs_attr_leaf.h235
-rw-r--r--fs/xfs/xfs_attr_list.c42
-rw-r--r--fs/xfs/xfs_attr_remote.c97
-rw-r--r--fs/xfs/xfs_attr_remote.h29
-rw-r--r--fs/xfs/xfs_bit.c4
-rw-r--r--fs/xfs/xfs_bit.h7
-rw-r--r--fs/xfs/xfs_bmap.c512
-rw-r--r--fs/xfs/xfs_bmap.h23
-rw-r--r--fs/xfs/xfs_bmap_btree.c38
-rw-r--r--fs/xfs/xfs_bmap_btree.h107
-rw-r--r--fs/xfs/xfs_bmap_util.c440
-rw-r--r--fs/xfs/xfs_bmap_util.h24
-rw-r--r--fs/xfs/xfs_btree.c164
-rw-r--r--fs/xfs/xfs_btree.h84
-rw-r--r--fs/xfs/xfs_buf.c132
-rw-r--r--fs/xfs/xfs_buf.h52
-rw-r--r--fs/xfs/xfs_buf_item.c157
-rw-r--r--fs/xfs/xfs_buf_item.h4
-rw-r--r--fs/xfs/xfs_da_btree.c397
-rw-r--r--fs/xfs/xfs_da_btree.h169
-rw-r--r--fs/xfs/xfs_da_format.c911
-rw-r--r--fs/xfs/xfs_da_format.h (renamed from fs/xfs/xfs_dir2_format.h)813
-rw-r--r--fs/xfs/xfs_dinode.h2
-rw-r--r--fs/xfs/xfs_dir2.c478
-rw-r--r--fs/xfs/xfs_dir2.h120
-rw-r--r--fs/xfs/xfs_dir2_block.c187
-rw-r--r--fs/xfs/xfs_dir2_data.c238
-rw-r--r--fs/xfs/xfs_dir2_leaf.c410
-rw-r--r--fs/xfs/xfs_dir2_node.c494
-rw-r--r--fs/xfs/xfs_dir2_priv.h158
-rw-r--r--fs/xfs/xfs_dir2_readdir.c199
-rw-r--r--fs/xfs/xfs_dir2_sf.c309
-rw-r--r--fs/xfs/xfs_discard.c16
-rw-r--r--fs/xfs/xfs_dquot.c201
-rw-r--r--fs/xfs/xfs_dquot.h4
-rw-r--r--fs/xfs/xfs_dquot_buf.c290
-rw-r--r--fs/xfs/xfs_dquot_item.c81
-rw-r--r--fs/xfs/xfs_dquot_item.h3
-rw-r--r--fs/xfs/xfs_error.c36
-rw-r--r--fs/xfs/xfs_error.h1
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_extent_busy.c11
-rw-r--r--fs/xfs/xfs_extent_busy.h4
-rw-r--r--fs/xfs/xfs_extfree_item.c29
-rw-r--r--fs/xfs/xfs_file.c268
-rw-r--r--fs/xfs/xfs_filestream.c696
-rw-r--r--fs/xfs/xfs_filestream.h34
-rw-r--r--fs/xfs/xfs_format.h275
-rw-r--r--fs/xfs/xfs_fs.h5
-rw-r--r--fs/xfs/xfs_fsops.c98
-rw-r--r--fs/xfs/xfs_ialloc.c813
-rw-r--r--fs/xfs/xfs_ialloc.h28
-rw-r--r--fs/xfs/xfs_ialloc_btree.c98
-rw-r--r--fs/xfs/xfs_ialloc_btree.h54
-rw-r--r--fs/xfs/xfs_icache.c32
-rw-r--r--fs/xfs/xfs_icache.h6
-rw-r--r--fs/xfs/xfs_icreate_item.c17
-rw-r--r--fs/xfs/xfs_inode.c719
-rw-r--r--fs/xfs/xfs_inode.h27
-rw-r--r--fs/xfs/xfs_inode_buf.c34
-rw-r--r--fs/xfs/xfs_inode_buf.h3
-rw-r--r--fs/xfs/xfs_inode_fork.c60
-rw-r--r--fs/xfs/xfs_inode_fork.h4
-rw-r--r--fs/xfs/xfs_inode_item.c388
-rw-r--r--fs/xfs/xfs_inode_item.h5
-rw-r--r--fs/xfs/xfs_ioctl.c199
-rw-r--r--fs/xfs/xfs_ioctl32.c15
-rw-r--r--fs/xfs/xfs_iomap.c38
-rw-r--r--fs/xfs/xfs_iomap.h8
-rw-r--r--fs/xfs/xfs_iops.c303
-rw-r--r--fs/xfs/xfs_iops.h8
-rw-r--r--fs/xfs/xfs_itable.c43
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c149
-rw-r--r--fs/xfs/xfs_log.h67
-rw-r--r--fs/xfs/xfs_log_cil.c211
-rw-r--r--fs/xfs/xfs_log_format.h177
-rw-r--r--fs/xfs/xfs_log_priv.h17
-rw-r--r--fs/xfs/xfs_log_recover.c228
-rw-r--r--fs/xfs/xfs_log_rlimit.c11
-rw-r--r--fs/xfs/xfs_message.c5
-rw-r--r--fs/xfs/xfs_mount.c102
-rw-r--r--fs/xfs/xfs_mount.h17
-rw-r--r--fs/xfs/xfs_mru_cache.c151
-rw-r--r--fs/xfs/xfs_mru_cache.h31
-rw-r--r--fs/xfs/xfs_qm.c262
-rw-r--r--fs/xfs/xfs_qm.h20
-rw-r--r--fs/xfs/xfs_qm_bhv.c12
-rw-r--r--fs/xfs/xfs_qm_syscalls.c50
-rw-r--r--fs/xfs/xfs_quota.h4
-rw-r--r--fs/xfs/xfs_quota_defs.h4
-rw-r--r--fs/xfs/xfs_quota_priv.h42
-rw-r--r--fs/xfs/xfs_quotaops.c34
-rw-r--r--fs/xfs/xfs_rtalloc.c1552
-rw-r--r--fs/xfs/xfs_rtalloc.h24
-rw-r--r--fs/xfs/xfs_rtbitmap.c973
-rw-r--r--fs/xfs/xfs_sb.c100
-rw-r--r--fs/xfs/xfs_sb.h240
-rw-r--r--fs/xfs/xfs_shared.h246
-rw-r--r--fs/xfs/xfs_stats.c1
-rw-r--r--fs/xfs/xfs_stats.h18
-rw-r--r--fs/xfs/xfs_super.c67
-rw-r--r--fs/xfs/xfs_symlink.c112
-rw-r--r--fs/xfs/xfs_symlink.h2
-rw-r--r--fs/xfs/xfs_symlink_remote.c23
-rw-r--r--fs/xfs/xfs_trace.c17
-rw-r--r--fs/xfs/xfs_trace.h145
-rw-r--r--fs/xfs/xfs_trans.c37
-rw-r--r--fs/xfs/xfs_trans.h22
-rw-r--r--fs/xfs/xfs_trans_ail.c15
-rw-r--r--fs/xfs/xfs_trans_buf.c36
-rw-r--r--fs/xfs/xfs_trans_dquot.c19
-rw-r--r--fs/xfs/xfs_trans_extfree.c7
-rw-r--r--fs/xfs/xfs_trans_inode.c21
-rw-r--r--fs/xfs/xfs_trans_priv.h4
-rw-r--r--fs/xfs/xfs_trans_resv.c163
-rw-r--r--fs/xfs/xfs_trans_resv.h3
-rw-r--r--fs/xfs/xfs_trans_space.h12
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_vnode.h17
-rw-r--r--fs/xfs/xfs_xattr.c12
1051 files changed, 68222 insertions, 50233 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 7af425f53be..8482f2d1160 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -156,7 +156,7 @@ int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid)
return -EOPNOTSUPP;
acl = v9fs_get_cached_acl(inode, ACL_TYPE_ACCESS);
if (acl) {
- retval = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ retval = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (retval)
return retval;
set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
@@ -200,7 +200,7 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
if (acl) {
if (S_ISDIR(mode))
*dpacl = posix_acl_dup(acl);
- retval = posix_acl_create(&acl, GFP_NOFS, &mode);
+ retval = __posix_acl_create(&acl, GFP_NOFS, &mode);
if (retval < 0)
return retval;
if (retval > 0)
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a9ea73d6dcf..a69260f2755 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -90,7 +90,7 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index,
&v9fs_cache_session_index_def,
- v9ses);
+ v9ses, true);
p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n",
v9ses, v9ses->fscache);
}
@@ -204,7 +204,7 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
- v9inode);
+ v9inode, true);
p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
inode, v9inode->fscache);
@@ -239,13 +239,12 @@ void v9fs_cache_inode_flush_cookie(struct inode *inode)
void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- struct p9_fid *fid;
if (!v9inode->fscache)
return;
spin_lock(&v9inode->fscache_lock);
- fid = filp->private_data;
+
if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
v9fs_cache_inode_flush_cookie(inode);
else
@@ -271,7 +270,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
&v9fs_cache_inode_index_def,
- v9inode);
+ v9inode, true);
p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
inode, old, v9inode->fscache);
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 08f2e1e9a7e..6894b085f0e 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -56,7 +56,7 @@ enum {
/* Options that take no arguments */
Opt_nodevmap,
/* Cache options */
- Opt_cache_loose, Opt_fscache,
+ Opt_cache_loose, Opt_fscache, Opt_mmap,
/* Access options */
Opt_access, Opt_posixacl,
/* Error token */
@@ -74,6 +74,7 @@ static const match_table_t tokens = {
{Opt_cache, "cache=%s"},
{Opt_cache_loose, "loose"},
{Opt_fscache, "fscache"},
+ {Opt_mmap, "mmap"},
{Opt_cachetag, "cachetag=%s"},
{Opt_access, "access=%s"},
{Opt_posixacl, "posixacl"},
@@ -91,6 +92,9 @@ static int get_cache_mode(char *s)
} else if (!strcmp(s, "fscache")) {
version = CACHE_FSCACHE;
p9_debug(P9_DEBUG_9P, "Cache mode: fscache\n");
+ } else if (!strcmp(s, "mmap")) {
+ version = CACHE_MMAP;
+ p9_debug(P9_DEBUG_9P, "Cache mode: mmap\n");
} else if (!strcmp(s, "none")) {
version = CACHE_NONE;
p9_debug(P9_DEBUG_9P, "Cache mode: none\n");
@@ -220,6 +224,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
case Opt_fscache:
v9ses->cache = CACHE_FSCACHE;
break;
+ case Opt_mmap:
+ v9ses->cache = CACHE_MMAP;
+ break;
case Opt_cachetag:
#ifdef CONFIG_9P_FSCACHE
v9ses->cachetag = match_strdup(&args[0]);
@@ -530,7 +537,7 @@ static struct attribute_group v9fs_attr_group = {
*
*/
-static int v9fs_sysfs_init(void)
+static int __init v9fs_sysfs_init(void)
{
v9fs_kobj = kobject_create_and_add("9p", fs_kobj);
if (!v9fs_kobj)
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a8e127c8962..099c7712631 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -64,6 +64,7 @@ enum p9_session_flags {
enum p9_cache_modes {
CACHE_NONE,
+ CACHE_MMAP,
CACHE_LOOSE,
CACHE_FSCACHE,
};
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index dc95a252523..b83ebfbf3fd 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -50,6 +50,8 @@ extern const struct dentry_operations v9fs_dentry_operations;
extern const struct dentry_operations v9fs_cached_dentry_operations;
extern const struct file_operations v9fs_cached_file_operations;
extern const struct file_operations v9fs_cached_file_operations_dotl;
+extern const struct file_operations v9fs_mmap_file_operations;
+extern const struct file_operations v9fs_mmap_file_operations_dotl;
extern struct kmem_cache *v9fs_inode_cache;
struct inode *v9fs_alloc_inode(struct super_block *sb);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 9ff073f4090..cc1cfae726b 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -202,6 +202,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
{
int retval;
+ p9_debug(P9_DEBUG_VFS, "page %p\n", page);
+
retval = v9fs_vfs_writepage_locked(page);
if (retval < 0) {
if (retval == -EAGAIN) {
@@ -257,8 +259,7 @@ static int v9fs_launder_page(struct page *page)
*
*/
static ssize_t
-v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
+v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
/*
* FIXME
@@ -267,7 +268,7 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*/
p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n",
iocb->ki_filp->f_path.dentry->d_name.name,
- (long long)pos, nr_segs);
+ (long long)pos, iter->nr_segs);
return -EINVAL;
}
@@ -282,6 +283,9 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
struct inode *inode = mapping->host;
+
+ p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
v9inode = V9FS_I(inode);
start:
page = grab_cache_page_write_begin(mapping, index, flags);
@@ -312,6 +316,8 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
loff_t last_pos = pos + copied;
struct inode *inode = page->mapping->host;
+ p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
+
if (unlikely(copied < len)) {
/*
* zero out the rest of the area
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f039b104a98..b03dd23feda 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -43,23 +43,6 @@
#include "fid.h"
/**
- * v9fs_dentry_delete - called when dentry refcount equals 0
- * @dentry: dentry in question
- *
- * By returning 1 here we should remove cacheing of unused
- * dentry components.
- *
- */
-
-static int v9fs_dentry_delete(const struct dentry *dentry)
-{
- p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
- dentry->d_name.name, dentry);
-
- return 1;
-}
-
-/**
* v9fs_cached_dentry_delete - called when dentry refcount equals 0
* @dentry: dentry in question
*
@@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
};
const struct dentry_operations v9fs_dentry_operations = {
- .d_delete = v9fs_dentry_delete,
+ .d_delete = always_delete_dentry,
.d_release = v9fs_dentry_release,
};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 4d0c2e0be7e..0b3bfa303dd 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -42,7 +42,6 @@
/**
* struct p9_rdir - readdir accounting
- * @mutex: mutex protecting readdir
* @head: start offset of current dirread buffer
* @tail: end offset of current dirread buffer
* @buf: dirread buffer
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a0df3e73c2b..520c11c2dcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -45,6 +45,7 @@
#include "cache.h"
static const struct vm_operations_struct v9fs_file_vm_ops;
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops;
/**
* v9fs_file_open - open a file (or directory)
@@ -87,7 +88,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
file->private_data = fid;
mutex_lock(&v9inode->v_mutex);
- if (v9ses->cache && !v9inode->writeback_fid &&
+ if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+ !v9inode->writeback_fid &&
((file->f_flags & O_ACCMODE) != O_RDONLY)) {
/*
* clone a fid and add it to writeback_fid
@@ -105,7 +107,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
v9inode->writeback_fid = (void *) fid;
}
mutex_unlock(&v9inode->v_mutex);
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
v9fs_cache_inode_set_cookie(inode, file);
return 0;
out_error:
@@ -350,9 +352,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
invalidate_mapping_pages(&inode->i_data, 0, -1);
}
/* Convert flock to posix lock */
- fl->fl_owner = (fl_owner_t)filp;
- fl->fl_start = 0;
- fl->fl_end = OFFSET_MAX;
fl->fl_flags |= FL_POSIX;
fl->fl_flags ^= FL_FLOCK;
@@ -461,14 +460,12 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
int n;
loff_t i_size;
size_t total = 0;
- struct p9_client *clnt;
loff_t origin = *offset;
unsigned long pg_start, pg_end;
p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n",
data, (int)count, (int)*offset);
- clnt = fid->clnt;
do {
n = p9_client_write(fid, NULL, data+total, origin+total, count);
if (n <= 0)
@@ -581,11 +578,12 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
}
static int
-v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
+v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
{
int retval;
- retval = generic_file_mmap(file, vma);
+
+ retval = generic_file_mmap(filp, vma);
if (!retval)
vma->vm_ops = &v9fs_file_vm_ops;
@@ -593,6 +591,43 @@ v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
}
static int
+v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ int retval;
+ struct inode *inode;
+ struct v9fs_inode *v9inode;
+ struct p9_fid *fid;
+
+ inode = file_inode(filp);
+ v9inode = V9FS_I(inode);
+ mutex_lock(&v9inode->v_mutex);
+ if (!v9inode->writeback_fid &&
+ (vma->vm_flags & VM_WRITE)) {
+ /*
+ * clone a fid and add it to writeback_fid
+ * we do it during mmap instead of
+ * page dirty time via write_begin/page_mkwrite
+ * because we want write after unlink usecase
+ * to work.
+ */
+ fid = v9fs_writeback_fid(filp->f_path.dentry);
+ if (IS_ERR(fid)) {
+ retval = PTR_ERR(fid);
+ mutex_unlock(&v9inode->v_mutex);
+ return retval;
+ }
+ v9inode->writeback_fid = (void *) fid;
+ }
+ mutex_unlock(&v9inode->v_mutex);
+
+ retval = generic_file_mmap(filp, vma);
+ if (!retval)
+ vma->vm_ops = &v9fs_mmap_file_vm_ops;
+
+ return retval;
+}
+
+static int
v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct v9fs_inode *v9inode;
@@ -646,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
/**
* v9fs_cached_file_read - read from a file
* @filp: file pointer to read
- * @udata: user data buffer to read data into
+ * @data: user data buffer to read data into
* @count: size of buffer
* @offset: offset at which to read data
*
@@ -657,7 +692,23 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
{
if (filp->f_flags & O_DIRECT)
return v9fs_direct_read(filp, data, count, offset);
- return do_sync_read(filp, data, count, offset);
+ return new_sync_read(filp, data, count, offset);
+}
+
+/**
+ * v9fs_mmap_file_read - read from a file
+ * @filp: file pointer to read
+ * @data: user data buffer to read data into
+ * @count: size of buffer
+ * @offset: offset at which to read data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count,
+ loff_t *offset)
+{
+ /* TODO: Check if there are dirty pages */
+ return v9fs_file_read(filp, data, count, offset);
}
static ssize_t
@@ -709,7 +760,7 @@ err_out:
buff_write:
mutex_unlock(&inode->i_mutex);
- return do_sync_write(filp, data, count, offsetp);
+ return new_sync_write(filp, data, count, offsetp);
}
/**
@@ -727,11 +778,66 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
if (filp->f_flags & O_DIRECT)
return v9fs_direct_write(filp, data, count, offset);
- return do_sync_write(filp, data, count, offset);
+ return new_sync_write(filp, data, count, offset);
+}
+
+
+/**
+ * v9fs_mmap_file_write - write to a file
+ * @filp: file pointer to write
+ * @data: data buffer to write data from
+ * @count: size of buffer
+ * @offset: offset at which to write data
+ *
+ */
+static ssize_t
+v9fs_mmap_file_write(struct file *filp, const char __user *data,
+ size_t count, loff_t *offset)
+{
+ /*
+ * TODO: invalidate mmaps on filp's inode between
+ * offset and offset+count
+ */
+ return v9fs_file_write(filp, data, count, offset);
+}
+
+static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
+{
+ struct inode *inode;
+
+ struct writeback_control wbc = {
+ .nr_to_write = LONG_MAX,
+ .sync_mode = WB_SYNC_ALL,
+ .range_start = vma->vm_pgoff * PAGE_SIZE,
+ /* absolute end, byte at end included */
+ .range_end = vma->vm_pgoff * PAGE_SIZE +
+ (vma->vm_end - vma->vm_start - 1),
+ };
+
+
+ p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma);
+
+ inode = file_inode(vma->vm_file);
+
+ if (!mapping_cap_writeback_dirty(inode->i_mapping))
+ wbc.nr_to_write = 0;
+
+ might_sleep();
+ sync_inode(inode, &wbc);
}
+
static const struct vm_operations_struct v9fs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = v9fs_vm_page_mkwrite,
+ .remap_pages = generic_file_remap_pages,
+};
+
+static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
+ .close = v9fs_mmap_vm_close,
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = v9fs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -741,8 +847,8 @@ const struct file_operations v9fs_cached_file_operations = {
.llseek = generic_file_llseek,
.read = v9fs_cached_file_read,
.write = v9fs_cached_file_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock,
@@ -754,8 +860,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = {
.llseek = generic_file_llseek,
.read = v9fs_cached_file_read,
.write = v9fs_cached_file_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.open = v9fs_file_open,
.release = v9fs_dir_release,
.lock = v9fs_file_lock_dotl,
@@ -786,3 +892,26 @@ const struct file_operations v9fs_file_operations_dotl = {
.mmap = generic_file_readonly_mmap,
.fsync = v9fs_file_fsync_dotl,
};
+
+const struct file_operations v9fs_mmap_file_operations = {
+ .llseek = generic_file_llseek,
+ .read = v9fs_mmap_file_read,
+ .write = v9fs_mmap_file_write,
+ .open = v9fs_file_open,
+ .release = v9fs_dir_release,
+ .lock = v9fs_file_lock,
+ .mmap = v9fs_mmap_file_mmap,
+ .fsync = v9fs_file_fsync,
+};
+
+const struct file_operations v9fs_mmap_file_operations_dotl = {
+ .llseek = generic_file_llseek,
+ .read = v9fs_mmap_file_read,
+ .write = v9fs_mmap_file_write,
+ .open = v9fs_file_open,
+ .release = v9fs_dir_release,
+ .lock = v9fs_file_lock_dotl,
+ .flock = v9fs_file_flock_dotl,
+ .mmap = v9fs_mmap_file_mmap,
+ .fsync = v9fs_file_fsync_dotl,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 4e65aa90334..7fa4f7a7653 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -147,7 +147,7 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses,
int major = -1, minor = -1;
strlcpy(ext, stat->extension, sizeof(ext));
- sscanf(ext, "%c %u %u", &type, &major, &minor);
+ sscanf(ext, "%c %i %i", &type, &major, &minor);
switch (type) {
case 'c':
res |= S_IFCHR;
@@ -299,15 +299,22 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
case S_IFREG:
if (v9fs_proto_dotl(v9ses)) {
inode->i_op = &v9fs_file_inode_operations_dotl;
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE ||
+ v9ses->cache == CACHE_FSCACHE)
inode->i_fop =
&v9fs_cached_file_operations_dotl;
+ else if (v9ses->cache == CACHE_MMAP)
+ inode->i_fop = &v9fs_mmap_file_operations_dotl;
else
inode->i_fop = &v9fs_file_operations_dotl;
} else {
inode->i_op = &v9fs_file_inode_operations;
- if (v9ses->cache)
- inode->i_fop = &v9fs_cached_file_operations;
+ if (v9ses->cache == CACHE_LOOSE ||
+ v9ses->cache == CACHE_FSCACHE)
+ inode->i_fop =
+ &v9fs_cached_file_operations;
+ else if (v9ses->cache == CACHE_MMAP)
+ inode->i_fop = &v9fs_mmap_file_operations;
else
inode->i_fop = &v9fs_file_operations;
}
@@ -444,7 +451,7 @@ void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- truncate_inode_pages(inode->i_mapping, 0);
+ truncate_inode_pages_final(inode->i_mapping);
clear_inode(inode);
filemap_fdatawrite(inode->i_mapping);
@@ -573,7 +580,7 @@ static int v9fs_at_to_dotl_flags(int flags)
* v9fs_remove - helper function to remove files and directories
* @dir: directory inode that is being deleted
* @dentry: dentry that is being deleted
- * @rmdir: removing a directory
+ * @flags: removing a directory
*
*/
@@ -771,7 +778,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
* v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode
* @dir: inode that is being walked from
* @dentry: dentry that is being walked to?
- * @nameidata: path data
+ * @flags: lookup flags (unused)
*
*/
@@ -779,7 +786,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct dentry *res;
- struct super_block *sb;
struct v9fs_session_info *v9ses;
struct p9_fid *dfid, *fid;
struct inode *inode;
@@ -791,7 +797,6 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- sb = dir->i_sb;
v9ses = v9fs_inode2v9ses(dir);
/* We can walk d_parent because we hold the dir->i_mutex */
dfid = v9fs_fid_lookup(dentry->d_parent);
@@ -812,7 +817,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
* unlink. For cached mode create calls request for new
* inode. But with cache disabled, lookup should do this.
*/
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -863,7 +868,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
return finish_no_open(file, res);
err = 0;
- fid = NULL;
+
v9ses = v9fs_inode2v9ses(dir);
perm = unixmode2p9mode(v9ses, mode);
fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
@@ -878,7 +883,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
v9fs_invalidate_inode_attr(dir);
v9inode = V9FS_I(dentry->d_inode);
mutex_lock(&v9inode->v_mutex);
- if (v9ses->cache && !v9inode->writeback_fid &&
+ if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+ !v9inode->writeback_fid &&
((flags & O_ACCMODE) != O_RDONLY)) {
/*
* clone a fid and add it to writeback_fid
@@ -901,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry,
goto error;
file->private_data = fid;
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
v9fs_cache_inode_set_cookie(dentry->d_inode, file);
*opened |= FILE_CREATED;
@@ -1318,7 +1324,7 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
* v9fs_vfs_mkspecial - create a special file
* @dir: inode to create special file in
* @dentry: dentry to create
- * @mode: mode to create special file
+ * @perm: mode to create special file
* @extension: 9p2000.u format extension string representing special file
*
*/
@@ -1479,7 +1485,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
*/
i_size = inode->i_size;
v9fs_stat2inode(st, inode, inode->i_sb);
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
inode->i_size = i_size;
spin_unlock(&inode->i_lock);
out:
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 4c10edec26a..1fa85aae24d 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -226,7 +226,7 @@ int v9fs_open_to_dotl_flags(int flags)
* v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol.
* @dir: directory inode that is being created
* @dentry: dentry that is being deleted
- * @mode: create permissions
+ * @omode: create permissions
*
*/
@@ -330,7 +330,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
v9inode = V9FS_I(inode);
mutex_lock(&v9inode->v_mutex);
- if (v9ses->cache && !v9inode->writeback_fid &&
+ if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) &&
+ !v9inode->writeback_fid &&
((flags & O_ACCMODE) != O_RDONLY)) {
/*
* clone a fid and add it to writeback_fid
@@ -353,7 +354,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
if (err)
goto err_clunk_old_fid;
file->private_data = ofid;
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
v9fs_cache_inode_set_cookie(inode, file);
*opened |= FILE_CREATED;
out:
@@ -374,7 +375,7 @@ err_clunk_old_fid:
* v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory
* @dir: inode that is being unlinked
* @dentry: dentry that is being unlinked
- * @mode: mode for new directory
+ * @omode: mode for new directory
*
*/
@@ -473,13 +474,11 @@ static int
v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
- int err;
struct v9fs_session_info *v9ses;
struct p9_fid *fid;
struct p9_stat_dotl *st;
p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry);
- err = -EPERM;
v9ses = v9fs_dentry2v9ses(dentry);
if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
generic_fillattr(dentry->d_inode, stat);
@@ -556,7 +555,6 @@ static int v9fs_mapped_iattr_valid(int iattr_valid)
int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
{
int retval;
- struct v9fs_session_info *v9ses;
struct p9_fid *fid;
struct p9_iattr_dotl p9attr;
struct inode *inode = dentry->d_inode;
@@ -577,8 +575,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
p9attr.mtime_sec = iattr->ia_mtime.tv_sec;
p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
- retval = -EPERM;
- v9ses = v9fs_dentry2v9ses(dentry);
fid = v9fs_fid_lookup(dentry);
if (IS_ERR(fid))
return PTR_ERR(fid);
@@ -611,7 +607,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
* v9fs_stat2inode_dotl - populate an inode structure with stat info
* @stat: stat structure
* @inode: inode to populate
- * @sb: superblock of filesystem
*
*/
@@ -715,7 +710,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
}
v9fs_invalidate_inode_attr(dir);
- if (v9ses->cache) {
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
/* Now walk from the parent so we can get an unopened fid. */
fid = p9_client_walk(dfid, 1, &name, 1);
if (IS_ERR(fid)) {
@@ -768,7 +763,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
int err;
- char *name;
struct dentry *dir_dentry;
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
@@ -786,8 +780,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
if (IS_ERR(oldfid))
return PTR_ERR(oldfid);
- name = (char *) dentry->d_name.name;
-
err = p9_client_link(dfid, oldfid, (char *)dentry->d_name.name);
if (err < 0) {
@@ -815,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
* v9fs_vfs_mknod_dotl - create a special file
* @dir: inode destination for new link
* @dentry: dentry for file
- * @mode: mode for creation
+ * @omode: mode for creation
* @rdev: device associated with special file
*
*/
@@ -973,7 +965,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
*/
i_size = inode->i_size;
v9fs_stat2inode_dotl(st, inode);
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
inode->i_size = i_size;
spin_unlock(&inode->i_lock);
out:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 2756dcd5de6..0afd0382822 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -144,7 +144,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
}
v9fs_fill_super(sb, v9ses, flags, data);
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
sb->s_d_op = &v9fs_cached_dentry_operations;
else
sb->s_d_op = &v9fs_dentry_operations;
@@ -282,7 +282,7 @@ static int v9fs_drop_inode(struct inode *inode)
{
struct v9fs_session_info *v9ses;
v9ses = v9fs_inode2v9ses(inode);
- if (v9ses->cache)
+ if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE)
return generic_drop_inode(inode);
/*
* in case of non cached mode always drop the
@@ -325,10 +325,12 @@ static int v9fs_write_inode_dotl(struct inode *inode,
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
- p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
v9inode = V9FS_I(inode);
+ p9_debug(P9_DEBUG_VFS, "%s: inode %p, writeback_fid %p\n",
+ __func__, inode, v9inode->writeback_fid);
if (!v9inode->writeback_fid)
return 0;
+
ret = p9_client_fsync(v9inode->writeback_fid, 0);
if (ret < 0) {
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 3c28cdfb8c4..f95e01e058e 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -138,8 +138,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
if (retval < 0) {
p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n",
retval);
- p9_client_clunk(fid);
- return retval;
+ goto err;
}
msize = fid->clnt->msize;
while (value_len) {
@@ -152,12 +151,15 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name,
if (write_count < 0) {
/* error in xattr write */
retval = write_count;
- break;
+ goto err;
}
offset += write_count;
value_len -= write_count;
}
- return p9_client_clunk(fid);
+ retval = 0;
+err:
+ p9_client_clunk(fid);
+ return retval;
}
ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
diff --git a/fs/Kconfig b/fs/Kconfig
index c229f828eb0..312393f3294 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,10 +68,6 @@ source "fs/quota/Kconfig"
source "fs/autofs4/Kconfig"
source "fs/fuse/Kconfig"
-config GENERIC_ACL
- bool
- select FS_POSIX_ACL
-
menu "Caches"
source "fs/fscache/Kconfig"
@@ -100,6 +96,7 @@ endif # BLOCK
menu "Pseudo filesystems"
source "fs/proc/Kconfig"
+source "fs/kernfs/Kconfig"
source "fs/sysfs/Kconfig"
config TMPFS
@@ -119,7 +116,7 @@ config TMPFS_POSIX_ACL
bool "Tmpfs POSIX Access Control Lists"
depends on TMPFS
select TMPFS_XATTR
- select GENERIC_ACL
+ select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support additional access rights
for users and groups beyond the standard owner/group/world scheme,
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28..4030cbfbc9a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y := open.o read_write.o file_table.o super.o \
stack.o fs_struct.o statfs.o
ifeq ($(CONFIG_BLOCK),y)
-obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y += buffer.o block_dev.o direct-io.o mpage.o
else
obj-y += no-block.o
endif
obj-$(CONFIG_PROC_FS) += proc_namespace.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
@@ -42,9 +41,8 @@ obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
-obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
+obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
-obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
obj-$(CONFIG_COREDUMP) += coredump.o
obj-$(CONFIG_SYSCTL) += drop_caches.o
@@ -53,6 +51,7 @@ obj-$(CONFIG_FHANDLE) += fhandle.o
obj-y += quota/
obj-$(CONFIG_PROC_FS) += proc/
+obj-$(CONFIG_KERNFS) += kernfs/
obj-$(CONFIG_SYSFS) += sysfs/
obj-$(CONFIG_CONFIGFS_FS) += configfs/
obj-y += devpts/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 585adafb0cc..c770337c4b4 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -43,9 +43,12 @@ struct adfs_dir_ops;
* ADFS file system superblock data in memory
*/
struct adfs_sb_info {
- struct adfs_discmap *s_map; /* bh list containing map */
- struct adfs_dir_ops *s_dir; /* directory operations */
-
+ union { struct {
+ struct adfs_discmap *s_map; /* bh list containing map */
+ struct adfs_dir_ops *s_dir; /* directory operations */
+ };
+ struct rcu_head rcu; /* used only at shutdown time */
+ };
kuid_t s_uid; /* owner uid */
kgid_t s_gid; /* owner gid */
umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index a36da5382b4..07c9edce5aa 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -23,12 +23,12 @@
const struct file_operations adfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 0ff4bae2c2a..9852bdf34d7 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -123,8 +123,7 @@ static void adfs_put_super(struct super_block *sb)
for (i = 0; i < asb->s_map_size; i++)
brelse(asb->s_map[i].dm_bh);
kfree(asb->s_map);
- kfree(asb);
- sb->s_fs_info = NULL;
+ kfree_rcu(asb, rcu);
}
static int adfs_show_options(struct seq_file *seq, struct dentry *root)
@@ -213,6 +212,7 @@ static int parse_options(struct super_block *sb, char *options)
static int adfs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_NODIRATIME;
return parse_options(sb, data);
}
@@ -266,7 +266,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
sizeof(struct adfs_inode_info),
diff --git a/fs/affs/Changes b/fs/affs/Changes
index a29409c1ffe..b41c2c9792f 100644
--- a/fs/affs/Changes
+++ b/fs/affs/Changes
@@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel]
Version 3.11
------------
-- Converted to use 2.3.x page cache [Dave Jones <dave@powertweak.com>]
+- Converted to use 2.3.x page cache [Dave Jones]
- Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>]
Version 3.10
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 3952121f2f2..9bca8815972 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -1,3 +1,9 @@
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
@@ -5,14 +11,6 @@
#include <linux/mutex.h>
#include <linux/workqueue.h>
-/* AmigaOS allows file names with up to 30 characters length.
- * Names longer than that will be silently truncated. If you
- * want to disallow this, comment out the following #define.
- * Creating filesystem objects with longer names will then
- * result in an error (ENAMETOOLONG).
- */
-/*#define AFFS_NO_TRUNCATE */
-
/* Ugly macros make the code more pretty. */
#define GET_END_PTR(st,p,sz) ((st *)((char *)(p)+((sz)-sizeof(st))))
@@ -28,7 +26,6 @@
#define AFFS_CACHE_SIZE PAGE_SIZE
-#define AFFS_MAX_PREALLOC 32
#define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2)
#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
@@ -118,6 +115,7 @@ struct affs_sb_info {
#define SF_OFS 0x0200 /* Old filesystem */
#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */
#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */
+#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */
/* short cut to get to the affs specific sb data */
static inline struct affs_sb_info *AFFS_SB(struct super_block *sb)
@@ -137,9 +135,13 @@ extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
extern void secs_to_datestamp(time_t secs, struct affs_date *ds);
extern umode_t prot_to_mode(u32 prot);
extern void mode_to_prot(struct inode *inode);
-extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...);
-extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...);
-extern int affs_check_name(const unsigned char *name, int len);
+extern void affs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+extern void affs_warning(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+extern bool affs_nofilenametruncate(const struct dentry *dentry);
+extern int affs_check_name(const unsigned char *name, int len,
+ bool notruncate);
extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry);
/* bitmap. c */
@@ -210,7 +212,7 @@ affs_set_blocksize(struct super_block *sb, int size)
static inline struct buffer_head *
affs_bread(struct super_block *sb, int block)
{
- pr_debug("affs_bread: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
return sb_bread(sb, block);
return NULL;
@@ -218,7 +220,7 @@ affs_bread(struct super_block *sb, int block)
static inline struct buffer_head *
affs_getblk(struct super_block *sb, int block)
{
- pr_debug("affs_getblk: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size)
return sb_getblk(sb, block);
return NULL;
@@ -227,7 +229,7 @@ static inline struct buffer_head *
affs_getzeroblk(struct super_block *sb, int block)
{
struct buffer_head *bh;
- pr_debug("affs_getzeroblk: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
bh = sb_getblk(sb, block);
lock_buffer(bh);
@@ -242,7 +244,7 @@ static inline struct buffer_head *
affs_getemptyblk(struct super_block *sb, int block)
{
struct buffer_head *bh;
- pr_debug("affs_getemptyblk: %d\n", block);
+ pr_debug("%s: %d\n", __func__, block);
if (block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size) {
bh = sb_getblk(sb, block);
wait_on_buffer(bh);
@@ -255,7 +257,7 @@ static inline void
affs_brelse(struct buffer_head *bh)
{
if (bh)
- pr_debug("affs_brelse: %lld\n", (long long) bh->b_blocknr);
+ pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr);
brelse(bh);
}
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index d9a43674cb9..406b29836b1 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -34,7 +34,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
ino = bh->b_blocknr;
offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
- pr_debug("AFFS: insert_hash(dir=%u, ino=%d)\n", (u32)dir->i_ino, ino);
+ pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino);
dir_bh = affs_bread(sb, dir->i_ino);
if (!dir_bh)
@@ -84,7 +84,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
sb = dir->i_sb;
rem_ino = rem_bh->b_blocknr;
offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
- pr_debug("AFFS: remove_hash(dir=%d, ino=%d, hashval=%d)\n", (u32)dir->i_ino, rem_ino, offset);
+ pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n",
+ __func__, (u32)dir->i_ino, rem_ino, offset);
bh = affs_bread(sb, dir->i_ino);
if (!bh)
@@ -147,7 +148,7 @@ affs_remove_link(struct dentry *dentry)
u32 link_ino, ino;
int retval;
- pr_debug("AFFS: remove_link(key=%ld)\n", inode->i_ino);
+ pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
retval = -EIO;
bh = affs_bread(sb, inode->i_ino);
if (!bh)
@@ -279,7 +280,7 @@ affs_remove_header(struct dentry *dentry)
if (!inode)
goto done;
- pr_debug("AFFS: remove_header(key=%ld)\n", inode->i_ino);
+ pr_debug("%s(key=%ld)\n", __func__, inode->i_ino);
retval = -EIO;
bh = affs_bread(sb, (u32)(long)dentry->d_fsdata);
if (!bh)
@@ -451,10 +452,10 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...)
vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
va_end(args);
- printk(KERN_CRIT "AFFS error (device %s): %s(): %s\n", sb->s_id,
+ pr_crit("error (device %s): %s(): %s\n", sb->s_id,
function,ErrorBuffer);
if (!(sb->s_flags & MS_RDONLY))
- printk(KERN_WARNING "AFFS: Remounting filesystem read-only\n");
+ pr_warn("Remounting filesystem read-only\n");
sb->s_flags |= MS_RDONLY;
}
@@ -467,24 +468,31 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...)
vsnprintf(ErrorBuffer,sizeof(ErrorBuffer),fmt,args);
va_end(args);
- printk(KERN_WARNING "AFFS warning (device %s): %s(): %s\n", sb->s_id,
+ pr_warn("(device %s): %s(): %s\n", sb->s_id,
function,ErrorBuffer);
}
+bool
+affs_nofilenametruncate(const struct dentry *dentry)
+{
+ struct inode *inode = dentry->d_inode;
+ return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE;
+
+}
+
/* Check if the name is valid for a affs object. */
int
-affs_check_name(const unsigned char *name, int len)
+affs_check_name(const unsigned char *name, int len, bool notruncate)
{
int i;
- if (len > 30)
-#ifdef AFFS_NO_TRUNCATE
- return -ENAMETOOLONG;
-#else
- len = 30;
-#endif
-
+ if (len > 30) {
+ if (notruncate)
+ return -ENAMETOOLONG;
+ else
+ len = 30;
+ }
for (i = 0; i < len; i++) {
if (name[i] < ' ' || name[i] == ':'
|| (name[i] > 0x7e && name[i] < 0xa0))
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index a32246b8359..c8de51185c2 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -17,7 +17,7 @@ affs_count_free_blocks(struct super_block *sb)
u32 free;
int i;
- pr_debug("AFFS: count_free_blocks()\n");
+ pr_debug("%s()\n", __func__);
if (sb->s_flags & MS_RDONLY)
return 0;
@@ -43,7 +43,7 @@ affs_free_block(struct super_block *sb, u32 block)
u32 blk, bmap, bit, mask, tmp;
__be32 *data;
- pr_debug("AFFS: free_block(%u)\n", block);
+ pr_debug("%s(%u)\n", __func__, block);
if (block > sbi->s_partition_size)
goto err_range;
@@ -125,7 +125,7 @@ affs_alloc_block(struct inode *inode, u32 goal)
sb = inode->i_sb;
sbi = AFFS_SB(sb);
- pr_debug("AFFS: balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
+ pr_debug("balloc(inode=%lu,goal=%u): ", inode->i_ino, goal);
if (AFFS_I(inode)->i_pa_cnt) {
pr_debug("%d\n", AFFS_I(inode)->i_lastalloc+1);
@@ -254,8 +254,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
return 0;
if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) {
- printk(KERN_NOTICE "AFFS: Bitmap invalid - mounting %s read only\n",
- sb->s_id);
+ pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id);
*flags |= MS_RDONLY;
return 0;
}
@@ -268,7 +267,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
size = sbi->s_bmap_count * sizeof(*bm);
bm = sbi->s_bitmap = kzalloc(size, GFP_KERNEL);
if (!sbi->s_bitmap) {
- printk(KERN_ERR "AFFS: Bitmap allocation failed\n");
+ pr_err("Bitmap allocation failed\n");
return -ENOMEM;
}
@@ -282,17 +281,17 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
bm->bm_key = be32_to_cpu(bmap_blk[blk]);
bh = affs_bread(sb, bm->bm_key);
if (!bh) {
- printk(KERN_ERR "AFFS: Cannot read bitmap\n");
+ pr_err("Cannot read bitmap\n");
res = -EIO;
goto out;
}
if (affs_checksum_block(sb, bh)) {
- printk(KERN_WARNING "AFFS: Bitmap %u invalid - mounting %s read only.\n",
- bm->bm_key, sb->s_id);
+ pr_warn("Bitmap %u invalid - mounting %s read only.\n",
+ bm->bm_key, sb->s_id);
*flags |= MS_RDONLY;
goto out;
}
- pr_debug("AFFS: read bitmap block %d: %d\n", blk, bm->bm_key);
+ pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key);
bm->bm_free = memweight(bh->b_data + 4, sb->s_blocksize - 4);
/* Don't try read the extension if this is the last block,
@@ -304,7 +303,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags)
affs_brelse(bmap_bh);
bmap_bh = affs_bread(sb, be32_to_cpu(bmap_blk[blk]));
if (!bmap_bh) {
- printk(KERN_ERR "AFFS: Cannot read bitmap extension\n");
+ pr_err("Cannot read bitmap extension\n");
res = -EIO;
goto out;
}
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index f1eba8c3644..59f07bec92a 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -52,8 +52,10 @@ affs_readdir(struct file *file, struct dir_context *ctx)
int hash_pos;
int chain_pos;
u32 ino;
+ int error = 0;
- pr_debug("AFFS: readdir(ino=%lu,f_pos=%lx)\n",inode->i_ino,(unsigned long)ctx->pos);
+ pr_debug("%s(ino=%lu,f_pos=%lx)\n",
+ __func__, inode->i_ino, (unsigned long)ctx->pos);
if (ctx->pos < 2) {
file->private_data = (void *)0;
@@ -72,14 +74,14 @@ affs_readdir(struct file *file, struct dir_context *ctx)
}
dir_bh = affs_bread(sb, inode->i_ino);
if (!dir_bh)
- goto readdir_out;
+ goto out_unlock_dir;
/* If the directory hasn't changed since the last call to readdir(),
* we can jump directly to where we left off.
*/
ino = (u32)(long)file->private_data;
if (ino && file->f_version == inode->i_version) {
- pr_debug("AFFS: readdir() left off=%d\n", ino);
+ pr_debug("readdir() left off=%d\n", ino);
goto inside;
}
@@ -88,7 +90,8 @@ affs_readdir(struct file *file, struct dir_context *ctx)
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
affs_error(sb, "readdir","Cannot read block %d", i);
- return -EIO;
+ error = -EIO;
+ goto out_brelse_dir;
}
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
@@ -107,29 +110,34 @@ inside:
do {
fh_bh = affs_bread(sb, ino);
if (!fh_bh) {
- affs_error(sb, "readdir","Cannot read block %d", ino);
+ affs_error(sb, "readdir",
+ "Cannot read block %d", ino);
break;
}
namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
name = AFFS_TAIL(sb, fh_bh)->name + 1;
- pr_debug("AFFS: readdir(): filldir(\"%.*s\", ino=%u), hash=%d, f_pos=%x\n",
+ pr_debug("readdir(): dir_emit(\"%.*s\", "
+ "ino=%u), hash=%d, f_pos=%x\n",
namelen, name, ino, hash_pos, (u32)ctx->pos);
+
if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
- goto readdir_done;
+ goto done;
ctx->pos++;
ino = be32_to_cpu(AFFS_TAIL(sb, fh_bh)->hash_chain);
affs_brelse(fh_bh);
fh_bh = NULL;
} while (ino);
}
-readdir_done:
+done:
file->f_version = inode->i_version;
file->private_data = (void *)(long)ino;
+ affs_brelse(fh_bh);
-readdir_out:
+out_brelse_dir:
affs_brelse(dir_bh);
- affs_brelse(fh_bh);
+
+out_unlock_dir:
affs_unlock_dir(inode);
- return 0;
+ return error;
}
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8669b6ecdde..a7fe57d2cd9 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -27,10 +27,10 @@ static int affs_file_release(struct inode *inode, struct file *filp);
const struct file_operations affs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = affs_file_open,
.release = affs_file_release,
@@ -45,7 +45,7 @@ const struct inode_operations affs_file_inode_operations = {
static int
affs_file_open(struct inode *inode, struct file *filp)
{
- pr_debug("AFFS: open(%lu,%d)\n",
+ pr_debug("open(%lu,%d)\n",
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
atomic_inc(&AFFS_I(inode)->i_opencnt);
return 0;
@@ -54,7 +54,7 @@ affs_file_open(struct inode *inode, struct file *filp)
static int
affs_file_release(struct inode *inode, struct file *filp)
{
- pr_debug("AFFS: release(%lu, %d)\n",
+ pr_debug("release(%lu, %d)\n",
inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
@@ -324,7 +324,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
struct buffer_head *ext_bh;
u32 ext;
- pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block);
+ pr_debug("%s(%u, %lu)\n",
+ __func__, (u32)inode->i_ino, (unsigned long)block);
BUG_ON(block > (sector_t)0x7fffffffUL);
@@ -498,34 +499,36 @@ affs_getemptyblk_ino(struct inode *inode, int block)
}
static int
-affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to)
+affs_do_readpage_ofs(struct page *page, unsigned to)
{
struct inode *inode = page->mapping->host;
struct super_block *sb = inode->i_sb;
struct buffer_head *bh;
char *data;
+ unsigned pos = 0;
u32 bidx, boff, bsize;
u32 tmp;
- pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to);
- BUG_ON(from > to || to > PAGE_CACHE_SIZE);
+ pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+ page->index, to);
+ BUG_ON(to > PAGE_CACHE_SIZE);
kmap(page);
data = page_address(page);
bsize = AFFS_SB(sb)->s_data_blksize;
- tmp = (page->index << PAGE_CACHE_SHIFT) + from;
+ tmp = page->index << PAGE_CACHE_SHIFT;
bidx = tmp / bsize;
boff = tmp % bsize;
- while (from < to) {
+ while (pos < to) {
bh = affs_bread_ino(inode, bidx, 0);
if (IS_ERR(bh))
return PTR_ERR(bh);
- tmp = min(bsize - boff, to - from);
- BUG_ON(from + tmp > to || tmp > bsize);
- memcpy(data + from, AFFS_DATA(bh) + boff, tmp);
+ tmp = min(bsize - boff, to - pos);
+ BUG_ON(pos + tmp > to || tmp > bsize);
+ memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
affs_brelse(bh);
bidx++;
- from += tmp;
+ pos += tmp;
boff = 0;
}
flush_dcache_page(page);
@@ -542,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
u32 size, bsize;
u32 tmp;
- pr_debug("AFFS: extent_file(%u, %d)\n", (u32)inode->i_ino, newsize);
+ pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
bsize = AFFS_SB(sb)->s_data_blksize;
bh = NULL;
size = AFFS_I(inode)->mmu_private;
@@ -608,14 +611,14 @@ affs_readpage_ofs(struct file *file, struct page *page)
u32 to;
int err;
- pr_debug("AFFS: read_page(%u, %ld)\n", (u32)inode->i_ino, page->index);
+ pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
to = PAGE_CACHE_SIZE;
if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
to = inode->i_size & ~PAGE_CACHE_MASK;
memset(page_address(page) + to, 0, PAGE_CACHE_SIZE - to);
}
- err = affs_do_readpage_ofs(file, page, 0, to);
+ err = affs_do_readpage_ofs(page, to);
if (!err)
SetPageUptodate(page);
unlock_page(page);
@@ -631,7 +634,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
pgoff_t index;
int err = 0;
- pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+ pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
+ (unsigned long long)pos, (unsigned long long)pos + len);
if (pos > AFFS_I(inode)->mmu_private) {
/* XXX: this probably leaves a too-big i_size in case of
* failure. Should really be updating i_size at write_end time
@@ -651,7 +655,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
return 0;
/* XXX: inefficient but safe in the face of short writes */
- err = affs_do_readpage_ofs(file, page, 0, PAGE_CACHE_SIZE);
+ err = affs_do_readpage_ofs(page, PAGE_CACHE_SIZE);
if (err) {
unlock_page(page);
page_cache_release(page);
@@ -680,7 +684,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
* due to write_begin.
*/
- pr_debug("AFFS: write_begin(%u, %llu, %llu)\n", (u32)inode->i_ino, (unsigned long long)pos, (unsigned long long)pos + len);
+ pr_debug("%s(%u, %llu, %llu)\n",
+ __func__, (u32)inode->i_ino, (unsigned long long)pos,
+ (unsigned long long)pos + len);
bsize = AFFS_SB(sb)->s_data_blksize;
data = page_address(page);
@@ -802,7 +808,7 @@ affs_free_prealloc(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- pr_debug("AFFS: free_prealloc(ino=%lu)\n", inode->i_ino);
+ pr_debug("free_prealloc(ino=%lu)\n", inode->i_ino);
while (AFFS_I(inode)->i_pa_cnt) {
AFFS_I(inode)->i_pa_cnt--;
@@ -822,7 +828,7 @@ affs_truncate(struct inode *inode)
struct buffer_head *ext_bh;
int i;
- pr_debug("AFFS: truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+ pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
(u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
last_blk = 0;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0e092d08680..bec2d1a0c91 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -34,7 +34,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
if (!(inode->i_state & I_NEW))
return inode;
- pr_debug("AFFS: affs_iget(%lu)\n", inode->i_ino);
+ pr_debug("affs_iget(%lu)\n", inode->i_ino);
block = inode->i_ino;
bh = affs_bread(sb, block);
@@ -175,7 +175,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
uid_t uid;
gid_t gid;
- pr_debug("AFFS: write_inode(%lu)\n",inode->i_ino);
+ pr_debug("write_inode(%lu)\n", inode->i_ino);
if (!inode->i_nlink)
// possibly free block
@@ -220,7 +220,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr)
struct inode *inode = dentry->d_inode;
int error;
- pr_debug("AFFS: notify_change(%lu,0x%x)\n",inode->i_ino,attr->ia_valid);
+ pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid);
error = inode_change_ok(inode,attr);
if (error)
@@ -258,8 +258,9 @@ void
affs_evict_inode(struct inode *inode)
{
unsigned long cache_page;
- pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
- truncate_inode_pages(&inode->i_data, 0);
+ pr_debug("evict_inode(ino=%lu, nlink=%u)\n",
+ inode->i_ino, inode->i_nlink);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
@@ -271,7 +272,7 @@ affs_evict_inode(struct inode *inode)
affs_free_prealloc(inode);
cache_page = (unsigned long)AFFS_I(inode)->i_lc;
if (cache_page) {
- pr_debug("AFFS: freeing ext cache\n");
+ pr_debug("freeing ext cache\n");
AFFS_I(inode)->i_lc = NULL;
AFFS_I(inode)->i_ac = NULL;
free_page(cache_page);
@@ -350,7 +351,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
u32 block = 0;
int retval;
- pr_debug("AFFS: add_entry(dir=%u, inode=%u, \"%*s\", type=%d)\n", (u32)dir->i_ino,
+ pr_debug("%s(dir=%u, inode=%u, \"%*s\", type=%d)\n",
+ __func__, (u32)dir->i_ino,
(u32)inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name, type);
retval = -EIO;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c36cbb4537a..035bd31556f 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -60,13 +60,13 @@ affs_get_toupper(struct super_block *sb)
* Note: the dentry argument is the parent dentry.
*/
static inline int
-__affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
+__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
int i;
- i = affs_check_name(qstr->name, qstr->len);
+ i = affs_check_name(qstr->name, qstr->len, notruncate);
if (i)
return i;
@@ -82,16 +82,22 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper)
static int
affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_toupper);
+ return __affs_hash_dentry(qstr, affs_toupper,
+ affs_nofilenametruncate(dentry));
+
}
+
static int
affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_intl_toupper);
+ return __affs_hash_dentry(qstr, affs_intl_toupper,
+ affs_nofilenametruncate(dentry));
+
}
static inline int __affs_compare_dentry(unsigned int len,
- const char *str, const struct qstr *name, toupper_t toupper)
+ const char *str, const struct qstr *name, toupper_t toupper,
+ bool notruncate)
{
const u8 *aname = str;
const u8 *bname = name->name;
@@ -101,7 +107,7 @@ static inline int __affs_compare_dentry(unsigned int len,
* must be valid. 'name' must be validated first.
*/
- if (affs_check_name(name->name, name->len))
+ if (affs_check_name(name->name, name->len, notruncate))
return 1;
/*
@@ -126,13 +132,18 @@ static int
affs_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- return __affs_compare_dentry(len, str, name, affs_toupper);
+
+ return __affs_compare_dentry(len, str, name, affs_toupper,
+ affs_nofilenametruncate(parent));
}
+
static int
affs_intl_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
unsigned int len, const char *str, const struct qstr *name)
{
- return __affs_compare_dentry(len, str, name, affs_intl_toupper);
+ return __affs_compare_dentry(len, str, name, affs_intl_toupper,
+ affs_nofilenametruncate(parent));
+
}
/*
@@ -179,7 +190,8 @@ affs_find_entry(struct inode *dir, struct dentry *dentry)
toupper_t toupper = affs_get_toupper(sb);
u32 key;
- pr_debug("AFFS: find_entry(\"%.*s\")\n", (int)dentry->d_name.len, dentry->d_name.name);
+ pr_debug("%s(\"%.*s\")\n",
+ __func__, (int)dentry->d_name.len, dentry->d_name.name);
bh = affs_bread(sb, dir->i_ino);
if (!bh)
@@ -207,7 +219,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
struct buffer_head *bh;
struct inode *inode = NULL;
- pr_debug("AFFS: lookup(\"%.*s\")\n",(int)dentry->d_name.len,dentry->d_name.name);
+ pr_debug("%s(\"%.*s\")\n",
+ __func__, (int)dentry->d_name.len, dentry->d_name.name);
affs_lock_dir(dir);
bh = affs_find_entry(dir, dentry);
@@ -237,9 +250,9 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
int
affs_unlink(struct inode *dir, struct dentry *dentry)
{
- pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino,
- dentry->d_inode->i_ino,
- (int)dentry->d_name.len, dentry->d_name.name);
+ pr_debug("%s(dir=%d, %lu \"%.*s\")\n",
+ __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+ (int)dentry->d_name.len, dentry->d_name.name);
return affs_remove_header(dentry);
}
@@ -251,7 +264,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
struct inode *inode;
int error;
- pr_debug("AFFS: create(%lu,\"%.*s\",0%ho)\n",dir->i_ino,(int)dentry->d_name.len,
+ pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
+ __func__, dir->i_ino, (int)dentry->d_name.len,
dentry->d_name.name,mode);
inode = affs_new_inode(dir);
@@ -280,8 +294,9 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct inode *inode;
int error;
- pr_debug("AFFS: mkdir(%lu,\"%.*s\",0%ho)\n",dir->i_ino,
- (int)dentry->d_name.len,dentry->d_name.name,mode);
+ pr_debug("%s(%lu,\"%.*s\",0%ho)\n",
+ __func__, dir->i_ino, (int)dentry->d_name.len,
+ dentry->d_name.name, mode);
inode = affs_new_inode(dir);
if (!inode)
@@ -306,8 +321,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int
affs_rmdir(struct inode *dir, struct dentry *dentry)
{
- pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino,
- dentry->d_inode->i_ino,
+ pr_debug("%s(dir=%u, %lu \"%.*s\")\n",
+ __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
(int)dentry->d_name.len, dentry->d_name.name);
return affs_remove_header(dentry);
@@ -323,8 +338,9 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
int i, maxlen, error;
char c, lc;
- pr_debug("AFFS: symlink(%lu,\"%.*s\" -> \"%s\")\n",dir->i_ino,
- (int)dentry->d_name.len,dentry->d_name.name,symname);
+ pr_debug("%s(%lu,\"%.*s\" -> \"%s\")\n",
+ __func__, dir->i_ino, (int)dentry->d_name.len,
+ dentry->d_name.name, symname);
maxlen = AFFS_SB(sb)->s_hashsize * sizeof(u32) - 1;
inode = affs_new_inode(dir);
@@ -393,7 +409,8 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- pr_debug("AFFS: link(%u, %u, \"%.*s\")\n", (u32)inode->i_ino, (u32)dir->i_ino,
+ pr_debug("%s(%u, %u, \"%.*s\")\n",
+ __func__, (u32)inode->i_ino, (u32)dir->i_ino,
(int)dentry->d_name.len,dentry->d_name.name);
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -407,11 +424,15 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
- pr_debug("AFFS: rename(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
- (u32)old_dir->i_ino, (int)old_dentry->d_name.len, old_dentry->d_name.name,
- (u32)new_dir->i_ino, (int)new_dentry->d_name.len, new_dentry->d_name.name);
+ pr_debug("%s(old=%u,\"%*s\" to new=%u,\"%*s\")\n",
+ __func__, (u32)old_dir->i_ino, (int)old_dentry->d_name.len,
+ old_dentry->d_name.name, (u32)new_dir->i_ino,
+ (int)new_dentry->d_name.len, new_dentry->d_name.name);
+
+ retval = affs_check_name(new_dentry->d_name.name,
+ new_dentry->d_name.len,
+ affs_nofilenametruncate(old_dentry));
- retval = affs_check_name(new_dentry->d_name.name,new_dentry->d_name.len);
if (retval)
return retval;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 45161a832bb..51f1a95bff7 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -46,14 +46,9 @@ static void
affs_put_super(struct super_block *sb)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
- pr_debug("AFFS: put_super()\n");
+ pr_debug("%s()\n", __func__);
cancel_delayed_work_sync(&sbi->sb_work);
- kfree(sbi->s_prefix);
- affs_free_bitmap(sb);
- affs_brelse(sbi->s_root_bh);
- kfree(sbi);
- sb->s_fs_info = NULL;
}
static int
@@ -133,7 +128,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
affs_inode_cachep = kmem_cache_create("affs_inode_cache",
sizeof(struct affs_inode_info),
@@ -168,7 +163,7 @@ static const struct super_operations affs_sops = {
};
enum {
- Opt_bs, Opt_mode, Opt_mufs, Opt_prefix, Opt_protect,
+ Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
};
@@ -177,6 +172,7 @@ static const match_table_t tokens = {
{Opt_bs, "bs=%u"},
{Opt_mode, "mode=%o"},
{Opt_mufs, "mufs"},
+ {Opt_notruncate, "nofilenametruncate"},
{Opt_prefix, "prefix=%s"},
{Opt_protect, "protect"},
{Opt_reserved, "reserved=%u"},
@@ -224,7 +220,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
return 0;
if (n != 512 && n != 1024 && n != 2048
&& n != 4096) {
- printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+ pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
return 0;
}
*blocksize = n;
@@ -238,6 +234,9 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
case Opt_mufs:
*mount_opts |= SF_MUFS;
break;
+ case Opt_notruncate:
+ *mount_opts |= SF_NO_TRUNCATE;
+ break;
case Opt_prefix:
*prefix = match_strdup(&args[0]);
if (!*prefix)
@@ -286,8 +285,8 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved,
/* Silently ignore the quota options */
break;
default:
- printk("AFFS: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ pr_warn("Unrecognized mount option \"%s\" or missing value\n",
+ p);
return 0;
}
}
@@ -316,11 +315,11 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
unsigned long mount_flags;
int tmp_flags; /* fix remount prototype... */
u8 sig[4];
- int ret = -EINVAL;
+ int ret;
save_mount_options(sb, data);
- pr_debug("AFFS: read_super(%s)\n",data ? (const char *)data : "no options");
+ pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
sb->s_magic = AFFS_SUPER_MAGIC;
sb->s_op = &affs_sops;
@@ -340,9 +339,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
&blocksize,&sbi->s_prefix,
sbi->s_volume, &mount_flags)) {
- printk(KERN_ERR "AFFS: Error parsing options\n");
- kfree(sbi->s_prefix);
- kfree(sbi);
+ pr_err("Error parsing options\n");
return -EINVAL;
}
/* N.B. after this point s_prefix must be released */
@@ -359,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
*/
size = sb->s_bdev->bd_inode->i_size >> 9;
- pr_debug("AFFS: initial blocksize=%d, #blocks=%d\n", 512, size);
+ pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size);
affs_set_blocksize(sb, PAGE_SIZE);
/* Try to find root block. Its location depends on the block size. */
@@ -374,7 +371,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_root_block = root_block;
if (root_block < 0)
sbi->s_root_block = (reserved + size - 1) / 2;
- pr_debug("AFFS: setting blocksize to %d\n", blocksize);
+ pr_debug("setting blocksize to %d\n", blocksize);
affs_set_blocksize(sb, blocksize);
sbi->s_partition_size = size;
@@ -389,7 +386,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
* block behind the calculated one. So we check this one, too.
*/
for (num_bm = 0; num_bm < 2; num_bm++) {
- pr_debug("AFFS: Dev %s, trying root=%u, bs=%d, "
+ pr_debug("Dev %s, trying root=%u, bs=%d, "
"size=%d, reserved=%d\n",
sb->s_id,
sbi->s_root_block + num_bm,
@@ -410,19 +407,20 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
}
}
if (!silent)
- printk(KERN_ERR "AFFS: No valid root block on device %s\n",
- sb->s_id);
- goto out_error;
+ pr_err("No valid root block on device %s\n", sb->s_id);
+ return -EINVAL;
/* N.B. after this point bh must be released */
got_root:
+ /* Keep super block in cache */
+ sbi->s_root_bh = root_bh;
root_block = sbi->s_root_block;
/* Find out which kind of FS we have */
boot_bh = sb_bread(sb, 0);
if (!boot_bh) {
- printk(KERN_ERR "AFFS: Cannot read boot block\n");
- goto out_error;
+ pr_err("Cannot read boot block\n");
+ return -EINVAL;
}
memcpy(sig, boot_bh->b_data, 4);
brelse(boot_bh);
@@ -434,8 +432,7 @@ got_root:
*/
if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS
|| chksum == MUFS_DCOFS) && !(sb->s_flags & MS_RDONLY)) {
- printk(KERN_NOTICE "AFFS: Dircache FS - mounting %s read only\n",
- sb->s_id);
+ pr_notice("Dircache FS - mounting %s read only\n", sb->s_id);
sb->s_flags |= MS_RDONLY;
}
switch (chksum) {
@@ -469,14 +466,14 @@ got_root:
sb->s_flags |= MS_NOEXEC;
break;
default:
- printk(KERN_ERR "AFFS: Unknown filesystem on device %s: %08X\n",
- sb->s_id, chksum);
- goto out_error;
+ pr_err("Unknown filesystem on device %s: %08X\n",
+ sb->s_id, chksum);
+ return -EINVAL;
}
if (mount_flags & SF_VERBOSE) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
- printk(KERN_NOTICE "AFFS: Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
+ pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
AFFS_ROOT_TAIL(sb, root_bh)->disk_name + 1,
sig, sig[3] + '0', blocksize);
@@ -488,22 +485,17 @@ got_root:
if (sbi->s_flags & SF_OFS)
sbi->s_data_blksize -= 24;
- /* Keep super block in cache */
- sbi->s_root_bh = root_bh;
- /* N.B. after this point s_root_bh must be released */
-
tmp_flags = sb->s_flags;
- if (affs_init_bitmap(sb, &tmp_flags))
- goto out_error;
+ ret = affs_init_bitmap(sb, &tmp_flags);
+ if (ret)
+ return ret;
sb->s_flags = tmp_flags;
/* set up enough so that it can read an inode */
root_inode = affs_iget(sb, root_block);
- if (IS_ERR(root_inode)) {
- ret = PTR_ERR(root_inode);
- goto out_error;
- }
+ if (IS_ERR(root_inode))
+ return PTR_ERR(root_inode);
if (AFFS_SB(sb)->s_flags & SF_INTL)
sb->s_d_op = &affs_intl_dentry_operations;
@@ -512,23 +504,12 @@ got_root:
sb->s_root = d_make_root(root_inode);
if (!sb->s_root) {
- printk(KERN_ERR "AFFS: Get root inode failed\n");
- goto out_error;
+ pr_err("AFFS: Get root inode failed\n");
+ return -ENOMEM;
}
- pr_debug("AFFS: s_flags=%lX\n",sb->s_flags);
+ pr_debug("s_flags=%lX\n", sb->s_flags);
return 0;
-
- /*
- * Begin the cascaded cleanup ...
- */
-out_error:
- kfree(sbi->s_bitmap);
- affs_brelse(root_bh);
- kfree(sbi->s_prefix);
- kfree(sbi);
- sb->s_fs_info = NULL;
- return ret;
}
static int
@@ -547,8 +528,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
char volume[32];
char *prefix = NULL;
- pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data);
+ pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
+ sync_filesystem(sb);
*flags |= MS_NODIRATIME;
memcpy(volume, sbi->s_volume, 32);
@@ -594,8 +576,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
int free;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
- pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size,
- AFFS_SB(sb)->s_reserved);
+ pr_debug("%s() partsize=%d, reserved=%d\n",
+ __func__, AFFS_SB(sb)->s_partition_size,
+ AFFS_SB(sb)->s_reserved);
free = affs_count_free_blocks(sb);
buf->f_type = AFFS_SUPER_MAGIC;
@@ -615,11 +598,23 @@ static struct dentry *affs_mount(struct file_system_type *fs_type,
return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
}
+static void affs_kill_sb(struct super_block *sb)
+{
+ struct affs_sb_info *sbi = AFFS_SB(sb);
+ kill_block_super(sb);
+ if (sbi) {
+ affs_free_bitmap(sb);
+ affs_brelse(sbi->s_root_bh);
+ kfree(sbi->s_prefix);
+ kfree(sbi);
+ }
+}
+
static struct file_system_type affs_fs_type = {
.owner = THIS_MODULE,
.name = "affs",
.mount = affs_mount,
- .kill_sb = kill_block_super,
+ .kill_sb = affs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("affs");
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ee00f08c4f5..f39b71c3981 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -21,7 +21,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
char c;
char lc;
- pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino);
+ pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
err = -EIO;
bh = affs_bread(inode->i_sb, inode->i_ino);
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 3c090b7555e..ca0a3cf9379 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -179,7 +179,7 @@ struct afs_cell *afs_cell_create(const char *name, unsigned namesz,
/* put it up for caching (this never returns an error) */
cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index,
&afs_cell_cache_index_def,
- cell);
+ cell, true);
#endif
/* add to the cell lists */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 1c8c6cc6de3..4b0eff6da67 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call)
{
_enter("");
+ /* Break the callbacks here so that we do it after the final ACK is
+ * received. The step number here must match the final number in
+ * afs_deliver_cb_callback().
+ */
+ if (call->unmarshall == 6) {
+ ASSERT(call->server && call->count && call->request);
+ afs_break_callbacks(call->server, call->count, call->request);
+ }
+
afs_put_server(call->server);
call->server = NULL;
kfree(call->buffer);
@@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
_debug("trailer");
if (skb->len != 0)
return -EBADMSG;
+
+ /* Record that the message was unmarshalled successfully so
+ * that the call destructor can know do the callback breaking
+ * work, even if the final ACK isn't received.
+ *
+ * If the step number changes, then afs_cm_destructor() must be
+ * updated also.
+ */
+ call->unmarshall++;
+ case 6:
break;
}
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 66d50fe2ee4..932ce07948b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -31,10 +31,10 @@ const struct file_operations afs_file_operations = {
.open = afs_open,
.release = afs_release,
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = afs_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = afs_file_write,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
.fsync = afs_fsync,
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index a8cf2cff836..4baf1d2b39e 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -555,10 +555,6 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
return -ENOLCK;
/* we're simulating flock() locks using posix locks on the server */
- fl->fl_owner = (fl_owner_t) file;
- fl->fl_start = 0;
- fl->fl_end = OFFSET_MAX;
-
if (fl->fl_type == F_UNLCK)
return afs_do_unlk(file, fl);
return afs_do_setlk(file, fl);
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 789bc253b5f..29467128844 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -259,7 +259,7 @@ struct inode *afs_iget(struct super_block *sb, struct key *key,
#ifdef CONFIG_AFS_FSCACHE
vnode->cache = fscache_acquire_cookie(vnode->volume->cache,
&afs_vnode_cache_index_def,
- vnode);
+ vnode, true);
#endif
ret = afs_inode_map_status(vnode, key);
@@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode)
ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
afs_give_up_callback(vnode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a306bb6d88d..71d5982312f 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,6 +75,7 @@ struct afs_call {
const struct afs_call_type *type; /* type of call */
const struct afs_wait_mode *wait_mode; /* completion wait mode */
wait_queue_head_t waitq; /* processes awaiting completion */
+ void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
struct work_struct async_work; /* asynchronous work processor */
struct work_struct work; /* actual work processor */
struct sk_buff_head rx_queue; /* received packets */
@@ -195,7 +196,6 @@ struct afs_cell {
struct list_head link; /* main cell list link */
struct key *anonymous_key; /* anonymous user key for this cell */
struct list_head proc_link; /* /proc cell list link */
- struct proc_dir_entry *proc_dir; /* /proc dir for this cell */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_cookie *cache; /* caching cookie */
#endif
@@ -747,8 +747,7 @@ extern int afs_write_end(struct file *file, struct address_space *mapping,
extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
-extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
- unsigned long, loff_t);
+extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_writeback_all(struct afs_vnode *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 42dd2e499ed..35de0c04729 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -55,13 +55,13 @@ static int __init afs_get_client_UUID(void)
afs_uuid.time_low = uuidtime;
afs_uuid.time_mid = uuidtime >> 32;
afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
- afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+ afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME;
get_random_bytes(&clockseq, 2);
afs_uuid.clock_seq_low = clockseq;
afs_uuid.clock_seq_hi_and_reserved =
(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
- afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+ afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD;
_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
afs_uuid.time_low,
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 526e4bbbde5..24a905b076f 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -41,11 +41,8 @@ static const struct file_operations afs_proc_cells_fops = {
.write = afs_proc_cells_write,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file);
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file);
static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
size_t size, loff_t *_pos);
static ssize_t afs_proc_rootcell_write(struct file *file,
@@ -53,17 +50,12 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
size_t size, loff_t *_pos);
static const struct file_operations afs_proc_rootcell_fops = {
- .open = afs_proc_rootcell_open,
.read = afs_proc_rootcell_read,
.write = afs_proc_rootcell_write,
.llseek = no_llseek,
- .release = afs_proc_rootcell_release,
- .owner = THIS_MODULE,
};
static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_volumes_release(struct inode *inode,
- struct file *file);
static void *afs_proc_cell_volumes_start(struct seq_file *p, loff_t *pos);
static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
loff_t *pos);
@@ -81,14 +73,11 @@ static const struct file_operations afs_proc_cell_volumes_fops = {
.open = afs_proc_cell_volumes_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = afs_proc_cell_volumes_release,
- .owner = THIS_MODULE,
+ .release = seq_release,
};
static int afs_proc_cell_vlservers_open(struct inode *inode,
struct file *file);
-static int afs_proc_cell_vlservers_release(struct inode *inode,
- struct file *file);
static void *afs_proc_cell_vlservers_start(struct seq_file *p, loff_t *pos);
static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
loff_t *pos);
@@ -106,13 +95,10 @@ static const struct file_operations afs_proc_cell_vlservers_fops = {
.open = afs_proc_cell_vlservers_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = afs_proc_cell_vlservers_release,
- .owner = THIS_MODULE,
+ .release = seq_release,
};
static int afs_proc_cell_servers_open(struct inode *inode, struct file *file);
-static int afs_proc_cell_servers_release(struct inode *inode,
- struct file *file);
static void *afs_proc_cell_servers_start(struct seq_file *p, loff_t *pos);
static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
loff_t *pos);
@@ -130,8 +116,7 @@ static const struct file_operations afs_proc_cell_servers_fops = {
.open = afs_proc_cell_servers_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = afs_proc_cell_servers_release,
- .owner = THIS_MODULE,
+ .release = seq_release,
};
/*
@@ -139,29 +124,21 @@ static const struct file_operations afs_proc_cell_servers_fops = {
*/
int afs_proc_init(void)
{
- struct proc_dir_entry *p;
-
_enter("");
proc_afs = proc_mkdir("fs/afs", NULL);
if (!proc_afs)
goto error_dir;
- p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops);
- if (!p)
- goto error_cells;
-
- p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops);
- if (!p)
- goto error_rootcell;
+ if (!proc_create("cells", 0644, proc_afs, &afs_proc_cells_fops) ||
+ !proc_create("rootcell", 0644, proc_afs, &afs_proc_rootcell_fops))
+ goto error_tree;
_leave(" = 0");
return 0;
-error_rootcell:
- remove_proc_entry("cells", proc_afs);
-error_cells:
- remove_proc_entry("fs/afs", NULL);
+error_tree:
+ remove_proc_subtree("fs/afs", NULL);
error_dir:
_leave(" = -ENOMEM");
return -ENOMEM;
@@ -172,9 +149,7 @@ error_dir:
*/
void afs_proc_cleanup(void)
{
- remove_proc_entry("rootcell", proc_afs);
- remove_proc_entry("cells", proc_afs);
- remove_proc_entry("fs/afs", NULL);
+ remove_proc_subtree("fs/afs", NULL);
}
/*
@@ -319,19 +294,6 @@ inval:
goto done;
}
-/*
- * Stubs for /proc/fs/afs/rootcell
- */
-static int afs_proc_rootcell_open(struct inode *inode, struct file *file)
-{
- return 0;
-}
-
-static int afs_proc_rootcell_release(struct inode *inode, struct file *file)
-{
- return 0;
-}
-
static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
size_t size, loff_t *_pos)
{
@@ -387,38 +349,27 @@ nomem:
*/
int afs_proc_cell_setup(struct afs_cell *cell)
{
- struct proc_dir_entry *p;
+ struct proc_dir_entry *dir;
_enter("%p{%s}", cell, cell->name);
- cell->proc_dir = proc_mkdir(cell->name, proc_afs);
- if (!cell->proc_dir)
+ dir = proc_mkdir(cell->name, proc_afs);
+ if (!dir)
goto error_dir;
- p = proc_create_data("servers", 0, cell->proc_dir,
- &afs_proc_cell_servers_fops, cell);
- if (!p)
- goto error_servers;
-
- p = proc_create_data("vlservers", 0, cell->proc_dir,
- &afs_proc_cell_vlservers_fops, cell);
- if (!p)
- goto error_vlservers;
-
- p = proc_create_data("volumes", 0, cell->proc_dir,
- &afs_proc_cell_volumes_fops, cell);
- if (!p)
- goto error_volumes;
+ if (!proc_create_data("servers", 0, dir,
+ &afs_proc_cell_servers_fops, cell) ||
+ !proc_create_data("vlservers", 0, dir,
+ &afs_proc_cell_vlservers_fops, cell) ||
+ !proc_create_data("volumes", 0, dir,
+ &afs_proc_cell_volumes_fops, cell))
+ goto error_tree;
_leave(" = 0");
return 0;
-error_volumes:
- remove_proc_entry("vlservers", cell->proc_dir);
-error_vlservers:
- remove_proc_entry("servers", cell->proc_dir);
-error_servers:
- remove_proc_entry(cell->name, proc_afs);
+error_tree:
+ remove_proc_subtree(cell->name, proc_afs);
error_dir:
_leave(" = -ENOMEM");
return -ENOMEM;
@@ -431,10 +382,7 @@ void afs_proc_cell_remove(struct afs_cell *cell)
{
_enter("");
- remove_proc_entry("volumes", cell->proc_dir);
- remove_proc_entry("vlservers", cell->proc_dir);
- remove_proc_entry("servers", cell->proc_dir);
- remove_proc_entry(cell->name, proc_afs);
+ remove_proc_subtree(cell->name, proc_afs);
_leave("");
}
@@ -463,14 +411,6 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
}
/*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
-{
- return seq_release(inode, file);
-}
-
-/*
* set up the iterator to start reading from the cells list and return the
* first item
*/
@@ -569,15 +509,6 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
}
/*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_vlservers_release(struct inode *inode,
- struct file *file)
-{
- return seq_release(inode, file);
-}
-
-/*
* set up the iterator to start reading from the cells list and return the
* first item
*/
@@ -673,15 +604,6 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
}
/*
- * close the file and release the ref to the cell
- */
-static int afs_proc_cell_servers_release(struct inode *inode,
- struct file *file)
-{
- return seq_release(inode, file);
-}
-
-/*
* set up the iterator to start reading from the cells list and return the
* first item
*/
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8ad8c2a0703..03a3beb1700 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *);
static int afs_wait_for_call_to_complete(struct afs_call *);
static void afs_wake_up_async_call(struct afs_call *);
static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct work_struct *);
+static void afs_process_async_call(struct afs_call *);
static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
@@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *);
static struct sk_buff_head afs_incoming_calls;
static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+static void afs_async_workfn(struct work_struct *work)
+{
+ struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+ call->async_workfn(call);
+}
+
/*
* open an RxRPC socket and bind it to be a server for callback notifications
* - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call)
}
/*
+ * End a call but do not free it
+ */
+static void afs_end_call_nofree(struct afs_call *call)
+{
+ if (call->rxcall) {
+ rxrpc_kernel_end_call(call->rxcall);
+ call->rxcall = NULL;
+ }
+ if (call->type->destructor)
+ call->type->destructor(call);
+}
+
+/*
+ * End a call and free it
+ */
+static void afs_end_call(struct afs_call *call)
+{
+ afs_end_call_nofree(call);
+ afs_free_call(call);
+}
+
+/*
* allocate a call with flat request and reply buffers
*/
struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
@@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
atomic_read(&afs_outstanding_calls));
call->wait_mode = wait_mode;
- INIT_WORK(&call->async_work, afs_process_async_call);
+ call->async_workfn = afs_process_async_call;
+ INIT_WORK(&call->async_work, afs_async_workfn);
memset(&srx, 0, sizeof(srx));
srx.srx_family = AF_RXRPC;
@@ -383,11 +413,8 @@ error_do_abort:
rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
while ((skb = skb_dequeue(&call->rx_queue)))
afs_free_skb(skb);
- rxrpc_kernel_end_call(rxcall);
- call->rxcall = NULL;
error_kill_call:
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" = %d", ret);
return ret;
}
@@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
if (call->state >= AFS_CALL_COMPLETE) {
while ((skb = skb_dequeue(&call->rx_queue)))
afs_free_skb(skb);
- if (call->incoming) {
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
- }
+ if (call->incoming)
+ afs_end_call(call);
}
_leave("");
@@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
}
_debug("call complete");
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" = %d", ret);
return ret;
}
@@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
/*
* delete an asynchronous call
*/
-static void afs_delete_async_call(struct work_struct *work)
+static void afs_delete_async_call(struct afs_call *call)
{
- struct afs_call *call =
- container_of(work, struct afs_call, async_work);
-
_enter("");
afs_free_call(call);
@@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work)
* - on a multiple-thread workqueue this work item may try to run on several
* CPUs at the same time
*/
-static void afs_process_async_call(struct work_struct *work)
+static void afs_process_async_call(struct afs_call *call)
{
- struct afs_call *call =
- container_of(work, struct afs_call, async_work);
-
_enter("");
if (!skb_queue_empty(&call->rx_queue))
@@ -637,14 +651,11 @@ static void afs_process_async_call(struct work_struct *work)
call->reply = NULL;
/* kill the call */
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- if (call->type->destructor)
- call->type->destructor(call);
+ afs_end_call_nofree(call);
/* we can't just delete the call because the work item may be
* queued */
- PREPARE_WORK(&call->async_work, afs_delete_async_call);
+ call->async_workfn = afs_delete_async_call;
queue_work(afs_async_calls, &call->async_work);
}
@@ -685,7 +696,8 @@ static void afs_collect_incoming_call(struct work_struct *work)
return;
}
- INIT_WORK(&call->async_work, afs_process_async_call);
+ call->async_workfn = afs_process_async_call;
+ INIT_WORK(&call->async_work, afs_async_workfn);
call->wait_mode = &afs_async_incoming_call;
call->type = &afs_RXCMxxxx;
init_waitqueue_head(&call->waitq);
@@ -782,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
_debug("oom");
rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
default:
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" [error]");
return;
}
@@ -815,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
call->state = AFS_CALL_AWAIT_ACK;
n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
if (n >= 0) {
+ /* Success */
_leave(" [replied]");
return;
}
+
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
}
- rxrpc_kernel_end_call(call->rxcall);
- call->rxcall = NULL;
- call->type->destructor(call);
- afs_free_call(call);
+ afs_end_call(call);
_leave(" [error]");
}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 57bcb159653..b6df2e83809 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -308,7 +308,8 @@ static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
/* see if we have an in-cache copy (will set vl->valid if there is) */
#ifdef CONFIG_AFS_FSCACHE
vl->cache = fscache_acquire_cookie(vl->cell->cache,
- &afs_vlocation_cache_index_def, vl);
+ &afs_vlocation_cache_index_def, vl,
+ true);
#endif
if (vl->valid) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 401eeb21869..2b607257820 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -131,7 +131,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
#ifdef CONFIG_AFS_FSCACHE
volume->cache = fscache_acquire_cookie(vlocation->cache,
&afs_volume_cache_index_def,
- volume);
+ volume, true);
#endif
afs_get_vlocation(vlocation);
volume->vlocation = vlocation;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index a890db4b989..ab6adfd5251 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -625,15 +625,14 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call)
/*
* write to an AFS file
*/
-ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
ssize_t result;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(from);
- _enter("{%x.%u},{%zu},%lu,",
- vnode->fid.vid, vnode->fid.vnode, count, nr_segs);
+ _enter("{%x.%u},{%zu},",
+ vnode->fid.vid, vnode->fid.vnode, count);
if (IS_SWAPFILE(&vnode->vfs_inode)) {
printk(KERN_INFO
@@ -644,7 +643,7 @@ ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (!count)
return 0;
- result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ result = generic_file_write_iter(iocb, from);
if (IS_ERR_VALUE(result)) {
_leave(" = %zd", result);
return result;
diff --git a/fs/aio.c b/fs/aio.c
index 067e3d340c3..1c9c5f0a9e2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,10 +36,10 @@
#include <linux/eventfd.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
-#include <linux/anon_inodes.h>
#include <linux/migrate.h>
#include <linux/ramfs.h>
#include <linux/percpu-refcount.h>
+#include <linux/mount.h>
#include <asm/kmap_types.h>
#include <asm/uaccess.h>
@@ -52,7 +52,8 @@
struct aio_ring {
unsigned id; /* kernel internal index number */
unsigned nr; /* number of io_events */
- unsigned head;
+ unsigned head; /* Written to by userland or under ring_lock
+ * mutex by aio_read_events_ring(). */
unsigned tail;
unsigned magic;
@@ -80,6 +81,8 @@ struct kioctx {
struct percpu_ref users;
atomic_t dead;
+ struct percpu_ref reqs;
+
unsigned long user_id;
struct __percpu kioctx_cpu *cpu;
@@ -107,9 +110,13 @@ struct kioctx {
struct page **ring_pages;
long nr_pages;
- struct rcu_head rcu_head;
struct work_struct free_work;
+ /*
+ * signals when all in-flight requests are done
+ */
+ struct completion *requests_done;
+
struct {
/*
* This counts the number of available slots in the ringbuffer,
@@ -152,12 +159,67 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
static struct kmem_cache *kiocb_cachep;
static struct kmem_cache *kioctx_cachep;
+static struct vfsmount *aio_mnt;
+
+static const struct file_operations aio_ring_fops;
+static const struct address_space_operations aio_ctx_aops;
+
+static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
+{
+ struct qstr this = QSTR_INIT("[aio]", 5);
+ struct file *file;
+ struct path path;
+ struct inode *inode = alloc_anon_inode(aio_mnt->mnt_sb);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ inode->i_mapping->a_ops = &aio_ctx_aops;
+ inode->i_mapping->private_data = ctx;
+ inode->i_size = PAGE_SIZE * nr_pages;
+
+ path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
+ if (!path.dentry) {
+ iput(inode);
+ return ERR_PTR(-ENOMEM);
+ }
+ path.mnt = mntget(aio_mnt);
+
+ d_instantiate(path.dentry, inode);
+ file = alloc_file(&path, FMODE_READ | FMODE_WRITE, &aio_ring_fops);
+ if (IS_ERR(file)) {
+ path_put(&path);
+ return file;
+ }
+
+ file->f_flags = O_RDWR;
+ file->private_data = ctx;
+ return file;
+}
+
+static struct dentry *aio_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ static const struct dentry_operations ops = {
+ .d_dname = simple_dname,
+ };
+ return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1);
+}
+
/* aio_setup
* Creates the slab caches used by the aio routines, panic on
* failure as this is done early during the boot sequence.
*/
static int __init aio_setup(void)
{
+ static struct file_system_type aio_fs = {
+ .name = "aio",
+ .mount = aio_mount,
+ .kill_sb = kill_anon_super,
+ };
+ aio_mnt = kern_mount(&aio_fs);
+ if (IS_ERR(aio_mnt))
+ panic("Failed to create aio fs mount.");
+
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -187,16 +249,26 @@ static void aio_free_ring(struct kioctx *ctx)
{
int i;
+ /* Disconnect the kiotx from the ring file. This prevents future
+ * accesses to the kioctx from page migration.
+ */
+ put_aio_ring_file(ctx);
+
for (i = 0; i < ctx->nr_pages; i++) {
+ struct page *page;
pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
page_count(ctx->ring_pages[i]));
- put_page(ctx->ring_pages[i]);
+ page = ctx->ring_pages[i];
+ if (!page)
+ continue;
+ ctx->ring_pages[i] = NULL;
+ put_page(page);
}
- put_aio_ring_file(ctx);
-
- if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
+ if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
kfree(ctx->ring_pages);
+ ctx->ring_pages = NULL;
+ }
}
static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
@@ -220,38 +292,66 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
{
struct kioctx *ctx;
unsigned long flags;
+ pgoff_t idx;
int rc;
+ rc = 0;
+
+ /* mapping->private_lock here protects against the kioctx teardown. */
+ spin_lock(&mapping->private_lock);
+ ctx = mapping->private_data;
+ if (!ctx) {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ /* The ring_lock mutex. The prevents aio_read_events() from writing
+ * to the ring's head, and prevents page migration from mucking in
+ * a partially initialized kiotx.
+ */
+ if (!mutex_trylock(&ctx->ring_lock)) {
+ rc = -EAGAIN;
+ goto out;
+ }
+
+ idx = old->index;
+ if (idx < (pgoff_t)ctx->nr_pages) {
+ /* Make sure the old page hasn't already been changed */
+ if (ctx->ring_pages[idx] != old)
+ rc = -EAGAIN;
+ } else
+ rc = -EINVAL;
+
+ if (rc != 0)
+ goto out_unlock;
+
/* Writeback must be complete */
BUG_ON(PageWriteback(old));
- put_page(old);
+ get_page(new);
- rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+ rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
if (rc != MIGRATEPAGE_SUCCESS) {
- get_page(old);
- return rc;
+ put_page(new);
+ goto out_unlock;
}
- get_page(new);
-
- /* We can potentially race against kioctx teardown here. Use the
- * address_space's private data lock to protect the mapping's
- * private_data.
+ /* Take completion_lock to prevent other writes to the ring buffer
+ * while the old page is copied to the new. This prevents new
+ * events from being lost.
*/
- spin_lock(&mapping->private_lock);
- ctx = mapping->private_data;
- if (ctx) {
- pgoff_t idx;
- spin_lock_irqsave(&ctx->completion_lock, flags);
- migrate_page_copy(new, old);
- idx = old->index;
- if (idx < (pgoff_t)ctx->nr_pages)
- ctx->ring_pages[idx] = new;
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
- } else
- rc = -EBUSY;
- spin_unlock(&mapping->private_lock);
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ migrate_page_copy(new, old);
+ BUG_ON(ctx->ring_pages[idx] != old);
+ ctx->ring_pages[idx] = new;
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ /* The old page is no longer accessible. */
+ put_page(old);
+
+out_unlock:
+ mutex_unlock(&ctx->ring_lock);
+out:
+ spin_unlock(&mapping->private_lock);
return rc;
}
#endif
@@ -268,7 +368,7 @@ static int aio_setup_ring(struct kioctx *ctx)
struct aio_ring *ring;
unsigned nr_events = ctx->max_reqs;
struct mm_struct *mm = current->mm;
- unsigned long size, populate;
+ unsigned long size, unused;
int nr_pages;
int i;
struct file *file;
@@ -283,15 +383,25 @@ static int aio_setup_ring(struct kioctx *ctx)
if (nr_pages < 0)
return -EINVAL;
- file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
+ file = aio_private_file(ctx, nr_pages);
if (IS_ERR(file)) {
ctx->aio_ring_file = NULL;
- return -EAGAIN;
+ return -ENOMEM;
}
- file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
- file->f_inode->i_mapping->private_data = ctx;
- file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
+ ctx->aio_ring_file = file;
+ nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+ / sizeof(struct io_event);
+
+ ctx->ring_pages = ctx->internal_pages;
+ if (nr_pages > AIO_RING_PAGES) {
+ ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+ GFP_KERNEL);
+ if (!ctx->ring_pages) {
+ put_aio_ring_file(ctx);
+ return -ENOMEM;
+ }
+ }
for (i = 0; i < nr_pages; i++) {
struct page *page;
@@ -304,17 +414,14 @@ static int aio_setup_ring(struct kioctx *ctx)
SetPageUptodate(page);
SetPageDirty(page);
unlock_page(page);
+
+ ctx->ring_pages[i] = page;
}
- ctx->aio_ring_file = file;
- nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
- / sizeof(struct io_event);
+ ctx->nr_pages = i;
- ctx->ring_pages = ctx->internal_pages;
- if (nr_pages > AIO_RING_PAGES) {
- ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- if (!ctx->ring_pages)
- return -ENOMEM;
+ if (unlikely(i != nr_pages)) {
+ aio_free_ring(ctx);
+ return -ENOMEM;
}
ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -323,37 +430,16 @@ static int aio_setup_ring(struct kioctx *ctx)
down_write(&mm->mmap_sem);
ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, 0, &populate);
+ MAP_SHARED, 0, &unused);
+ up_write(&mm->mmap_sem);
if (IS_ERR((void *)ctx->mmap_base)) {
- up_write(&mm->mmap_sem);
ctx->mmap_size = 0;
aio_free_ring(ctx);
- return -EAGAIN;
+ return -ENOMEM;
}
pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
- /* We must do this while still holding mmap_sem for write, as we
- * need to be protected against userspace attempting to mremap()
- * or munmap() the ring buffer.
- */
- ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
- 1, 0, ctx->ring_pages, NULL);
-
- /* Dropping the reference here is safe as the page cache will hold
- * onto the pages for us. It is also required so that page migration
- * can unmap the pages and get the right reference count.
- */
- for (i = 0; i < ctx->nr_pages; i++)
- put_page(ctx->ring_pages[i]);
-
- up_write(&mm->mmap_sem);
-
- if (unlikely(ctx->nr_pages != nr_pages)) {
- aio_free_ring(ctx);
- return -EAGAIN;
- }
-
ctx->user_id = ctx->mmap_base;
ctx->nr_events = nr_events; /* trusted copy */
@@ -391,7 +477,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
}
EXPORT_SYMBOL(kiocb_set_cancel_fn);
-static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
+static int kiocb_cancel(struct kiocb *kiocb)
{
kiocb_cancel_fn *old, *cancel;
@@ -412,26 +498,38 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
return cancel(kiocb);
}
-static void free_ioctx_rcu(struct rcu_head *head)
+static void free_ioctx(struct work_struct *work)
{
- struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+ struct kioctx *ctx = container_of(work, struct kioctx, free_work);
+ pr_debug("freeing %p\n", ctx);
+
+ aio_free_ring(ctx);
free_percpu(ctx->cpu);
kmem_cache_free(kioctx_cachep, ctx);
}
+static void free_ioctx_reqs(struct percpu_ref *ref)
+{
+ struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+
+ /* At this point we know that there are no any in-flight requests */
+ if (ctx->requests_done)
+ complete(ctx->requests_done);
+
+ INIT_WORK(&ctx->free_work, free_ioctx);
+ schedule_work(&ctx->free_work);
+}
+
/*
* When this function runs, the kioctx has been removed from the "hash table"
* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
* now it's safe to cancel any that need to be.
*/
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx_users(struct percpu_ref *ref)
{
- struct kioctx *ctx = container_of(work, struct kioctx, free_work);
- struct aio_ring *ring;
+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
struct kiocb *req;
- unsigned cpu, avail;
- DEFINE_WAIT(wait);
spin_lock_irq(&ctx->ctx_lock);
@@ -440,59 +538,13 @@ static void free_ioctx(struct work_struct *work)
struct kiocb, ki_list);
list_del_init(&req->ki_list);
- kiocb_cancel(ctx, req);
+ kiocb_cancel(req);
}
spin_unlock_irq(&ctx->ctx_lock);
- for_each_possible_cpu(cpu) {
- struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
-
- atomic_add(kcpu->reqs_available, &ctx->reqs_available);
- kcpu->reqs_available = 0;
- }
-
- while (1) {
- prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
- ring = kmap_atomic(ctx->ring_pages[0]);
- avail = (ring->head <= ring->tail)
- ? ring->tail - ring->head
- : ctx->nr_events - ring->head + ring->tail;
-
- atomic_add(avail, &ctx->reqs_available);
- ring->head = ring->tail;
- kunmap_atomic(ring);
-
- if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
- break;
-
- schedule();
- }
- finish_wait(&ctx->wait, &wait);
-
- WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
-
- aio_free_ring(ctx);
-
- pr_debug("freeing %p\n", ctx);
-
- /*
- * Here the call_rcu() is between the wait_event() for reqs_active to
- * hit 0, and freeing the ioctx.
- *
- * aio_complete() decrements reqs_active, but it has to touch the ioctx
- * after to issue a wakeup so we use rcu.
- */
- call_rcu(&ctx->rcu_head, free_ioctx_rcu);
-}
-
-static void free_ioctx_ref(struct percpu_ref *ref)
-{
- struct kioctx *ctx = container_of(ref, struct kioctx, users);
-
- INIT_WORK(&ctx->free_work, free_ioctx);
- schedule_work(&ctx->free_work);
+ percpu_ref_kill(&ctx->reqs);
+ percpu_ref_put(&ctx->reqs);
}
static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
@@ -514,6 +566,10 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
+ /* While kioctx setup is in progress,
+ * we are protected from page migration
+ * changes ring_pages by ->ring_lock.
+ */
ring = kmap_atomic(ctx->ring_pages[0]);
ring->id = ctx->id;
kunmap_atomic(ring);
@@ -551,6 +607,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
}
}
+static void aio_nr_sub(unsigned nr)
+{
+ spin_lock(&aio_nr_lock);
+ if (WARN_ON(aio_nr - nr > aio_nr))
+ aio_nr = 0;
+ else
+ aio_nr -= nr;
+ spin_unlock(&aio_nr_lock);
+}
+
/* ioctx_alloc
* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
*/
@@ -588,22 +654,29 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
ctx->max_reqs = nr_events;
- if (percpu_ref_init(&ctx->users, free_ioctx_ref))
- goto out_freectx;
-
spin_lock_init(&ctx->ctx_lock);
spin_lock_init(&ctx->completion_lock);
mutex_init(&ctx->ring_lock);
+ /* Protect against page migration throughout kiotx setup by keeping
+ * the ring_lock mutex held until setup is complete. */
+ mutex_lock(&ctx->ring_lock);
init_waitqueue_head(&ctx->wait);
INIT_LIST_HEAD(&ctx->active_reqs);
+ if (percpu_ref_init(&ctx->users, free_ioctx_users))
+ goto err;
+
+ if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+ goto err;
+
ctx->cpu = alloc_percpu(struct kioctx_cpu);
if (!ctx->cpu)
- goto out_freeref;
+ goto err;
- if (aio_setup_ring(ctx) < 0)
- goto out_freepcpu;
+ err = aio_setup_ring(ctx);
+ if (err < 0)
+ goto err;
atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -615,32 +688,35 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
aio_nr + nr_events < aio_nr) {
spin_unlock(&aio_nr_lock);
- goto out_cleanup;
+ err = -EAGAIN;
+ goto err_ctx;
}
aio_nr += ctx->max_reqs;
spin_unlock(&aio_nr_lock);
- percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
+ percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
+ percpu_ref_get(&ctx->reqs); /* free_ioctx_users() will drop this */
err = ioctx_add_table(ctx, mm);
if (err)
- goto out_cleanup_put;
+ goto err_cleanup;
+
+ /* Release the ring_lock mutex now that all setup is complete. */
+ mutex_unlock(&ctx->ring_lock);
pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, mm, ctx->nr_events);
return ctx;
-out_cleanup_put:
- percpu_ref_put(&ctx->users);
-out_cleanup:
- err = -EAGAIN;
+err_cleanup:
+ aio_nr_sub(ctx->max_reqs);
+err_ctx:
aio_free_ring(ctx);
-out_freepcpu:
+err:
+ mutex_unlock(&ctx->ring_lock);
free_percpu(ctx->cpu);
-out_freeref:
+ free_percpu(ctx->reqs.pcpu_count);
free_percpu(ctx->users.pcpu_count);
-out_freectx:
- put_aio_ring_file(ctx);
kmem_cache_free(kioctx_cachep, ctx);
pr_debug("error allocating ioctx %d\n", err);
return ERR_PTR(err);
@@ -651,40 +727,42 @@ out_freectx:
* when the processes owning a context have all exited to encourage
* the rapid destruction of the kioctx.
*/
-static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
+static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+ struct completion *requests_done)
{
- if (!atomic_xchg(&ctx->dead, 1)) {
- struct kioctx_table *table;
+ struct kioctx_table *table;
- spin_lock(&mm->ioctx_lock);
- rcu_read_lock();
- table = rcu_dereference(mm->ioctx_table);
+ if (atomic_xchg(&ctx->dead, 1))
+ return -EINVAL;
- WARN_ON(ctx != table->table[ctx->id]);
- table->table[ctx->id] = NULL;
- rcu_read_unlock();
- spin_unlock(&mm->ioctx_lock);
- /* percpu_ref_kill() will do the necessary call_rcu() */
- wake_up_all(&ctx->wait);
+ spin_lock(&mm->ioctx_lock);
+ rcu_read_lock();
+ table = rcu_dereference(mm->ioctx_table);
+
+ WARN_ON(ctx != table->table[ctx->id]);
+ table->table[ctx->id] = NULL;
+ rcu_read_unlock();
+ spin_unlock(&mm->ioctx_lock);
- /*
- * It'd be more correct to do this in free_ioctx(), after all
- * the outstanding kiocbs have finished - but by then io_destroy
- * has already returned, so io_setup() could potentially return
- * -EAGAIN with no ioctxs actually in use (as far as userspace
- * could tell).
- */
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
- aio_nr -= ctx->max_reqs;
- spin_unlock(&aio_nr_lock);
+ /* percpu_ref_kill() will do the necessary call_rcu() */
+ wake_up_all(&ctx->wait);
+
+ /*
+ * It'd be more correct to do this in free_ioctx(), after all
+ * the outstanding kiocbs have finished - but by then io_destroy
+ * has already returned, so io_setup() could potentially return
+ * -EAGAIN with no ioctxs actually in use (as far as userspace
+ * could tell).
+ */
+ aio_nr_sub(ctx->max_reqs);
- if (ctx->mmap_size)
- vm_munmap(ctx->mmap_base, ctx->mmap_size);
+ if (ctx->mmap_size)
+ vm_munmap(ctx->mmap_base, ctx->mmap_size);
- percpu_ref_kill(&ctx->users);
- }
+ ctx->requests_done = requests_done;
+ percpu_ref_kill(&ctx->users);
+ return 0;
}
/* wait_on_sync_kiocb:
@@ -745,23 +823,27 @@ void exit_aio(struct mm_struct *mm)
*/
ctx->mmap_size = 0;
- kill_ioctx(mm, ctx);
+ kill_ioctx(mm, ctx, NULL);
}
}
static void put_reqs_available(struct kioctx *ctx, unsigned nr)
{
struct kioctx_cpu *kcpu;
+ unsigned long flags;
preempt_disable();
kcpu = this_cpu_ptr(ctx->cpu);
+ local_irq_save(flags);
kcpu->reqs_available += nr;
+
while (kcpu->reqs_available >= ctx->req_batch * 2) {
kcpu->reqs_available -= ctx->req_batch;
atomic_add(ctx->req_batch, &ctx->reqs_available);
}
+ local_irq_restore(flags);
preempt_enable();
}
@@ -769,10 +851,12 @@ static bool get_reqs_available(struct kioctx *ctx)
{
struct kioctx_cpu *kcpu;
bool ret = false;
+ unsigned long flags;
preempt_disable();
kcpu = this_cpu_ptr(ctx->cpu);
+ local_irq_save(flags);
if (!kcpu->reqs_available) {
int old, avail = atomic_read(&ctx->reqs_available);
@@ -791,6 +875,7 @@ static bool get_reqs_available(struct kioctx *ctx)
ret = true;
kcpu->reqs_available--;
out:
+ local_irq_restore(flags);
preempt_enable();
return ret;
}
@@ -810,6 +895,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
if (unlikely(!req))
goto out_put;
+ percpu_ref_get(&ctx->reqs);
+
req->ki_ctx = ctx;
return req;
out_put:
@@ -879,12 +966,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
return;
}
- /*
- * Take rcu_read_lock() in case the kioctx is being destroyed, as we
- * need to issue a wakeup after incrementing reqs_available.
- */
- rcu_read_lock();
-
if (iocb->ki_list.next) {
unsigned long flags;
@@ -947,6 +1028,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
/* everything turned out well, dispose of the aiocb. */
kiocb_free(iocb);
+ put_reqs_available(ctx, 1);
/*
* We have to order our ring_info tail store above and test
@@ -959,7 +1041,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
- rcu_read_unlock();
+ percpu_ref_put(&ctx->reqs);
}
EXPORT_SYMBOL(aio_complete);
@@ -977,6 +1059,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
mutex_lock(&ctx->ring_lock);
+ /* Access to ->ring_pages here is protected by ctx->ring_lock. */
ring = kmap_atomic(ctx->ring_pages[0]);
head = ring->head;
tail = ring->tail;
@@ -987,6 +1070,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
if (head == tail)
goto out;
+ head %= ctx->nr_events;
+ tail %= ctx->nr_events;
+
while (ret < nr) {
long avail;
struct io_event *ev;
@@ -1025,8 +1111,6 @@ static long aio_read_events_ring(struct kioctx *ctx,
flush_dcache_page(ctx->ring_pages[0]);
pr_debug("%li h%u t%u\n", ret, head, tail);
-
- put_reqs_available(ctx, ret);
out:
mutex_unlock(&ctx->ring_lock);
@@ -1124,7 +1208,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
- kill_ioctx(current->mm, ioctx);
+ kill_ioctx(current->mm, ioctx, NULL);
percpu_ref_put(&ioctx->users);
}
@@ -1142,9 +1226,25 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
{
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
- kill_ioctx(current->mm, ioctx);
+ struct completion requests_done =
+ COMPLETION_INITIALIZER_ONSTACK(requests_done);
+ int ret;
+
+ /* Pass requests_done to kill_ioctx() where it can be set
+ * in a thread-safe way. If we try to set it here then we have
+ * a race condition if two io_destroy() called simultaneously.
+ */
+ ret = kill_ioctx(current->mm, ioctx, &requests_done);
percpu_ref_put(&ioctx->users);
- return 0;
+
+ /* Wait until all IO for the context are done. Otherwise kernel
+ * keep using user-space buffers even if user thinks the context
+ * is destroyed.
+ */
+ if (!ret)
+ wait_for_completion(&requests_done);
+
+ return ret;
}
pr_debug("EINVAL: io_destroy: invalid context id\n");
return -EINVAL;
@@ -1152,6 +1252,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
+typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *);
static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
int rw, char __user *buf,
@@ -1209,7 +1310,9 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
int rw;
fmode_t mode;
aio_rw_op *rw_op;
+ rw_iter_op *iter_op;
struct iovec inline_vec, *iovec = &inline_vec;
+ struct iov_iter iter;
switch (opcode) {
case IOCB_CMD_PREAD:
@@ -1217,6 +1320,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
mode = FMODE_READ;
rw = READ;
rw_op = file->f_op->aio_read;
+ iter_op = file->f_op->read_iter;
goto rw_common;
case IOCB_CMD_PWRITE:
@@ -1224,12 +1328,13 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
mode = FMODE_WRITE;
rw = WRITE;
rw_op = file->f_op->aio_write;
+ iter_op = file->f_op->write_iter;
goto rw_common;
rw_common:
if (unlikely(!(file->f_mode & mode)))
return -EBADF;
- if (!rw_op)
+ if (!rw_op && !iter_op)
return -EINVAL;
ret = (opcode == IOCB_CMD_PREADV ||
@@ -1238,10 +1343,8 @@ rw_common:
&iovec, compat)
: aio_setup_single_vector(req, rw, buf, &nr_segs,
iovec);
- if (ret)
- return ret;
-
- ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+ if (!ret)
+ ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
if (ret < 0) {
if (iovec != &inline_vec)
kfree(iovec);
@@ -1260,7 +1363,12 @@ rw_common:
if (rw == WRITE)
file_start_write(file);
- ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+ if (iter_op) {
+ iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes);
+ ret = iter_op(req, &iter);
+ } else {
+ ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+ }
if (rw == WRITE)
file_end_write(file);
@@ -1370,6 +1478,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
return 0;
out_put_req:
put_reqs_available(ctx, 1);
+ percpu_ref_put(&ctx->reqs);
kiocb_free(req);
return ret;
}
@@ -1497,7 +1606,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
kiocb = lookup_kiocb(ctx, iocb, key);
if (kiocb)
- ret = kiocb_cancel(ctx, kiocb);
+ ret = kiocb_cancel(kiocb);
else
ret = -EINVAL;
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 85c96184995..80ef38c73e5 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -24,7 +24,6 @@
static struct vfsmount *anon_inode_mnt __read_mostly;
static struct inode *anon_inode_inode;
-static const struct file_operations anon_inode_fops;
/*
* anon_inodefs_dname() is called from d_path().
@@ -39,67 +38,11 @@ static const struct dentry_operations anon_inodefs_dentry_operations = {
.d_dname = anon_inodefs_dname,
};
-/*
- * nop .set_page_dirty method so that people can use .page_mkwrite on
- * anon inodes.
- */
-static int anon_set_page_dirty(struct page *page)
-{
- return 0;
-};
-
-static const struct address_space_operations anon_aops = {
- .set_page_dirty = anon_set_page_dirty,
-};
-
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
-static struct inode *anon_inode_mkinode(struct super_block *s)
-{
- struct inode *inode = new_inode_pseudo(s);
-
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- inode->i_ino = get_next_ino();
- inode->i_fop = &anon_inode_fops;
-
- inode->i_mapping->a_ops = &anon_aops;
-
- /*
- * Mark the inode dirty from the very beginning,
- * that way it will never be moved to the dirty
- * list because mark_inode_dirty() will think
- * that it already _is_ on the dirty list.
- */
- inode->i_state = I_DIRTY;
- inode->i_mode = S_IRUSR | S_IWUSR;
- inode->i_uid = current_fsuid();
- inode->i_gid = current_fsgid();
- inode->i_flags |= S_PRIVATE;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- return inode;
-}
-
static struct dentry *anon_inodefs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- struct dentry *root;
- root = mount_pseudo(fs_type, "anon_inode:", NULL,
+ return mount_pseudo(fs_type, "anon_inode:", NULL,
&anon_inodefs_dentry_operations, ANON_INODE_FS_MAGIC);
- if (!IS_ERR(root)) {
- struct super_block *s = root->d_sb;
- anon_inode_inode = anon_inode_mkinode(s);
- if (IS_ERR(anon_inode_inode)) {
- dput(root);
- deactivate_locked_super(s);
- root = ERR_CAST(anon_inode_inode);
- }
- }
- return root;
}
static struct file_system_type anon_inode_fs_type = {
@@ -109,72 +52,6 @@ static struct file_system_type anon_inode_fs_type = {
};
/**
- * anon_inode_getfile_private - creates a new file instance by hooking it up to an
- * anonymous inode, and a dentry that describe the "class"
- * of the file
- *
- * @name: [in] name of the "class" of the new file
- * @fops: [in] file operations for the new file
- * @priv: [in] private data for the new file (will be file's private_data)
- * @flags: [in] flags
- *
- *
- * Similar to anon_inode_getfile, but each file holds a single inode.
- *
- */
-struct file *anon_inode_getfile_private(const char *name,
- const struct file_operations *fops,
- void *priv, int flags)
-{
- struct qstr this;
- struct path path;
- struct file *file;
- struct inode *inode;
-
- if (fops->owner && !try_module_get(fops->owner))
- return ERR_PTR(-ENOENT);
-
- inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb);
- if (IS_ERR(inode)) {
- file = ERR_PTR(-ENOMEM);
- goto err_module;
- }
-
- /*
- * Link the inode to a directory entry by creating a unique name
- * using the inode sequence number.
- */
- file = ERR_PTR(-ENOMEM);
- this.name = name;
- this.len = strlen(name);
- this.hash = 0;
- path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
- if (!path.dentry)
- goto err_module;
-
- path.mnt = mntget(anon_inode_mnt);
-
- d_instantiate(path.dentry, inode);
-
- file = alloc_file(&path, OPEN_FMODE(flags), fops);
- if (IS_ERR(file))
- goto err_dput;
-
- file->f_mapping = inode->i_mapping;
- file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
- file->private_data = priv;
-
- return file;
-
-err_dput:
- path_put(&path);
-err_module:
- module_put(fops->owner);
- return file;
-}
-EXPORT_SYMBOL_GPL(anon_inode_getfile_private);
-
-/**
* anon_inode_getfile - creates a new file instance by hooking it up to an
* anonymous inode, and a dentry that describe the "class"
* of the file
@@ -287,22 +164,15 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
static int __init anon_inode_init(void)
{
- int error;
-
- error = register_filesystem(&anon_inode_fs_type);
- if (error)
- goto err_exit;
anon_inode_mnt = kern_mount(&anon_inode_fs_type);
- if (IS_ERR(anon_inode_mnt)) {
- error = PTR_ERR(anon_inode_mnt);
- goto err_unregister_filesystem;
- }
- return 0;
+ if (IS_ERR(anon_inode_mnt))
+ panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));
-err_unregister_filesystem:
- unregister_filesystem(&anon_inode_fs_type);
-err_exit:
- panic(KERN_ERR "anon_inode_init() failed (%d)\n", error);
+ anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
+ if (IS_ERR(anon_inode_inode))
+ panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));
+
+ return 0;
}
fs_initcall(anon_inode_init);
diff --git a/fs/attr.c b/fs/attr.c
index 1449adb14ef..6530ced1969 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -50,14 +50,14 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
if ((ia_valid & ATTR_UID) &&
(!uid_eq(current_fsuid(), inode->i_uid) ||
!uid_eq(attr->ia_uid, inode->i_uid)) &&
- !inode_capable(inode, CAP_CHOWN))
+ !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
return -EPERM;
/* Make sure caller can chgrp. */
if ((ia_valid & ATTR_GID) &&
(!uid_eq(current_fsuid(), inode->i_uid) ||
(!in_group_p(attr->ia_gid) && !gid_eq(attr->ia_gid, inode->i_gid))) &&
- !inode_capable(inode, CAP_CHOWN))
+ !capable_wrt_inode_uidgid(inode, CAP_CHOWN))
return -EPERM;
/* Make sure a caller can chmod. */
@@ -67,7 +67,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
/* Also check the setgid bit! */
if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
inode->i_gid) &&
- !inode_capable(inode, CAP_FSETID))
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID))
attr->ia_mode &= ~S_ISGID;
}
@@ -160,14 +160,34 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
umode_t mode = attr->ia_mode;
if (!in_group_p(inode->i_gid) &&
- !inode_capable(inode, CAP_FSETID))
+ !capable_wrt_inode_uidgid(inode, CAP_FSETID))
mode &= ~S_ISGID;
inode->i_mode = mode;
}
}
EXPORT_SYMBOL(setattr_copy);
-int notify_change(struct dentry * dentry, struct iattr * attr)
+/**
+ * notify_change - modify attributes of a filesytem object
+ * @dentry: object affected
+ * @iattr: new attributes
+ * @delegated_inode: returns inode, if the inode is delegated
+ *
+ * The caller must hold the i_mutex on the affected object.
+ *
+ * If notify_change discovers a delegation in need of breaking,
+ * it will return -EWOULDBLOCK and return a reference to the inode in
+ * delegated_inode. The caller should then break the delegation and
+ * retry. Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode. This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported. Also, passing NULL is fine for callers holding
+ * the file open for write, as there can be no conflicting delegation in
+ * that case.
+ */
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
umode_t mode = inode->i_mode;
@@ -182,11 +202,6 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
return -EPERM;
}
- if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
- if (attr->ia_size != inode->i_size)
- inode_inc_iversion(inode);
- }
-
if ((ia_valid & ATTR_MODE)) {
umode_t amode = attr->ia_mode;
/* Flag setting protected by i_mutex */
@@ -243,6 +258,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
error = security_inode_setattr(dentry, attr);
if (error)
return error;
+ error = try_break_deleg(inode, delegated_inode);
+ if (error)
+ return error;
if (inode->i_op->setattr)
error = inode->i_op->setattr(dentry, attr);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 3f1128b37e4..acf32054edd 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -104,7 +104,7 @@ struct autofs_sb_info {
u32 magic;
int pipefd;
struct file *pipe;
- pid_t oz_pgrp;
+ struct pid *oz_pgrp;
int catatonic;
int version;
int sub_version;
@@ -122,6 +122,7 @@ struct autofs_sb_info {
spinlock_t lookup_lock;
struct list_head active_list;
struct list_head expiring_list;
+ struct rcu_head rcu;
};
static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
@@ -139,7 +140,7 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
filesystem without "magic".) */
static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
- return sbi->catatonic || task_pgrp_nr(current) == sbi->oz_pgrp;
+ return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
}
/* Does a dentry have some pending activity? */
@@ -271,7 +272,7 @@ void autofs4_clean_ino(struct autofs_info *);
static inline int autofs_prepare_pipe(struct file *pipe)
{
- if (!pipe->f_op || !pipe->f_op->write)
+ if (!pipe->f_op->write)
return -EINVAL;
if (!S_ISFIFO(file_inode(pipe)->i_mode))
return -EINVAL;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 0f00da329e7..5b570b6efa2 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -103,6 +103,9 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
if (tmp.size < sizeof(tmp))
return ERR_PTR(-EINVAL);
+ if (tmp.size > (PATH_MAX + sizeof(tmp)))
+ return ERR_PTR(-ENAMETOOLONG);
+
return memdup_user(in, tmp.size);
}
@@ -346,6 +349,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
{
int pipefd;
int err = 0;
+ struct pid *new_pid = NULL;
if (param->setpipefd.pipefd == -1)
return -EINVAL;
@@ -357,7 +361,17 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
mutex_unlock(&sbi->wq_mutex);
return -EBUSY;
} else {
- struct file *pipe = fget(pipefd);
+ struct file *pipe;
+
+ new_pid = get_task_pid(current, PIDTYPE_PGID);
+
+ if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) {
+ AUTOFS_WARN("Not allowed to change PID namespace");
+ err = -EINVAL;
+ goto out;
+ }
+
+ pipe = fget(pipefd);
if (!pipe) {
err = -EBADF;
goto out;
@@ -367,12 +381,13 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
fput(pipe);
goto out;
}
- sbi->oz_pgrp = task_pgrp_nr(current);
+ swap(sbi->oz_pgrp, new_pid);
sbi->pipefd = pipefd;
sbi->pipe = pipe;
sbi->catatonic = 0;
}
out:
+ put_pid(new_pid);
mutex_unlock(&sbi->wq_mutex);
return err;
}
@@ -658,12 +673,6 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
goto out;
}
- if (!fp->f_op) {
- err = -ENOTTY;
- fput(fp);
- goto out;
- }
-
sbi = autofs_dev_ioctl_sbi(fp);
if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) {
err = -EINVAL;
@@ -728,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
MODULE_ALIAS("devname:autofs");
/* Register/deregister misc character device */
-int autofs_dev_ioctl_init(void)
+int __init autofs_dev_ioctl_init(void)
{
int r;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3d9d3f5d5dd..394e90b02c5 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -402,6 +402,20 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
goto next;
}
+ if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+ DPRINTK("checking symlink %p %.*s",
+ dentry, (int)dentry->d_name.len, dentry->d_name.name);
+ /*
+ * A symlink can't be "busy" in the usual sense so
+ * just check last used for expire timeout.
+ */
+ if (autofs4_can_expire(dentry, timeout, do_now)) {
+ expired = dentry;
+ goto found;
+ }
+ goto next;
+ }
+
if (simple_empty(dentry))
goto next;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index b104726e2d0..1c55388ae63 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -56,18 +56,16 @@ void autofs4_kill_sb(struct super_block *sb)
* just call kill_anon_super when we are called from
* deactivate_super.
*/
- if (!sbi)
- goto out_kill_sb;
-
- /* Free wait queues, close pipe */
- autofs4_catatonic_mode(sbi);
-
- sb->s_fs_info = NULL;
- kfree(sbi);
+ if (sbi) {
+ /* Free wait queues, close pipe */
+ autofs4_catatonic_mode(sbi);
+ put_pid(sbi->oz_pgrp);
+ }
-out_kill_sb:
DPRINTK("shutting down");
kill_litter_super(sb);
+ if (sbi)
+ kfree_rcu(sbi, rcu);
}
static int autofs4_show_options(struct seq_file *m, struct dentry *root)
@@ -85,7 +83,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID))
seq_printf(m, ",gid=%u",
from_kgid_munged(&init_user_ns, root_inode->i_gid));
- seq_printf(m, ",pgrp=%d", sbi->oz_pgrp);
+ seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp));
seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ);
seq_printf(m, ",minproto=%d", sbi->min_proto);
seq_printf(m, ",maxproto=%d", sbi->max_proto);
@@ -129,7 +127,8 @@ static const match_table_t tokens = {
};
static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
- pid_t *pgrp, unsigned int *type, int *minproto, int *maxproto)
+ int *pgrp, bool *pgrp_set, unsigned int *type,
+ int *minproto, int *maxproto)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -137,7 +136,6 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
*uid = current_uid();
*gid = current_gid();
- *pgrp = task_pgrp_nr(current);
*minproto = AUTOFS_MIN_PROTO_VERSION;
*maxproto = AUTOFS_MAX_PROTO_VERSION;
@@ -176,6 +174,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
if (match_int(args, &option))
return 1;
*pgrp = option;
+ *pgrp_set = true;
break;
case Opt_minproto:
if (match_int(args, &option))
@@ -211,10 +210,13 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
int pipefd;
struct autofs_sb_info *sbi;
struct autofs_info *ino;
+ int pgrp = 0;
+ bool pgrp_set = false;
+ int ret = -EINVAL;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
- goto fail_unlock;
+ return -ENOMEM;
DPRINTK("starting up, sbi = %p",sbi);
s->s_fs_info = sbi;
@@ -223,7 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->pipe = NULL;
sbi->catatonic = 1;
sbi->exp_timeout = 0;
- sbi->oz_pgrp = task_pgrp_nr(current);
+ sbi->oz_pgrp = NULL;
sbi->sb = s;
sbi->version = 0;
sbi->sub_version = 0;
@@ -248,8 +250,10 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
* Get the root inode and dentry, but defer checking for errors.
*/
ino = autofs4_new_ino(sbi);
- if (!ino)
+ if (!ino) {
+ ret = -ENOMEM;
goto fail_free;
+ }
root_inode = autofs4_get_inode(s, S_IFDIR | 0755);
root = d_make_root(root_inode);
if (!root)
@@ -260,12 +264,23 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
/* Can this call block? */
if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid,
- &sbi->oz_pgrp, &sbi->type, &sbi->min_proto,
- &sbi->max_proto)) {
+ &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto,
+ &sbi->max_proto)) {
printk("autofs: called with bogus options\n");
goto fail_dput;
}
+ if (pgrp_set) {
+ sbi->oz_pgrp = find_get_pid(pgrp);
+ if (!sbi->oz_pgrp) {
+ pr_warn("autofs: could not find process group %d\n",
+ pgrp);
+ goto fail_dput;
+ }
+ } else {
+ sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID);
+ }
+
if (autofs_type_trigger(sbi->type))
__managed_dentry_set_managed(root);
@@ -289,14 +304,15 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
sbi->version = sbi->max_proto;
sbi->sub_version = AUTOFS_PROTO_SUBVERSION;
- DPRINTK("pipe fd = %d, pgrp = %u", pipefd, sbi->oz_pgrp);
+ DPRINTK("pipe fd = %d, pgrp = %u", pipefd, pid_nr(sbi->oz_pgrp));
pipe = fget(pipefd);
-
+
if (!pipe) {
printk("autofs: could not open pipe file descriptor\n");
goto fail_dput;
}
- if (autofs_prepare_pipe(pipe) < 0)
+ ret = autofs_prepare_pipe(pipe);
+ if (ret < 0)
goto fail_fput;
sbi->pipe = pipe;
sbi->pipefd = pipefd;
@@ -321,10 +337,10 @@ fail_dput:
fail_ino:
kfree(ino);
fail_free:
+ put_pid(sbi->oz_pgrp);
kfree(sbi);
s->s_fs_info = NULL;
-fail_unlock:
- return -EINVAL;
+ return ret;
}
struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode)
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 92ef341ba0c..cc87c1abac9 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
spin_lock(&active->d_lock);
/* Already gone? */
- if (!d_count(active))
+ if ((int) d_count(active) <= 0)
goto next;
qstr = &active->d_name;
@@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
spin_lock(&expiring->d_lock);
- /* Bad luck, we've already been dentry_iput */
+ /* We've already been dentry_iput or unlinked */
if (!expiring->d_inode)
goto next;
@@ -558,7 +558,7 @@ static int autofs4_dir_symlink(struct inode *dir,
dget(dentry);
atomic_inc(&ino->count);
p_ino = autofs4_dentry_ino(dentry->d_parent);
- if (p_ino && dentry->d_parent != dentry)
+ if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
dir->i_mtime = CURRENT_TIME;
@@ -593,7 +593,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
if (atomic_dec_and_test(&ino->count)) {
p_ino = autofs4_dentry_ino(dentry->d_parent);
- if (p_ino && dentry->d_parent != dentry)
+ if (p_ino && !IS_ROOT(dentry))
atomic_dec(&p_ino->count);
}
dput(ino->dentry);
@@ -732,7 +732,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
dget(dentry);
atomic_inc(&ino->count);
p_ino = autofs4_dentry_ino(dentry->d_parent);
- if (p_ino && dentry->d_parent != dentry)
+ if (p_ino && !IS_ROOT(dentry))
atomic_inc(&p_ino->count);
inc_nlink(dir);
dir->i_mtime = CURRENT_TIME;
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index f27c094a191..1e8ea192be2 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -14,6 +14,10 @@
static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
{
+ struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+ struct autofs_info *ino = autofs4_dentry_ino(dentry);
+ if (ino && !autofs4_oz_mode(sbi))
+ ino->last_used = jiffies;
nd_set_link(nd, dentry->d_inode->i_private);
return NULL;
}
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 689e40d983a..116fd38ee47 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -347,11 +347,23 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
struct qstr qstr;
char *name;
int status, ret, type;
+ pid_t pid;
+ pid_t tgid;
/* In catatonic mode, we don't wait for nobody */
if (sbi->catatonic)
return -ENOENT;
+ /*
+ * Try translating pids to the namespace of the daemon.
+ *
+ * Zero means failure: we are in an unrelated pid namespace.
+ */
+ pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+ tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp));
+ if (pid == 0 || tgid == 0)
+ return -ENOENT;
+
if (!dentry->d_inode) {
/*
* A wait for a negative dentry is invalid for certain
@@ -417,8 +429,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
wq->ino = autofs4_get_ino(sbi);
wq->uid = current_uid();
wq->gid = current_gid();
- wq->pid = current->pid;
- wq->tgid = current->tgid;
+ wq->pid = pid;
+ wq->tgid = tgid;
wq->status = -EINTR; /* Status return if interrupted */
wq->wait_ctr = 2;
diff --git a/fs/befs/Makefile b/fs/befs/Makefile
index 2f370bd7a50..8b9f66642a8 100644
--- a/fs/befs/Makefile
+++ b/fs/befs/Makefile
@@ -3,5 +3,5 @@
#
obj-$(CONFIG_BEFS_FS) += befs.o
-
+ccflags-$(CONFIG_BEFS_DEBUG) += -DDEBUG
befs-objs := datastream.o btree.o super.o inode.o debug.o io.o linuxvfs.o
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index b2664283915..3a7813ab8c9 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -88,8 +88,11 @@ enum befs_err {
/****************************/
/* debug.c */
+__printf(2, 3)
void befs_error(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
void befs_warning(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
void befs_debug(const struct super_block *sb, const char *fmt, ...);
void befs_dump_super_block(const struct super_block *sb, befs_super_block *);
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 74e397db0b8..9c7faa8a928 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -137,7 +137,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
struct buffer_head *bh = NULL;
befs_disk_btree_super *od_sup = NULL;
- befs_debug(sb, "---> befs_btree_read_super()");
+ befs_debug(sb, "---> %s", __func__);
bh = befs_read_datastream(sb, ds, 0, NULL);
@@ -162,11 +162,11 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
goto error;
}
- befs_debug(sb, "<--- befs_btree_read_super()");
+ befs_debug(sb, "<--- %s", __func__);
return BEFS_OK;
error:
- befs_debug(sb, "<--- befs_btree_read_super() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -195,16 +195,16 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
{
uint off = 0;
- befs_debug(sb, "---> befs_bt_read_node()");
+ befs_debug(sb, "---> %s", __func__);
if (node->bh)
brelse(node->bh);
node->bh = befs_read_datastream(sb, ds, node_off, &off);
if (!node->bh) {
- befs_error(sb, "befs_bt_read_node() failed to read "
- "node at %Lu", node_off);
- befs_debug(sb, "<--- befs_bt_read_node() ERROR");
+ befs_error(sb, "%s failed to read "
+ "node at %llu", __func__, node_off);
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -221,7 +221,7 @@ befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
node->head.all_key_length =
fs16_to_cpu(sb, node->od_node->all_key_length);
- befs_debug(sb, "<--- befs_btree_read_node()");
+ befs_debug(sb, "<--- %s", __func__);
return BEFS_OK;
}
@@ -252,7 +252,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
befs_off_t node_off;
int res;
- befs_debug(sb, "---> befs_btree_find() Key: %s", key);
+ befs_debug(sb, "---> %s Key: %s", __func__, key);
if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
befs_error(sb,
@@ -263,7 +263,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
this_node = kmalloc(sizeof (befs_btree_node),
GFP_NOFS);
if (!this_node) {
- befs_error(sb, "befs_btree_find() failed to allocate %u "
+ befs_error(sb, "befs_btree_find() failed to allocate %zu "
"bytes of memory", sizeof (befs_btree_node));
goto error;
}
@@ -274,7 +274,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
node_off = bt_super.root_node_ptr;
if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
befs_error(sb, "befs_btree_find() failed to read "
- "node at %Lu", node_off);
+ "node at %llu", node_off);
goto error_alloc;
}
@@ -285,7 +285,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
/* if no match, go to overflow node */
if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
befs_error(sb, "befs_btree_find() failed to read "
- "node at %Lu", node_off);
+ "node at %llu", node_off);
goto error_alloc;
}
}
@@ -298,11 +298,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
kfree(this_node);
if (res != BEFS_BT_MATCH) {
- befs_debug(sb, "<--- befs_btree_find() Key %s not found", key);
+ befs_debug(sb, "<--- %s Key %s not found", __func__, key);
*value = 0;
return BEFS_BT_NOT_FOUND;
}
- befs_debug(sb, "<--- befs_btree_find() Found key %s, value %Lu",
+ befs_debug(sb, "<--- %s Found key %s, value %llu", __func__,
key, *value);
return BEFS_OK;
@@ -310,7 +310,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
kfree(this_node);
error:
*value = 0;
- befs_debug(sb, "<--- befs_btree_find() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -318,7 +318,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
* befs_find_key - Search for a key within a node
* @sb: Filesystem superblock
* @node: Node to find the key within
- * @key: Keystring to search for
+ * @findkey: Keystring to search for
* @value: If key is found, the value stored with the key is put here
*
* finds exact match if one exists, and returns BEFS_BT_MATCH
@@ -343,7 +343,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
char *thiskey;
fs64 *valarray;
- befs_debug(sb, "---> befs_find_key() %s", findkey);
+ befs_debug(sb, "---> %s %s", __func__, findkey);
*value = 0;
@@ -355,7 +355,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
eq = befs_compare_strings(thiskey, keylen, findkey, findkey_len);
if (eq < 0) {
- befs_debug(sb, "<--- befs_find_key() %s not found", findkey);
+ befs_debug(sb, "<--- %s %s not found", __func__, findkey);
return BEFS_BT_NOT_FOUND;
}
@@ -373,8 +373,8 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
findkey_len);
if (eq == 0) {
- befs_debug(sb, "<--- befs_find_key() found %s at %d",
- thiskey, mid);
+ befs_debug(sb, "<--- %s found %s at %d",
+ __func__, thiskey, mid);
*value = fs64_to_cpu(sb, valarray[mid]);
return BEFS_BT_MATCH;
@@ -388,7 +388,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
*value = fs64_to_cpu(sb, valarray[mid + 1]);
else
*value = fs64_to_cpu(sb, valarray[mid]);
- befs_debug(sb, "<--- befs_find_key() found %s at %d", thiskey, mid);
+ befs_debug(sb, "<--- %s found %s at %d", __func__, thiskey, mid);
return BEFS_BT_PARMATCH;
}
@@ -405,7 +405,7 @@ befs_find_key(struct super_block *sb, befs_btree_node * node,
* Heres how it works: Key_no is the index of the key/value pair to
* return in keybuf/value.
* Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is
- * the number of charecters in the key (just a convenience).
+ * the number of characters in the key (just a convenience).
*
* Algorithm:
* Get the first leafnode of the tree. See if the requested key is in that
@@ -428,7 +428,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
uint key_sum = 0;
- befs_debug(sb, "---> befs_btree_read()");
+ befs_debug(sb, "---> %s", __func__);
if (befs_bt_read_super(sb, ds, &bt_super) != BEFS_OK) {
befs_error(sb,
@@ -437,7 +437,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
}
if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) {
- befs_error(sb, "befs_btree_read() failed to allocate %u "
+ befs_error(sb, "befs_btree_read() failed to allocate %zu "
"bytes of memory", sizeof (befs_btree_node));
goto error;
}
@@ -452,7 +452,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
kfree(this_node);
*value = 0;
*keysize = 0;
- befs_debug(sb, "<--- befs_btree_read() Tree is EMPTY");
+ befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
return BEFS_BT_EMPTY;
} else if (res == BEFS_ERR) {
goto error_alloc;
@@ -467,7 +467,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
*keysize = 0;
*value = 0;
befs_debug(sb,
- "<--- befs_btree_read() END of keys at %Lu",
+ "<--- %s END of keys at %llu", __func__,
+ (unsigned long long)
key_sum + this_node->head.all_key_count);
brelse(this_node->bh);
kfree(this_node);
@@ -478,8 +479,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
node_off = this_node->head.right;
if (befs_bt_read_node(sb, ds, this_node, node_off) != BEFS_OK) {
- befs_error(sb, "befs_btree_read() failed to read "
- "node at %Lu", node_off);
+ befs_error(sb, "%s failed to read node at %llu",
+ __func__, (unsigned long long)node_off);
goto error_alloc;
}
}
@@ -492,27 +493,28 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
keystart = befs_bt_get_key(sb, this_node, cur_key, &keylen);
- befs_debug(sb, "Read [%Lu,%d]: keysize %d", node_off, cur_key, keylen);
+ befs_debug(sb, "Read [%llu,%d]: keysize %d",
+ (long long unsigned int)node_off, (int)cur_key,
+ (int)keylen);
if (bufsize < keylen + 1) {
- befs_error(sb, "befs_btree_read() keybuf too small (%u) "
- "for key of size %d", bufsize, keylen);
+ befs_error(sb, "%s keybuf too small (%zu) "
+ "for key of size %d", __func__, bufsize, keylen);
brelse(this_node->bh);
goto error_alloc;
- };
+ }
- strncpy(keybuf, keystart, keylen);
+ strlcpy(keybuf, keystart, keylen + 1);
*value = fs64_to_cpu(sb, valarray[cur_key]);
*keysize = keylen;
- keybuf[keylen] = '\0';
- befs_debug(sb, "Read [%Lu,%d]: Key \"%.*s\", Value %Lu", node_off,
+ befs_debug(sb, "Read [%llu,%d]: Key \"%.*s\", Value %llu", node_off,
cur_key, keylen, keybuf, *value);
brelse(this_node->bh);
kfree(this_node);
- befs_debug(sb, "<--- befs_btree_read()");
+ befs_debug(sb, "<--- %s", __func__);
return BEFS_OK;
@@ -522,7 +524,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
error:
*keysize = 0;
*value = 0;
- befs_debug(sb, "<--- befs_btree_read() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -547,26 +549,26 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
befs_off_t * node_off)
{
- befs_debug(sb, "---> befs_btree_seekleaf()");
+ befs_debug(sb, "---> %s", __func__);
if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
- befs_error(sb, "befs_btree_seekleaf() failed to read "
- "node at %Lu", *node_off);
+ befs_error(sb, "%s failed to read "
+ "node at %llu", __func__, *node_off);
goto error;
}
- befs_debug(sb, "Seekleaf to root node %Lu", *node_off);
+ befs_debug(sb, "Seekleaf to root node %llu", *node_off);
if (this_node->head.all_key_count == 0 && befs_leafnode(this_node)) {
- befs_debug(sb, "<--- befs_btree_seekleaf() Tree is EMPTY");
+ befs_debug(sb, "<--- %s Tree is EMPTY", __func__);
return BEFS_BT_EMPTY;
}
while (!befs_leafnode(this_node)) {
if (this_node->head.all_key_count == 0) {
- befs_debug(sb, "befs_btree_seekleaf() encountered "
- "an empty interior node: %Lu. Using Overflow "
- "node: %Lu", *node_off,
+ befs_debug(sb, "%s encountered "
+ "an empty interior node: %llu. Using Overflow "
+ "node: %llu", __func__, *node_off,
this_node->head.overflow);
*node_off = this_node->head.overflow;
} else {
@@ -574,19 +576,19 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
*node_off = fs64_to_cpu(sb, valarray[0]);
}
if (befs_bt_read_node(sb, ds, this_node, *node_off) != BEFS_OK) {
- befs_error(sb, "befs_btree_seekleaf() failed to read "
- "node at %Lu", *node_off);
+ befs_error(sb, "%s failed to read "
+ "node at %llu", __func__, *node_off);
goto error;
}
- befs_debug(sb, "Seekleaf to child node %Lu", *node_off);
+ befs_debug(sb, "Seekleaf to child node %llu", *node_off);
}
- befs_debug(sb, "Node %Lu is a leaf node", *node_off);
+ befs_debug(sb, "Node %llu is a leaf node", *node_off);
return BEFS_OK;
error:
- befs_debug(sb, "<--- befs_btree_seekleaf() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -704,7 +706,7 @@ befs_bt_get_key(struct super_block *sb, befs_btree_node * node,
* @key1: pointer to the first key to be compared
* @keylen1: length in bytes of key1
* @key2: pointer to the second key to be compared
- * @kelen2: length in bytes of key2
+ * @keylen2: length in bytes of key2
*
* Returns 0 if @key1 and @key2 are equal.
* Returns >0 if @key1 is greater.
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index 59096b5e0fc..1e8e0b8d883 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -52,26 +52,25 @@ befs_read_datastream(struct super_block *sb, befs_data_stream * ds,
befs_block_run run;
befs_blocknr_t block; /* block coresponding to pos */
- befs_debug(sb, "---> befs_read_datastream() %Lu", pos);
+ befs_debug(sb, "---> %s %llu", __func__, pos);
block = pos >> BEFS_SB(sb)->block_shift;
if (off)
*off = pos - (block << BEFS_SB(sb)->block_shift);
if (befs_fblock2brun(sb, ds, block, &run) != BEFS_OK) {
befs_error(sb, "BeFS: Error finding disk addr of block %lu",
- block);
- befs_debug(sb, "<--- befs_read_datastream() ERROR");
+ (unsigned long)block);
+ befs_debug(sb, "<--- %s ERROR", __func__);
return NULL;
}
bh = befs_bread_iaddr(sb, run);
if (!bh) {
befs_error(sb, "BeFS: Error reading block %lu from datastream",
- block);
+ (unsigned long)block);
return NULL;
}
- befs_debug(sb, "<--- befs_read_datastream() read data, starting at %Lu",
- pos);
+ befs_debug(sb, "<--- %s read data, starting at %llu", __func__, pos);
return bh;
}
@@ -106,7 +105,8 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
} else {
befs_error(sb,
"befs_fblock2brun() was asked to find block %lu, "
- "which is not mapped by the datastream\n", fblock);
+ "which is not mapped by the datastream\n",
+ (unsigned long)fblock);
err = BEFS_ERR;
}
return err;
@@ -116,7 +116,7 @@ befs_fblock2brun(struct super_block *sb, befs_data_stream * data,
* befs_read_lsmylink - read long symlink from datastream.
* @sb: Filesystem superblock
* @ds: Datastrem to read from
- * @buf: Buffer in which to place long symlink data
+ * @buff: Buffer in which to place long symlink data
* @len: Length of the long symlink in bytes
*
* Returns the number of bytes read
@@ -128,14 +128,14 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
befs_off_t bytes_read = 0; /* bytes readed */
u16 plen;
struct buffer_head *bh = NULL;
- befs_debug(sb, "---> befs_read_lsymlink() length: %Lu", len);
+ befs_debug(sb, "---> %s length: %llu", __func__, len);
while (bytes_read < len) {
bh = befs_read_datastream(sb, ds, bytes_read, NULL);
if (!bh) {
befs_error(sb, "BeFS: Error reading datastream block "
- "starting from %Lu", bytes_read);
- befs_debug(sb, "<--- befs_read_lsymlink() ERROR");
+ "starting from %llu", bytes_read);
+ befs_debug(sb, "<--- %s ERROR", __func__);
return bytes_read;
}
@@ -146,7 +146,8 @@ befs_read_lsymlink(struct super_block * sb, befs_data_stream * ds, void *buff,
bytes_read += plen;
}
- befs_debug(sb, "<--- befs_read_lsymlink() read %u bytes", bytes_read);
+ befs_debug(sb, "<--- %s read %u bytes", __func__, (unsigned int)
+ bytes_read);
return bytes_read;
}
@@ -169,7 +170,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
befs_blocknr_t metablocks; /* FS metadata blocks */
befs_sb_info *befs_sb = BEFS_SB(sb);
- befs_debug(sb, "---> befs_count_blocks()");
+ befs_debug(sb, "---> %s", __func__);
datablocks = ds->size >> befs_sb->block_shift;
if (ds->size & (befs_sb->block_size - 1))
@@ -206,7 +207,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds)
}
blocks = datablocks + metablocks;
- befs_debug(sb, "<--- befs_count_blocks() %u blocks", blocks);
+ befs_debug(sb, "<--- %s %u blocks", __func__, (unsigned int)blocks);
return blocks;
}
@@ -251,11 +252,11 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
befs_blocknr_t max_block =
data->max_direct_range >> BEFS_SB(sb)->block_shift;
- befs_debug(sb, "---> befs_find_brun_direct(), find %lu", blockno);
+ befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
if (blockno > max_block) {
- befs_error(sb, "befs_find_brun_direct() passed block outside of"
- "direct region");
+ befs_error(sb, "%s passed block outside of direct region",
+ __func__);
return BEFS_ERR;
}
@@ -267,13 +268,14 @@ befs_find_brun_direct(struct super_block *sb, befs_data_stream * data,
run->start = array[i].start + offset;
run->len = array[i].len - offset;
- befs_debug(sb, "---> befs_find_brun_direct(), "
- "found %lu at direct[%d]", blockno, i);
+ befs_debug(sb, "---> %s, "
+ "found %lu at direct[%d]", __func__,
+ (unsigned long)blockno, i);
return BEFS_OK;
}
}
- befs_debug(sb, "---> befs_find_brun_direct() ERROR");
+ befs_debug(sb, "---> %s ERROR", __func__);
return BEFS_ERR;
}
@@ -316,7 +318,7 @@ befs_find_brun_indirect(struct super_block *sb,
befs_blocknr_t indirblockno = iaddr2blockno(sb, &indirect);
int arraylen = befs_iaddrs_per_block(sb);
- befs_debug(sb, "---> befs_find_brun_indirect(), find %lu", blockno);
+ befs_debug(sb, "---> %s, find %lu", __func__, (unsigned long)blockno);
indir_start_blk = data->max_direct_range >> BEFS_SB(sb)->block_shift;
search_blk = blockno - indir_start_blk;
@@ -325,10 +327,9 @@ befs_find_brun_indirect(struct super_block *sb,
for (i = 0; i < indirect.len; i++) {
indirblock = befs_bread(sb, indirblockno + i);
if (indirblock == NULL) {
- befs_debug(sb,
- "---> befs_find_brun_indirect() failed to "
- "read disk block %lu from the indirect brun",
- indirblockno + i);
+ befs_debug(sb, "---> %s failed to read "
+ "disk block %lu from the indirect brun",
+ __func__, (unsigned long)indirblockno + i);
return BEFS_ERR;
}
@@ -348,9 +349,10 @@ befs_find_brun_indirect(struct super_block *sb,
brelse(indirblock);
befs_debug(sb,
- "<--- befs_find_brun_indirect() found "
- "file block %lu at indirect[%d]",
- blockno, j + (i * arraylen));
+ "<--- %s found file block "
+ "%lu at indirect[%d]", __func__,
+ (unsigned long)blockno,
+ j + (i * arraylen));
return BEFS_OK;
}
sum += len;
@@ -360,10 +362,10 @@ befs_find_brun_indirect(struct super_block *sb,
}
/* Only fallthrough is an error */
- befs_error(sb, "BeFS: befs_find_brun_indirect() failed to find "
- "file block %lu", blockno);
+ befs_error(sb, "BeFS: %s failed to find "
+ "file block %lu", __func__, (unsigned long)blockno);
- befs_debug(sb, "<--- befs_find_brun_indirect() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return BEFS_ERR;
}
@@ -444,7 +446,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
size_t diblklen = iblklen * befs_iaddrs_per_block(sb)
* BEFS_DBLINDIR_BRUN_LEN;
- befs_debug(sb, "---> befs_find_brun_dblindirect() find %lu", blockno);
+ befs_debug(sb, "---> %s find %lu", __func__, (unsigned long)blockno);
/* First, discover which of the double_indir->indir blocks
* contains pos. Then figure out how much of pos that
@@ -460,8 +462,9 @@ befs_find_brun_dblindirect(struct super_block *sb,
dbl_which_block = dblindir_indx / befs_iaddrs_per_block(sb);
if (dbl_which_block > data->double_indirect.len) {
befs_error(sb, "The double-indirect index calculated by "
- "befs_read_brun_dblindirect(), %d, is outside the range "
- "of the double-indirect block", dblindir_indx);
+ "%s, %d, is outside the range "
+ "of the double-indirect block", __func__,
+ dblindir_indx);
return BEFS_ERR;
}
@@ -469,10 +472,10 @@ befs_find_brun_dblindirect(struct super_block *sb,
befs_bread(sb, iaddr2blockno(sb, &data->double_indirect) +
dbl_which_block);
if (dbl_indir_block == NULL) {
- befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
- "double-indirect block at blockno %lu",
- iaddr2blockno(sb,
- &data->double_indirect) +
+ befs_error(sb, "%s couldn't read the "
+ "double-indirect block at blockno %lu", __func__,
+ (unsigned long)
+ iaddr2blockno(sb, &data->double_indirect) +
dbl_which_block);
brelse(dbl_indir_block);
return BEFS_ERR;
@@ -489,16 +492,16 @@ befs_find_brun_dblindirect(struct super_block *sb,
which_block = indir_indx / befs_iaddrs_per_block(sb);
if (which_block > indir_run.len) {
befs_error(sb, "The indirect index calculated by "
- "befs_read_brun_dblindirect(), %d, is outside the range "
- "of the indirect block", indir_indx);
+ "%s, %d, is outside the range "
+ "of the indirect block", __func__, indir_indx);
return BEFS_ERR;
}
indir_block =
befs_bread(sb, iaddr2blockno(sb, &indir_run) + which_block);
if (indir_block == NULL) {
- befs_error(sb, "befs_read_brun_dblindirect() couldn't read the "
- "indirect block at blockno %lu",
+ befs_error(sb, "%s couldn't read the indirect block "
+ "at blockno %lu", __func__, (unsigned long)
iaddr2blockno(sb, &indir_run) + which_block);
brelse(indir_block);
return BEFS_ERR;
@@ -519,7 +522,7 @@ befs_find_brun_dblindirect(struct super_block *sb,
run->len -= offset;
befs_debug(sb, "Found file block %lu in double_indirect[%d][%d],"
- " double_indirect_leftover = %lu",
+ " double_indirect_leftover = %lu", (unsigned long)
blockno, dblindir_indx, indir_indx, dblindir_leftover);
return BEFS_OK;
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 622e73775c8..4de7cffcd66 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -10,6 +10,7 @@
* debug functions
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#ifdef __KERNEL__
#include <stdarg.h>
@@ -23,43 +24,30 @@
#include "befs.h"
-#define ERRBUFSIZE 1024
-
void
befs_error(const struct super_block *sb, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
- if (err_buf == NULL) {
- printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
- return;
- }
va_start(args, fmt);
- vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_err("(%s): %pV\n", sb->s_id, &vaf);
va_end(args);
-
- printk(KERN_ERR "BeFS(%s): %s\n", sb->s_id, err_buf);
- kfree(err_buf);
}
void
befs_warning(const struct super_block *sb, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- char *err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
- if (err_buf == NULL) {
- printk(KERN_ERR "could not allocate %d bytes\n", ERRBUFSIZE);
- return;
- }
va_start(args, fmt);
- vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_warn("(%s): %pV\n", sb->s_id, &vaf);
va_end(args);
-
- printk(KERN_WARNING "BeFS(%s): %s\n", sb->s_id, err_buf);
-
- kfree(err_buf);
}
void
@@ -67,25 +55,13 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
{
#ifdef CONFIG_BEFS_DEBUG
+ struct va_format vaf;
va_list args;
- char *err_buf = NULL;
-
- if (BEFS_SB(sb)->mount_opts.debug) {
- err_buf = kmalloc(ERRBUFSIZE, GFP_KERNEL);
- if (err_buf == NULL) {
- printk(KERN_ERR "could not allocate %d bytes\n",
- ERRBUFSIZE);
- return;
- }
-
- va_start(args, fmt);
- vsnprintf(err_buf, ERRBUFSIZE, fmt, args);
- va_end(args);
-
- printk(KERN_DEBUG "BeFS(%s): %s\n", sb->s_id, err_buf);
-
- kfree(err_buf);
- }
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_debug("(%s): %pV\n", sb->s_id, &vaf);
+ va_end(args);
#endif //CONFIG_BEFS_DEBUG
}
@@ -109,9 +85,9 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
befs_debug(sb, " gid %u", fs32_to_cpu(sb, inode->gid));
befs_debug(sb, " mode %08x", fs32_to_cpu(sb, inode->mode));
befs_debug(sb, " flags %08x", fs32_to_cpu(sb, inode->flags));
- befs_debug(sb, " create_time %Lu",
+ befs_debug(sb, " create_time %llu",
fs64_to_cpu(sb, inode->create_time));
- befs_debug(sb, " last_modified_time %Lu",
+ befs_debug(sb, " last_modified_time %llu",
fs64_to_cpu(sb, inode->last_modified_time));
tmp_run = fsrun_to_cpu(sb, inode->parent);
@@ -137,7 +113,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
tmp_run.allocation_group, tmp_run.start,
tmp_run.len);
}
- befs_debug(sb, " max_direct_range %Lu",
+ befs_debug(sb, " max_direct_range %llu",
fs64_to_cpu(sb,
inode->data.datastream.
max_direct_range));
@@ -147,7 +123,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
tmp_run.allocation_group,
tmp_run.start, tmp_run.len);
- befs_debug(sb, " max_indirect_range %Lu",
+ befs_debug(sb, " max_indirect_range %llu",
fs64_to_cpu(sb,
inode->data.datastream.
max_indirect_range));
@@ -158,12 +134,12 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
tmp_run.allocation_group, tmp_run.start,
tmp_run.len);
- befs_debug(sb, " max_double_indirect_range %Lu",
+ befs_debug(sb, " max_double_indirect_range %llu",
fs64_to_cpu(sb,
inode->data.datastream.
max_double_indirect_range));
- befs_debug(sb, " size %Lu",
+ befs_debug(sb, " size %llu",
fs64_to_cpu(sb, inode->data.datastream.size));
}
@@ -191,8 +167,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
befs_debug(sb, " block_size %u", fs32_to_cpu(sb, sup->block_size));
befs_debug(sb, " block_shift %u", fs32_to_cpu(sb, sup->block_shift));
- befs_debug(sb, " num_blocks %Lu", fs64_to_cpu(sb, sup->num_blocks));
- befs_debug(sb, " used_blocks %Lu", fs64_to_cpu(sb, sup->used_blocks));
+ befs_debug(sb, " num_blocks %llu", fs64_to_cpu(sb, sup->num_blocks));
+ befs_debug(sb, " used_blocks %llu", fs64_to_cpu(sb, sup->used_blocks));
befs_debug(sb, " magic2 %08x", fs32_to_cpu(sb, sup->magic2));
befs_debug(sb, " blocks_per_ag %u",
@@ -206,8 +182,8 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
befs_debug(sb, " log_blocks %u, %hu, %hu",
tmp_run.allocation_group, tmp_run.start, tmp_run.len);
- befs_debug(sb, " log_start %Ld", fs64_to_cpu(sb, sup->log_start));
- befs_debug(sb, " log_end %Ld", fs64_to_cpu(sb, sup->log_end));
+ befs_debug(sb, " log_start %lld", fs64_to_cpu(sb, sup->log_start));
+ befs_debug(sb, " log_end %lld", fs64_to_cpu(sb, sup->log_end));
befs_debug(sb, " magic3 %08x", fs32_to_cpu(sb, sup->magic3));
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index 94c17f9a957..fa4b718de59 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -25,7 +25,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
/* check magic header. */
if (magic1 != BEFS_INODE_MAGIC1) {
befs_error(sb,
- "Inode has a bad magic header - inode = %lu", inode);
+ "Inode has a bad magic header - inode = %lu",
+ (unsigned long)inode);
return BEFS_BAD_INODE;
}
@@ -34,8 +35,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
*/
if (inode != iaddr2blockno(sb, &ino_num)) {
befs_error(sb, "inode blocknr field disagrees with vfs "
- "VFS: %lu, Inode %lu",
- inode, iaddr2blockno(sb, &ino_num));
+ "VFS: %lu, Inode %lu", (unsigned long)
+ inode, (unsigned long)iaddr2blockno(sb, &ino_num));
return BEFS_BAD_INODE;
}
@@ -44,7 +45,8 @@ befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
*/
if (!(flags & BEFS_INODE_IN_USE)) {
- befs_error(sb, "inode is not used - inode = %lu", inode);
+ befs_error(sb, "inode is not used - inode = %lu",
+ (unsigned long)inode);
return BEFS_BAD_INODE;
}
diff --git a/fs/befs/io.c b/fs/befs/io.c
index ddef98aa255..0408a3d601d 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -30,9 +30,9 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
befs_blocknr_t block = 0;
befs_sb_info *befs_sb = BEFS_SB(sb);
- befs_debug(sb, "---> Enter befs_read_iaddr() "
- "[%u, %hu, %hu]",
- iaddr.allocation_group, iaddr.start, iaddr.len);
+ befs_debug(sb, "---> Enter %s "
+ "[%u, %hu, %hu]", __func__, iaddr.allocation_group,
+ iaddr.start, iaddr.len);
if (iaddr.allocation_group > befs_sb->num_ags) {
befs_error(sb, "BEFS: Invalid allocation group %u, max is %u",
@@ -42,20 +42,21 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
block = iaddr2blockno(sb, &iaddr);
- befs_debug(sb, "befs_read_iaddr: offset = %lu", block);
+ befs_debug(sb, "%s: offset = %lu", __func__, (unsigned long)block);
bh = sb_bread(sb, block);
if (bh == NULL) {
- befs_error(sb, "Failed to read block %lu", block);
+ befs_error(sb, "Failed to read block %lu",
+ (unsigned long)block);
goto error;
}
- befs_debug(sb, "<--- befs_read_iaddr()");
+ befs_debug(sb, "<--- %s", __func__);
return bh;
error:
- befs_debug(sb, "<--- befs_read_iaddr() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return NULL;
}
@@ -64,20 +65,21 @@ befs_bread(struct super_block *sb, befs_blocknr_t block)
{
struct buffer_head *bh = NULL;
- befs_debug(sb, "---> Enter befs_read() %Lu", block);
+ befs_debug(sb, "---> Enter %s %lu", __func__, (unsigned long)block);
bh = sb_bread(sb, block);
if (bh == NULL) {
- befs_error(sb, "Failed to read block %lu", block);
+ befs_error(sb, "Failed to read block %lu",
+ (unsigned long)block);
goto error;
}
- befs_debug(sb, "<--- befs_read()");
+ befs_debug(sb, "<--- %s", __func__);
return bh;
error:
- befs_debug(sb, "<--- befs_read() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return NULL;
}
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index daa15d6ba45..a16fbd4e824 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -5,6 +5,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -39,7 +41,6 @@ static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int)
static struct inode *befs_iget(struct super_block *, unsigned long);
static struct inode *befs_alloc_inode(struct super_block *sb);
static void befs_destroy_inode(struct inode *inode);
-static int befs_init_inodecache(void);
static void befs_destroy_inodecache(void);
static void *befs_follow_link(struct dentry *, struct nameidata *);
static void *befs_fast_follow_link(struct dentry *, struct nameidata *);
@@ -131,26 +132,20 @@ befs_get_block(struct inode *inode, sector_t block,
ulong disk_off;
befs_debug(sb, "---> befs_get_block() for inode %lu, block %ld",
- inode->i_ino, block);
-
- if (block < 0) {
- befs_error(sb, "befs_get_block() was asked for a block "
- "number less than zero: block %ld in inode %lu",
- block, inode->i_ino);
- return -EIO;
- }
-
+ (unsigned long)inode->i_ino, (long)block);
if (create) {
befs_error(sb, "befs_get_block() was asked to write to "
- "block %ld in inode %lu", block, inode->i_ino);
+ "block %ld in inode %lu", (long)block,
+ (unsigned long)inode->i_ino);
return -EPERM;
}
res = befs_fblock2brun(sb, ds, block, &run);
if (res != BEFS_OK) {
befs_error(sb,
- "<--- befs_get_block() for inode %lu, block "
- "%ld ERROR", inode->i_ino, block);
+ "<--- %s for inode %lu, block %ld ERROR",
+ __func__, (unsigned long)inode->i_ino,
+ (long)block);
return -EFBIG;
}
@@ -158,8 +153,9 @@ befs_get_block(struct inode *inode, sector_t block,
map_bh(bh_result, inode->i_sb, disk_off);
- befs_debug(sb, "<--- befs_get_block() for inode %lu, block %ld, "
- "disk address %lu", inode->i_ino, block, disk_off);
+ befs_debug(sb, "<--- %s for inode %lu, block %ld, disk address %lu",
+ __func__, (unsigned long)inode->i_ino, (long)block,
+ (unsigned long)disk_off);
return 0;
}
@@ -176,15 +172,15 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
char *utfname;
const char *name = dentry->d_name.name;
- befs_debug(sb, "---> befs_lookup() "
- "name %s inode %ld", dentry->d_name.name, dir->i_ino);
+ befs_debug(sb, "---> %s name %s inode %ld", __func__,
+ dentry->d_name.name, dir->i_ino);
/* Convert to UTF-8 */
if (BEFS_SB(sb)->nls) {
ret =
befs_nls2utf(sb, name, strlen(name), &utfname, &utfnamelen);
if (ret < 0) {
- befs_debug(sb, "<--- befs_lookup() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return ERR_PTR(ret);
}
ret = befs_btree_find(sb, ds, utfname, &offset);
@@ -195,12 +191,12 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
}
if (ret == BEFS_BT_NOT_FOUND) {
- befs_debug(sb, "<--- befs_lookup() %s not found",
+ befs_debug(sb, "<--- %s %s not found", __func__,
dentry->d_name.name);
return ERR_PTR(-ENOENT);
} else if (ret != BEFS_OK || offset == 0) {
- befs_warning(sb, "<--- befs_lookup() Error");
+ befs_warning(sb, "<--- %s Error", __func__);
return ERR_PTR(-ENODATA);
}
@@ -210,7 +206,7 @@ befs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
d_add(dentry, inode);
- befs_debug(sb, "<--- befs_lookup()");
+ befs_debug(sb, "<--- %s", __func__);
return NULL;
}
@@ -228,26 +224,25 @@ befs_readdir(struct file *file, struct dir_context *ctx)
char keybuf[BEFS_NAME_LEN + 1];
const char *dirname = file->f_path.dentry->d_name.name;
- befs_debug(sb, "---> befs_readdir() "
- "name %s, inode %ld, ctx->pos %Ld",
- dirname, inode->i_ino, ctx->pos);
+ befs_debug(sb, "---> %s name %s, inode %ld, ctx->pos %lld",
+ __func__, dirname, inode->i_ino, ctx->pos);
more:
result = befs_btree_read(sb, ds, ctx->pos, BEFS_NAME_LEN + 1,
keybuf, &keysize, &value);
if (result == BEFS_ERR) {
- befs_debug(sb, "<--- befs_readdir() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
befs_error(sb, "IO error reading %s (inode %lu)",
dirname, inode->i_ino);
return -EIO;
} else if (result == BEFS_BT_END) {
- befs_debug(sb, "<--- befs_readdir() END");
+ befs_debug(sb, "<--- %s END", __func__);
return 0;
} else if (result == BEFS_BT_EMPTY) {
- befs_debug(sb, "<--- befs_readdir() Empty directory");
+ befs_debug(sb, "<--- %s Empty directory", __func__);
return 0;
}
@@ -260,7 +255,7 @@ more:
result =
befs_utf2nls(sb, keybuf, keysize, &nlsname, &nlsnamelen);
if (result < 0) {
- befs_debug(sb, "<--- befs_readdir() ERROR");
+ befs_debug(sb, "<--- %s ERROR", __func__);
return result;
}
if (!dir_emit(ctx, nlsname, nlsnamelen,
@@ -277,7 +272,7 @@ more:
ctx->pos++;
goto more;
- befs_debug(sb, "<--- befs_readdir() pos %Ld", ctx->pos);
+ befs_debug(sb, "<--- %s pos %lld", __func__, ctx->pos);
return 0;
}
@@ -321,11 +316,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
struct inode *inode;
long ret = -EIO;
- befs_debug(sb, "---> befs_read_inode() " "inode = %lu", ino);
+ befs_debug(sb, "---> %s inode = %lu", __func__, ino);
inode = iget_locked(sb, ino);
- if (IS_ERR(inode))
- return inode;
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
@@ -393,9 +388,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
if (S_ISLNK(inode->i_mode) && !(befs_ino->i_flags & BEFS_LONG_SYMLINK)){
inode->i_size = 0;
inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
- strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
- BEFS_SYMLINK_LEN - 1);
- befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
+ strlcpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
+ BEFS_SYMLINK_LEN);
} else {
int num_blks;
@@ -428,7 +422,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
}
brelse(bh);
- befs_debug(sb, "<--- befs_read_inode()");
+ befs_debug(sb, "<--- %s", __func__);
unlock_new_inode(inode);
return inode;
@@ -437,7 +431,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
unacquire_none:
iget_failed(inode);
- befs_debug(sb, "<--- befs_read_inode() - Bad inode");
+ befs_debug(sb, "<--- %s - Bad inode", __func__);
return ERR_PTR(ret);
}
@@ -445,7 +439,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
*
* Taken from NFS implementation by Al Viro.
*/
-static int
+static int __init
befs_init_inodecache(void)
{
befs_inode_cachep = kmem_cache_create("befs_inode_cache",
@@ -454,11 +448,9 @@ befs_init_inodecache(void)
SLAB_MEM_SPREAD),
init_once);
if (befs_inode_cachep == NULL) {
- printk(KERN_ERR "befs_init_inodecache: "
- "Couldn't initialize inode slabcache\n");
+ pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
return -ENOMEM;
}
-
return 0;
}
@@ -544,16 +536,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
*/
int maxlen = in_len + 1;
- befs_debug(sb, "---> utf2nls()");
+ befs_debug(sb, "---> %s", __func__);
if (!nls) {
- befs_error(sb, "befs_utf2nls called with no NLS table loaded");
+ befs_error(sb, "%s called with no NLS table loaded", __func__);
return -EINVAL;
}
*out = result = kmalloc(maxlen, GFP_NOFS);
if (!*out) {
- befs_error(sb, "befs_utf2nls() cannot allocate memory");
+ befs_error(sb, "%s cannot allocate memory", __func__);
*out_len = 0;
return -ENOMEM;
}
@@ -575,14 +567,14 @@ befs_utf2nls(struct super_block *sb, const char *in,
result[o] = '\0';
*out_len = o;
- befs_debug(sb, "<--- utf2nls()");
+ befs_debug(sb, "<--- %s", __func__);
return o;
conv_err:
befs_error(sb, "Name using character set %s contains a character that "
"cannot be converted to unicode.", nls->charset);
- befs_debug(sb, "<--- utf2nls()");
+ befs_debug(sb, "<--- %s", __func__);
kfree(result);
return -EILSEQ;
}
@@ -590,21 +582,21 @@ befs_utf2nls(struct super_block *sb, const char *in,
/**
* befs_nls2utf - Convert NLS string to utf8 encodeing
* @sb: Superblock
- * @src: Input string buffer in NLS format
- * @srclen: Length of input string in bytes
- * @dest: The output string in UTF-8 format
- * @destlen: Length of the output buffer
+ * @in: Input string buffer in NLS format
+ * @in_len: Length of input string in bytes
+ * @out: The output string in UTF-8 format
+ * @out_len: Length of the output buffer
*
- * Converts input string @src, which is in the format of the loaded NLS map,
+ * Converts input string @in, which is in the format of the loaded NLS map,
* into a utf8 string.
*
- * The destination string @dest is allocated by this function and the caller is
+ * The destination string @out is allocated by this function and the caller is
* responsible for freeing it with kfree()
*
- * On return, *@destlen is the length of @dest in bytes.
+ * On return, *@out_len is the length of @out in bytes.
*
* On success, the return value is the number of utf8 characters written to
- * the output buffer @dest.
+ * the output buffer @out.
*
* On Failure, a negative number coresponding to the error code is returned.
*/
@@ -623,16 +615,17 @@ befs_nls2utf(struct super_block *sb, const char *in,
* in special cases */
int maxlen = (3 * in_len) + 1;
- befs_debug(sb, "---> nls2utf()\n");
+ befs_debug(sb, "---> %s\n", __func__);
if (!nls) {
- befs_error(sb, "befs_nls2utf called with no NLS table loaded.");
+ befs_error(sb, "%s called with no NLS table loaded.",
+ __func__);
return -EINVAL;
}
*out = result = kmalloc(maxlen, GFP_NOFS);
if (!*out) {
- befs_error(sb, "befs_nls2utf() cannot allocate memory");
+ befs_error(sb, "%s cannot allocate memory", __func__);
*out_len = 0;
return -ENOMEM;
}
@@ -653,14 +646,14 @@ befs_nls2utf(struct super_block *sb, const char *in,
result[o] = '\0';
*out_len = o;
- befs_debug(sb, "<--- nls2utf()");
+ befs_debug(sb, "<--- %s", __func__);
return i;
conv_err:
befs_error(sb, "Name using charecter set %s contains a charecter that "
"cannot be converted to unicode.", nls->charset);
- befs_debug(sb, "<--- nls2utf()");
+ befs_debug(sb, "<--- %s", __func__);
kfree(result);
return -EILSEQ;
}
@@ -715,8 +708,8 @@ parse_options(char *options, befs_mount_options * opts)
if (option >= 0)
uid = make_kuid(current_user_ns(), option);
if (!uid_valid(uid)) {
- printk(KERN_ERR "BeFS: Invalid uid %d, "
- "using default\n", option);
+ pr_err("Invalid uid %d, "
+ "using default\n", option);
break;
}
opts->uid = uid;
@@ -729,8 +722,8 @@ parse_options(char *options, befs_mount_options * opts)
if (option >= 0)
gid = make_kgid(current_user_ns(), option);
if (!gid_valid(gid)) {
- printk(KERN_ERR "BeFS: Invalid gid %d, "
- "using default\n", option);
+ pr_err("Invalid gid %d, "
+ "using default\n", option);
break;
}
opts->gid = gid;
@@ -740,8 +733,8 @@ parse_options(char *options, befs_mount_options * opts)
kfree(opts->iocharset);
opts->iocharset = match_strdup(&args[0]);
if (!opts->iocharset) {
- printk(KERN_ERR "BeFS: allocation failure for "
- "iocharset string\n");
+ pr_err("allocation failure for "
+ "iocharset string\n");
return 0;
}
break;
@@ -749,8 +742,8 @@ parse_options(char *options, befs_mount_options * opts)
opts->debug = 1;
break;
default:
- printk(KERN_ERR "BeFS: Unrecognized mount option \"%s\" "
- "or missing value\n", p);
+ pr_err("Unrecognized mount option \"%s\" "
+ "or missing value\n", p);
return 0;
}
}
@@ -791,22 +784,20 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
save_mount_options(sb, data);
- sb->s_fs_info = kmalloc(sizeof (*befs_sb), GFP_KERNEL);
+ sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
if (sb->s_fs_info == NULL) {
- printk(KERN_ERR
- "BeFS(%s): Unable to allocate memory for private "
+ pr_err("(%s): Unable to allocate memory for private "
"portion of superblock. Bailing.\n", sb->s_id);
goto unacquire_none;
}
befs_sb = BEFS_SB(sb);
- memset(befs_sb, 0, sizeof(befs_sb_info));
if (!parse_options((char *) data, &befs_sb->mount_opts)) {
befs_error(sb, "cannot parse mount options");
goto unacquire_priv_sbp;
}
- befs_debug(sb, "---> befs_fill_super()");
+ befs_debug(sb, "---> %s", __func__);
#ifndef CONFIG_BEFS_RW
if (!(sb->s_flags & MS_RDONLY)) {
@@ -854,7 +845,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
goto unacquire_priv_sbp;
if( befs_sb->num_blocks > ~((sector_t)0) ) {
- befs_error(sb, "blocks count: %Lu "
+ befs_error(sb, "blocks count: %llu "
"is larger than the host can use",
befs_sb->num_blocks);
goto unacquire_priv_sbp;
@@ -913,6 +904,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
static int
befs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
if (!(*flags & MS_RDONLY))
return -EINVAL;
return 0;
@@ -924,7 +916,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
- befs_debug(sb, "---> befs_statfs()");
+ befs_debug(sb, "---> %s", __func__);
buf->f_type = BEFS_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
@@ -937,7 +929,7 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = BEFS_NAME_LEN;
- befs_debug(sb, "<--- befs_statfs()");
+ befs_debug(sb, "<--- %s", __func__);
return 0;
}
@@ -963,7 +955,7 @@ init_befs_fs(void)
{
int err;
- printk(KERN_INFO "BeFS version: %s\n", BEFS_VERSION);
+ pr_info("version: %s\n", BEFS_VERSION);
err = befs_init_inodecache();
if (err)
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ae289221833..e7f88ace1a2 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -23,10 +23,10 @@
const struct file_operations bfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
};
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8defc6b3f9a..7041ac35ace 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode)
dprintf("ino=%08lx\n", ino);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode);
clear_inode(inode);
@@ -266,7 +266,7 @@ static void init_once(void *foo)
inode_init_once(&bi->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
sizeof(struct bfs_inode_info),
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 89dec7f789a..ca0ba15a730 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -45,7 +45,6 @@ static int load_aout_library(struct file*);
*/
static int aout_core_dump(struct coredump_params *cprm)
{
- struct file *file = cprm->file;
mm_segment_t fs;
int has_dumped = 0;
void __user *dump_start;
@@ -85,10 +84,10 @@ static int aout_core_dump(struct coredump_params *cprm)
set_fs(KERNEL_DS);
/* struct user */
- if (!dump_write(file, &dump, sizeof(dump)))
+ if (!dump_emit(cprm, &dump, sizeof(dump)))
goto end_coredump;
/* Now dump all of the user data. Include malloced stuff as well */
- if (!dump_seek(cprm->file, PAGE_SIZE - sizeof(dump)))
+ if (!dump_skip(cprm, PAGE_SIZE - sizeof(dump)))
goto end_coredump;
/* now we start writing out the user space info */
set_fs(USER_DS);
@@ -96,14 +95,14 @@ static int aout_core_dump(struct coredump_params *cprm)
if (dump.u_dsize != 0) {
dump_start = START_DATA(dump);
dump_size = dump.u_dsize << PAGE_SHIFT;
- if (!dump_write(file, dump_start, dump_size))
+ if (!dump_emit(cprm, dump_start, dump_size))
goto end_coredump;
}
/* Now prepare to dump the stack area */
if (dump.u_ssize != 0) {
dump_start = START_STACK(dump);
dump_size = dump.u_ssize << PAGE_SHIFT;
- if (!dump_write(file, dump_start, dump_size))
+ if (!dump_emit(cprm, dump_start, dump_size))
goto end_coredump;
}
end_coredump:
@@ -221,7 +220,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
* Requires a mmap handler. This prevents people from using a.out
* as part of an exploit attack against /proc-related vulnerabilities.
*/
- if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+ if (!bprm->file->f_op->mmap)
return -ENOEXEC;
fd_offset = N_TXTOFF(ex);
@@ -374,7 +373,7 @@ static int load_aout_library(struct file *file)
* Requires a mmap handler. This prevents people from using a.out
* as part of an exploit attack against /proc-related vulnerabilities.
*/
- if (!file->f_op || !file->f_op->mmap)
+ if (!file->f_op->mmap)
goto out;
if (N_FLAGS(ex))
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4c94a79991b..3892c1a2324 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -46,10 +46,15 @@
#endif
static int load_elf_binary(struct linux_binprm *bprm);
-static int load_elf_library(struct file *);
static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
int, int, unsigned long);
+#ifdef CONFIG_USELIB
+static int load_elf_library(struct file *);
+#else
+#define load_elf_library NULL
+#endif
+
/*
* If we don't support core dumping, then supply a NULL so we
* don't even try.
@@ -406,7 +411,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
goto out;
if (!elf_check_arch(interp_elf_ex))
goto out;
- if (!interpreter->f_op || !interpreter->f_op->mmap)
+ if (!interpreter->f_op->mmap)
goto out;
/*
@@ -543,9 +548,6 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-#define INTERPRETER_NONE 0
-#define INTERPRETER_ELF 2
-
#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
#endif
@@ -582,7 +584,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
unsigned long start_code, end_code, start_data, end_data;
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
- unsigned long def_flags = 0;
struct pt_regs *regs = current_pt_regs();
struct {
struct elfhdr elf_ex;
@@ -607,7 +608,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
goto out;
if (!elf_check_arch(&loc->elf_ex))
goto out;
- if (!bprm->file->f_op || !bprm->file->f_op->mmap)
+ if (!bprm->file->f_op->mmap)
goto out;
/* Now read in all of the header information */
@@ -722,9 +723,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
if (retval)
goto out_free_dentry;
- /* OK, This is the point of no return */
- current->mm->def_flags = def_flags;
-
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
SET_PERSONALITY(loc->elf_ex);
@@ -1008,6 +1006,7 @@ out_free_ph:
goto out;
}
+#ifdef CONFIG_USELIB
/* This is really simpleminded and specialized - we are loading an
a.out library that is given an ELF header. */
static int load_elf_library(struct file *file)
@@ -1028,7 +1027,7 @@ static int load_elf_library(struct file *file)
/* First of all, some simple consistency checks */
if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 ||
- !elf_check_arch(&elf_ex) || !file->f_op || !file->f_op->mmap)
+ !elf_check_arch(&elf_ex) || !file->f_op->mmap)
goto out;
/* Now read in all of the header information */
@@ -1086,6 +1085,7 @@ out_free_ph:
out:
return error;
}
+#endif /* #ifdef CONFIG_USELIB */
#ifdef CONFIG_ELF_CORE
/*
@@ -1108,6 +1108,14 @@ static bool always_dump_vma(struct vm_area_struct *vma)
/* Any vsyscall mappings? */
if (vma == get_gate_vma(vma->vm_mm))
return true;
+
+ /*
+ * Assume that all vmas with a .name op should always be dumped.
+ * If this changes, a new vm_ops field can easily be added.
+ */
+ if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma))
+ return true;
+
/*
* arch_vma_name() returns non-NULL for special architecture mappings,
* such as vDSO sections.
@@ -1225,35 +1233,17 @@ static int notesize(struct memelfnote *en)
return sz;
}
-#define DUMP_WRITE(addr, nr, foffset) \
- do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-
-static int alignfile(struct file *file, loff_t *foffset)
-{
- static const char buf[4] = { 0, };
- DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
- return 1;
-}
-
-static int writenote(struct memelfnote *men, struct file *file,
- loff_t *foffset)
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
{
struct elf_note en;
en.n_namesz = strlen(men->name) + 1;
en.n_descsz = men->datasz;
en.n_type = men->type;
- DUMP_WRITE(&en, sizeof(en), foffset);
- DUMP_WRITE(men->name, en.n_namesz, foffset);
- if (!alignfile(file, foffset))
- return 0;
- DUMP_WRITE(men->data, men->datasz, foffset);
- if (!alignfile(file, foffset))
- return 0;
-
- return 1;
+ return dump_emit(cprm, &en, sizeof(en)) &&
+ dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+ dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
}
-#undef DUMP_WRITE
static void fill_elf_header(struct elfhdr *elf, int segs,
u16 machine, u32 flags)
@@ -1392,7 +1382,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
- siginfo_t *siginfo)
+ const siginfo_t *siginfo)
{
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
@@ -1599,7 +1589,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- siginfo_t *siginfo, struct pt_regs *regs)
+ const siginfo_t *siginfo, struct pt_regs *regs)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1702,33 +1692,33 @@ static size_t get_note_info_size(struct elf_note_info *info)
* process-wide notes are interleaved after the first thread-specific note.
*/
static int write_note_info(struct elf_note_info *info,
- struct file *file, loff_t *foffset)
+ struct coredump_params *cprm)
{
- bool first = 1;
+ bool first = true;
struct elf_thread_core_info *t = info->thread;
do {
int i;
- if (!writenote(&t->notes[0], file, foffset))
+ if (!writenote(&t->notes[0], cprm))
return 0;
- if (first && !writenote(&info->psinfo, file, foffset))
+ if (first && !writenote(&info->psinfo, cprm))
return 0;
- if (first && !writenote(&info->signote, file, foffset))
+ if (first && !writenote(&info->signote, cprm))
return 0;
- if (first && !writenote(&info->auxv, file, foffset))
+ if (first && !writenote(&info->auxv, cprm))
return 0;
if (first && info->files.data &&
- !writenote(&info->files, file, foffset))
+ !writenote(&info->files, cprm))
return 0;
for (i = 1; i < info->thread_notes; ++i)
if (t->notes[i].data &&
- !writenote(&t->notes[i], file, foffset))
+ !writenote(&t->notes[i], cprm))
return 0;
- first = 0;
+ first = false;
t = t->next;
} while (t);
@@ -1848,34 +1838,31 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- siginfo_t *siginfo, struct pt_regs *regs)
+ const siginfo_t *siginfo, struct pt_regs *regs)
{
struct list_head *t;
+ struct core_thread *ct;
+ struct elf_thread_status *ets;
if (!elf_note_info_init(info))
return 0;
- if (siginfo->si_signo) {
- struct core_thread *ct;
- struct elf_thread_status *ets;
-
- for (ct = current->mm->core_state->dumper.next;
- ct; ct = ct->next) {
- ets = kzalloc(sizeof(*ets), GFP_KERNEL);
- if (!ets)
- return 0;
+ for (ct = current->mm->core_state->dumper.next;
+ ct; ct = ct->next) {
+ ets = kzalloc(sizeof(*ets), GFP_KERNEL);
+ if (!ets)
+ return 0;
- ets->thread = ct->task;
- list_add(&ets->list, &info->thread_list);
- }
+ ets->thread = ct->task;
+ list_add(&ets->list, &info->thread_list);
+ }
- list_for_each(t, &info->thread_list) {
- int sz;
+ list_for_each(t, &info->thread_list) {
+ int sz;
- ets = list_entry(t, struct elf_thread_status, list);
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
- info->thread_status_size += sz;
- }
+ ets = list_entry(t, struct elf_thread_status, list);
+ sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
@@ -1935,13 +1922,13 @@ static size_t get_note_info_size(struct elf_note_info *info)
}
static int write_note_info(struct elf_note_info *info,
- struct file *file, loff_t *foffset)
+ struct coredump_params *cprm)
{
int i;
struct list_head *t;
for (i = 0; i < info->numnote; i++)
- if (!writenote(info->notes + i, file, foffset))
+ if (!writenote(info->notes + i, cprm))
return 0;
/* write out the thread status notes section */
@@ -1950,7 +1937,7 @@ static int write_note_info(struct elf_note_info *info,
list_entry(t, struct elf_thread_status, list);
for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], file, foffset))
+ if (!writenote(&tmp->notes[i], cprm))
return 0;
}
@@ -2046,10 +2033,9 @@ static int elf_core_dump(struct coredump_params *cprm)
int has_dumped = 0;
mm_segment_t fs;
int segs;
- size_t size = 0;
struct vm_area_struct *vma, *gate_vma;
struct elfhdr *elf = NULL;
- loff_t offset = 0, dataoff, foffset;
+ loff_t offset = 0, dataoff;
struct elf_note_info info = { };
struct elf_phdr *phdr4note = NULL;
struct elf_shdr *shdr4extnum = NULL;
@@ -2105,7 +2091,6 @@ static int elf_core_dump(struct coredump_params *cprm)
offset += sizeof(*elf); /* Elf header */
offset += segs * sizeof(struct elf_phdr); /* Program headers */
- foffset = offset;
/* Write notes phdr entry */
{
@@ -2136,13 +2121,10 @@ static int elf_core_dump(struct coredump_params *cprm)
offset = dataoff;
- size += sizeof(*elf);
- if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+ if (!dump_emit(cprm, elf, sizeof(*elf)))
goto end_coredump;
- size += sizeof(*phdr4note);
- if (size > cprm->limit
- || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+ if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
goto end_coredump;
/* Write program headers for segments dump */
@@ -2164,24 +2146,22 @@ static int elf_core_dump(struct coredump_params *cprm)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
- size += sizeof(phdr);
- if (size > cprm->limit
- || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+ if (!dump_emit(cprm, &phdr, sizeof(phdr)))
goto end_coredump;
}
- if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+ if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
/* write out the notes section */
- if (!write_note_info(&info, cprm->file, &foffset))
+ if (!write_note_info(&info, cprm))
goto end_coredump;
- if (elf_coredump_extra_notes_write(cprm->file, &foffset))
+ if (elf_coredump_extra_notes_write(cprm))
goto end_coredump;
/* Align to page */
- if (!dump_seek(cprm->file, dataoff - foffset))
+ if (!dump_skip(cprm, dataoff - cprm->written))
goto end_coredump;
for (vma = first_vma(current, gate_vma); vma != NULL;
@@ -2198,26 +2178,21 @@ static int elf_core_dump(struct coredump_params *cprm)
page = get_dump_page(addr);
if (page) {
void *kaddr = kmap(page);
- stop = ((size += PAGE_SIZE) > cprm->limit) ||
- !dump_write(cprm->file, kaddr,
- PAGE_SIZE);
+ stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
page_cache_release(page);
} else
- stop = !dump_seek(cprm->file, PAGE_SIZE);
+ stop = !dump_skip(cprm, PAGE_SIZE);
if (stop)
goto end_coredump;
}
}
- if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+ if (!elf_core_write_extra_data(cprm))
goto end_coredump;
if (e_phnum == PN_XNUM) {
- size += sizeof(*shdr4extnum);
- if (size > cprm->limit
- || !dump_write(cprm->file, shdr4extnum,
- sizeof(*shdr4extnum)))
+ if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
goto end_coredump;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index c166f325a18..fe2a643ee00 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -111,7 +111,7 @@ static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
return 0;
if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
return 0;
- if (!file->f_op || !file->f_op->mmap)
+ if (!file->f_op->mmap)
return 0;
return 1;
}
@@ -1267,35 +1267,17 @@ static int notesize(struct memelfnote *en)
/* #define DEBUG */
-#define DUMP_WRITE(addr, nr, foffset) \
- do { if (!dump_write(file, (addr), (nr))) return 0; *foffset += (nr); } while(0)
-
-static int alignfile(struct file *file, loff_t *foffset)
-{
- static const char buf[4] = { 0, };
- DUMP_WRITE(buf, roundup(*foffset, 4) - *foffset, foffset);
- return 1;
-}
-
-static int writenote(struct memelfnote *men, struct file *file,
- loff_t *foffset)
+static int writenote(struct memelfnote *men, struct coredump_params *cprm)
{
struct elf_note en;
en.n_namesz = strlen(men->name) + 1;
en.n_descsz = men->datasz;
en.n_type = men->type;
- DUMP_WRITE(&en, sizeof(en), foffset);
- DUMP_WRITE(men->name, en.n_namesz, foffset);
- if (!alignfile(file, foffset))
- return 0;
- DUMP_WRITE(men->data, men->datasz, foffset);
- if (!alignfile(file, foffset))
- return 0;
-
- return 1;
+ return dump_emit(cprm, &en, sizeof(en)) &&
+ dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) &&
+ dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4);
}
-#undef DUMP_WRITE
static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs)
{
@@ -1500,66 +1482,40 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum,
/*
* dump the segments for an MMU process
*/
-#ifdef CONFIG_MMU
-static int elf_fdpic_dump_segments(struct file *file, size_t *size,
- unsigned long *limit, unsigned long mm_flags)
+static bool elf_fdpic_dump_segments(struct coredump_params *cprm)
{
struct vm_area_struct *vma;
- int err = 0;
for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
unsigned long addr;
- if (!maydump(vma, mm_flags))
+ if (!maydump(vma, cprm->mm_flags))
continue;
+#ifdef CONFIG_MMU
for (addr = vma->vm_start; addr < vma->vm_end;
addr += PAGE_SIZE) {
+ bool res;
struct page *page = get_dump_page(addr);
if (page) {
void *kaddr = kmap(page);
- *size += PAGE_SIZE;
- if (*size > *limit)
- err = -EFBIG;
- else if (!dump_write(file, kaddr, PAGE_SIZE))
- err = -EIO;
+ res = dump_emit(cprm, kaddr, PAGE_SIZE);
kunmap(page);
page_cache_release(page);
- } else if (!dump_seek(file, PAGE_SIZE))
- err = -EFBIG;
- if (err)
- goto out;
+ } else {
+ res = dump_skip(cprm, PAGE_SIZE);
+ }
+ if (!res)
+ return false;
}
- }
-out:
- return err;
-}
-#endif
-
-/*
- * dump the segments for a NOMMU process
- */
-#ifndef CONFIG_MMU
-static int elf_fdpic_dump_segments(struct file *file, size_t *size,
- unsigned long *limit, unsigned long mm_flags)
-{
- struct vm_area_struct *vma;
-
- for (vma = current->mm->mmap; vma; vma = vma->vm_next) {
- if (!maydump(vma, mm_flags))
- continue;
-
- if ((*size += PAGE_SIZE) > *limit)
- return -EFBIG;
-
- if (!dump_write(file, (void *) vma->vm_start,
+#else
+ if (!dump_emit(cprm, (void *) vma->vm_start,
vma->vm_end - vma->vm_start))
- return -EIO;
+ return false;
+#endif
}
-
- return 0;
+ return true;
}
-#endif
static size_t elf_core_vma_data_size(unsigned long mm_flags)
{
@@ -1585,11 +1541,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
int has_dumped = 0;
mm_segment_t fs;
int segs;
- size_t size = 0;
int i;
struct vm_area_struct *vma;
struct elfhdr *elf = NULL;
- loff_t offset = 0, dataoff, foffset;
+ loff_t offset = 0, dataoff;
int numnote;
struct memelfnote *notes = NULL;
struct elf_prstatus *prstatus = NULL; /* NT_PRSTATUS */
@@ -1606,6 +1561,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
+ struct core_thread *ct;
+ struct elf_thread_status *tmp;
/*
* We no longer stop all VM operations.
@@ -1641,28 +1598,23 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto cleanup;
#endif
- if (cprm->siginfo->si_signo) {
- struct core_thread *ct;
- struct elf_thread_status *tmp;
-
- for (ct = current->mm->core_state->dumper.next;
- ct; ct = ct->next) {
- tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
- if (!tmp)
- goto cleanup;
+ for (ct = current->mm->core_state->dumper.next;
+ ct; ct = ct->next) {
+ tmp = kzalloc(sizeof(*tmp), GFP_KERNEL);
+ if (!tmp)
+ goto cleanup;
- tmp->thread = ct->task;
- list_add(&tmp->list, &thread_list);
- }
+ tmp->thread = ct->task;
+ list_add(&tmp->list, &thread_list);
+ }
- list_for_each(t, &thread_list) {
- struct elf_thread_status *tmp;
- int sz;
+ list_for_each(t, &thread_list) {
+ struct elf_thread_status *tmp;
+ int sz;
- tmp = list_entry(t, struct elf_thread_status, list);
- sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
- thread_status_size += sz;
- }
+ tmp = list_entry(t, struct elf_thread_status, list);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, tmp);
+ thread_status_size += sz;
}
/* now collect the dump for the current */
@@ -1720,7 +1672,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
offset += sizeof(*elf); /* Elf header */
offset += segs * sizeof(struct elf_phdr); /* Program headers */
- foffset = offset;
/* Write notes phdr entry */
{
@@ -1755,13 +1706,10 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
offset = dataoff;
- size += sizeof(*elf);
- if (size > cprm->limit || !dump_write(cprm->file, elf, sizeof(*elf)))
+ if (!dump_emit(cprm, elf, sizeof(*elf)))
goto end_coredump;
- size += sizeof(*phdr4note);
- if (size > cprm->limit
- || !dump_write(cprm->file, phdr4note, sizeof(*phdr4note)))
+ if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note)))
goto end_coredump;
/* write program headers for segments dump */
@@ -1785,18 +1733,16 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
phdr.p_flags |= PF_X;
phdr.p_align = ELF_EXEC_PAGESIZE;
- size += sizeof(phdr);
- if (size > cprm->limit
- || !dump_write(cprm->file, &phdr, sizeof(phdr)))
+ if (!dump_emit(cprm, &phdr, sizeof(phdr)))
goto end_coredump;
}
- if (!elf_core_write_extra_phdrs(cprm->file, offset, &size, cprm->limit))
+ if (!elf_core_write_extra_phdrs(cprm, offset))
goto end_coredump;
/* write out the notes section */
for (i = 0; i < numnote; i++)
- if (!writenote(notes + i, cprm->file, &foffset))
+ if (!writenote(notes + i, cprm))
goto end_coredump;
/* write out the thread status notes section */
@@ -1805,25 +1751,21 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
list_entry(t, struct elf_thread_status, list);
for (i = 0; i < tmp->num_notes; i++)
- if (!writenote(&tmp->notes[i], cprm->file, &foffset))
+ if (!writenote(&tmp->notes[i], cprm))
goto end_coredump;
}
- if (!dump_seek(cprm->file, dataoff - foffset))
+ if (!dump_skip(cprm, dataoff - cprm->written))
goto end_coredump;
- if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit,
- cprm->mm_flags) < 0)
+ if (!elf_fdpic_dump_segments(cprm))
goto end_coredump;
- if (!elf_core_write_extra_data(cprm->file, &size, cprm->limit))
+ if (!elf_core_write_extra_data(cprm))
goto end_coredump;
if (e_phnum == PN_XNUM) {
- size += sizeof(*shdr4extnum);
- if (size > cprm->limit
- || !dump_write(cprm->file, shdr4extnum,
- sizeof(*shdr4extnum)))
+ if (!dump_emit(cprm, shdr4extnum, sizeof(*shdr4extnum)))
goto end_coredump;
}
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 037a3e2b045..f37b08cea1f 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -38,7 +38,7 @@ static int load_em86(struct linux_binprm *bprm)
/* First of all, some simple consistency checks */
if ((elf_ex.e_type != ET_EXEC && elf_ex.e_type != ET_DYN) ||
(!((elf_ex.e_machine == EM_386) || (elf_ex.e_machine == EM_486))) ||
- (!bprm->file->f_op || !bprm->file->f_op->mmap)) {
+ !bprm->file->f_op->mmap) {
return -ENOEXEC;
}
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index d50bbe59da1..f723cd3a455 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -380,7 +380,7 @@ failed:
/****************************************************************************/
-void old_reloc(unsigned long rl)
+static void old_reloc(unsigned long rl)
{
#ifdef DEBUG
char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 1c740e152f3..b60500300dd 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -656,6 +656,7 @@ static ssize_t bm_status_write(struct file * file, const char __user * buffer,
mutex_unlock(&root->d_inode->i_mutex);
dput(root);
+ break;
default: return res;
}
return count;
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
deleted file mode 100644
index fc60b31453e..00000000000
--- a/fs/bio-integrity.c
+++ /dev/null
@@ -1,759 +0,0 @@
-/*
- * bio-integrity.c - bio data integrity extensions
- *
- * Copyright (C) 2007, 2008, 2009 Oracle Corporation
- * Written by: Martin K. Petersen <martin.petersen@oracle.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/export.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-
-#define BIP_INLINE_VECS 4
-
-static struct kmem_cache *bip_slab;
-static struct workqueue_struct *kintegrityd_wq;
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio: bio to attach integrity metadata to
- * @gfp_mask: Memory allocation mask
- * @nr_vecs: Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata. nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
- gfp_t gfp_mask,
- unsigned int nr_vecs)
-{
- struct bio_integrity_payload *bip;
- struct bio_set *bs = bio->bi_pool;
- unsigned long idx = BIO_POOL_NONE;
- unsigned inline_vecs;
-
- if (!bs) {
- bip = kmalloc(sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * nr_vecs, gfp_mask);
- inline_vecs = nr_vecs;
- } else {
- bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIP_INLINE_VECS;
- }
-
- if (unlikely(!bip))
- return NULL;
-
- memset(bip, 0, sizeof(*bip));
-
- if (nr_vecs > inline_vecs) {
- bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
- bs->bvec_integrity_pool);
- if (!bip->bip_vec)
- goto err;
- } else {
- bip->bip_vec = bip->bip_inline_vecs;
- }
-
- bip->bip_slab = idx;
- bip->bip_bio = bio;
- bio->bi_integrity = bip;
-
- return bip;
-err:
- mempool_free(bip, bs->bio_integrity_pool);
- return NULL;
-}
-EXPORT_SYMBOL(bio_integrity_alloc);
-
-/**
- * bio_integrity_free - Free bio integrity payload
- * @bio: bio containing bip to be freed
- *
- * Description: Used to free the integrity portion of a bio. Usually
- * called from bio_free().
- */
-void bio_integrity_free(struct bio *bio)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct bio_set *bs = bio->bi_pool;
-
- if (bip->bip_owns_buf)
- kfree(bip->bip_buf);
-
- if (bs) {
- if (bip->bip_slab != BIO_POOL_NONE)
- bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_slab);
-
- mempool_free(bip, bs->bio_integrity_pool);
- } else {
- kfree(bip);
- }
-
- bio->bi_integrity = NULL;
-}
-EXPORT_SYMBOL(bio_integrity_free);
-
-/**
- * bio_integrity_add_page - Attach integrity metadata
- * @bio: bio to update
- * @page: page containing integrity metadata
- * @len: number of bytes of integrity metadata in page
- * @offset: start offset within page
- *
- * Description: Attach a page containing integrity metadata to bio.
- */
-int bio_integrity_add_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct bio_vec *iv;
-
- if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_slab)) {
- printk(KERN_ERR "%s: bip_vec full\n", __func__);
- return 0;
- }
-
- iv = bip_vec_idx(bip, bip->bip_vcnt);
- BUG_ON(iv == NULL);
-
- iv->bv_page = page;
- iv->bv_len = len;
- iv->bv_offset = offset;
- bip->bip_vcnt++;
-
- return len;
-}
-EXPORT_SYMBOL(bio_integrity_add_page);
-
-static int bdev_integrity_enabled(struct block_device *bdev, int rw)
-{
- struct blk_integrity *bi = bdev_get_integrity(bdev);
-
- if (bi == NULL)
- return 0;
-
- if (rw == READ && bi->verify_fn != NULL &&
- (bi->flags & INTEGRITY_FLAG_READ))
- return 1;
-
- if (rw == WRITE && bi->generate_fn != NULL &&
- (bi->flags & INTEGRITY_FLAG_WRITE))
- return 1;
-
- return 0;
-}
-
-/**
- * bio_integrity_enabled - Check whether integrity can be passed
- * @bio: bio to check
- *
- * Description: Determines whether bio_integrity_prep() can be called
- * on this bio or not. bio data direction and target device must be
- * set prior to calling. The functions honors the write_generate and
- * read_verify flags in sysfs.
- */
-int bio_integrity_enabled(struct bio *bio)
-{
- /* Already protected? */
- if (bio_integrity(bio))
- return 0;
-
- return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
-}
-EXPORT_SYMBOL(bio_integrity_enabled);
-
-/**
- * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
- * @bi: blk_integrity profile for device
- * @sectors: Number of 512 sectors to convert
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the hardware
- * sector size of the storage device. Convert the block layer sectors
- * to physical sectors.
- */
-static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
- unsigned int sectors)
-{
- /* At this point there are only 512b or 4096b DIF/EPP devices */
- if (bi->sector_size == 4096)
- return sectors >>= 3;
-
- return sectors;
-}
-
-/**
- * bio_integrity_tag_size - Retrieve integrity tag space
- * @bio: bio to inspect
- *
- * Description: Returns the maximum number of tag bytes that can be
- * attached to this bio. Filesystems can use this to determine how
- * much metadata to attach to an I/O.
- */
-unsigned int bio_integrity_tag_size(struct bio *bio)
-{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
- BUG_ON(bio->bi_size == 0);
-
- return bi->tag_size * (bio->bi_size / bi->sector_size);
-}
-EXPORT_SYMBOL(bio_integrity_tag_size);
-
-int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- unsigned int nr_sectors;
-
- BUG_ON(bip->bip_buf == NULL);
-
- if (bi->tag_size == 0)
- return -1;
-
- nr_sectors = bio_integrity_hw_sectors(bi,
- DIV_ROUND_UP(len, bi->tag_size));
-
- if (nr_sectors * bi->tuple_size > bip->bip_size) {
- printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
- __func__, nr_sectors * bi->tuple_size, bip->bip_size);
- return -1;
- }
-
- if (set)
- bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
- else
- bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
-
- return 0;
-}
-
-/**
- * bio_integrity_set_tag - Attach a tag buffer to a bio
- * @bio: bio to attach buffer to
- * @tag_buf: Pointer to a buffer containing tag data
- * @len: Length of the included buffer
- *
- * Description: Use this function to tag a bio by leveraging the extra
- * space provided by devices formatted with integrity protection. The
- * size of the integrity buffer must be <= to the size reported by
- * bio_integrity_tag_size().
- */
-int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
- BUG_ON(bio_data_dir(bio) != WRITE);
-
- return bio_integrity_tag(bio, tag_buf, len, 1);
-}
-EXPORT_SYMBOL(bio_integrity_set_tag);
-
-/**
- * bio_integrity_get_tag - Retrieve a tag buffer from a bio
- * @bio: bio to retrieve buffer from
- * @tag_buf: Pointer to a buffer for the tag data
- * @len: Length of the target buffer
- *
- * Description: Use this function to retrieve the tag buffer from a
- * completed I/O. The size of the integrity buffer must be <= to the
- * size reported by bio_integrity_tag_size().
- */
-int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
- BUG_ON(bio_data_dir(bio) != READ);
-
- return bio_integrity_tag(bio, tag_buf, len, 0);
-}
-EXPORT_SYMBOL(bio_integrity_get_tag);
-
-/**
- * bio_integrity_generate - Generate integrity metadata for a bio
- * @bio: bio to generate integrity metadata for
- *
- * Description: Generates integrity metadata for a bio by calling the
- * block device's generation callback function. The bio must have a
- * bip attached with enough room to accommodate the generated
- * integrity metadata.
- */
-static void bio_integrity_generate(struct bio *bio)
-{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- struct blk_integrity_exchg bix;
- struct bio_vec *bv;
- sector_t sector = bio->bi_sector;
- unsigned int i, sectors, total;
- void *prot_buf = bio->bi_integrity->bip_buf;
-
- total = 0;
- bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
- bix.sector_size = bi->sector_size;
-
- bio_for_each_segment(bv, bio, i) {
- void *kaddr = kmap_atomic(bv->bv_page);
- bix.data_buf = kaddr + bv->bv_offset;
- bix.data_size = bv->bv_len;
- bix.prot_buf = prot_buf;
- bix.sector = sector;
-
- bi->generate_fn(&bix);
-
- sectors = bv->bv_len / bi->sector_size;
- sector += sectors;
- prot_buf += sectors * bi->tuple_size;
- total += sectors * bi->tuple_size;
- BUG_ON(total > bio->bi_integrity->bip_size);
-
- kunmap_atomic(kaddr);
- }
-}
-
-static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
-{
- if (bi)
- return bi->tuple_size;
-
- return 0;
-}
-
-/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio: bio to prepare
- *
- * Description: Allocates a buffer for integrity metadata, maps the
- * pages and attaches them to a bio. The bio must have data
- * direction, target device and start sector set priot to calling. In
- * the WRITE case, integrity metadata will be generated using the
- * block device's integrity function. In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-int bio_integrity_prep(struct bio *bio)
-{
- struct bio_integrity_payload *bip;
- struct blk_integrity *bi;
- struct request_queue *q;
- void *buf;
- unsigned long start, end;
- unsigned int len, nr_pages;
- unsigned int bytes, offset, i;
- unsigned int sectors;
-
- bi = bdev_get_integrity(bio->bi_bdev);
- q = bdev_get_queue(bio->bi_bdev);
- BUG_ON(bi == NULL);
- BUG_ON(bio_integrity(bio));
-
- sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
-
- /* Allocate kernel buffer for protection data */
- len = sectors * blk_integrity_tuple_size(bi);
- buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
- if (unlikely(buf == NULL)) {
- printk(KERN_ERR "could not allocate integrity buffer\n");
- return -ENOMEM;
- }
-
- end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = ((unsigned long) buf) >> PAGE_SHIFT;
- nr_pages = end - start;
-
- /* Allocate bio integrity payload and integrity vectors */
- bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
- if (unlikely(bip == NULL)) {
- printk(KERN_ERR "could not allocate data integrity bioset\n");
- kfree(buf);
- return -EIO;
- }
-
- bip->bip_owns_buf = 1;
- bip->bip_buf = buf;
- bip->bip_size = len;
- bip->bip_sector = bio->bi_sector;
-
- /* Map it */
- offset = offset_in_page(buf);
- for (i = 0 ; i < nr_pages ; i++) {
- int ret;
- bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- ret = bio_integrity_add_page(bio, virt_to_page(buf),
- bytes, offset);
-
- if (ret == 0)
- return 0;
-
- if (ret < bytes)
- break;
-
- buf += bytes;
- len -= bytes;
- offset = 0;
- }
-
- /* Install custom I/O completion handler if read verify is enabled */
- if (bio_data_dir(bio) == READ) {
- bip->bip_end_io = bio->bi_end_io;
- bio->bi_end_io = bio_integrity_endio;
- }
-
- /* Auto-generate integrity metadata if this is a write */
- if (bio_data_dir(bio) == WRITE)
- bio_integrity_generate(bio);
-
- return 0;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify - Verify integrity metadata for a bio
- * @bio: bio to verify
- *
- * Description: This function is called to verify the integrity of a
- * bio. The data in the bio io_vec is compared to the integrity
- * metadata returned by the HBA.
- */
-static int bio_integrity_verify(struct bio *bio)
-{
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- struct blk_integrity_exchg bix;
- struct bio_vec *bv;
- sector_t sector = bio->bi_integrity->bip_sector;
- unsigned int i, sectors, total, ret;
- void *prot_buf = bio->bi_integrity->bip_buf;
-
- ret = total = 0;
- bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
- bix.sector_size = bi->sector_size;
-
- bio_for_each_segment(bv, bio, i) {
- void *kaddr = kmap_atomic(bv->bv_page);
- bix.data_buf = kaddr + bv->bv_offset;
- bix.data_size = bv->bv_len;
- bix.prot_buf = prot_buf;
- bix.sector = sector;
-
- ret = bi->verify_fn(&bix);
-
- if (ret) {
- kunmap_atomic(kaddr);
- return ret;
- }
-
- sectors = bv->bv_len / bi->sector_size;
- sector += sectors;
- prot_buf += sectors * bi->tuple_size;
- total += sectors * bi->tuple_size;
- BUG_ON(total > bio->bi_integrity->bip_size);
-
- kunmap_atomic(kaddr);
- }
-
- return ret;
-}
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work: Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request. The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
- struct bio_integrity_payload *bip =
- container_of(work, struct bio_integrity_payload, bip_work);
- struct bio *bio = bip->bip_bio;
- int error;
-
- error = bio_integrity_verify(bio);
-
- /* Restore original bio completion handler */
- bio->bi_end_io = bip->bip_end_io;
- bio_endio(bio, error);
-}
-
-/**
- * bio_integrity_endio - Integrity I/O completion function
- * @bio: Protected bio
- * @error: Pointer to errno
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context. However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context. This function postpones completion
- * accordingly.
- */
-void bio_integrity_endio(struct bio *bio, int error)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
-
- BUG_ON(bip->bip_bio != bio);
-
- /* In case of an I/O error there is no point in verifying the
- * integrity metadata. Restore original bio end_io handler
- * and run it.
- */
- if (error) {
- bio->bi_end_io = bip->bip_end_io;
- bio_endio(bio, error);
-
- return;
- }
-
- INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
- queue_work(kintegrityd_wq, &bip->bip_work);
-}
-EXPORT_SYMBOL(bio_integrity_endio);
-
-/**
- * bio_integrity_mark_head - Advance bip_vec skip bytes
- * @bip: Integrity vector to advance
- * @skip: Number of bytes to advance it
- */
-void bio_integrity_mark_head(struct bio_integrity_payload *bip,
- unsigned int skip)
-{
- struct bio_vec *iv;
- unsigned int i;
-
- bip_for_each_vec(iv, bip, i) {
- if (skip == 0) {
- bip->bip_idx = i;
- return;
- } else if (skip >= iv->bv_len) {
- skip -= iv->bv_len;
- } else { /* skip < iv->bv_len) */
- iv->bv_offset += skip;
- iv->bv_len -= skip;
- bip->bip_idx = i;
- return;
- }
- }
-}
-
-/**
- * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
- * @bip: Integrity vector to truncate
- * @len: New length of integrity vector
- */
-void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
- unsigned int len)
-{
- struct bio_vec *iv;
- unsigned int i;
-
- bip_for_each_vec(iv, bip, i) {
- if (len == 0) {
- bip->bip_vcnt = i;
- return;
- } else if (len >= iv->bv_len) {
- len -= iv->bv_len;
- } else { /* len < iv->bv_len) */
- iv->bv_len = len;
- len = 0;
- }
- }
-}
-
-/**
- * bio_integrity_advance - Advance integrity vector
- * @bio: bio whose integrity vector to update
- * @bytes_done: number of data bytes that have been completed
- *
- * Description: This function calculates how many integrity bytes the
- * number of completed data bytes correspond to and advances the
- * integrity vector accordingly.
- */
-void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- unsigned int nr_sectors;
-
- BUG_ON(bip == NULL);
- BUG_ON(bi == NULL);
-
- nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
- bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
-}
-EXPORT_SYMBOL(bio_integrity_advance);
-
-/**
- * bio_integrity_trim - Trim integrity vector
- * @bio: bio whose integrity vector to update
- * @offset: offset to first data sector
- * @sectors: number of data sectors
- *
- * Description: Used to trim the integrity vector in a cloned bio.
- * The ivec will be advanced corresponding to 'offset' data sectors
- * and the length will be truncated corresponding to 'len' data
- * sectors.
- */
-void bio_integrity_trim(struct bio *bio, unsigned int offset,
- unsigned int sectors)
-{
- struct bio_integrity_payload *bip = bio->bi_integrity;
- struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
- unsigned int nr_sectors;
-
- BUG_ON(bip == NULL);
- BUG_ON(bi == NULL);
- BUG_ON(!bio_flagged(bio, BIO_CLONED));
-
- nr_sectors = bio_integrity_hw_sectors(bi, sectors);
- bip->bip_sector = bip->bip_sector + offset;
- bio_integrity_mark_head(bip, offset * bi->tuple_size);
- bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
-}
-EXPORT_SYMBOL(bio_integrity_trim);
-
-/**
- * bio_integrity_split - Split integrity metadata
- * @bio: Protected bio
- * @bp: Resulting bio_pair
- * @sectors: Offset
- *
- * Description: Splits an integrity page into a bio_pair.
- */
-void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
-{
- struct blk_integrity *bi;
- struct bio_integrity_payload *bip = bio->bi_integrity;
- unsigned int nr_sectors;
-
- if (bio_integrity(bio) == 0)
- return;
-
- bi = bdev_get_integrity(bio->bi_bdev);
- BUG_ON(bi == NULL);
- BUG_ON(bip->bip_vcnt != 1);
-
- nr_sectors = bio_integrity_hw_sectors(bi, sectors);
-
- bp->bio1.bi_integrity = &bp->bip1;
- bp->bio2.bi_integrity = &bp->bip2;
-
- bp->iv1 = bip->bip_vec[bip->bip_idx];
- bp->iv2 = bip->bip_vec[bip->bip_idx];
-
- bp->bip1.bip_vec = &bp->iv1;
- bp->bip2.bip_vec = &bp->iv2;
-
- bp->iv1.bv_len = sectors * bi->tuple_size;
- bp->iv2.bv_offset += sectors * bi->tuple_size;
- bp->iv2.bv_len -= sectors * bi->tuple_size;
-
- bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
- bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
-
- bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
- bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
-}
-EXPORT_SYMBOL(bio_integrity_split);
-
-/**
- * bio_integrity_clone - Callback for cloning bios with integrity metadata
- * @bio: New bio
- * @bio_src: Original bio
- * @gfp_mask: Memory allocation mask
- *
- * Description: Called to allocate a bip when cloning a bio
- */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
- gfp_t gfp_mask)
-{
- struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
- struct bio_integrity_payload *bip;
-
- BUG_ON(bip_src == NULL);
-
- bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
- if (bip == NULL)
- return -EIO;
-
- memcpy(bip->bip_vec, bip_src->bip_vec,
- bip_src->bip_vcnt * sizeof(struct bio_vec));
-
- bip->bip_sector = bip_src->bip_sector;
- bip->bip_vcnt = bip_src->bip_vcnt;
- bip->bip_idx = bip_src->bip_idx;
-
- return 0;
-}
-EXPORT_SYMBOL(bio_integrity_clone);
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
- if (bs->bio_integrity_pool)
- return 0;
-
- bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
- if (!bs->bio_integrity_pool)
- return -1;
-
- bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
- if (!bs->bvec_integrity_pool) {
- mempool_destroy(bs->bio_integrity_pool);
- return -1;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
- if (bs->bio_integrity_pool)
- mempool_destroy(bs->bio_integrity_pool);
-
- if (bs->bvec_integrity_pool)
- mempool_destroy(bs->bvec_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
-
-void __init bio_integrity_init(void)
-{
- /*
- * kintegrityd won't block much but may burn a lot of CPU cycles.
- * Make it highpri CPU intensive wq with max concurrency of 1.
- */
- kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
- WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
- if (!kintegrityd_wq)
- panic("Failed to create kintegrityd\n");
-
- bip_slab = kmem_cache_create("bio_integrity_payload",
- sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIP_INLINE_VECS,
- 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- if (!bip_slab)
- panic("Failed to create slab\n");
-}
diff --git a/fs/bio.c b/fs/bio.c
deleted file mode 100644
index ea5035da4d9..00000000000
--- a/fs/bio.c
+++ /dev/null
@@ -1,2029 +0,0 @@
-/*
- * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public Licens
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
- *
- */
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/uio.h>
-#include <linux/iocontext.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/mempool.h>
-#include <linux/workqueue.h>
-#include <linux/cgroup.h>
-#include <scsi/sg.h> /* for struct sg_iovec */
-
-#include <trace/events/block.h>
-
-/*
- * Test patch to inline a certain number of bi_io_vec's inside the bio
- * itself, to shrink a bio data allocation from two mempool calls to one
- */
-#define BIO_INLINE_VECS 4
-
-static mempool_t *bio_split_pool __read_mostly;
-
-/*
- * if you change this list, also change bvec_alloc or things will
- * break badly! cannot be bigger than what you can fit into an
- * unsigned short
- */
-#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
- BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
-};
-#undef BV
-
-/*
- * fs_bio_set is the bio_set containing bio and iovec memory pools used by
- * IO code that does not need private memory pools.
- */
-struct bio_set *fs_bio_set;
-EXPORT_SYMBOL(fs_bio_set);
-
-/*
- * Our slab pool management
- */
-struct bio_slab {
- struct kmem_cache *slab;
- unsigned int slab_ref;
- unsigned int slab_size;
- char name[8];
-};
-static DEFINE_MUTEX(bio_slab_lock);
-static struct bio_slab *bio_slabs;
-static unsigned int bio_slab_nr, bio_slab_max;
-
-static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
-{
- unsigned int sz = sizeof(struct bio) + extra_size;
- struct kmem_cache *slab = NULL;
- struct bio_slab *bslab, *new_bio_slabs;
- unsigned int new_bio_slab_max;
- unsigned int i, entry = -1;
-
- mutex_lock(&bio_slab_lock);
-
- i = 0;
- while (i < bio_slab_nr) {
- bslab = &bio_slabs[i];
-
- if (!bslab->slab && entry == -1)
- entry = i;
- else if (bslab->slab_size == sz) {
- slab = bslab->slab;
- bslab->slab_ref++;
- break;
- }
- i++;
- }
-
- if (slab)
- goto out_unlock;
-
- if (bio_slab_nr == bio_slab_max && entry == -1) {
- new_bio_slab_max = bio_slab_max << 1;
- new_bio_slabs = krealloc(bio_slabs,
- new_bio_slab_max * sizeof(struct bio_slab),
- GFP_KERNEL);
- if (!new_bio_slabs)
- goto out_unlock;
- bio_slab_max = new_bio_slab_max;
- bio_slabs = new_bio_slabs;
- }
- if (entry == -1)
- entry = bio_slab_nr++;
-
- bslab = &bio_slabs[entry];
-
- snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
- slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!slab)
- goto out_unlock;
-
- printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
- bslab->slab = slab;
- bslab->slab_ref = 1;
- bslab->slab_size = sz;
-out_unlock:
- mutex_unlock(&bio_slab_lock);
- return slab;
-}
-
-static void bio_put_slab(struct bio_set *bs)
-{
- struct bio_slab *bslab = NULL;
- unsigned int i;
-
- mutex_lock(&bio_slab_lock);
-
- for (i = 0; i < bio_slab_nr; i++) {
- if (bs->bio_slab == bio_slabs[i].slab) {
- bslab = &bio_slabs[i];
- break;
- }
- }
-
- if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
- goto out;
-
- WARN_ON(!bslab->slab_ref);
-
- if (--bslab->slab_ref)
- goto out;
-
- kmem_cache_destroy(bslab->slab);
- bslab->slab = NULL;
-
-out:
- mutex_unlock(&bio_slab_lock);
-}
-
-unsigned int bvec_nr_vecs(unsigned short idx)
-{
- return bvec_slabs[idx].nr_vecs;
-}
-
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
-{
- BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
-
- if (idx == BIOVEC_MAX_IDX)
- mempool_free(bv, pool);
- else {
- struct biovec_slab *bvs = bvec_slabs + idx;
-
- kmem_cache_free(bvs->slab, bv);
- }
-}
-
-struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
- mempool_t *pool)
-{
- struct bio_vec *bvl;
-
- /*
- * see comment near bvec_array define!
- */
- switch (nr) {
- case 1:
- *idx = 0;
- break;
- case 2 ... 4:
- *idx = 1;
- break;
- case 5 ... 16:
- *idx = 2;
- break;
- case 17 ... 64:
- *idx = 3;
- break;
- case 65 ... 128:
- *idx = 4;
- break;
- case 129 ... BIO_MAX_PAGES:
- *idx = 5;
- break;
- default:
- return NULL;
- }
-
- /*
- * idx now points to the pool we want to allocate from. only the
- * 1-vec entry pool is mempool backed.
- */
- if (*idx == BIOVEC_MAX_IDX) {
-fallback:
- bvl = mempool_alloc(pool, gfp_mask);
- } else {
- struct biovec_slab *bvs = bvec_slabs + *idx;
- gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
-
- /*
- * Make this allocation restricted and don't dump info on
- * allocation failures, since we'll fallback to the mempool
- * in case of failure.
- */
- __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-
- /*
- * Try a slab allocation. If this fails and __GFP_WAIT
- * is set, retry with the 1-entry mempool
- */
- bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
- if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
- *idx = BIOVEC_MAX_IDX;
- goto fallback;
- }
- }
-
- return bvl;
-}
-
-static void __bio_free(struct bio *bio)
-{
- bio_disassociate_task(bio);
-
- if (bio_integrity(bio))
- bio_integrity_free(bio);
-}
-
-static void bio_free(struct bio *bio)
-{
- struct bio_set *bs = bio->bi_pool;
- void *p;
-
- __bio_free(bio);
-
- if (bs) {
- if (bio_flagged(bio, BIO_OWNS_VEC))
- bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
-
- /*
- * If we have front padding, adjust the bio pointer before freeing
- */
- p = bio;
- p -= bs->front_pad;
-
- mempool_free(p, bs->bio_pool);
- } else {
- /* Bio was allocated by bio_kmalloc() */
- kfree(bio);
- }
-}
-
-void bio_init(struct bio *bio)
-{
- memset(bio, 0, sizeof(*bio));
- bio->bi_flags = 1 << BIO_UPTODATE;
- atomic_set(&bio->bi_cnt, 1);
-}
-EXPORT_SYMBOL(bio_init);
-
-/**
- * bio_reset - reinitialize a bio
- * @bio: bio to reset
- *
- * Description:
- * After calling bio_reset(), @bio will be in the same state as a freshly
- * allocated bio returned bio bio_alloc_bioset() - the only fields that are
- * preserved are the ones that are initialized by bio_alloc_bioset(). See
- * comment in struct bio.
- */
-void bio_reset(struct bio *bio)
-{
- unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
-
- __bio_free(bio);
-
- memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags|(1 << BIO_UPTODATE);
-}
-EXPORT_SYMBOL(bio_reset);
-
-static void bio_alloc_rescue(struct work_struct *work)
-{
- struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
- struct bio *bio;
-
- while (1) {
- spin_lock(&bs->rescue_lock);
- bio = bio_list_pop(&bs->rescue_list);
- spin_unlock(&bs->rescue_lock);
-
- if (!bio)
- break;
-
- generic_make_request(bio);
- }
-}
-
-static void punt_bios_to_rescuer(struct bio_set *bs)
-{
- struct bio_list punt, nopunt;
- struct bio *bio;
-
- /*
- * In order to guarantee forward progress we must punt only bios that
- * were allocated from this bio_set; otherwise, if there was a bio on
- * there for a stacking driver higher up in the stack, processing it
- * could require allocating bios from this bio_set, and doing that from
- * our own rescuer would be bad.
- *
- * Since bio lists are singly linked, pop them all instead of trying to
- * remove from the middle of the list:
- */
-
- bio_list_init(&punt);
- bio_list_init(&nopunt);
-
- while ((bio = bio_list_pop(current->bio_list)))
- bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
-
- *current->bio_list = nopunt;
-
- spin_lock(&bs->rescue_lock);
- bio_list_merge(&bs->rescue_list, &punt);
- spin_unlock(&bs->rescue_lock);
-
- queue_work(bs->rescue_workqueue, &bs->rescue_work);
-}
-
-/**
- * bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask: the GFP_ mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
- * @bs: the bio_set to allocate from.
- *
- * Description:
- * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- * backed by the @bs's mempool.
- *
- * When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- * able to allocate a bio. This is due to the mempool guarantees. To make this
- * work, callers must never allocate more than 1 bio at a time from this pool.
- * Callers that need to allocate more than 1 bio must always submit the
- * previously allocated bio for IO before attempting to allocate a new one.
- * Failure to do so can cause deadlocks under memory pressure.
- *
- * Note that when running under generic_make_request() (i.e. any block
- * driver), bios are not submitted until after you return - see the code in
- * generic_make_request() that converts recursion into iteration, to prevent
- * stack overflows.
- *
- * This would normally mean allocating multiple bios under
- * generic_make_request() would be susceptible to deadlocks, but we have
- * deadlock avoidance code that resubmits any blocked bios from a rescuer
- * thread.
- *
- * However, we do not guarantee forward progress for allocations from other
- * mempools. Doing multiple allocations from the same mempool under
- * generic_make_request() should be avoided - instead, use bio_set's front_pad
- * for per bio allocations.
- *
- * RETURNS:
- * Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
-{
- gfp_t saved_gfp = gfp_mask;
- unsigned front_pad;
- unsigned inline_vecs;
- unsigned long idx = BIO_POOL_NONE;
- struct bio_vec *bvl = NULL;
- struct bio *bio;
- void *p;
-
- if (!bs) {
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
-
- p = kmalloc(sizeof(struct bio) +
- nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
- front_pad = 0;
- inline_vecs = nr_iovecs;
- } else {
- /*
- * generic_make_request() converts recursion to iteration; this
- * means if we're running beneath it, any bios we allocate and
- * submit will not be submitted (and thus freed) until after we
- * return.
- *
- * This exposes us to a potential deadlock if we allocate
- * multiple bios from the same bio_set() while running
- * underneath generic_make_request(). If we were to allocate
- * multiple bios (say a stacking block driver that was splitting
- * bios), we would deadlock if we exhausted the mempool's
- * reserve.
- *
- * We solve this, and guarantee forward progress, with a rescuer
- * workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_WAIT; if that fails, we punt those bios we
- * would be blocking to the rescuer workqueue before we retry
- * with the original gfp_flags.
- */
-
- if (current->bio_list && !bio_list_empty(current->bio_list))
- gfp_mask &= ~__GFP_WAIT;
-
- p = mempool_alloc(bs->bio_pool, gfp_mask);
- if (!p && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- p = mempool_alloc(bs->bio_pool, gfp_mask);
- }
-
- front_pad = bs->front_pad;
- inline_vecs = BIO_INLINE_VECS;
- }
-
- if (unlikely(!p))
- return NULL;
-
- bio = p + front_pad;
- bio_init(bio);
-
- if (nr_iovecs > inline_vecs) {
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
- if (!bvl && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
- }
-
- if (unlikely(!bvl))
- goto err_free;
-
- bio->bi_flags |= 1 << BIO_OWNS_VEC;
- } else if (nr_iovecs) {
- bvl = bio->bi_inline_vecs;
- }
-
- bio->bi_pool = bs;
- bio->bi_flags |= idx << BIO_POOL_OFFSET;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bvl;
- return bio;
-
-err_free:
- mempool_free(p, bs->bio_pool);
- return NULL;
-}
-EXPORT_SYMBOL(bio_alloc_bioset);
-
-void zero_fill_bio(struct bio *bio)
-{
- unsigned long flags;
- struct bio_vec *bv;
- int i;
-
- bio_for_each_segment(bv, bio, i) {
- char *data = bvec_kmap_irq(bv, &flags);
- memset(data, 0, bv->bv_len);
- flush_dcache_page(bv->bv_page);
- bvec_kunmap_irq(data, &flags);
- }
-}
-EXPORT_SYMBOL(zero_fill_bio);
-
-/**
- * bio_put - release a reference to a bio
- * @bio: bio to release reference to
- *
- * Description:
- * Put a reference to a &struct bio, either one you have gotten with
- * bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
- **/
-void bio_put(struct bio *bio)
-{
- BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->bi_cnt))
- bio_free(bio);
-}
-EXPORT_SYMBOL(bio_put);
-
-inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
- if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
- blk_recount_segments(q, bio);
-
- return bio->bi_phys_segments;
-}
-EXPORT_SYMBOL(bio_phys_segments);
-
-/**
- * __bio_clone - clone a bio
- * @bio: destination bio
- * @bio_src: bio to clone
- *
- * Clone a &bio. Caller will own the returned bio, but not
- * the actual data it points to. Reference count of returned
- * bio will be one.
- */
-void __bio_clone(struct bio *bio, struct bio *bio_src)
-{
- memcpy(bio->bi_io_vec, bio_src->bi_io_vec,
- bio_src->bi_max_vecs * sizeof(struct bio_vec));
-
- /*
- * most users will be overriding ->bi_bdev with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_sector = bio_src->bi_sector;
- bio->bi_bdev = bio_src->bi_bdev;
- bio->bi_flags |= 1 << BIO_CLONED;
- bio->bi_rw = bio_src->bi_rw;
- bio->bi_vcnt = bio_src->bi_vcnt;
- bio->bi_size = bio_src->bi_size;
- bio->bi_idx = bio_src->bi_idx;
-}
-EXPORT_SYMBOL(__bio_clone);
-
-/**
- * bio_clone_bioset - clone a bio
- * @bio: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
- *
- * Like __bio_clone, only also allocates the returned bio
- */
-struct bio *bio_clone_bioset(struct bio *bio, gfp_t gfp_mask,
- struct bio_set *bs)
-{
- struct bio *b;
-
- b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, bs);
- if (!b)
- return NULL;
-
- __bio_clone(b, bio);
-
- if (bio_integrity(bio)) {
- int ret;
-
- ret = bio_integrity_clone(b, bio, gfp_mask);
-
- if (ret < 0) {
- bio_put(b);
- return NULL;
- }
- }
-
- return b;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
-/**
- * bio_get_nr_vecs - return approx number of vecs
- * @bdev: I/O target
- *
- * Return the approximate number of pages we can send to this target.
- * There's no guarantee that you will be able to fit this number of pages
- * into a bio, it does not account for dynamic restrictions that vary
- * on offset.
- */
-int bio_get_nr_vecs(struct block_device *bdev)
-{
- struct request_queue *q = bdev_get_queue(bdev);
- int nr_pages;
-
- nr_pages = min_t(unsigned,
- queue_max_segments(q),
- queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
- return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
- *page, unsigned int len, unsigned int offset,
- unsigned short max_sectors)
-{
- int retried_segments = 0;
- struct bio_vec *bvec;
-
- /*
- * cloned bio must not modify vec list
- */
- if (unlikely(bio_flagged(bio, BIO_CLONED)))
- return 0;
-
- if (((bio->bi_size + len) >> 9) > max_sectors)
- return 0;
-
- /*
- * For filesystems with a blocksize smaller than the pagesize
- * we will often be called with the same page as last time and
- * a consecutive offset. Optimize this special case.
- */
- if (bio->bi_vcnt > 0) {
- struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page == prev->bv_page &&
- offset == prev->bv_offset + prev->bv_len) {
- unsigned int prev_bv_len = prev->bv_len;
- prev->bv_len += len;
-
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- /* prev_bvec is already charged in
- bi_size, discharge it in order to
- simulate merging updated prev_bvec
- as new bvec. */
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_sector,
- .bi_size = bio->bi_size - prev_bv_len,
- .bi_rw = bio->bi_rw,
- };
-
- if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
- prev->bv_len -= len;
- return 0;
- }
- }
-
- goto done;
- }
- }
-
- if (bio->bi_vcnt >= bio->bi_max_vecs)
- return 0;
-
- /*
- * we might lose a segment or two here, but rather that than
- * make this too complex.
- */
-
- while (bio->bi_phys_segments >= queue_max_segments(q)) {
-
- if (retried_segments)
- return 0;
-
- retried_segments = 1;
- blk_recount_segments(q, bio);
- }
-
- /*
- * setup the new entry, we might clear it again later if we
- * cannot add the page
- */
- bvec = &bio->bi_io_vec[bio->bi_vcnt];
- bvec->bv_page = page;
- bvec->bv_len = len;
- bvec->bv_offset = offset;
-
- /*
- * if queue has other restrictions (eg varying max sector size
- * depending on offset), it can specify a merge_bvec_fn in the
- * queue to get further control
- */
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_sector,
- .bi_size = bio->bi_size,
- .bi_rw = bio->bi_rw,
- };
-
- /*
- * merge_bvec_fn() returns number of bytes it can accept
- * at this offset
- */
- if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
- bvec->bv_page = NULL;
- bvec->bv_len = 0;
- bvec->bv_offset = 0;
- return 0;
- }
- }
-
- /* If we may be able to merge these biovecs, force a recount */
- if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
- bio->bi_flags &= ~(1 << BIO_SEG_VALID);
-
- bio->bi_vcnt++;
- bio->bi_phys_segments++;
- done:
- bio->bi_size += len;
- return len;
-}
-
-/**
- * bio_add_pc_page - attempt to add page to bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- *
- * This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
- unsigned int len, unsigned int offset)
-{
- return __bio_add_page(q, bio, page, len, offset,
- queue_max_hw_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_pc_page);
-
-/**
- * bio_add_page - attempt to add page to bio
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
- */
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
- unsigned int offset)
-{
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_page);
-
-struct submit_bio_ret {
- struct completion event;
- int error;
-};
-
-static void submit_bio_wait_endio(struct bio *bio, int error)
-{
- struct submit_bio_ret *ret = bio->bi_private;
-
- ret->error = error;
- complete(&ret->event);
-}
-
-/**
- * submit_bio_wait - submit a bio, and wait until it completes
- * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
- * @bio: The &struct bio which describes the I/O
- *
- * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
- * bio_endio() on failure.
- */
-int submit_bio_wait(int rw, struct bio *bio)
-{
- struct submit_bio_ret ret;
-
- rw |= REQ_SYNC;
- init_completion(&ret.event);
- bio->bi_private = &ret;
- bio->bi_end_io = submit_bio_wait_endio;
- submit_bio(rw, bio);
- wait_for_completion(&ret.event);
-
- return ret.error;
-}
-EXPORT_SYMBOL(submit_bio_wait);
-
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio: bio to advance
- * @bytes: number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
-{
- if (bio_integrity(bio))
- bio_integrity_advance(bio, bytes);
-
- bio->bi_sector += bytes >> 9;
- bio->bi_size -= bytes;
-
- if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
- return;
-
- while (bytes) {
- if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
- WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
- bio->bi_idx, bio->bi_vcnt);
- break;
- }
-
- if (bytes >= bio_iovec(bio)->bv_len) {
- bytes -= bio_iovec(bio)->bv_len;
- bio->bi_idx++;
- } else {
- bio_iovec(bio)->bv_len -= bytes;
- bio_iovec(bio)->bv_offset += bytes;
- bytes = 0;
- }
- }
-}
-EXPORT_SYMBOL(bio_advance);
-
-/**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
- int i;
- struct bio_vec *bv;
-
- bio_for_each_segment_all(bv, bio, i) {
- bv->bv_page = alloc_page(gfp_mask);
- if (!bv->bv_page) {
- while (--bv >= bio->bi_io_vec)
- __free_page(bv->bv_page);
- return -ENOMEM;
- }
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
-{
- struct bio_vec *src_bv, *dst_bv;
- unsigned src_offset, dst_offset, bytes;
- void *src_p, *dst_p;
-
- src_bv = bio_iovec(src);
- dst_bv = bio_iovec(dst);
-
- src_offset = src_bv->bv_offset;
- dst_offset = dst_bv->bv_offset;
-
- while (1) {
- if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
- src_bv++;
- if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
- src = src->bi_next;
- if (!src)
- break;
-
- src_bv = bio_iovec(src);
- }
-
- src_offset = src_bv->bv_offset;
- }
-
- if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
- dst_bv++;
- if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
- dst = dst->bi_next;
- if (!dst)
- break;
-
- dst_bv = bio_iovec(dst);
- }
-
- dst_offset = dst_bv->bv_offset;
- }
-
- bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
- src_bv->bv_offset + src_bv->bv_len - src_offset);
-
- src_p = kmap_atomic(src_bv->bv_page);
- dst_p = kmap_atomic(dst_bv->bv_page);
-
- memcpy(dst_p + dst_offset,
- src_p + src_offset,
- bytes);
-
- kunmap_atomic(dst_p);
- kunmap_atomic(src_p);
-
- src_offset += bytes;
- dst_offset += bytes;
- }
-}
-EXPORT_SYMBOL(bio_copy_data);
-
-struct bio_map_data {
- struct bio_vec *iovecs;
- struct sg_iovec *sgvecs;
- int nr_sgvecs;
- int is_our_pages;
-};
-
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
- struct sg_iovec *iov, int iov_count,
- int is_our_pages)
-{
- memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
- memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
- bmd->nr_sgvecs = iov_count;
- bmd->is_our_pages = is_our_pages;
- bio->bi_private = bmd;
-}
-
-static void bio_free_map_data(struct bio_map_data *bmd)
-{
- kfree(bmd->iovecs);
- kfree(bmd->sgvecs);
- kfree(bmd);
-}
-
-static struct bio_map_data *bio_alloc_map_data(int nr_segs,
- unsigned int iov_count,
- gfp_t gfp_mask)
-{
- struct bio_map_data *bmd;
-
- if (iov_count > UIO_MAXIOV)
- return NULL;
-
- bmd = kmalloc(sizeof(*bmd), gfp_mask);
- if (!bmd)
- return NULL;
-
- bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask);
- if (!bmd->iovecs) {
- kfree(bmd);
- return NULL;
- }
-
- bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask);
- if (bmd->sgvecs)
- return bmd;
-
- kfree(bmd->iovecs);
- kfree(bmd);
- return NULL;
-}
-
-static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
- struct sg_iovec *iov, int iov_count,
- int to_user, int from_user, int do_free_page)
-{
- int ret = 0, i;
- struct bio_vec *bvec;
- int iov_idx = 0;
- unsigned int iov_off = 0;
-
- bio_for_each_segment_all(bvec, bio, i) {
- char *bv_addr = page_address(bvec->bv_page);
- unsigned int bv_len = iovecs[i].bv_len;
-
- while (bv_len && iov_idx < iov_count) {
- unsigned int bytes;
- char __user *iov_addr;
-
- bytes = min_t(unsigned int,
- iov[iov_idx].iov_len - iov_off, bv_len);
- iov_addr = iov[iov_idx].iov_base + iov_off;
-
- if (!ret) {
- if (to_user)
- ret = copy_to_user(iov_addr, bv_addr,
- bytes);
-
- if (from_user)
- ret = copy_from_user(bv_addr, iov_addr,
- bytes);
-
- if (ret)
- ret = -EFAULT;
- }
-
- bv_len -= bytes;
- bv_addr += bytes;
- iov_addr += bytes;
- iov_off += bytes;
-
- if (iov[iov_idx].iov_len == iov_off) {
- iov_idx++;
- iov_off = 0;
- }
- }
-
- if (do_free_page)
- __free_page(bvec->bv_page);
- }
-
- return ret;
-}
-
-/**
- * bio_uncopy_user - finish previously mapped bio
- * @bio: bio being terminated
- *
- * Free pages allocated from bio_copy_user() and write back data
- * to user space in case of a read.
- */
-int bio_uncopy_user(struct bio *bio)
-{
- struct bio_map_data *bmd = bio->bi_private;
- struct bio_vec *bvec;
- int ret = 0, i;
-
- if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
- /*
- * if we're in a workqueue, the request is orphaned, so
- * don't copy into a random user address space, just free.
- */
- if (current->mm)
- ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs,
- bmd->nr_sgvecs, bio_data_dir(bio) == READ,
- 0, bmd->is_our_pages);
- else if (bmd->is_our_pages)
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
- }
- bio_free_map_data(bmd);
- bio_put(bio);
- return ret;
-}
-EXPORT_SYMBOL(bio_uncopy_user);
-
-/**
- * bio_copy_user_iov - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iov: the iovec.
- * @iov_count: number of elements in the iovec
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user_iov(struct request_queue *q,
- struct rq_map_data *map_data,
- struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct bio_map_data *bmd;
- struct bio_vec *bvec;
- struct page *page;
- struct bio *bio;
- int i, ret;
- int nr_pages = 0;
- unsigned int len = 0;
- unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
-
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr;
- unsigned long end;
- unsigned long start;
-
- uaddr = (unsigned long)iov[i].iov_base;
- end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- start = uaddr >> PAGE_SHIFT;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages += end - start;
- len += iov[i].iov_len;
- }
-
- if (offset)
- nr_pages++;
-
- bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
- if (!bmd)
- return ERR_PTR(-ENOMEM);
-
- ret = -ENOMEM;
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- goto out_bmd;
-
- if (!write_to_vm)
- bio->bi_rw |= REQ_WRITE;
-
- ret = 0;
-
- if (map_data) {
- nr_pages = 1 << map_data->page_order;
- i = map_data->offset / PAGE_SIZE;
- }
- while (len) {
- unsigned int bytes = PAGE_SIZE;
-
- bytes -= offset;
-
- if (bytes > len)
- bytes = len;
-
- if (map_data) {
- if (i == map_data->nr_entries * nr_pages) {
- ret = -ENOMEM;
- break;
- }
-
- page = map_data->pages[i / nr_pages];
- page += (i % nr_pages);
-
- i++;
- } else {
- page = alloc_page(q->bounce_gfp | gfp_mask);
- if (!page) {
- ret = -ENOMEM;
- break;
- }
- }
-
- if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
- break;
-
- len -= bytes;
- offset = 0;
- }
-
- if (ret)
- goto cleanup;
-
- /*
- * success
- */
- if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
- (map_data && map_data->from_user)) {
- ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0);
- if (ret)
- goto cleanup;
- }
-
- bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
- return bio;
-cleanup:
- if (!map_data)
- bio_for_each_segment_all(bvec, bio, i)
- __free_page(bvec->bv_page);
-
- bio_put(bio);
-out_bmd:
- bio_free_map_data(bmd);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_copy_user - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @uaddr: start of user address
- * @len: length in bytes
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
- unsigned long uaddr, unsigned int len,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct sg_iovec iov;
-
- iov.iov_base = (void __user *)uaddr;
- iov.iov_len = len;
-
- return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
-static struct bio *__bio_map_user_iov(struct request_queue *q,
- struct block_device *bdev,
- struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- int i, j;
- int nr_pages = 0;
- struct page **pages;
- struct bio *bio;
- int cur_page = 0;
- int ret, offset;
-
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
- unsigned long len = iov[i].iov_len;
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages += end - start;
- /*
- * buffer must be aligned to at least hardsector size for now
- */
- if (uaddr & queue_dma_alignment(q))
- return ERR_PTR(-EINVAL);
- }
-
- if (!nr_pages)
- return ERR_PTR(-EINVAL);
-
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- ret = -ENOMEM;
- pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
- if (!pages)
- goto out;
-
- for (i = 0; i < iov_count; i++) {
- unsigned long uaddr = (unsigned long)iov[i].iov_base;
- unsigned long len = iov[i].iov_len;
- unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = uaddr >> PAGE_SHIFT;
- const int local_nr_pages = end - start;
- const int page_limit = cur_page + local_nr_pages;
-
- ret = get_user_pages_fast(uaddr, local_nr_pages,
- write_to_vm, &pages[cur_page]);
- if (ret < local_nr_pages) {
- ret = -EFAULT;
- goto out_unmap;
- }
-
- offset = uaddr & ~PAGE_MASK;
- for (j = cur_page; j < page_limit; j++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- /*
- * sorry...
- */
- if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
- bytes)
- break;
-
- len -= bytes;
- offset = 0;
- }
-
- cur_page = j;
- /*
- * release the pages we didn't map into the bio, if any
- */
- while (j < page_limit)
- page_cache_release(pages[j++]);
- }
-
- kfree(pages);
-
- /*
- * set data direction, and check if mapped pages need bouncing
- */
- if (!write_to_vm)
- bio->bi_rw |= REQ_WRITE;
-
- bio->bi_bdev = bdev;
- bio->bi_flags |= (1 << BIO_USER_MAPPED);
- return bio;
-
- out_unmap:
- for (i = 0; i < nr_pages; i++) {
- if(!pages[i])
- break;
- page_cache_release(pages[i]);
- }
- out:
- kfree(pages);
- bio_put(bio);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_map_user - map user address into bio
- * @q: the struct request_queue for the bio
- * @bdev: destination block device
- * @uaddr: start of user address
- * @len: length in bytes
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
- unsigned long uaddr, unsigned int len, int write_to_vm,
- gfp_t gfp_mask)
-{
- struct sg_iovec iov;
-
- iov.iov_base = (void __user *)uaddr;
- iov.iov_len = len;
-
- return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
-/**
- * bio_map_user_iov - map user sg_iovec table into bio
- * @q: the struct request_queue for the bio
- * @bdev: destination block device
- * @iov: the iovec.
- * @iov_count: number of elements in the iovec
- * @write_to_vm: bool indicating writing to pages or not
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
- struct sg_iovec *iov, int iov_count,
- int write_to_vm, gfp_t gfp_mask)
-{
- struct bio *bio;
-
- bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
- gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- /*
- * subtle -- if __bio_map_user() ended up bouncing a bio,
- * it would normally disappear when its bi_end_io is run.
- * however, we need it for the unmap, so grab an extra
- * reference to it
- */
- bio_get(bio);
-
- return bio;
-}
-
-static void __bio_unmap_user(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- /*
- * make sure we dirty pages we wrote to
- */
- bio_for_each_segment_all(bvec, bio, i) {
- if (bio_data_dir(bio) == READ)
- set_page_dirty_lock(bvec->bv_page);
-
- page_cache_release(bvec->bv_page);
- }
-
- bio_put(bio);
-}
-
-/**
- * bio_unmap_user - unmap a bio
- * @bio: the bio being unmapped
- *
- * Unmap a bio previously mapped by bio_map_user(). Must be called with
- * a process context.
- *
- * bio_unmap_user() may sleep.
- */
-void bio_unmap_user(struct bio *bio)
-{
- __bio_unmap_user(bio);
- bio_put(bio);
-}
-EXPORT_SYMBOL(bio_unmap_user);
-
-static void bio_map_kern_endio(struct bio *bio, int err)
-{
- bio_put(bio);
-}
-
-static struct bio *__bio_map_kern(struct request_queue *q, void *data,
- unsigned int len, gfp_t gfp_mask)
-{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- int offset, i;
- struct bio *bio;
-
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
- offset) < bytes)
- break;
-
- data += bytes;
- len -= bytes;
- offset = 0;
- }
-
- bio->bi_end_io = bio_map_kern_endio;
- return bio;
-}
-
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask)
-{
- struct bio *bio;
-
- bio = __bio_map_kern(q, data, len, gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- if (bio->bi_size == len)
- return bio;
-
- /*
- * Don't support partial mappings.
- */
- bio_put(bio);
- return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL(bio_map_kern);
-
-static void bio_copy_kern_endio(struct bio *bio, int err)
-{
- struct bio_vec *bvec;
- const int read = bio_data_dir(bio) == READ;
- struct bio_map_data *bmd = bio->bi_private;
- int i;
- char *p = bmd->sgvecs[0].iov_base;
-
- bio_for_each_segment_all(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
- int len = bmd->iovecs[i].bv_len;
-
- if (read)
- memcpy(p, addr, len);
-
- __free_page(bvec->bv_page);
- p += len;
- }
-
- bio_free_map_data(bmd);
- bio_put(bio);
-}
-
-/**
- * bio_copy_kern - copy kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to copy
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio and page allocation
- * @reading: data direction is READ
- *
- * copy the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask, int reading)
-{
- struct bio *bio;
- struct bio_vec *bvec;
- int i;
-
- bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
- if (IS_ERR(bio))
- return bio;
-
- if (!reading) {
- void *p = data;
-
- bio_for_each_segment_all(bvec, bio, i) {
- char *addr = page_address(bvec->bv_page);
-
- memcpy(addr, p, bvec->bv_len);
- p += bvec->bv_len;
- }
- }
-
- bio->bi_end_io = bio_copy_kern_endio;
-
- return bio;
-}
-EXPORT_SYMBOL(bio_copy_kern);
-
-/*
- * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
- * for performing direct-IO in BIOs.
- *
- * The problem is that we cannot run set_page_dirty() from interrupt context
- * because the required locks are not interrupt-safe. So what we can do is to
- * mark the pages dirty _before_ performing IO. And in interrupt context,
- * check that the pages are still dirty. If so, fine. If not, redirty them
- * in process context.
- *
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages. The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all. So we skip compound pages here at an early stage.
- *
- * Note that this code is very hard to test under normal circumstances because
- * direct-io pins the pages with get_user_pages(). This makes
- * is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, flusher threads) could clean the pages if they are mapped
- * pagecache.
- *
- * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
- * deferred bio dirtying paths.
- */
-
-/*
- * bio_set_pages_dirty() will mark all the bio's pages as dirty.
- */
-void bio_set_pages_dirty(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page && !PageCompound(page))
- set_page_dirty_lock(page);
- }
-}
-
-static void bio_release_pages(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (page)
- put_page(page);
- }
-}
-
-/*
- * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
- * If they are, then fine. If, however, some pages are clean then they must
- * have been written out during the direct-IO read. So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
- *
- * It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on. It will run one page_cache_release() against each page and will
- * run one bio_put() against the BIO.
- */
-
-static void bio_dirty_fn(struct work_struct *work);
-
-static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
-static DEFINE_SPINLOCK(bio_dirty_lock);
-static struct bio *bio_dirty_list;
-
-/*
- * This runs in process context
- */
-static void bio_dirty_fn(struct work_struct *work)
-{
- unsigned long flags;
- struct bio *bio;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio = bio_dirty_list;
- bio_dirty_list = NULL;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
-
- while (bio) {
- struct bio *next = bio->bi_private;
-
- bio_set_pages_dirty(bio);
- bio_release_pages(bio);
- bio_put(bio);
- bio = next;
- }
-}
-
-void bio_check_pages_dirty(struct bio *bio)
-{
- struct bio_vec *bvec;
- int nr_clean_pages = 0;
- int i;
-
- bio_for_each_segment_all(bvec, bio, i) {
- struct page *page = bvec->bv_page;
-
- if (PageDirty(page) || PageCompound(page)) {
- page_cache_release(page);
- bvec->bv_page = NULL;
- } else {
- nr_clean_pages++;
- }
- }
-
- if (nr_clean_pages) {
- unsigned long flags;
-
- spin_lock_irqsave(&bio_dirty_lock, flags);
- bio->bi_private = bio_dirty_list;
- bio_dirty_list = bio;
- spin_unlock_irqrestore(&bio_dirty_lock, flags);
- schedule_work(&bio_dirty_work);
- } else {
- bio_put(bio);
- }
-}
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
- int i;
- struct bio_vec *bvec;
-
- bio_for_each_segment(bvec, bi, i)
- flush_dcache_page(bvec->bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
-/**
- * bio_endio - end I/O on a bio
- * @bio: bio
- * @error: error, if any
- *
- * Description:
- * bio_endio() will end I/O on the whole bio. bio_endio() is the
- * preferred way to end I/O on a bio, it takes care of clearing
- * BIO_UPTODATE on error. @error is 0 on success, and and one of the
- * established -Exxxx (-EIO, for instance) error values in case
- * something went wrong. No one should call bi_end_io() directly on a
- * bio unless they own it and thus know that it has an end_io
- * function.
- **/
-void bio_endio(struct bio *bio, int error)
-{
- if (error)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- error = -EIO;
-
- if (bio->bi_end_io)
- bio->bi_end_io(bio, error);
-}
-EXPORT_SYMBOL(bio_endio);
-
-void bio_pair_release(struct bio_pair *bp)
-{
- if (atomic_dec_and_test(&bp->cnt)) {
- struct bio *master = bp->bio1.bi_private;
-
- bio_endio(master, bp->error);
- mempool_free(bp, bp->bio2.bi_private);
- }
-}
-EXPORT_SYMBOL(bio_pair_release);
-
-static void bio_pair_end_1(struct bio *bi, int err)
-{
- struct bio_pair *bp = container_of(bi, struct bio_pair, bio1);
-
- if (err)
- bp->error = err;
-
- bio_pair_release(bp);
-}
-
-static void bio_pair_end_2(struct bio *bi, int err)
-{
- struct bio_pair *bp = container_of(bi, struct bio_pair, bio2);
-
- if (err)
- bp->error = err;
-
- bio_pair_release(bp);
-}
-
-/*
- * split a bio - only worry about a bio with a single page in its iovec
- */
-struct bio_pair *bio_split(struct bio *bi, int first_sectors)
-{
- struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO);
-
- if (!bp)
- return bp;
-
- trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
- bi->bi_sector + first_sectors);
-
- BUG_ON(bio_segments(bi) > 1);
- atomic_set(&bp->cnt, 3);
- bp->error = 0;
- bp->bio1 = *bi;
- bp->bio2 = *bi;
- bp->bio2.bi_sector += first_sectors;
- bp->bio2.bi_size -= first_sectors << 9;
- bp->bio1.bi_size = first_sectors << 9;
-
- if (bi->bi_vcnt != 0) {
- bp->bv1 = *bio_iovec(bi);
- bp->bv2 = *bio_iovec(bi);
-
- if (bio_is_rw(bi)) {
- bp->bv2.bv_offset += first_sectors << 9;
- bp->bv2.bv_len -= first_sectors << 9;
- bp->bv1.bv_len = first_sectors << 9;
- }
-
- bp->bio1.bi_io_vec = &bp->bv1;
- bp->bio2.bi_io_vec = &bp->bv2;
-
- bp->bio1.bi_max_vecs = 1;
- bp->bio2.bi_max_vecs = 1;
- }
-
- bp->bio1.bi_end_io = bio_pair_end_1;
- bp->bio2.bi_end_io = bio_pair_end_2;
-
- bp->bio1.bi_private = bi;
- bp->bio2.bi_private = bio_split_pool;
-
- if (bio_integrity(bi))
- bio_integrity_split(bi, bp, first_sectors);
-
- return bp;
-}
-EXPORT_SYMBOL(bio_split);
-
-/**
- * bio_sector_offset - Find hardware sector offset in bio
- * @bio: bio to inspect
- * @index: bio_vec index
- * @offset: offset in bv_page
- *
- * Return the number of hardware sectors between beginning of bio
- * and an end point indicated by a bio_vec index and an offset
- * within that vector's page.
- */
-sector_t bio_sector_offset(struct bio *bio, unsigned short index,
- unsigned int offset)
-{
- unsigned int sector_sz;
- struct bio_vec *bv;
- sector_t sectors;
- int i;
-
- sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
- sectors = 0;
-
- if (index >= bio->bi_idx)
- index = bio->bi_vcnt - 1;
-
- bio_for_each_segment_all(bv, bio, i) {
- if (i == index) {
- if (offset > bv->bv_offset)
- sectors += (offset - bv->bv_offset) / sector_sz;
- break;
- }
-
- sectors += bv->bv_len / sector_sz;
- }
-
- return sectors;
-}
-EXPORT_SYMBOL(bio_sector_offset);
-
-/*
- * create memory pools for biovec's in a bio_set.
- * use the global biovec slabs created for general use.
- */
-mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
-{
- struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-
- return mempool_create_slab_pool(pool_entries, bp->slab);
-}
-
-void bioset_free(struct bio_set *bs)
-{
- if (bs->rescue_workqueue)
- destroy_workqueue(bs->rescue_workqueue);
-
- if (bs->bio_pool)
- mempool_destroy(bs->bio_pool);
-
- if (bs->bvec_pool)
- mempool_destroy(bs->bvec_pool);
-
- bioset_integrity_free(bs);
- bio_put_slab(bs);
-
- kfree(bs);
-}
-EXPORT_SYMBOL(bioset_free);
-
-/**
- * bioset_create - Create a bio_set
- * @pool_size: Number of bio and bio_vecs to cache in the mempool
- * @front_pad: Number of bytes to allocate in front of the returned bio
- *
- * Description:
- * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
- * to ask for a number of bytes to be allocated in front of the bio.
- * Front pad allocation is useful for embedding the bio inside
- * another structure, to avoid allocating extra data to go with the bio.
- * Note that the bio must be embedded at the END of that structure always,
- * or things will break badly.
- */
-struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
-{
- unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
- struct bio_set *bs;
-
- bs = kzalloc(sizeof(*bs), GFP_KERNEL);
- if (!bs)
- return NULL;
-
- bs->front_pad = front_pad;
-
- spin_lock_init(&bs->rescue_lock);
- bio_list_init(&bs->rescue_list);
- INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
-
- bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
- if (!bs->bio_slab) {
- kfree(bs);
- return NULL;
- }
-
- bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
- if (!bs->bio_pool)
- goto bad;
-
- bs->bvec_pool = biovec_create_pool(bs, pool_size);
- if (!bs->bvec_pool)
- goto bad;
-
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
-
- return bs;
-bad:
- bioset_free(bs);
- return NULL;
-}
-EXPORT_SYMBOL(bioset_create);
-
-#ifdef CONFIG_BLK_CGROUP
-/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet. Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released. The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
- struct io_context *ioc;
- struct cgroup_subsys_state *css;
-
- if (bio->bi_ioc)
- return -EBUSY;
-
- ioc = current->io_context;
- if (!ioc)
- return -ENOENT;
-
- /* acquire active ref on @ioc and associate */
- get_io_context_active(ioc);
- bio->bi_ioc = ioc;
-
- /* associate blkcg if exists */
- rcu_read_lock();
- css = task_css(current, blkio_subsys_id);
- if (css && css_tryget(css))
- bio->bi_css = css;
- rcu_read_unlock();
-
- return 0;
-}
-
-/**
- * bio_disassociate_task - undo bio_associate_current()
- * @bio: target bio
- */
-void bio_disassociate_task(struct bio *bio)
-{
- if (bio->bi_ioc) {
- put_io_context(bio->bi_ioc);
- bio->bi_ioc = NULL;
- }
- if (bio->bi_css) {
- css_put(bio->bi_css);
- bio->bi_css = NULL;
- }
-}
-
-#endif /* CONFIG_BLK_CGROUP */
-
-static void __init biovec_init_slabs(void)
-{
- int i;
-
- for (i = 0; i < BIOVEC_NR_POOLS; i++) {
- int size;
- struct biovec_slab *bvs = bvec_slabs + i;
-
- if (bvs->nr_vecs <= BIO_INLINE_VECS) {
- bvs->slab = NULL;
- continue;
- }
-
- size = bvs->nr_vecs * sizeof(struct bio_vec);
- bvs->slab = kmem_cache_create(bvs->name, size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- }
-}
-
-static int __init init_bio(void)
-{
- bio_slab_max = 2;
- bio_slab_nr = 0;
- bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
- if (!bio_slabs)
- panic("bio: can't allocate bios\n");
-
- bio_integrity_init();
- biovec_init_slabs();
-
- fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
- if (!fs_bio_set)
- panic("bio: can't allocate bios\n");
-
- if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
- panic("bio: can't create integrity pool\n");
-
- bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
- sizeof(struct bio_pair));
- if (!bio_split_pool)
- panic("bio: can't create split pool\n");
-
- return 0;
-}
-subsys_initcall(init_bio);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1e86823a9cb..6d7274619bf 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -83,7 +83,7 @@ void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0)
+ if (mapping->nrpages == 0 && mapping->nrshadows == 0)
return;
invalidate_bh_lrus();
@@ -165,14 +165,15 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
}
static ssize_t
-blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
- nr_segs, blkdev_get_block, NULL, NULL, 0);
+ return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
+ offset, blkdev_get_block,
+ NULL, NULL, 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -363,6 +364,69 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(blkdev_fsync);
+/**
+ * bdev_read_page() - Start reading a page from a block device
+ * @bdev: The device to read the page from
+ * @sector: The offset on the device to read the page to (need not be aligned)
+ * @page: The page to read
+ *
+ * On entry, the page should be locked. It will be unlocked when the page
+ * has been read. If the block driver implements rw_page synchronously,
+ * that will be true on exit from this function, but it need not be.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to read this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_read_page(struct block_device *bdev, sector_t sector,
+ struct page *page)
+{
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+ if (!ops->rw_page)
+ return -EOPNOTSUPP;
+ return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+}
+EXPORT_SYMBOL_GPL(bdev_read_page);
+
+/**
+ * bdev_write_page() - Start writing a page to a block device
+ * @bdev: The device to write the page to
+ * @sector: The offset on the device to write the page to (need not be aligned)
+ * @page: The page to write
+ * @wbc: The writeback_control for the write
+ *
+ * On entry, the page should be locked and not currently under writeback.
+ * On exit, if the write started successfully, the page will be unlocked and
+ * under writeback. If the write failed already (eg the driver failed to
+ * queue the page to the device), the page will still be locked. If the
+ * caller is a ->writepage implementation, it will need to unlock the page.
+ *
+ * Errors returned by this function are usually "soft", eg out of memory, or
+ * queue full; callers should try a different route to write this page rather
+ * than propagate an error back up the stack.
+ *
+ * Return: negative errno if an error occurs, 0 if submission was successful.
+ */
+int bdev_write_page(struct block_device *bdev, sector_t sector,
+ struct page *page, struct writeback_control *wbc)
+{
+ int result;
+ int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+ if (!ops->rw_page)
+ return -EOPNOTSUPP;
+ set_page_writeback(page);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+ if (result)
+ end_page_writeback(page);
+ else
+ unlock_page(page);
+ return result;
+}
+EXPORT_SYMBOL_GPL(bdev_write_page);
+
/*
* pseudo-fs
*/
@@ -419,7 +483,7 @@ static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
struct list_head *p;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
spin_lock(&bdev_lock);
@@ -1508,43 +1572,38 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
* Does not take i_mutex for the write and thus is not for general purpose
* use.
*/
-ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct blk_plug plug;
ssize_t ret;
- BUG_ON(iocb->ki_pos != pos);
-
blk_start_plug(&plug);
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+ ret = __generic_file_write_iter(iocb, from);
if (ret > 0) {
ssize_t err;
-
- err = generic_write_sync(file, pos, ret);
- if (err < 0 && ret > 0)
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
ret = err;
}
blk_finish_plug(&plug);
return ret;
}
-EXPORT_SYMBOL_GPL(blkdev_aio_write);
+EXPORT_SYMBOL_GPL(blkdev_write_iter);
-static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = file->f_mapping->host;
loff_t size = i_size_read(bd_inode);
+ loff_t pos = iocb->ki_pos;
if (pos >= size)
return 0;
size -= pos;
- if (size < iocb->ki_nbytes)
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
- return generic_file_aio_read(iocb, iov, nr_segs, pos);
+ iov_iter_truncate(to, size);
+ return generic_file_read_iter(iocb, to);
}
/*
@@ -1576,10 +1635,10 @@ const struct file_operations def_blk_fops = {
.open = blkdev_open,
.release = blkdev_close,
.llseek = block_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = blkdev_aio_read,
- .aio_write = blkdev_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = blkdev_read_iter,
+ .write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
@@ -1587,7 +1646,7 @@ const struct file_operations def_blk_fops = {
.compat_ioctl = compat_blkdev_ioctl,
#endif
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 398cbd517be..a66768ebc8d 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,6 +1,7 @@
config BTRFS_FS
tristate "Btrfs filesystem support"
- select LIBCRC32C
+ select CRYPTO
+ select CRYPTO_CRC32C
select ZLIB_INFLATE
select ZLIB_DEFLATE
select LZO_COMPRESS
@@ -9,12 +10,17 @@ config BTRFS_FS
select XOR_BLOCKS
help
- Btrfs is a new filesystem with extents, writable snapshotting,
- support for multiple devices and many more features.
+ Btrfs is a general purpose copy-on-write filesystem with extents,
+ writable snapshotting, support for multiple devices and many more
+ features focused on fault tolerance, repair and easy administration.
- Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET
- FINALIZED. You should say N here unless you are interested in
- testing Btrfs with non-critical data.
+ The filesystem disk format is no longer unstable, and it's not
+ expected to change unless there are strong reasons to do so. If there
+ is a format change, file systems with a unchanged format will
+ continue to be mountable and usable by newer kernels.
+
+ For more information, please see the web pages at
+ http://btrfs.wiki.kernel.org.
To compile this file system support as a module, choose M here. The
module will be called btrfs.
@@ -59,7 +65,8 @@ config BTRFS_FS_RUN_SANITY_TESTS
help
This will run some basic sanity tests on the free space cache
code to make sure it is acting as it should. These are mostly
- regression tests and are only really interesting to btrfs devlopers.
+ regression tests and are only really interesting to btrfs
+ developers.
If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a91a6a355cc..6d1d0b93b1a 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,9 +9,11 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
- uuid-tree.o
+ uuid-tree.o props.o hash.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
-btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
+ tests/extent-buffer-tests.o tests/btrfs-tests.o \
+ tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index e15d2b0d8d3..9a0124a9585 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -35,13 +35,6 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
char *value = NULL;
struct posix_acl *acl;
- if (!IS_POSIXACL(inode))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
@@ -76,44 +69,16 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
return acl;
}
-static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
-{
- struct posix_acl *acl;
- int ret = 0;
-
- if (!IS_POSIXACL(dentry->d_inode))
- return -EOPNOTSUPP;
-
- acl = btrfs_get_acl(dentry->d_inode, type);
-
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- posix_acl_release(acl);
-
- return ret;
-}
-
/*
* Needs to be called with fs_mutex held
*/
-static int btrfs_set_acl(struct btrfs_trans_handle *trans,
+static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
char *value = NULL;
- if (acl) {
- ret = posix_acl_valid(acl);
- if (ret < 0)
- return ret;
- ret = 0;
- }
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
@@ -158,35 +123,9 @@ out:
return ret;
}
-static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- int ret;
- struct posix_acl *acl = NULL;
-
- if (!inode_owner_or_capable(dentry->d_inode))
- return -EPERM;
-
- if (!IS_POSIXACL(dentry->d_inode))
- return -EOPNOTSUPP;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (acl) {
- ret = posix_acl_valid(acl);
- if (ret)
- goto out;
- }
- }
-
- ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
-out:
- posix_acl_release(acl);
-
- return ret;
+ return __btrfs_set_acl(NULL, inode, acl, type);
}
/*
@@ -197,83 +136,31 @@ out:
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl = NULL;
+ struct posix_acl *default_acl, *acl;
int ret = 0;
/* this happens with subvols */
if (!dir)
return 0;
- if (!S_ISLNK(inode->i_mode)) {
- if (IS_POSIXACL(dir)) {
- acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
+ ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (ret)
+ return ret;
- if (!acl)
- inode->i_mode &= ~current_umask();
+ if (default_acl) {
+ ret = __btrfs_set_acl(trans, inode, default_acl,
+ ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
}
- if (IS_POSIXACL(dir) && acl) {
- if (S_ISDIR(inode->i_mode)) {
- ret = btrfs_set_acl(trans, inode, acl,
- ACL_TYPE_DEFAULT);
- if (ret)
- goto failed;
- }
- ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (ret < 0)
- return ret;
-
- if (ret > 0) {
- /* we need an acl */
- ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
- } else {
- cache_no_acl(inode);
- }
- } else {
- cache_no_acl(inode);
+ if (acl) {
+ if (!ret)
+ ret = __btrfs_set_acl(trans, inode, acl,
+ ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
}
-failed:
- posix_acl_release(acl);
-
- return ret;
-}
-int btrfs_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
- int ret = 0;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- if (!IS_POSIXACL(inode))
- return 0;
-
- acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR_OR_NULL(acl))
- return PTR_ERR(acl);
-
- ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (ret)
- return ret;
- ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
- posix_acl_release(acl);
+ if (!default_acl && !acl)
+ cache_no_acl(inode);
return ret;
}
-
-const struct xattr_handler btrfs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = btrfs_xattr_acl_get,
- .set = btrfs_xattr_acl_set,
-};
-
-const struct xattr_handler btrfs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = btrfs_xattr_acl_get,
- .set = btrfs_xattr_acl_set,
-};
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 08cc08f037a..5a201d81049 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -21,707 +22,315 @@
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/freezer.h>
+#include <linux/workqueue.h>
#include "async-thread.h"
+#include "ctree.h"
+
+#define WORK_DONE_BIT 0
+#define WORK_ORDER_DONE_BIT 1
+#define WORK_HIGH_PRIO_BIT 2
+
+#define NO_THRESHOLD (-1)
+#define DFT_THRESHOLD (32)
+
+struct __btrfs_workqueue {
+ struct workqueue_struct *normal_wq;
+ /* List head pointing to ordered work list */
+ struct list_head ordered_list;
+
+ /* Spinlock for ordered_list */
+ spinlock_t list_lock;
+
+ /* Thresholding related variants */
+ atomic_t pending;
+ int max_active;
+ int current_max;
+ int thresh;
+ unsigned int count;
+ spinlock_t thres_lock;
+};
-#define WORK_QUEUED_BIT 0
-#define WORK_DONE_BIT 1
-#define WORK_ORDER_DONE_BIT 2
-#define WORK_HIGH_PRIO_BIT 3
-
-/*
- * container for the kthread task pointer and the list of pending work
- * One of these is allocated per thread.
- */
-struct btrfs_worker_thread {
- /* pool we belong to */
- struct btrfs_workers *workers;
-
- /* list of struct btrfs_work that are waiting for service */
- struct list_head pending;
- struct list_head prio_pending;
-
- /* list of worker threads from struct btrfs_workers */
- struct list_head worker_list;
-
- /* kthread */
- struct task_struct *task;
+struct btrfs_workqueue {
+ struct __btrfs_workqueue *normal;
+ struct __btrfs_workqueue *high;
+};
- /* number of things on the pending list */
- atomic_t num_pending;
+static inline struct __btrfs_workqueue
+*__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
+ int thresh)
+{
+ struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
- /* reference counter for this struct */
- atomic_t refs;
+ if (unlikely(!ret))
+ return NULL;
- unsigned long sequence;
+ ret->max_active = max_active;
+ atomic_set(&ret->pending, 0);
+ if (thresh == 0)
+ thresh = DFT_THRESHOLD;
+ /* For low threshold, disabling threshold is a better choice */
+ if (thresh < DFT_THRESHOLD) {
+ ret->current_max = max_active;
+ ret->thresh = NO_THRESHOLD;
+ } else {
+ ret->current_max = 1;
+ ret->thresh = thresh;
+ }
- /* protects the pending list. */
- spinlock_t lock;
+ if (flags & WQ_HIGHPRI)
+ ret->normal_wq = alloc_workqueue("%s-%s-high", flags,
+ ret->max_active,
+ "btrfs", name);
+ else
+ ret->normal_wq = alloc_workqueue("%s-%s", flags,
+ ret->max_active, "btrfs",
+ name);
+ if (unlikely(!ret->normal_wq)) {
+ kfree(ret);
+ return NULL;
+ }
- /* set to non-zero when this thread is already awake and kicking */
- int working;
+ INIT_LIST_HEAD(&ret->ordered_list);
+ spin_lock_init(&ret->list_lock);
+ spin_lock_init(&ret->thres_lock);
+ trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
+ return ret;
+}
- /* are we currently idle */
- int idle;
-};
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-static int __btrfs_start_workers(struct btrfs_workers *workers);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ int flags,
+ int max_active,
+ int thresh)
+{
+ struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
-/*
- * btrfs_start_workers uses kthread_run, which can block waiting for memory
- * for a very long time. It will actually throttle on page writeback,
- * and so it may not make progress until after our btrfs worker threads
- * process all of the pending work structs in their queue
- *
- * This means we can't use btrfs_start_workers from inside a btrfs worker
- * thread that is used as part of cleaning dirty memory, which pretty much
- * involves all of the worker threads.
- *
- * Instead we have a helper queue who never has more than one thread
- * where we scheduler thread start operations. This worker_start struct
- * is used to contain the work and hold a pointer to the queue that needs
- * another worker.
- */
-struct worker_start {
- struct btrfs_work work;
- struct btrfs_workers *queue;
-};
+ if (unlikely(!ret))
+ return NULL;
-static void start_new_worker_func(struct btrfs_work *work)
-{
- struct worker_start *start;
- start = container_of(work, struct worker_start, work);
- __btrfs_start_workers(start->queue);
- kfree(start);
-}
+ ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+ max_active, thresh);
+ if (unlikely(!ret->normal)) {
+ kfree(ret);
+ return NULL;
+ }
-/*
- * helper function to move a thread onto the idle list after it
- * has finished some requests.
- */
-static void check_idle_worker(struct btrfs_worker_thread *worker)
-{
- if (!worker->idle && atomic_read(&worker->num_pending) <
- worker->workers->idle_thresh / 2) {
- unsigned long flags;
- spin_lock_irqsave(&worker->workers->lock, flags);
- worker->idle = 1;
-
- /* the list may be empty if the worker is just starting */
- if (!list_empty(&worker->worker_list) &&
- !worker->workers->stopping) {
- list_move(&worker->worker_list,
- &worker->workers->idle_list);
+ if (flags & WQ_HIGHPRI) {
+ ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
+ thresh);
+ if (unlikely(!ret->high)) {
+ __btrfs_destroy_workqueue(ret->normal);
+ kfree(ret);
+ return NULL;
}
- spin_unlock_irqrestore(&worker->workers->lock, flags);
}
+ return ret;
}
/*
- * helper function to move a thread off the idle list after new
- * pending work is added.
+ * Hook for threshold which will be called in btrfs_queue_work.
+ * This hook WILL be called in IRQ handler context,
+ * so workqueue_set_max_active MUST NOT be called in this hook
*/
-static void check_busy_worker(struct btrfs_worker_thread *worker)
+static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
{
- if (worker->idle && atomic_read(&worker->num_pending) >=
- worker->workers->idle_thresh) {
- unsigned long flags;
- spin_lock_irqsave(&worker->workers->lock, flags);
- worker->idle = 0;
-
- if (!list_empty(&worker->worker_list) &&
- !worker->workers->stopping) {
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- }
- spin_unlock_irqrestore(&worker->workers->lock, flags);
- }
+ if (wq->thresh == NO_THRESHOLD)
+ return;
+ atomic_inc(&wq->pending);
}
-static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
+/*
+ * Hook for threshold which will be called before executing the work,
+ * This hook is called in kthread content.
+ * So workqueue_set_max_active is called here.
+ */
+static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
{
- struct btrfs_workers *workers = worker->workers;
- struct worker_start *start;
- unsigned long flags;
-
- rmb();
- if (!workers->atomic_start_pending)
- return;
+ int new_max_active;
+ long pending;
+ int need_change = 0;
- start = kzalloc(sizeof(*start), GFP_NOFS);
- if (!start)
+ if (wq->thresh == NO_THRESHOLD)
return;
- start->work.func = start_new_worker_func;
- start->queue = workers;
-
- spin_lock_irqsave(&workers->lock, flags);
- if (!workers->atomic_start_pending)
- goto out;
-
- workers->atomic_start_pending = 0;
- if (workers->num_workers + workers->num_workers_starting >=
- workers->max_workers)
- goto out;
-
- workers->num_workers_starting += 1;
- spin_unlock_irqrestore(&workers->lock, flags);
- btrfs_queue_worker(workers->atomic_worker_start, &start->work);
- return;
+ atomic_dec(&wq->pending);
+ spin_lock(&wq->thres_lock);
+ /*
+ * Use wq->count to limit the calling frequency of
+ * workqueue_set_max_active.
+ */
+ wq->count++;
+ wq->count %= (wq->thresh / 4);
+ if (!wq->count)
+ goto out;
+ new_max_active = wq->current_max;
+ /*
+ * pending may be changed later, but it's OK since we really
+ * don't need it so accurate to calculate new_max_active.
+ */
+ pending = atomic_read(&wq->pending);
+ if (pending > wq->thresh)
+ new_max_active++;
+ if (pending < wq->thresh / 2)
+ new_max_active--;
+ new_max_active = clamp_val(new_max_active, 1, wq->max_active);
+ if (new_max_active != wq->current_max) {
+ need_change = 1;
+ wq->current_max = new_max_active;
+ }
out:
- kfree(start);
- spin_unlock_irqrestore(&workers->lock, flags);
+ spin_unlock(&wq->thres_lock);
+
+ if (need_change) {
+ workqueue_set_max_active(wq->normal_wq, wq->current_max);
+ }
}
-static noinline void run_ordered_completions(struct btrfs_workers *workers,
- struct btrfs_work *work)
+static void run_ordered_work(struct __btrfs_workqueue *wq)
{
- if (!workers->ordered)
- return;
-
- set_bit(WORK_DONE_BIT, &work->flags);
-
- spin_lock(&workers->order_lock);
+ struct list_head *list = &wq->ordered_list;
+ struct btrfs_work *work;
+ spinlock_t *lock = &wq->list_lock;
+ unsigned long flags;
while (1) {
- if (!list_empty(&workers->prio_order_list)) {
- work = list_entry(workers->prio_order_list.next,
- struct btrfs_work, order_list);
- } else if (!list_empty(&workers->order_list)) {
- work = list_entry(workers->order_list.next,
- struct btrfs_work, order_list);
- } else {
+ spin_lock_irqsave(lock, flags);
+ if (list_empty(list))
break;
- }
+ work = list_entry(list->next, struct btrfs_work,
+ ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
- /* we are going to call the ordered done function, but
+ /*
+ * we are going to call the ordered done function, but
* we leave the work item on the list as a barrier so
* that later work items that are done don't have their
* functions called before this one returns
*/
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
-
- spin_unlock(&workers->order_lock);
-
+ trace_btrfs_ordered_sched(work);
+ spin_unlock_irqrestore(lock, flags);
work->ordered_func(work);
/* now take the lock again and drop our item from the list */
- spin_lock(&workers->order_lock);
- list_del(&work->order_list);
- spin_unlock(&workers->order_lock);
+ spin_lock_irqsave(lock, flags);
+ list_del(&work->ordered_list);
+ spin_unlock_irqrestore(lock, flags);
/*
* we don't want to call the ordered free functions
* with the lock held though
*/
work->ordered_free(work);
- spin_lock(&workers->order_lock);
- }
-
- spin_unlock(&workers->order_lock);
-}
-
-static void put_worker(struct btrfs_worker_thread *worker)
-{
- if (atomic_dec_and_test(&worker->refs))
- kfree(worker);
-}
-
-static int try_worker_shutdown(struct btrfs_worker_thread *worker)
-{
- int freeit = 0;
-
- spin_lock_irq(&worker->lock);
- spin_lock(&worker->workers->lock);
- if (worker->workers->num_workers > 1 &&
- worker->idle &&
- !worker->working &&
- !list_empty(&worker->worker_list) &&
- list_empty(&worker->prio_pending) &&
- list_empty(&worker->pending) &&
- atomic_read(&worker->num_pending) == 0) {
- freeit = 1;
- list_del_init(&worker->worker_list);
- worker->workers->num_workers--;
+ trace_btrfs_all_work_done(work);
}
- spin_unlock(&worker->workers->lock);
- spin_unlock_irq(&worker->lock);
-
- if (freeit)
- put_worker(worker);
- return freeit;
+ spin_unlock_irqrestore(lock, flags);
}
-static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
- struct list_head *prio_head,
- struct list_head *head)
+static void normal_work_helper(struct work_struct *arg)
{
- struct btrfs_work *work = NULL;
- struct list_head *cur = NULL;
-
- if(!list_empty(prio_head))
- cur = prio_head->next;
-
- smp_mb();
- if (!list_empty(&worker->prio_pending))
- goto refill;
-
- if (!list_empty(head))
- cur = head->next;
-
- if (cur)
- goto out;
-
-refill:
- spin_lock_irq(&worker->lock);
- list_splice_tail_init(&worker->prio_pending, prio_head);
- list_splice_tail_init(&worker->pending, head);
-
- if (!list_empty(prio_head))
- cur = prio_head->next;
- else if (!list_empty(head))
- cur = head->next;
- spin_unlock_irq(&worker->lock);
-
- if (!cur)
- goto out_fail;
-
-out:
- work = list_entry(cur, struct btrfs_work, list);
-
-out_fail:
- return work;
-}
-
-/*
- * main loop for servicing work items
- */
-static int worker_loop(void *arg)
-{
- struct btrfs_worker_thread *worker = arg;
- struct list_head head;
- struct list_head prio_head;
struct btrfs_work *work;
+ struct __btrfs_workqueue *wq;
+ int need_order = 0;
- INIT_LIST_HEAD(&head);
- INIT_LIST_HEAD(&prio_head);
-
- do {
-again:
- while (1) {
-
-
- work = get_next_work(worker, &prio_head, &head);
- if (!work)
- break;
-
- list_del(&work->list);
- clear_bit(WORK_QUEUED_BIT, &work->flags);
-
- work->worker = worker;
-
- work->func(work);
-
- atomic_dec(&worker->num_pending);
- /*
- * unless this is an ordered work queue,
- * 'work' was probably freed by func above.
- */
- run_ordered_completions(worker->workers, work);
-
- check_pending_worker_creates(worker);
- cond_resched();
- }
-
- spin_lock_irq(&worker->lock);
- check_idle_worker(worker);
-
- if (freezing(current)) {
- worker->working = 0;
- spin_unlock_irq(&worker->lock);
- try_to_freeze();
- } else {
- spin_unlock_irq(&worker->lock);
- if (!kthread_should_stop()) {
- cpu_relax();
- /*
- * we've dropped the lock, did someone else
- * jump_in?
- */
- smp_mb();
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- continue;
-
- /*
- * this short schedule allows more work to
- * come in without the queue functions
- * needing to go through wake_up_process()
- *
- * worker->working is still 1, so nobody
- * is going to try and wake us up
- */
- schedule_timeout(1);
- smp_mb();
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending))
- continue;
-
- if (kthread_should_stop())
- break;
-
- /* still no more work?, sleep for real */
- spin_lock_irq(&worker->lock);
- set_current_state(TASK_INTERRUPTIBLE);
- if (!list_empty(&worker->pending) ||
- !list_empty(&worker->prio_pending)) {
- spin_unlock_irq(&worker->lock);
- set_current_state(TASK_RUNNING);
- goto again;
- }
-
- /*
- * this makes sure we get a wakeup when someone
- * adds something new to the queue
- */
- worker->working = 0;
- spin_unlock_irq(&worker->lock);
-
- if (!kthread_should_stop()) {
- schedule_timeout(HZ * 120);
- if (!worker->working &&
- try_worker_shutdown(worker)) {
- return 0;
- }
- }
- }
- __set_current_state(TASK_RUNNING);
- }
- } while (!kthread_should_stop());
- return 0;
-}
-
-/*
- * this will wait for all the worker threads to shutdown
- */
-void btrfs_stop_workers(struct btrfs_workers *workers)
-{
- struct list_head *cur;
- struct btrfs_worker_thread *worker;
- int can_stop;
-
- spin_lock_irq(&workers->lock);
- workers->stopping = 1;
- list_splice_init(&workers->idle_list, &workers->worker_list);
- while (!list_empty(&workers->worker_list)) {
- cur = workers->worker_list.next;
- worker = list_entry(cur, struct btrfs_worker_thread,
- worker_list);
-
- atomic_inc(&worker->refs);
- workers->num_workers -= 1;
- if (!list_empty(&worker->worker_list)) {
- list_del_init(&worker->worker_list);
- put_worker(worker);
- can_stop = 1;
- } else
- can_stop = 0;
- spin_unlock_irq(&workers->lock);
- if (can_stop)
- kthread_stop(worker->task);
- spin_lock_irq(&workers->lock);
- put_worker(worker);
+ work = container_of(arg, struct btrfs_work, normal_work);
+ /*
+ * We should not touch things inside work in the following cases:
+ * 1) after work->func() if it has no ordered_free
+ * Since the struct is freed in work->func().
+ * 2) after setting WORK_DONE_BIT
+ * The work may be freed in other threads almost instantly.
+ * So we save the needed things here.
+ */
+ if (work->ordered_func)
+ need_order = 1;
+ wq = work->wq;
+
+ trace_btrfs_work_sched(work);
+ thresh_exec_hook(wq);
+ work->func(work);
+ if (need_order) {
+ set_bit(WORK_DONE_BIT, &work->flags);
+ run_ordered_work(wq);
}
- spin_unlock_irq(&workers->lock);
+ if (!need_order)
+ trace_btrfs_all_work_done(work);
}
-/*
- * simple init on struct btrfs_workers
- */
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
- struct btrfs_workers *async_helper)
+void btrfs_init_work(struct btrfs_work *work,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free)
{
- workers->num_workers = 0;
- workers->num_workers_starting = 0;
- INIT_LIST_HEAD(&workers->worker_list);
- INIT_LIST_HEAD(&workers->idle_list);
- INIT_LIST_HEAD(&workers->order_list);
- INIT_LIST_HEAD(&workers->prio_order_list);
- spin_lock_init(&workers->lock);
- spin_lock_init(&workers->order_lock);
- workers->max_workers = max;
- workers->idle_thresh = 32;
- workers->name = name;
- workers->ordered = 0;
- workers->atomic_start_pending = 0;
- workers->atomic_worker_start = async_helper;
- workers->stopping = 0;
+ work->func = func;
+ work->ordered_func = ordered_func;
+ work->ordered_free = ordered_free;
+ INIT_WORK(&work->normal_work, normal_work_helper);
+ INIT_LIST_HEAD(&work->ordered_list);
+ work->flags = 0;
}
-/*
- * starts new worker threads. This does not enforce the max worker
- * count in case you need to temporarily go past it.
- */
-static int __btrfs_start_workers(struct btrfs_workers *workers)
+static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- int ret = 0;
-
- worker = kzalloc(sizeof(*worker), GFP_NOFS);
- if (!worker) {
- ret = -ENOMEM;
- goto fail;
- }
-
- INIT_LIST_HEAD(&worker->pending);
- INIT_LIST_HEAD(&worker->prio_pending);
- INIT_LIST_HEAD(&worker->worker_list);
- spin_lock_init(&worker->lock);
-
- atomic_set(&worker->num_pending, 0);
- atomic_set(&worker->refs, 1);
- worker->workers = workers;
- worker->task = kthread_create(worker_loop, worker,
- "btrfs-%s-%d", workers->name,
- workers->num_workers + 1);
- if (IS_ERR(worker->task)) {
- ret = PTR_ERR(worker->task);
- goto fail;
- }
+ unsigned long flags;
- spin_lock_irq(&workers->lock);
- if (workers->stopping) {
- spin_unlock_irq(&workers->lock);
- goto fail_kthread;
+ work->wq = wq;
+ thresh_queue_hook(wq);
+ if (work->ordered_func) {
+ spin_lock_irqsave(&wq->list_lock, flags);
+ list_add_tail(&work->ordered_list, &wq->ordered_list);
+ spin_unlock_irqrestore(&wq->list_lock, flags);
}
- list_add_tail(&worker->worker_list, &workers->idle_list);
- worker->idle = 1;
- workers->num_workers++;
- workers->num_workers_starting--;
- WARN_ON(workers->num_workers_starting < 0);
- spin_unlock_irq(&workers->lock);
-
- wake_up_process(worker->task);
- return 0;
-
-fail_kthread:
- kthread_stop(worker->task);
-fail:
- kfree(worker);
- spin_lock_irq(&workers->lock);
- workers->num_workers_starting--;
- spin_unlock_irq(&workers->lock);
- return ret;
+ queue_work(wq->normal_wq, &work->normal_work);
+ trace_btrfs_work_queued(work);
}
-int btrfs_start_workers(struct btrfs_workers *workers)
-{
- spin_lock_irq(&workers->lock);
- workers->num_workers_starting++;
- spin_unlock_irq(&workers->lock);
- return __btrfs_start_workers(workers);
-}
-
-/*
- * run through the list and find a worker thread that doesn't have a lot
- * to do right now. This can return null if we aren't yet at the thread
- * count limit and all of the threads are busy.
- */
-static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- struct list_head *next;
- int enforce_min;
+ struct __btrfs_workqueue *dest_wq;
- enforce_min = (workers->num_workers + workers->num_workers_starting) <
- workers->max_workers;
-
- /*
- * if we find an idle thread, don't move it to the end of the
- * idle list. This improves the chance that the next submission
- * will reuse the same thread, and maybe catch it while it is still
- * working
- */
- if (!list_empty(&workers->idle_list)) {
- next = workers->idle_list.next;
- worker = list_entry(next, struct btrfs_worker_thread,
- worker_list);
- return worker;
- }
- if (enforce_min || list_empty(&workers->worker_list))
- return NULL;
-
- /*
- * if we pick a busy task, move the task to the end of the list.
- * hopefully this will keep things somewhat evenly balanced.
- * Do the move in batches based on the sequence number. This groups
- * requests submitted at roughly the same time onto the same worker.
- */
- next = workers->worker_list.next;
- worker = list_entry(next, struct btrfs_worker_thread, worker_list);
- worker->sequence++;
-
- if (worker->sequence % workers->idle_thresh == 0)
- list_move_tail(next, &workers->worker_list);
- return worker;
+ if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
+ dest_wq = wq->high;
+ else
+ dest_wq = wq->normal;
+ __btrfs_queue_work(dest_wq, work);
}
-/*
- * selects a worker thread to take the next job. This will either find
- * an idle worker, start a new worker up to the max count, or just return
- * one of the existing busy workers.
- */
-static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
+static inline void
+__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
{
- struct btrfs_worker_thread *worker;
- unsigned long flags;
- struct list_head *fallback;
- int ret;
-
- spin_lock_irqsave(&workers->lock, flags);
-again:
- worker = next_worker(workers);
-
- if (!worker) {
- if (workers->num_workers + workers->num_workers_starting >=
- workers->max_workers) {
- goto fallback;
- } else if (workers->atomic_worker_start) {
- workers->atomic_start_pending = 1;
- goto fallback;
- } else {
- workers->num_workers_starting++;
- spin_unlock_irqrestore(&workers->lock, flags);
- /* we're below the limit, start another worker */
- ret = __btrfs_start_workers(workers);
- spin_lock_irqsave(&workers->lock, flags);
- if (ret)
- goto fallback;
- goto again;
- }
- }
- goto found;
-
-fallback:
- fallback = NULL;
- /*
- * we have failed to find any workers, just
- * return the first one we can find.
- */
- if (!list_empty(&workers->worker_list))
- fallback = workers->worker_list.next;
- if (!list_empty(&workers->idle_list))
- fallback = workers->idle_list.next;
- BUG_ON(!fallback);
- worker = list_entry(fallback,
- struct btrfs_worker_thread, worker_list);
-found:
- /*
- * this makes sure the worker doesn't exit before it is placed
- * onto a busy/idle list
- */
- atomic_inc(&worker->num_pending);
- spin_unlock_irqrestore(&workers->lock, flags);
- return worker;
+ destroy_workqueue(wq->normal_wq);
+ trace_btrfs_workqueue_destroy(wq);
+ kfree(wq);
}
-/*
- * btrfs_requeue_work just puts the work item back on the tail of the list
- * it was taken from. It is intended for use with long running work functions
- * that make some progress and want to give the cpu up for others.
- */
-void btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
- struct btrfs_worker_thread *worker = work->worker;
- unsigned long flags;
- int wake = 0;
-
- if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
+ if (!wq)
return;
-
- spin_lock_irqsave(&worker->lock, flags);
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
- list_add_tail(&work->list, &worker->prio_pending);
- else
- list_add_tail(&work->list, &worker->pending);
- atomic_inc(&worker->num_pending);
-
- /* by definition we're busy, take ourselves off the idle
- * list
- */
- if (worker->idle) {
- spin_lock(&worker->workers->lock);
- worker->idle = 0;
- list_move_tail(&worker->worker_list,
- &worker->workers->worker_list);
- spin_unlock(&worker->workers->lock);
- }
- if (!worker->working) {
- wake = 1;
- worker->working = 1;
- }
-
- if (wake)
- wake_up_process(worker->task);
- spin_unlock_irqrestore(&worker->lock, flags);
+ if (wq->high)
+ __btrfs_destroy_workqueue(wq->high);
+ __btrfs_destroy_workqueue(wq->normal);
+ kfree(wq);
}
-void btrfs_set_work_high_prio(struct btrfs_work *work)
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max)
{
- set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
+ if (!wq)
+ return;
+ wq->normal->max_active = max;
+ if (wq->high)
+ wq->high->max_active = max;
}
-/*
- * places a struct btrfs_work into the pending queue of one of the kthreads
- */
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
+void btrfs_set_work_high_priority(struct btrfs_work *work)
{
- struct btrfs_worker_thread *worker;
- unsigned long flags;
- int wake = 0;
-
- /* don't requeue something already on a list */
- if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
- return;
-
- worker = find_worker(workers);
- if (workers->ordered) {
- /*
- * you're not allowed to do ordered queues from an
- * interrupt handler
- */
- spin_lock(&workers->order_lock);
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
- list_add_tail(&work->order_list,
- &workers->prio_order_list);
- } else {
- list_add_tail(&work->order_list, &workers->order_list);
- }
- spin_unlock(&workers->order_lock);
- } else {
- INIT_LIST_HEAD(&work->order_list);
- }
-
- spin_lock_irqsave(&worker->lock, flags);
-
- if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
- list_add_tail(&work->list, &worker->prio_pending);
- else
- list_add_tail(&work->list, &worker->pending);
- check_busy_worker(worker);
-
- /*
- * avoid calling into wake_up_process if this thread has already
- * been kicked
- */
- if (!worker->working)
- wake = 1;
- worker->working = 1;
-
- if (wake)
- wake_up_process(worker->task);
- spin_unlock_irqrestore(&worker->lock, flags);
+ set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 1f26792683e..9c6b66d15fb 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2007 Oracle. All rights reserved.
+ * Copyright (C) 2014 Fujitsu. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
@@ -19,103 +20,35 @@
#ifndef __BTRFS_ASYNC_THREAD_
#define __BTRFS_ASYNC_THREAD_
-struct btrfs_worker_thread;
+struct btrfs_workqueue;
+/* Internal use only */
+struct __btrfs_workqueue;
+struct btrfs_work;
+typedef void (*btrfs_func_t)(struct btrfs_work *arg);
-/*
- * This is similar to a workqueue, but it is meant to spread the operations
- * across all available cpus instead of just the CPU that was used to
- * queue the work. There is also some batching introduced to try and
- * cut down on context switches.
- *
- * By default threads are added on demand up to 2 * the number of cpus.
- * Changing struct btrfs_workers->max_workers is one way to prevent
- * demand creation of kthreads.
- *
- * the basic model of these worker threads is to embed a btrfs_work
- * structure in your own data struct, and use container_of in a
- * work function to get back to your data struct.
- */
struct btrfs_work {
- /*
- * func should be set to the function you want called
- * your work struct is passed as the only arg
- *
- * ordered_func must be set for work sent to an ordered work queue,
- * and it is called to complete a given work item in the same
- * order they were sent to the queue.
- */
- void (*func)(struct btrfs_work *work);
- void (*ordered_func)(struct btrfs_work *work);
- void (*ordered_free)(struct btrfs_work *work);
-
- /*
- * flags should be set to zero. It is used to make sure the
- * struct is only inserted once into the list.
- */
+ btrfs_func_t func;
+ btrfs_func_t ordered_func;
+ btrfs_func_t ordered_free;
+
+ /* Don't touch things below */
+ struct work_struct normal_work;
+ struct list_head ordered_list;
+ struct __btrfs_workqueue *wq;
unsigned long flags;
-
- /* don't touch these */
- struct btrfs_worker_thread *worker;
- struct list_head list;
- struct list_head order_list;
-};
-
-struct btrfs_workers {
- /* current number of running workers */
- int num_workers;
-
- int num_workers_starting;
-
- /* max number of workers allowed. changed by btrfs_start_workers */
- int max_workers;
-
- /* once a worker has this many requests or fewer, it is idle */
- int idle_thresh;
-
- /* force completions in the order they were queued */
- int ordered;
-
- /* more workers required, but in an interrupt handler */
- int atomic_start_pending;
-
- /*
- * are we allowed to sleep while starting workers or are we required
- * to start them at a later time? If we can't sleep, this indicates
- * which queue we need to use to schedule thread creation.
- */
- struct btrfs_workers *atomic_worker_start;
-
- /* list with all the work threads. The workers on the idle thread
- * may be actively servicing jobs, but they haven't yet hit the
- * idle thresh limit above.
- */
- struct list_head worker_list;
- struct list_head idle_list;
-
- /*
- * when operating in ordered mode, this maintains the list
- * of work items waiting for completion
- */
- struct list_head order_list;
- struct list_head prio_order_list;
-
- /* lock for finding the next worker thread to queue on */
- spinlock_t lock;
-
- /* lock for the ordered lists */
- spinlock_t order_lock;
-
- /* extra name for this worker, used for current->name */
- char *name;
-
- int stopping;
};
-void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
-int btrfs_start_workers(struct btrfs_workers *workers);
-void btrfs_stop_workers(struct btrfs_workers *workers);
-void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
- struct btrfs_workers *async_starter);
-void btrfs_requeue_work(struct btrfs_work *work);
-void btrfs_set_work_high_prio(struct btrfs_work *work);
+struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+ int flags,
+ int max_active,
+ int thresh);
+void btrfs_init_work(struct btrfs_work *work,
+ btrfs_func_t func,
+ btrfs_func_t ordered_func,
+ btrfs_func_t ordered_free);
+void btrfs_queue_work(struct btrfs_workqueue *wq,
+ struct btrfs_work *work);
+void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
+void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
+void btrfs_set_work_high_priority(struct btrfs_work *work);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 0552a599b28..e25564bfcb4 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -66,6 +66,16 @@ static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
return 0;
}
+static void free_inode_elem_list(struct extent_inode_elem *eie)
+{
+ struct extent_inode_elem *eie_next;
+
+ for (; eie; eie = eie_next) {
+ eie_next = eie->next;
+ kfree(eie);
+ }
+}
+
static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
u64 extent_item_pos,
struct extent_inode_elem **eie)
@@ -185,6 +195,9 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
{
struct __prelim_ref *ref;
+ if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ return 0;
+
ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
if (!ref)
return -ENOMEM;
@@ -206,18 +219,20 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
}
static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
- struct ulist *parents, int level,
- struct btrfs_key *key_for_search, u64 time_seq,
- u64 wanted_disk_byte,
- const u64 *extent_item_pos)
+ struct ulist *parents, struct __prelim_ref *ref,
+ int level, u64 time_seq, const u64 *extent_item_pos,
+ u64 total_refs)
{
int ret = 0;
int slot;
struct extent_buffer *eb;
struct btrfs_key key;
+ struct btrfs_key *key_for_search = &ref->key_for_search;
struct btrfs_file_extent_item *fi;
struct extent_inode_elem *eie = NULL, *old = NULL;
u64 disk_byte;
+ u64 wanted_disk_byte = ref->wanted_disk_byte;
+ u64 count = 0;
if (level != 0) {
eb = path->nodes[level];
@@ -235,7 +250,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
ret = btrfs_next_old_leaf(root, path, time_seq);
- while (!ret) {
+ while (!ret && count < total_refs) {
eb = path->nodes[0];
slot = path->slots[0];
@@ -251,6 +266,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (disk_byte == wanted_disk_byte) {
eie = NULL;
old = NULL;
+ count++;
if (extent_item_pos) {
ret = check_extent_in_eb(&key, eb, fi,
*extent_item_pos,
@@ -270,6 +286,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
old = old->next;
old->next = eie;
}
+ eie = NULL;
}
next:
ret = btrfs_next_old_item(root, path, time_seq);
@@ -277,6 +294,8 @@ next:
if (ret > 0)
ret = 0;
+ else if (ret < 0)
+ free_inode_elem_list(eie);
return ret;
}
@@ -288,7 +307,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct __prelim_ref *ref,
struct ulist *parents,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, u64 total_refs)
{
struct btrfs_root *root;
struct btrfs_key root_key;
@@ -296,23 +315,37 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
int ret = 0;
int root_level;
int level = ref->level;
+ int index;
root_key.objectid = ref->root_id;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
root = btrfs_read_fs_root_no_name(fs_info, &root_key);
if (IS_ERR(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(root);
goto out;
}
- root_level = btrfs_old_root_level(root, time_seq);
+ if (path->search_commit_root)
+ root_level = btrfs_header_level(root->commit_root);
+ else
+ root_level = btrfs_old_root_level(root, time_seq);
- if (root_level + 1 == level)
+ if (root_level + 1 == level) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
goto out;
+ }
path->lowest_level = level;
ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+
+ /* root node has been locked, we can release @subvol_srcu safely here */
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
pr_debug("search slot in root %llu (level %d, ref count %d) returned "
"%d for key (%llu %u %llu)\n",
ref->root_id, level, ref->count, ret,
@@ -323,8 +356,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
while (!eb) {
- if (!level) {
- WARN_ON(1);
+ if (WARN_ON(!level)) {
ret = 1;
goto out;
}
@@ -332,9 +364,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
eb = path->nodes[level];
}
- ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
- time_seq, ref->wanted_disk_byte,
- extent_item_pos);
+ ret = add_all_parents(root, path, parents, ref, level, time_seq,
+ extent_item_pos, total_refs);
out:
path->lowest_level = 0;
btrfs_release_path(path);
@@ -347,7 +378,7 @@ out:
static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 time_seq,
struct list_head *head,
- const u64 *extent_item_pos)
+ const u64 *extent_item_pos, u64 total_refs)
{
int err;
int ret = 0;
@@ -373,11 +404,18 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
if (ref->count == 0)
continue;
err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
- parents, extent_item_pos);
- if (err == -ENOMEM)
- goto out;
- if (err)
+ parents, extent_item_pos,
+ total_refs);
+ /*
+ * we can only tolerate ENOENT,otherwise,we should catch error
+ * and return directly.
+ */
+ if (err == -ENOENT) {
continue;
+ } else if (err) {
+ ret = err;
+ goto out;
+ }
/* we put the first parent into the ref at hand */
ULIST_ITER_INIT(&uiter);
@@ -524,7 +562,7 @@ static void __merge_refs(struct list_head *head, int mode)
* smaller or equal that seq to the list
*/
static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
- struct list_head *prefs)
+ struct list_head *prefs, u64 *total_refs)
{
struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct rb_node *n = &head->node.rb_node;
@@ -536,14 +574,13 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
if (extent_op && extent_op->update_key)
btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
- while ((n = rb_prev(n))) {
+ spin_lock(&head->lock);
+ n = rb_first(&head->ref_root);
+ while (n) {
struct btrfs_delayed_ref_node *node;
node = rb_entry(n, struct btrfs_delayed_ref_node,
rb_node);
- if (node->bytenr != head->node.bytenr)
- break;
- WARN_ON(node->is_head);
-
+ n = rb_next(n);
if (node->seq > seq)
continue;
@@ -561,6 +598,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
default:
BUG_ON(1);
}
+ *total_refs += (node->ref_mod * sgn);
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
struct btrfs_delayed_tree_ref *ref;
@@ -610,10 +648,10 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
WARN_ON(1);
}
if (ret)
- return ret;
+ break;
}
-
- return 0;
+ spin_unlock(&head->lock);
+ return ret;
}
/*
@@ -621,7 +659,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
*/
static int __add_inline_refs(struct btrfs_fs_info *fs_info,
struct btrfs_path *path, u64 bytenr,
- int *info_level, struct list_head *prefs)
+ int *info_level, struct list_head *prefs,
+ u64 *total_refs)
{
int ret = 0;
int slot;
@@ -645,6 +684,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
flags = btrfs_extent_flags(leaf, ei);
+ *total_refs += btrfs_extent_refs(leaf, ei);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
ptr = (unsigned long)(ei + 1);
@@ -826,6 +866,8 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
struct list_head prefs_delayed;
struct list_head prefs;
struct __prelim_ref *ref;
+ struct extent_inode_elem *eie = NULL;
+ u64 total_refs = 0;
INIT_LIST_HEAD(&prefs);
INIT_LIST_HEAD(&prefs_delayed);
@@ -840,8 +882,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!trans)
+ if (!trans) {
path->search_commit_root = 1;
+ path->skip_locking = 1;
+ }
/*
* grab both a lock on the path and a lock on the delayed ref head.
@@ -856,7 +900,11 @@ again:
goto out;
BUG_ON(ret == 0);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (trans && likely(trans->type != __TRANS_DUMMY)) {
+#else
if (trans) {
+#endif
/*
* look if there are updates for this ref queued and lock the
* head
@@ -880,15 +928,15 @@ again:
btrfs_put_delayed_ref(&head->node);
goto again;
}
+ spin_unlock(&delayed_refs->lock);
ret = __add_delayed_refs(head, time_seq,
- &prefs_delayed);
+ &prefs_delayed, &total_refs);
mutex_unlock(&head->mutex);
- if (ret) {
- spin_unlock(&delayed_refs->lock);
+ if (ret)
goto out;
- }
+ } else {
+ spin_unlock(&delayed_refs->lock);
}
- spin_unlock(&delayed_refs->lock);
}
if (path->slots[0]) {
@@ -903,7 +951,8 @@ again:
(key.type == BTRFS_EXTENT_ITEM_KEY ||
key.type == BTRFS_METADATA_ITEM_KEY)) {
ret = __add_inline_refs(fs_info, path, bytenr,
- &info_level, &prefs);
+ &info_level, &prefs,
+ &total_refs);
if (ret)
goto out;
ret = __add_keyed_refs(fs_info, path, bytenr,
@@ -923,7 +972,7 @@ again:
__merge_refs(&prefs, 1);
ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
- extent_item_pos);
+ extent_item_pos, total_refs);
if (ret)
goto out;
@@ -932,19 +981,19 @@ again:
while (!list_empty(&prefs)) {
ref = list_first_entry(&prefs, struct __prelim_ref, list);
WARN_ON(ref->count < 0);
- if (ref->count && ref->root_id && ref->parent == 0) {
+ if (roots && ref->count && ref->root_id && ref->parent == 0) {
/* no parent == root of tree */
ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
if (ret < 0)
goto out;
}
if (ref->count && ref->parent) {
- struct extent_inode_elem *eie = NULL;
- if (extent_item_pos && !ref->inode_list) {
+ if (extent_item_pos && !ref->inode_list &&
+ ref->level == 0) {
u32 bsz;
struct extent_buffer *eb;
bsz = btrfs_level_size(fs_info->extent_root,
- info_level);
+ ref->level);
eb = read_tree_block(fs_info->extent_root,
ref->parent, bsz, 0);
if (!eb || !extent_buffer_uptodate(eb)) {
@@ -974,6 +1023,7 @@ again:
eie = eie->next;
eie->next = ref->inode_list;
}
+ eie = NULL;
}
list_del(&ref->list);
kmem_cache_free(btrfs_prelim_ref_cache, ref);
@@ -992,7 +1042,8 @@ out:
list_del(&ref->list);
kmem_cache_free(btrfs_prelim_ref_cache, ref);
}
-
+ if (ret < 0)
+ free_inode_elem_list(eie);
return ret;
}
@@ -1000,7 +1051,6 @@ static void free_leaf_list(struct ulist *blocks)
{
struct ulist_node *node = NULL;
struct extent_inode_elem *eie;
- struct extent_inode_elem *eie_next;
struct ulist_iterator uiter;
ULIST_ITER_INIT(&uiter);
@@ -1008,10 +1058,7 @@ static void free_leaf_list(struct ulist *blocks)
if (!node->aux)
continue;
eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
- for (; eie; eie = eie_next) {
- eie_next = eie->next;
- kfree(eie);
- }
+ free_inode_elem_list(eie);
node->aux = 0;
}
@@ -1031,22 +1078,14 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos)
{
- struct ulist *tmp;
int ret;
- tmp = ulist_alloc(GFP_NOFS);
- if (!tmp)
- return -ENOMEM;
*leafs = ulist_alloc(GFP_NOFS);
- if (!*leafs) {
- ulist_free(tmp);
+ if (!*leafs)
return -ENOMEM;
- }
ret = find_parent_nodes(trans, fs_info, bytenr,
- time_seq, *leafs, tmp, extent_item_pos);
- ulist_free(tmp);
-
+ time_seq, *leafs, NULL, extent_item_pos);
if (ret < 0 && ret != -ENOENT) {
free_leaf_list(*leafs);
return ret;
@@ -1068,9 +1107,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
*
* returns 0 on success, < 0 on error.
*/
-int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots)
+static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
{
struct ulist *tmp;
struct ulist_node *node = NULL;
@@ -1099,42 +1138,25 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
if (!node)
break;
bytenr = node->val;
+ cond_resched();
}
ulist_free(tmp);
return 0;
}
-
-static int __inode_info(u64 inum, u64 ioff, u8 key_type,
- struct btrfs_root *fs_root, struct btrfs_path *path,
- struct btrfs_key *found_key)
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots)
{
int ret;
- struct btrfs_key key;
- struct extent_buffer *eb;
-
- key.type = key_type;
- key.objectid = inum;
- key.offset = ioff;
- ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
- if (ret < 0)
- return ret;
-
- eb = path->nodes[0];
- if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
- ret = btrfs_next_leaf(fs_root, path);
- if (ret)
- return ret;
- eb = path->nodes[0];
- }
-
- btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
- if (found_key->type != key.type || found_key->objectid != key.objectid)
- return 1;
-
- return 0;
+ if (!trans)
+ down_read(&fs_info->commit_root_sem);
+ ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
}
/*
@@ -1144,16 +1166,16 @@ int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path)
{
struct btrfs_key key;
- return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
- &key);
+ return btrfs_find_item(fs_root, path, inum, ioff,
+ BTRFS_INODE_ITEM_KEY, &key);
}
static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
struct btrfs_path *path,
struct btrfs_key *found_key)
{
- return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
- found_key);
+ return btrfs_find_item(fs_root, path, inum, ioff,
+ BTRFS_INODE_REF_KEY, found_key);
}
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
@@ -1333,20 +1355,20 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
return ret;
- ret = btrfs_previous_item(fs_info->extent_root, path,
- 0, BTRFS_EXTENT_ITEM_KEY);
- if (ret < 0)
- return ret;
+ ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
+ }
btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
if (found_key->type == BTRFS_METADATA_ITEM_KEY)
size = fs_info->extent_root->leafsize;
else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
size = found_key->offset;
- if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
- found_key->type != BTRFS_METADATA_ITEM_KEY) ||
- found_key->objectid > logical ||
+ if (found_key->objectid > logical ||
found_key->objectid + size <= logical) {
pr_debug("logical %llu is not within any extent\n", logical);
return -ENOENT;
@@ -1387,9 +1409,10 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
* returns <0 on error
*/
static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- struct btrfs_extent_inline_ref **out_eiref,
- int *out_type)
+ struct btrfs_key *key,
+ struct btrfs_extent_item *ei, u32 item_size,
+ struct btrfs_extent_inline_ref **out_eiref,
+ int *out_type)
{
unsigned long end;
u64 flags;
@@ -1399,19 +1422,26 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
/* first call */
flags = btrfs_extent_flags(eb, ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
- info = (struct btrfs_tree_block_info *)(ei + 1);
- *out_eiref =
- (struct btrfs_extent_inline_ref *)(info + 1);
+ if (key->type == BTRFS_METADATA_ITEM_KEY) {
+ /* a skinny metadata extent */
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(ei + 1);
+ } else {
+ WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_eiref =
+ (struct btrfs_extent_inline_ref *)(info + 1);
+ }
} else {
*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
}
*ptr = (unsigned long)*out_eiref;
- if ((void *)*ptr >= (void *)ei + item_size)
+ if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
return -ENOENT;
}
end = (unsigned long)ei + item_size;
- *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+ *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
*ptr += btrfs_extent_inline_ref_size(*out_type);
@@ -1430,8 +1460,8 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
* <0 on error.
*/
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- u64 *out_root, u8 *out_level)
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level)
{
int ret;
int type;
@@ -1442,8 +1472,8 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
return 1;
while (1) {
- ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
- &eiref, &type);
+ ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
+ &eiref, &type);
if (ret < 0)
return ret;
@@ -1516,6 +1546,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
if (IS_ERR(trans))
return PTR_ERR(trans);
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ } else {
+ down_read(&fs_info->commit_root_sem);
}
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
@@ -1526,8 +1558,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
- ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots);
+ ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
+ tree_mod_seq_elem.seq, &roots);
if (ret)
break;
ULIST_ITER_INIT(&root_uiter);
@@ -1549,6 +1581,8 @@ out:
if (!search_commit_root) {
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
btrfs_end_transaction(trans, fs_info->extent_root);
+ } else {
+ up_read(&fs_info->commit_root_sem);
}
return ret;
@@ -1599,7 +1633,6 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_key found_key;
while (!ret) {
- path->leave_spinning = 1;
ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
&found_key);
if (ret < 0)
@@ -1612,14 +1645,17 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
parent = found_key.offset;
slot = path->slots[0];
- eb = path->nodes[0];
- /* make sure we can use eb after releasing the path */
- atomic_inc(&eb->refs);
+ eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!eb) {
+ ret = -ENOMEM;
+ break;
+ }
+ extent_buffer_get(eb);
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
- item = btrfs_item_nr(eb, slot);
+ item = btrfs_item_nr(slot);
iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
@@ -1672,17 +1708,20 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
++found;
slot = path->slots[0];
- eb = path->nodes[0];
- /* make sure we can use eb after releasing the path */
- atomic_inc(&eb->refs);
+ eb = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!eb) {
+ ret = -ENOMEM;
+ break;
+ }
+ extent_buffer_get(eb);
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, path->slots[0]);
- ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ item_size = btrfs_item_size_nr(leaf, slot);
+ ptr = btrfs_item_ptr_offset(leaf, slot);
cur_offset = 0;
while (cur_offset < item_size) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index a910b27a8ad..86fc20fec28 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -40,8 +40,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
u64 *flags);
int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
- struct btrfs_extent_item *ei, u32 item_size,
- u64 *out_root, u8 *out_level);
+ struct btrfs_key *key, struct btrfs_extent_item *ei,
+ u32 item_size, u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid,
@@ -55,8 +55,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 time_seq, struct ulist **roots);
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
struct extent_buffer *eb_in, u64 parent,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 71f074e1870..4794923c410 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -19,6 +19,7 @@
#ifndef __BTRFS_I__
#define __BTRFS_I__
+#include <linux/hash.h>
#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"
@@ -42,6 +43,7 @@
#define BTRFS_INODE_COPY_EVERYTHING 8
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
+#define BTRFS_INODE_HAS_PROPS 11
/* in memory btrfs inode */
struct btrfs_inode {
@@ -107,14 +109,17 @@ struct btrfs_inode {
u64 last_trans;
/*
- * log transid when this inode was last modified
+ * transid that last logged this inode
*/
- u64 last_sub_trans;
+ u64 logged_trans;
/*
- * transid that last logged this inode
+ * log transid when this inode was last modified
*/
- u64 logged_trans;
+ int last_sub_trans;
+
+ /* a local copy of root's last_log_commit */
+ int last_log_commit;
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
@@ -134,6 +139,9 @@ struct btrfs_inode {
*/
u64 index_cnt;
+ /* Cache the directory index number to speed the dir/file remove */
+ u64 dir_index;
+
/* the fsync log has some corner cases that mean we have to check
* directories to see if any unlinks have been done before
* the directory was logged. See tree-log.c for all the
@@ -150,9 +158,6 @@ struct btrfs_inode {
/* flags field from the on disk inode */
u32 flags;
- /* a local copy of root's last_log_commit */
- unsigned long last_log_commit;
-
/*
* Counters to keep track of the number of extent item's we may use due
* to delalloc and such. outstanding_extents is the number of extent
@@ -179,6 +184,25 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
return container_of(inode, struct btrfs_inode, vfs_inode);
}
+static inline unsigned long btrfs_inode_hash(u64 objectid,
+ const struct btrfs_root *root)
+{
+ u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+
+#if BITS_PER_LONG == 32
+ h = (h >> 32) ^ (h & 0xffffffff);
+#endif
+
+ return (unsigned long)h;
+}
+
+static inline void btrfs_insert_inode_hash(struct inode *inode)
+{
+ unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
+
+ __insert_inode_hash(inode, h);
+}
+
static inline u64 btrfs_ino(struct inode *inode)
{
u64 ino = BTRFS_I(inode)->location.objectid;
@@ -255,9 +279,11 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
{
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags);
}
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end);
+
#endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 1c47be18724..ce92ae30250 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -77,17 +77,26 @@
* the integrity of (super)-block write requests, do not
* enable the config option BTRFS_FS_CHECK_INTEGRITY to
* include and compile the integrity check tool.
+ *
+ * Expect millions of lines of information in the kernel log with an
+ * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
+ * kernel config to at least 26 (which is 64MB). Usually the value is
+ * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
+ * changed like this before LOG_BUF_SHIFT can be set to a high value:
+ * config LOG_BUF_SHIFT
+ * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
+ * range 12 30
*/
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
#include <linux/mutex.h>
-#include <linux/crc32c.h>
#include <linux/genhd.h>
#include <linux/blkdev.h>
#include "ctree.h"
#include "disk-io.h"
+#include "hash.h"
#include "transaction.h"
#include "extent_io.h"
#include "volumes.h"
@@ -124,6 +133,7 @@
#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
+#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
struct btrfsic_dev_state;
struct btrfsic_state;
@@ -323,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
static int btrfsic_read_block(struct btrfsic_state *state,
struct btrfsic_block_data_ctx *block_ctx);
static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
static int btrfsic_test_for_metadata(struct btrfsic_state *state,
char **datav, unsigned int num_pages);
static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -1038,7 +1047,7 @@ leaf_item_out_of_bounce_error:
disk_item_offset,
sizeof(struct btrfs_item));
item_offset = btrfs_stack_item_offset(&disk_item);
- item_size = btrfs_stack_item_offset(&disk_item);
+ item_size = btrfs_stack_item_size(&disk_item);
disk_key = &disk_item.key;
type = btrfs_disk_key_type(disk_key);
@@ -1084,6 +1093,7 @@ leaf_item_out_of_bounce_error:
next_stack =
btrfsic_stack_frame_alloc();
if (NULL == next_stack) {
+ sf->error = -1;
btrfsic_release_block_ctx(
&sf->
next_block_ctx);
@@ -1181,8 +1191,10 @@ continue_with_current_node_stack_frame:
sf->next_block_ctx.datav[0];
next_stack = btrfsic_stack_frame_alloc();
- if (NULL == next_stack)
+ if (NULL == next_stack) {
+ sf->error = -1;
goto one_stack_frame_backwards;
+ }
next_stack->i = -1;
next_stack->block = sf->next_block;
@@ -1447,10 +1459,14 @@ static int btrfsic_handle_extent_data(
btrfsic_read_from_block_data(block_ctx, &file_extent_item,
file_extent_item_offset,
sizeof(struct btrfs_file_extent_item));
- next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) +
- btrfs_stack_file_extent_offset(&file_extent_item);
- generation = btrfs_stack_file_extent_generation(&file_extent_item);
- num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+ next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item);
+ if (btrfs_stack_file_extent_compression(&file_extent_item) ==
+ BTRFS_COMPRESS_NONE) {
+ next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item);
+ num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+ } else {
+ num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item);
+ }
generation = btrfs_stack_file_extent_generation(&file_extent_item);
if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
@@ -1677,7 +1693,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
- DECLARE_COMPLETION_ONSTACK(complete);
bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
if (!bio) {
@@ -1687,9 +1702,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -1;
}
bio->bi_bdev = block_ctx->dev->bdev;
- bio->bi_sector = dev_bytenr >> 9;
- bio->bi_end_io = btrfsic_complete_bio_end_io;
- bio->bi_private = &complete;
+ bio->bi_iter.bi_sector = dev_bytenr >> 9;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1702,12 +1715,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
"btrfsic: error, failed to add a single page!\n");
return -1;
}
- submit_bio(READ, bio);
-
- /* this will also unplug the queue */
- wait_for_completion(&complete);
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (submit_bio_wait(READ, bio)) {
printk(KERN_INFO
"btrfsic: read error at logical %llu dev %s!\n",
block_ctx->start, block_ctx->dev->name);
@@ -1730,11 +1738,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return block_ctx->len;
}
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
-{
- complete((struct completion *)bio->bi_private);
-}
-
static void btrfsic_dump_database(struct btrfsic_state *state)
{
struct list_head *elem_all;
@@ -1823,7 +1826,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state,
size_t sublen = i ? PAGE_CACHE_SIZE :
(PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
- crc = crc32c(crc, data, sublen);
+ crc = btrfs_crc32c(crc, data, sublen);
}
btrfs_csum_final(crc, csum);
if (memcmp(csum, h->csum, state->csum_size))
@@ -1900,7 +1903,9 @@ again:
dev_state,
dev_bytenr);
}
- if (block->logical_bytenr != bytenr) {
+ if (block->logical_bytenr != bytenr &&
+ !(!block->is_metadata &&
+ block->logical_bytenr == 0))
printk(KERN_INFO
"Written block @%llu (%s/%llu/%d)"
" found in hash table, %c,"
@@ -1910,15 +1915,14 @@ again:
block->mirror_num,
btrfsic_get_block_type(state, block),
block->logical_bytenr);
- block->logical_bytenr = bytenr;
- } else if (state->print_mask &
- BTRFSIC_PRINT_MASK_VERBOSE)
+ else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
printk(KERN_INFO
"Written block @%llu (%s/%llu/%d)"
" found in hash table, %c.\n",
bytenr, dev_state->name, dev_bytenr,
block->mirror_num,
btrfsic_get_block_type(state, block));
+ block->logical_bytenr = bytenr;
} else {
if (num_pages * PAGE_CACHE_SIZE <
state->datablock_size) {
@@ -2463,10 +2467,8 @@ static int btrfsic_process_written_superblock(
}
}
- if (-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)) {
- WARN_ON(1);
+ if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0)))
btrfsic_dump_tree(state);
- }
return 0;
}
@@ -2906,7 +2908,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
btrfsic_release_block_ctx(&block_ctx);
}
- if (!match) {
+ if (WARN_ON(!match)) {
printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
" buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
" phys_bytenr=%llu)!\n",
@@ -2923,7 +2925,6 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
bytenr, block_ctx.dev->name,
block_ctx.dev_bytenr, mirror_num);
}
- WARN_ON(1);
}
}
@@ -3000,14 +3001,12 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
return submit_bh(rw, bh);
}
-void btrfsic_submit_bio(int rw, struct bio *bio)
+static void __btrfsic_submit_bio(int rw, struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
- if (!btrfsic_is_initialized) {
- submit_bio(rw, bio);
+ if (!btrfsic_is_initialized)
return;
- }
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bio() is also called before
@@ -3017,10 +3016,11 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
(rw & WRITE) && NULL != bio->bi_io_vec) {
unsigned int i;
u64 dev_bytenr;
+ u64 cur_bytenr;
int bio_is_patched;
char **mapped_datav;
- dev_bytenr = 512 * bio->bi_sector;
+ dev_bytenr = 512 * bio->bi_iter.bi_sector;
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
@@ -3028,13 +3028,14 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
"submit_bio(rw=0x%x, bi_vcnt=%u,"
" bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
rw, bio->bi_vcnt,
- (unsigned long long)bio->bi_sector, dev_bytenr,
- bio->bi_bdev);
+ (unsigned long long)bio->bi_iter.bi_sector,
+ dev_bytenr, bio->bi_bdev);
mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
GFP_NOFS);
if (!mapped_datav)
goto leave;
+ cur_bytenr = dev_bytenr;
for (i = 0; i < bio->bi_vcnt; i++) {
BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
@@ -3046,16 +3047,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
kfree(mapped_datav);
goto leave;
}
- if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE) ==
- (dev_state->state->print_mask &
- (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
- BTRFSIC_PRINT_MASK_VERBOSE)))
+ if (dev_state->state->print_mask &
+ BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
printk(KERN_INFO
- "#%u: page=%p, len=%u, offset=%u\n",
- i, bio->bi_io_vec[i].bv_page,
- bio->bi_io_vec[i].bv_len,
+ "#%u: bytenr=%llu, len=%u, offset=%u\n",
+ i, cur_bytenr, bio->bi_io_vec[i].bv_len,
bio->bi_io_vec[i].bv_offset);
+ cur_bytenr += bio->bi_io_vec[i].bv_len;
}
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, bio->bi_vcnt,
@@ -3099,10 +3097,20 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
}
leave:
mutex_unlock(&btrfsic_mutex);
+}
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+ __btrfsic_submit_bio(rw, bio);
submit_bio(rw, bio);
}
+int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+{
+ __btrfsic_submit_bio(rw, bio);
+ return submit_bio_wait(rw, bio);
+}
+
int btrfsic_mount(struct btrfs_root *root,
struct btrfs_fs_devices *fs_devices,
int including_extent_data, u32 print_mask)
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 8b59175cc50..13b8566c97a 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -22,9 +22,11 @@
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
int btrfsic_submit_bh(int rw, struct buffer_head *bh);
void btrfsic_submit_bio(int rw, struct bio *bio);
+int btrfsic_submit_bio_wait(int rw, struct bio *bio);
#else
#define btrfsic_submit_bh submit_bh
#define btrfsic_submit_bio submit_bio
+#define btrfsic_submit_bio_wait submit_bio_wait
#endif
int btrfsic_mount(struct btrfs_root *root,
diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h
deleted file mode 100644
index 7c4503ef6ef..00000000000
--- a/fs/btrfs/compat.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef _COMPAT_H_
-#define _COMPAT_H_
-
-#define btrfs_drop_nlink(inode) drop_nlink(inode)
-#define btrfs_inc_nlink(inode) inc_nlink(inode)
-
-#endif /* _COMPAT_H_ */
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 6aad98cb343..1daea0b4718 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
#include <linux/writeback.h>
#include <linux/bit_spinlock.h>
#include <linux/slab.h>
-#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -129,11 +128,10 @@ static int check_compressed_csum(struct inode *inode,
kunmap_atomic(kaddr);
if (csum != *cb_sum) {
- printk(KERN_INFO "btrfs csum failed ino %llu "
- "extent %llu csum %u "
- "wanted %u mirror %d\n",
- btrfs_ino(inode), disk_start, csum, *cb_sum,
- cb->mirror_num);
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu extent %llu csum %u wanted %u mirror %d",
+ btrfs_ino(inode), disk_start, csum, *cb_sum,
+ cb->mirror_num);
ret = -EIO;
goto fail;
}
@@ -173,7 +171,8 @@ static void end_compressed_bio_read(struct bio *bio, int err)
goto out;
inode = cb->inode;
- ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9);
+ ret = check_compressed_csum(inode, cb,
+ (u64)bio->bi_iter.bi_sector << 9);
if (ret)
goto csum_failed;
@@ -202,18 +201,16 @@ csum_failed:
if (cb->errors) {
bio_io_error(cb->orig_bio);
} else {
- int bio_index = 0;
- struct bio_vec *bvec = cb->orig_bio->bi_io_vec;
+ int i;
+ struct bio_vec *bvec;
/*
* we have verified the checksum already, set page
* checked so the end_io handlers know about it
*/
- while (bio_index < cb->orig_bio->bi_vcnt) {
+ bio_for_each_segment_all(bvec, cb->orig_bio, i)
SetPageChecked(bvec->bv_page);
- bvec++;
- bio_index++;
- }
+
bio_endio(cb->orig_bio, 0);
}
@@ -360,7 +357,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
- if(!bio) {
+ if (!bio) {
kfree(cb);
return -ENOMEM;
}
@@ -373,7 +370,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
page = compressed_pages[pg_index];
page->mapping = inode->i_mapping;
- if (bio->bi_size)
+ if (bio->bi_iter.bi_size)
ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
PAGE_CACHE_SIZE,
bio, 0);
@@ -413,7 +410,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
}
if (bytes_left < PAGE_CACHE_SIZE) {
- printk("bytes left %lu compress len %lu nr %lu\n",
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "bytes left %lu compress len %lu nr %lu",
bytes_left, cb->compressed_len, cb->nr_pages);
}
bytes_left -= PAGE_CACHE_SIZE;
@@ -474,7 +472,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, pg_index);
rcu_read_unlock();
- if (page) {
+ if (page && !radix_tree_exceptional_entry(page)) {
misses++;
if (misses > 4)
break;
@@ -507,7 +505,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (!em || last_offset < em->start ||
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
- (em->block_start >> 9) != cb->orig_bio->bi_sector) {
+ (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, last_offset, end);
unlock_page(page);
@@ -553,7 +551,7 @@ next:
* in it. We don't actually do IO on those pages but allocate new ones
* to hold the compressed pages on disk.
*
- * bio->bi_sector points to the compressed extent on disk
+ * bio->bi_iter.bi_sector points to the compressed extent on disk
* bio->bi_io_vec points to all of the inode pages
* bio->bi_vcnt is a count of pages
*
@@ -574,7 +572,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
struct page *page;
struct block_device *bdev;
struct bio *comp_bio;
- u64 cur_disk_byte = (u64)bio->bi_sector << 9;
+ u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9;
u64 em_len;
u64 em_start;
struct extent_map *em;
@@ -660,7 +658,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
page->mapping = inode->i_mapping;
page->index = em_start >> PAGE_CACHE_SHIFT;
- if (comp_bio->bi_size)
+ if (comp_bio->bi_iter.bi_size)
ret = tree->ops->merge_bio_hook(READ, page, 0,
PAGE_CACHE_SIZE,
comp_bio, 0);
@@ -688,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
comp_bio, sums);
BUG_ON(ret); /* -ENOMEM */
}
- sums += (comp_bio->bi_size + root->sectorsize - 1) /
- root->sectorsize;
+ sums += (comp_bio->bi_iter.bi_size +
+ root->sectorsize - 1) / root->sectorsize;
ret = btrfs_map_bio(root, READ, comp_bio,
mirror_num, 0);
@@ -823,7 +821,7 @@ static void free_workspace(int type, struct list_head *workspace)
spin_lock(workspace_lock);
if (*num_workspace < num_online_cpus()) {
- list_add_tail(workspace, idle_workspace);
+ list_add(workspace, idle_workspace);
(*num_workspace)++;
spin_unlock(workspace_lock);
goto wake;
@@ -889,7 +887,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -1;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
start, len, pages,
@@ -925,7 +923,7 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -ENOMEM;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
disk_start,
@@ -947,7 +945,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
workspace = find_workspace(type);
if (IS_ERR(workspace))
- return -ENOMEM;
+ return PTR_ERR(workspace);
ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
dest_page, start_byte,
@@ -1012,6 +1010,8 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
bytes = min(bytes, working_bytes);
kaddr = kmap_atomic(page_out);
memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+ if (*pg_index == (vcnt - 1) && *pg_offset == 0)
+ memset(kaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
kunmap_atomic(kaddr);
flush_dcache_page(page_out);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 61b5bcd57b7..aeab453b8e2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -39,9 +39,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
struct extent_buffer *src_buf);
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
int level, int slot);
-static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb);
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
struct btrfs_path *btrfs_alloc_path(void)
{
@@ -225,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
static void add_root_to_dirty_list(struct btrfs_root *root)
{
spin_lock(&root->fs_info->trans_lock);
- if (root->track_dirty && list_empty(&root->dirty_list)) {
+ if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
+ list_empty(&root->dirty_list)) {
list_add(&root->dirty_list,
&root->fs_info->dirty_cowonly_roots);
}
@@ -247,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
int level;
struct btrfs_disk_key disk_key;
- WARN_ON(root->ref_cows && trans->transid !=
- root->fs_info->running_transaction->transid);
- WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
level = btrfs_header_level(buf);
if (level == 0)
@@ -274,7 +275,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, new_root_objectid);
- write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+ write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
BTRFS_FSID_SIZE);
WARN_ON(btrfs_header_generation(buf) > trans->transid);
@@ -355,44 +356,14 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
}
/*
- * Increment the upper half of tree_mod_seq, set lower half zero.
- *
- * Must be called with fs_info->tree_mod_seq_lock held.
+ * Pull a new tree mod seq number for our operation.
*/
-static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
-{
- u64 seq = atomic64_read(&fs_info->tree_mod_seq);
- seq &= 0xffffffff00000000ull;
- seq += 1ull << 32;
- atomic64_set(&fs_info->tree_mod_seq, seq);
- return seq;
-}
-
-/*
- * Increment the lower half of tree_mod_seq.
- *
- * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
- * are generated should not technically require a spin lock here. (Rationale:
- * incrementing the minor while incrementing the major seq number is between its
- * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
- * just returns a unique sequence number as usual.) We have decided to leave
- * that requirement in here and rethink it once we notice it really imposes a
- * problem on some workload.
- */
-static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
return atomic64_inc_return(&fs_info->tree_mod_seq);
}
/*
- * return the last minor in the previous major tree_mod_seq number
- */
-u64 btrfs_tree_mod_seq_prev(u64 seq)
-{
- return (seq & 0xffffffff00000000ull) - 1ull;
-}
-
-/*
* This adds a new blocker to the tree mod log's blocker list if the @elem
* passed does not already have a sequence number set. So when a caller expects
* to record tree modifications, it should ensure to set elem->seq to zero
@@ -403,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
- u64 seq;
-
tree_mod_log_write_lock(fs_info);
spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
- seq = btrfs_inc_tree_mod_seq_minor(fs_info);
spin_unlock(&fs_info->tree_mod_seq_lock);
tree_mod_log_write_unlock(fs_info);
- return seq;
+ return elem->seq;
}
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -475,6 +443,8 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* the index is the shifted logical of the *new* root node for root replace
* operations, or the shifted logical of the affected block for all other
* operations.
+ *
+ * Note: must be called with write lock (tree_mod_log_write_lock).
*/
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
@@ -483,27 +453,10 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
struct rb_node **new;
struct rb_node *parent = NULL;
struct tree_mod_elem *cur;
- int ret = 0;
BUG_ON(!tm);
- tree_mod_log_write_lock(fs_info);
- if (list_empty(&fs_info->tree_mod_seq_list)) {
- tree_mod_log_write_unlock(fs_info);
- /*
- * Ok we no longer care about logging modifications, free up tm
- * and return 0. Any callers shouldn't be using tm after
- * calling tree_mod_log_insert, but if they do we can just
- * change this to return a special error code to let the callers
- * do their own thing.
- */
- kfree(tm);
- return 0;
- }
-
- spin_lock(&fs_info->tree_mod_seq_lock);
- tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
tm_root = &fs_info->tree_mod_log;
new = &tm_root->rb_node;
@@ -518,18 +471,13 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
new = &((*new)->rb_left);
else if (cur->seq > tm->seq)
new = &((*new)->rb_right);
- else {
- ret = -EEXIST;
- kfree(tm);
- goto out;
- }
+ else
+ return -EEXIST;
}
rb_link_node(&tm->node, parent, new);
rb_insert_color(&tm->node, tm_root);
-out:
- tree_mod_log_write_unlock(fs_info);
- return ret;
+ return 0;
}
/*
@@ -545,19 +493,38 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
return 1;
if (eb && btrfs_header_level(eb) == 0)
return 1;
+
+ tree_mod_log_write_lock(fs_info);
+ if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+ tree_mod_log_write_unlock(fs_info);
+ return 1;
+ }
+
return 0;
}
-static inline int
-__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ smp_mb();
+ if (list_empty(&(fs_info)->tree_mod_seq_list))
+ return 0;
+ if (eb && btrfs_header_level(eb) == 0)
+ return 0;
+
+ return 1;
+}
+
+static struct tree_mod_elem *
+alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
+ enum mod_log_op op, gfp_t flags)
{
struct tree_mod_elem *tm;
tm = kzalloc(sizeof(*tm), flags);
if (!tm)
- return -ENOMEM;
+ return NULL;
tm->index = eb->start >> PAGE_CACHE_SHIFT;
if (op != MOD_LOG_KEY_ADD) {
@@ -567,8 +534,9 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
tm->op = op;
tm->slot = slot;
tm->generation = btrfs_node_ptr_generation(eb, slot);
+ RB_CLEAR_NODE(&tm->node);
- return __tree_mod_log_insert(fs_info, tm);
+ return tm;
}
static noinline int
@@ -576,10 +544,27 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int slot,
enum mod_log_op op, gfp_t flags)
{
- if (tree_mod_dont_log(fs_info, eb))
+ struct tree_mod_elem *tm;
+ int ret;
+
+ if (!tree_mod_need_log(fs_info, eb))
return 0;
- return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+ tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ if (!tm)
+ return -ENOMEM;
+
+ if (tree_mod_dont_log(fs_info, eb)) {
+ kfree(tm);
+ return 0;
+ }
+
+ ret = __tree_mod_log_insert(fs_info, tm);
+ tree_mod_log_write_unlock(fs_info);
+ if (ret)
+ kfree(tm);
+
+ return ret;
}
static noinline int
@@ -587,53 +572,95 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
struct extent_buffer *eb, int dst_slot, int src_slot,
int nr_items, gfp_t flags)
{
- struct tree_mod_elem *tm;
- int ret;
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int ret = 0;
int i;
+ int locked = 0;
- if (tree_mod_dont_log(fs_info, eb))
+ if (!tree_mod_need_log(fs_info, eb))
return 0;
+ tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->index = eb->start >> PAGE_CACHE_SHIFT;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = MOD_LOG_MOVE_KEYS;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+ MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, eb))
+ goto free_tms;
+ locked = 1;
+
/*
* When we override something during the move, we log these removals.
* This can only happen when we move towards the beginning of the
* buffer, i.e. dst_slot < src_slot.
*/
for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
- BUG_ON(ret < 0);
+ ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret)
+ goto free_tms;
}
- tm = kzalloc(sizeof(*tm), flags);
- if (!tm)
- return -ENOMEM;
+ ret = __tree_mod_log_insert(fs_info, tm);
+ if (ret)
+ goto free_tms;
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
- tm->index = eb->start >> PAGE_CACHE_SHIFT;
- tm->slot = src_slot;
- tm->move.dst_slot = dst_slot;
- tm->move.nr_items = nr_items;
- tm->op = MOD_LOG_MOVE_KEYS;
+ return 0;
+free_tms:
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+ kfree(tm);
- return __tree_mod_log_insert(fs_info, tm);
+ return ret;
}
-static inline void
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+static inline int
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
{
- int i;
- u32 nritems;
+ int i, j;
int ret;
- if (btrfs_header_level(eb) == 0)
- return;
-
- nritems = btrfs_header_nritems(eb);
for (i = nritems - 1; i >= 0; i--) {
- ret = __tree_mod_log_insert_key(fs_info, eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- BUG_ON(ret < 0);
+ ret = __tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret) {
+ for (j = nritems - 1; j > i; j--)
+ rb_erase(&tm_list[j]->node,
+ &fs_info->tree_mod_log);
+ return ret;
+ }
}
+
+ return 0;
}
static noinline int
@@ -642,17 +669,38 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
struct extent_buffer *new_root, gfp_t flags,
int log_removal)
{
- struct tree_mod_elem *tm;
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int ret = 0;
+ int i;
- if (tree_mod_dont_log(fs_info, NULL))
+ if (!tree_mod_need_log(fs_info, NULL))
return 0;
- if (log_removal)
- __tree_mod_log_free_eb(fs_info, old_root);
+ if (log_removal && btrfs_header_level(old_root) > 0) {
+ nritems = btrfs_header_nritems(old_root);
+ tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+ flags);
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(old_root, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+ }
tm = kzalloc(sizeof(*tm), flags);
- if (!tm)
- return -ENOMEM;
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
tm->index = new_root->start >> PAGE_CACHE_SHIFT;
tm->old_root.logical = old_root->start;
@@ -660,7 +708,30 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
tm->generation = btrfs_header_generation(old_root);
tm->op = MOD_LOG_ROOT_REPLACE;
- return __tree_mod_log_insert(fs_info, tm);
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+
+ if (tm_list)
+ ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ if (!ret)
+ ret = __tree_mod_log_insert(fs_info, tm);
+
+ tree_mod_log_write_unlock(fs_info);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return ret;
+
+free_tms:
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
+ kfree(tm);
+
+ return ret;
}
static struct tree_mod_elem *
@@ -729,31 +800,75 @@ tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
return __tree_mod_log_search(fs_info, start, min_seq, 0);
}
-static noinline void
+static noinline int
tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
struct extent_buffer *src, unsigned long dst_offset,
unsigned long src_offset, int nr_items)
{
- int ret;
+ int ret = 0;
+ struct tree_mod_elem **tm_list = NULL;
+ struct tree_mod_elem **tm_list_add, **tm_list_rem;
int i;
+ int locked = 0;
- if (tree_mod_dont_log(fs_info, NULL))
- return;
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
- return;
+ return 0;
+ tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm_list_add = tm_list;
+ tm_list_rem = tm_list + nr_items;
for (i = 0; i < nr_items; i++) {
- ret = __tree_mod_log_insert_key(fs_info, src,
- i + src_offset,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
- BUG_ON(ret < 0);
- ret = __tree_mod_log_insert_key(fs_info, dst,
- i + dst_offset,
- MOD_LOG_KEY_ADD,
- GFP_NOFS);
- BUG_ON(ret < 0);
+ tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+ MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ if (!tm_list_rem[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+ MOD_LOG_KEY_ADD, GFP_NOFS);
+ if (!tm_list_add[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+ locked = 1;
+
+ for (i = 0; i < nr_items; i++) {
+ ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
+ if (ret)
+ goto free_tms;
+ ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
}
+ if (locked)
+ tree_mod_log_write_unlock(fs_info);
+ kfree(tm_list);
+
+ return ret;
}
static inline void
@@ -772,18 +887,58 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
{
int ret;
- ret = __tree_mod_log_insert_key(fs_info, eb, slot,
+ ret = tree_mod_log_insert_key(fs_info, eb, slot,
MOD_LOG_KEY_REPLACE,
atomic ? GFP_ATOMIC : GFP_NOFS);
BUG_ON(ret < 0);
}
-static noinline void
+static noinline int
tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
{
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int i;
+ int ret = 0;
+
+ if (btrfs_header_level(eb) == 0)
+ return 0;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ nritems = btrfs_header_nritems(eb);
+ tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i,
+ MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
if (tree_mod_dont_log(fs_info, eb))
- return;
- __tree_mod_log_free_eb(fs_info, eb);
+ goto free_tms;
+
+ ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ tree_mod_log_write_unlock(fs_info);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+
+ return ret;
}
static noinline void
@@ -809,14 +964,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
* snapshot and the block was not allocated by tree relocation,
* we know the block is not shared.
*/
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
buf != root->node && buf != root->commit_root &&
(btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item) ||
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
return 1;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
return 1;
#endif
@@ -958,9 +1113,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_assert_tree_locked(buf);
- WARN_ON(root->ref_cows && trans->transid !=
- root->fs_info->running_transaction->transid);
- WARN_ON(root->ref_cows && trans->transid != root->last_trans);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->fs_info->running_transaction->transid);
+ WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ trans->transid != root->last_trans);
level = btrfs_header_level(buf);
@@ -996,7 +1152,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
else
btrfs_set_header_owner(cow, root->root_key.objectid);
- write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
+ write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
BTRFS_FSID_SIZE);
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
@@ -1005,7 +1161,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret)
return ret;
@@ -1041,8 +1197,13 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
- if (last_ref)
- tree_mod_log_free_eb(root->fs_info, buf);
+ if (last_ref) {
+ ret = tree_mod_log_free_eb(root->fs_info, buf);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
+ }
btrfs_free_tree_block(trans, root, buf, parent_start,
last_ref);
}
@@ -1285,11 +1446,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(eb_root);
blocksize = btrfs_level_size(root, old_root->level);
old = read_tree_block(root, logical, blocksize, 0);
- if (!old || !extent_buffer_uptodate(old)) {
+ if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
free_extent_buffer(old);
- pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
- logical);
- WARN_ON(1);
+ btrfs_warn(root->fs_info,
+ "failed to read tree block %llu from get_old_root", logical);
} else {
eb = btrfs_clone_extent_buffer(old);
free_extent_buffer(old);
@@ -1346,6 +1506,10 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
/* ensure we can see the force_cow */
smp_rmb();
@@ -1364,7 +1528,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
!(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
- !root->force_cow)
+ !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return 0;
return 1;
}
@@ -2463,6 +2627,49 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
return 0;
}
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+ u64 iobjectid, u64 ioff, u8 key_type,
+ struct btrfs_key *found_key)
+{
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *eb;
+ struct btrfs_path *path;
+
+ key.type = key_type;
+ key.objectid = iobjectid;
+ key.offset = ioff;
+
+ if (found_path == NULL) {
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+ } else
+ path = found_path;
+
+ ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+ if ((ret < 0) || (found_key == NULL)) {
+ if (path != found_path)
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ eb = path->nodes[0];
+ if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(fs_root, path);
+ if (ret)
+ return ret;
+ eb = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+ if (found_key->type != key.type ||
+ found_key->objectid != key.objectid)
+ return 1;
+
+ return 0;
+}
+
/*
* look for key in the tree. path is filled in with nodes along the way
* if key is found, we return zero and you can find the item in the leaf
@@ -2496,6 +2703,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
WARN_ON(p->nodes[0] != NULL);
+ BUG_ON(!cow && ins_len);
if (ins_len < 0) {
lowest_unlock = 2;
@@ -2533,9 +2741,13 @@ again:
* the commit roots are read only
* so we always do read locks
*/
+ if (p->need_commit_sem)
+ down_read(&root->fs_info->commit_root_sem);
b = root->commit_root;
extent_buffer_get(b);
level = btrfs_header_level(b);
+ if (p->need_commit_sem)
+ up_read(&root->fs_info->commit_root_sem);
if (!p->skip_locking)
btrfs_tree_read_lock(b);
} else {
@@ -2604,8 +2816,6 @@ again:
}
}
cow_done:
- BUG_ON(!cow && ins_len);
-
p->nodes[level] = b;
btrfs_clear_path_blocking(p, NULL, 0);
@@ -2615,13 +2825,19 @@ cow_done:
* It is safe to drop the lock on our parent before we
* go through the expensive btree search on b.
*
- * If cow is true, then we might be changing slot zero,
- * which may require changing the parent. So, we can't
- * drop the lock until after we know which slot we're
- * operating on.
+ * If we're inserting or deleting (ins_len != 0), then we might
+ * be changing slot zero, which may require changing the parent.
+ * So, we can't drop the lock until after we know which slot
+ * we're operating on.
*/
- if (!cow)
- btrfs_unlock_up_safe(p, level + 1);
+ if (!ins_len && !p->keep_locks) {
+ int u = level + 1;
+
+ if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
+ btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
+ p->locks[u] = 0;
+ }
+ }
ret = key_search(b, key, level, &prev_cmp, &slot);
@@ -2649,7 +2865,7 @@ cow_done:
* which means we must have a write lock
* on the parent
*/
- if (slot == 0 && cow &&
+ if (slot == 0 && ins_len &&
write_lock_level < level + 1) {
write_lock_level = level + 1;
btrfs_release_path(p);
@@ -2758,7 +2974,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
int level;
int lowest_unlock = 1;
u8 lowest_level = 0;
- int prev_cmp;
+ int prev_cmp = -1;
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
@@ -2769,7 +2985,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
}
again:
- prev_cmp = -1;
b = get_old_root(root, time_seq);
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
@@ -2787,6 +3002,11 @@ again:
*/
btrfs_unlock_up_safe(p, level + 1);
+ /*
+ * Since we can unwind eb's we want to do a real search every
+ * time.
+ */
+ prev_cmp = -1;
ret = key_search(b, key, level, &prev_cmp, &slot);
if (level != 0) {
@@ -2898,7 +3118,9 @@ again:
if (ret < 0)
return ret;
if (!ret) {
- p->slots[0] = btrfs_header_nritems(leaf) - 1;
+ leaf = p->nodes[0];
+ if (p->slots[0] == btrfs_header_nritems(leaf))
+ p->slots[0]--;
return 0;
}
if (!return_any)
@@ -3019,8 +3241,12 @@ static int push_node_left(struct btrfs_trans_handle *trans,
} else
push_items = min(src_nritems - 8, push_items);
- tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
- push_items);
+ ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+ push_items);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
@@ -3090,8 +3316,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
- src_nritems - push_items, push_items);
+ ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+ src_nritems - push_items, push_items);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -3148,7 +3378,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(c, root->root_key.objectid);
- write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
+ write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
BTRFS_FSID_SIZE);
write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
@@ -3287,12 +3517,17 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(split, root->root_key.objectid);
write_extent_buffer(split, root->fs_info->fsid,
- btrfs_header_fsid(split), BTRFS_FSID_SIZE);
+ btrfs_header_fsid(), BTRFS_FSID_SIZE);
write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(split),
BTRFS_UUID_SIZE);
- tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
+ ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
+ mid, c_nritems - mid);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
+ }
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
@@ -3337,8 +3572,8 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr)
if (!nr)
return 0;
btrfs_init_map_token(&token);
- start_item = btrfs_item_nr(l, start);
- end_item = btrfs_item_nr(l, end);
+ start_item = btrfs_item_nr(start);
+ end_item = btrfs_item_nr(end);
data_len = btrfs_token_item_offset(l, start_item, &token) +
btrfs_token_item_size(l, start_item, &token);
data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
@@ -3359,8 +3594,8 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
int ret;
ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
if (ret < 0) {
- printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, "
- "used %d nritems %d\n",
+ btrfs_crit(root->fs_info,
+ "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
leaf_space_used(leaf, 0, nritems), nritems);
}
@@ -3406,7 +3641,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
- item = btrfs_item_nr(left, i);
+ item = btrfs_item_nr(i);
if (!empty && push_items > 0) {
if (path->slots[0] > i)
@@ -3470,7 +3705,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(root);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(right, i);
+ item = btrfs_item_nr(i);
push_space -= btrfs_token_item_size(right, item, &token);
btrfs_set_token_item_offset(right, item, push_space, &token);
}
@@ -3568,6 +3803,19 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (left_nritems == 0)
goto out_unlock;
+ if (path->slots[0] == left_nritems && !empty) {
+ /* Key greater than all keys in the leaf, right neighbor has
+ * enough room for it and we're not emptying our leaf to delete
+ * it, therefore use right neighbor to insert the new item and
+ * no need to touch/dirty our left leaft. */
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ path->nodes[0] = right;
+ path->slots[0] = 0;
+ path->slots[1]++;
+ return 0;
+ }
+
return __push_leaf_right(trans, root, path, min_data_size, empty,
right, free_space, left_nritems, min_slot);
out_unlock:
@@ -3612,7 +3860,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
nr = min(right_nritems - 1, max_slot);
for (i = 0; i < nr; i++) {
- item = btrfs_item_nr(right, i);
+ item = btrfs_item_nr(i);
if (!empty && push_items > 0) {
if (path->slots[0] < i)
@@ -3639,8 +3887,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
ret = 1;
goto out;
}
- if (!empty && push_items == btrfs_header_nritems(right))
- WARN_ON(1);
+ WARN_ON(!empty && push_items == btrfs_header_nritems(right));
/* push data from right to left */
copy_extent_buffer(left, right,
@@ -3663,7 +3910,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
u32 ioff;
- item = btrfs_item_nr(left, i);
+ item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(left, item, &token);
btrfs_set_token_item_offset(left, item,
@@ -3694,7 +3941,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(right, right_nritems);
push_space = BTRFS_LEAF_DATA_SIZE(root);
for (i = 0; i < right_nritems; i++) {
- item = btrfs_item_nr(right, i);
+ item = btrfs_item_nr(i);
push_space = push_space - btrfs_token_item_size(right,
item, &token);
@@ -3835,7 +4082,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_item_end_nr(l, mid);
for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(right, i);
+ struct btrfs_item *item = btrfs_item_nr(i);
u32 ioff;
ioff = btrfs_token_item_offset(right, item, &token);
@@ -3885,14 +4132,17 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
int progress = 0;
int slot;
u32 nritems;
+ int space_needed = data_size;
slot = path->slots[0];
+ if (slot < btrfs_header_nritems(path->nodes[0]))
+ space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
/*
* try to push all the items after our slot into the
* right leaf
*/
- ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+ ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
if (ret < 0)
return ret;
@@ -3912,7 +4162,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
/* try to push all the items before our slot into the next leaf */
slot = path->slots[0];
- ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+ ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
if (ret < 0)
return ret;
@@ -3956,13 +4206,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
/* first try to make some room by pushing left and right */
if (data_size && path->nodes[1]) {
- wret = push_leaf_right(trans, root, path, data_size,
- data_size, 0, 0);
+ int space_needed = data_size;
+
+ if (slot < btrfs_header_nritems(l))
+ space_needed -= btrfs_leaf_free_space(root, l);
+
+ wret = push_leaf_right(trans, root, path, space_needed,
+ space_needed, 0, 0);
if (wret < 0)
return wret;
if (wret) {
- wret = push_leaf_left(trans, root, path, data_size,
- data_size, 0, (u32)-1);
+ wret = push_leaf_left(trans, root, path, space_needed,
+ space_needed, 0, (u32)-1);
if (wret < 0)
return wret;
}
@@ -4016,7 +4271,7 @@ again:
data_size > BTRFS_LEAF_DATA_SIZE(root)) {
if (data_size && !tried_avoid_double)
goto push_for_double;
- split = 2 ;
+ split = 2;
}
}
}
@@ -4042,7 +4297,7 @@ again:
btrfs_set_header_owner(right, root->root_key.objectid);
btrfs_set_header_level(right, 0);
write_extent_buffer(right, root->fs_info->fsid,
- btrfs_header_fsid(right), BTRFS_FSID_SIZE);
+ btrfs_header_fsid(), BTRFS_FSID_SIZE);
write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(right),
@@ -4177,7 +4432,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(path);
- item = btrfs_item_nr(leaf, path->slots[0]);
+ item = btrfs_item_nr(path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
@@ -4200,7 +4455,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans,
btrfs_cpu_key_to_disk(&disk_key, new_key);
btrfs_set_item_key(leaf, &disk_key, slot);
- new_item = btrfs_item_nr(leaf, slot);
+ new_item = btrfs_item_nr(slot);
btrfs_set_item_offset(leaf, new_item, orig_offset);
btrfs_set_item_size(leaf, new_item, item_size - split_offset);
@@ -4339,7 +4594,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
/* first correct the data pointers */
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(leaf, i);
+ item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(leaf, item, &token);
btrfs_set_token_item_offset(leaf, item,
@@ -4387,7 +4642,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
fixup_low_keys(root, path, &disk_key, 1);
}
- item = btrfs_item_nr(leaf, slot);
+ item = btrfs_item_nr(slot);
btrfs_set_item_size(leaf, item, new_size);
btrfs_mark_buffer_dirty(leaf);
@@ -4430,7 +4685,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
BUG_ON(slot < 0);
if (slot >= nritems) {
btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d too large, nritems %d\n",
+ btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
slot, nritems);
BUG_ON(1);
}
@@ -4441,7 +4696,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
/* first correct the data pointers */
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(leaf, i);
+ item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(leaf, item, &token);
btrfs_set_token_item_offset(leaf, item,
@@ -4455,7 +4710,7 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
data_end = old_data;
old_size = btrfs_item_size_nr(leaf, slot);
- item = btrfs_item_nr(leaf, slot);
+ item = btrfs_item_nr(slot);
btrfs_set_item_size(leaf, item, old_size + data_size);
btrfs_mark_buffer_dirty(leaf);
@@ -4493,7 +4748,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (btrfs_leaf_free_space(root, leaf) < total_size) {
btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "not enough freespace need %u have %d\n",
+ btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
total_size, btrfs_leaf_free_space(root, leaf));
BUG();
}
@@ -4503,7 +4758,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
if (old_data < data_end) {
btrfs_print_leaf(root, leaf);
- printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
+ btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
slot, old_data, data_end);
BUG_ON(1);
}
@@ -4514,7 +4769,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
for (i = slot; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(leaf, i);
+ item = btrfs_item_nr( i);
ioff = btrfs_token_item_offset(leaf, item, &token);
btrfs_set_token_item_offset(leaf, item,
ioff - total_data, &token);
@@ -4535,7 +4790,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
for (i = 0; i < nr; i++) {
btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
btrfs_set_item_key(leaf, &disk_key, slot + i);
- item = btrfs_item_nr(leaf, slot + i);
+ item = btrfs_item_nr(slot + i);
btrfs_set_token_item_offset(leaf, item,
data_end - data_size[i], &token);
data_end -= data_size[i];
@@ -4730,7 +4985,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = slot + nr; i < nritems; i++) {
u32 ioff;
- item = btrfs_item_nr(leaf, i);
+ item = btrfs_item_nr(i);
ioff = btrfs_token_item_offset(leaf, item, &token);
btrfs_set_token_item_offset(leaf, item,
ioff + dsize, &token);
@@ -4815,7 +5070,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
* This may release the path, and so you may lose any locks held at the
* time you call it.
*/
-static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
struct btrfs_key key;
struct btrfs_disk_key found_key;
@@ -4823,14 +5078,18 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
- if (key.offset > 0)
+ if (key.offset > 0) {
key.offset--;
- else if (key.type > 0)
+ } else if (key.type > 0) {
key.type--;
- else if (key.objectid > 0)
+ key.offset = (u64)-1;
+ } else if (key.objectid > 0) {
key.objectid--;
- else
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+ } else {
return 1;
+ }
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -4838,7 +5097,17 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
return ret;
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = comp_keys(&found_key, &key);
- if (ret < 0)
+ /*
+ * We might have had an item with the previous key in the tree right
+ * before we released our path. And after we released our path, that
+ * item might have been pushed to the first slot (0) of the leaf we
+ * were holding due to a tree balance. Alternatively, an item with the
+ * previous key can exist as the only element of a leaf (big fat item).
+ * Therefore account for these 2 cases, so that our callers (like
+ * btrfs_previous_item) don't miss an existing item with a key matching
+ * the previous key we computed above.
+ */
+ if (ret <= 0)
return 0;
return 1;
}
@@ -4866,7 +5135,6 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
* was nothing in the tree that matched the search criteria.
*/
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
- struct btrfs_key *max_key,
struct btrfs_path *path,
u64 min_trans)
{
@@ -4911,10 +5179,8 @@ again:
* If it is too old, old, skip to the next one.
*/
while (slot < nritems) {
- u64 blockptr;
u64 gen;
- blockptr = btrfs_node_blockptr(cur, slot);
gen = btrfs_node_ptr_generation(cur, slot);
if (gen < min_trans) {
slot++;
@@ -5080,7 +5346,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
{
int ret;
int cmp;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *left_path = NULL;
struct btrfs_path *right_path = NULL;
struct btrfs_key left_key;
@@ -5096,9 +5361,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
int advance_right;
u64 left_blockptr;
u64 right_blockptr;
- u64 left_start_ctransid;
- u64 right_start_ctransid;
- u64 ctransid;
+ u64 left_gen;
+ u64 right_gen;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -5122,21 +5386,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_path->search_commit_root = 1;
right_path->skip_locking = 1;
- spin_lock(&left_root->root_item_lock);
- left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
-
- spin_lock(&right_root->root_item_lock);
- right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
-
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
/*
* Strategy: Go to the first items of both trees. Then do
*
@@ -5173,6 +5422,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
* the right if possible or go up and right.
*/
+ down_read(&left_root->fs_info->commit_root_sem);
left_level = btrfs_header_level(left_root->commit_root);
left_root_level = left_level;
left_path->nodes[left_level] = left_root->commit_root;
@@ -5182,6 +5432,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_root_level = right_level;
right_path->nodes[right_level] = right_root->commit_root;
extent_buffer_get(right_path->nodes[right_level]);
+ up_read(&left_root->fs_info->commit_root_sem);
if (left_level == 0)
btrfs_item_key_to_cpu(left_path->nodes[left_level],
@@ -5200,67 +5451,6 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. This means, we need to
- * join and leave transactions for every item that we process.
- */
- if (trans && btrfs_should_end_transaction(trans, left_root)) {
- btrfs_release_path(left_path);
- btrfs_release_path(right_path);
-
- ret = btrfs_end_transaction(trans, left_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- }
- /* now rejoin the transaction */
- if (!trans) {
- trans = btrfs_join_transaction(left_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- spin_lock(&left_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&left_root->root_item);
- spin_unlock(&left_root->root_item_lock);
- if (ctransid != left_start_ctransid)
- left_start_ctransid = 0;
-
- spin_lock(&right_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&right_root->root_item);
- spin_unlock(&right_root->root_item_lock);
- if (ctransid != right_start_ctransid)
- right_start_ctransid = 0;
-
- if (!left_start_ctransid || !right_start_ctransid) {
- WARN(1, KERN_WARNING
- "btrfs: btrfs_compare_tree detected "
- "a change in one of the trees while "
- "iterating. This is probably a "
- "bug.\n");
- ret = -EIO;
- goto out;
- }
-
- /*
- * the commit root may have changed, so start again
- * where we stopped
- */
- left_path->lowest_level = left_level;
- right_path->lowest_level = right_level;
- ret = btrfs_search_slot(NULL, left_root,
- &left_key, left_path, 0, 0);
- if (ret < 0)
- goto out;
- ret = btrfs_search_slot(NULL, right_root,
- &right_key, right_path, 0, 0);
- if (ret < 0)
- goto out;
- }
-
if (advance_left && !left_end_reached) {
ret = tree_advance(left_root, left_path, &left_level,
left_root_level,
@@ -5360,7 +5550,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_blockptr = btrfs_node_blockptr(
right_path->nodes[right_level],
right_path->slots[right_level]);
- if (left_blockptr == right_blockptr) {
+ left_gen = btrfs_node_ptr_generation(
+ left_path->nodes[left_level],
+ left_path->slots[left_level]);
+ right_gen = btrfs_node_ptr_generation(
+ right_path->nodes[right_level],
+ right_path->slots[right_level]);
+ if (left_blockptr == right_blockptr &&
+ left_gen == right_gen) {
/*
* As we're on a shared block, don't
* allow to go deeper.
@@ -5383,14 +5580,6 @@ out:
btrfs_free_path(left_path);
btrfs_free_path(right_path);
kfree(tmp_buf);
-
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, left_root);
- else
- btrfs_end_transaction(trans, left_root);
- }
-
return ret;
}
@@ -5529,6 +5718,24 @@ again:
ret = 0;
goto done;
}
+ /*
+ * So the above check misses one case:
+ * - after releasing the path above, someone has removed the item that
+ * used to be at the very end of the block, and balance between leafs
+ * gets another one with bigger key.offset to replace it.
+ *
+ * This one should be returned as well, or we can get leaf corruption
+ * later(esp. in __btrfs_drop_extents()).
+ *
+ * And a bit more explanation about this check,
+ * with ret > 0, the key isn't found, the path points to the slot
+ * where it should be inserted, so the path->slots[0] item must be the
+ * bigger one.
+ */
+ if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
+ ret = 0;
+ goto done;
+ }
while (level < BTRFS_MAX_LEVEL) {
if (!path->nodes[level]) {
@@ -5677,3 +5884,46 @@ int btrfs_previous_item(struct btrfs_root *root,
}
return 1;
}
+
+/*
+ * search in extent tree to find a previous Metadata/Data extent item with
+ * min objecitd.
+ *
+ * returns 0 if something is found, 1 if nothing was found and < 0 on error
+ */
+int btrfs_previous_extent_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid)
+{
+ struct btrfs_key found_key;
+ struct extent_buffer *leaf;
+ u32 nritems;
+ int ret;
+
+ while (1) {
+ if (path->slots[0] == 0) {
+ btrfs_set_path_blocking(path);
+ ret = btrfs_prev_leaf(root, path);
+ if (ret != 0)
+ return ret;
+ } else {
+ path->slots[0]--;
+ }
+ leaf = path->nodes[0];
+ nritems = btrfs_header_nritems(leaf);
+ if (nritems == 0)
+ return 1;
+ if (path->slots[0] == nritems)
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid < min_objectid)
+ break;
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
+ found_key.type == BTRFS_METADATA_ITEM_KEY)
+ return 0;
+ if (found_key.objectid == min_objectid &&
+ found_key.type < BTRFS_EXTENT_ITEM_KEY)
+ break;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0506f40ede8..be91397f4e9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
#include <asm/kmap_types.h>
#include <linux/pagemap.h>
#include <linux/btrfs.h>
+#include <linux/workqueue.h>
#include "extent_io.h"
#include "extent_map.h"
#include "async-thread.h"
@@ -47,6 +48,12 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+#define STATIC noinline
+#else
+#define STATIC static noinline
+#endif
+
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -345,6 +352,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_ERROR 0
#define BTRFS_FS_STATE_REMOUNTING 1
#define BTRFS_FS_STATE_TRANS_ABORTED 2
+#define BTRFS_FS_STATE_DEV_REPLACING 3
/* Super block flags */
/* Errors detected */
@@ -515,9 +523,15 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
+#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
+
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
@@ -526,7 +540,12 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
+ BTRFS_FEATURE_INCOMPAT_NO_HOLES)
+
+#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
+ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
/*
* A leaf is full of items. offset and size tell us where to find
@@ -591,6 +610,7 @@ struct btrfs_path {
unsigned int skip_locking:1;
unsigned int leave_spinning:1;
unsigned int search_commit_root:1;
+ unsigned int need_commit_sem:1;
};
/*
@@ -737,6 +757,12 @@ struct btrfs_dir_item {
#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
+/*
+ * Internal in-memory flag that a subvolume has been marked for deletion but
+ * still visible as a directory
+ */
+#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
+
struct btrfs_root_item {
struct btrfs_inode_item inode;
__le64 generation;
@@ -821,7 +847,10 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- __le64 unused[8];
+ /* BTRFS_BALANCE_ARGS_LIMIT value */
+ __le64 limit;
+
+ __le64 unused[7];
} __attribute__ ((__packed__));
/*
@@ -968,7 +997,8 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
-#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
+#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
+ BTRFS_SPACE_INFO_GLOBAL_RSV)
enum btrfs_raid_types {
BTRFS_RAID_RAID10,
@@ -1000,6 +1030,12 @@ enum btrfs_raid_types {
*/
#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
+/*
+ * A fake block group type that is used to communicate global block reserve
+ * size to userspace via the SPACE_INFO ioctl.
+ */
+#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
+
#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
BTRFS_AVAIL_ALLOC_BIT_SINGLE)
@@ -1087,8 +1123,14 @@ struct btrfs_qgroup_limit_item {
__le64 rsv_excl;
} __attribute__ ((__packed__));
+/* For raid type sysfs entries */
+struct raid_kobject {
+ int raid_type;
+ struct kobject kobj;
+};
+
struct btrfs_space_info {
- u64 flags;
+ spinlock_t lock;
u64 total_bytes; /* total bytes in the space,
this doesn't take mirrors into account */
@@ -1098,14 +1140,25 @@ struct btrfs_space_info {
transaction finishes */
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
- u64 bytes_readonly; /* total bytes that are read only */
-
u64 bytes_may_use; /* number of bytes that may be used for
delalloc/allocations */
+ u64 bytes_readonly; /* total bytes that are read only */
+
+ unsigned int full:1; /* indicates that we cannot allocate any more
+ chunks for this space */
+ unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
+
+ unsigned int flush:1; /* set if we are trying to make space */
+
+ unsigned int force_alloc; /* set if we need to force a chunk
+ alloc for this space */
+
u64 disk_used; /* total bytes used on disk */
u64 disk_total; /* total bytes on disk, takes mirrors into
account */
+ u64 flags;
+
/*
* bytes_pinned is kept in line with what is actually pinned, as in
* we've called update_block_group and dropped the bytes_used counter
@@ -1118,22 +1171,15 @@ struct btrfs_space_info {
*/
struct percpu_counter total_bytes_pinned;
- unsigned int full:1; /* indicates that we cannot allocate any more
- chunks for this space */
- unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
-
- unsigned int flush:1; /* set if we are trying to make space */
-
- unsigned int force_alloc; /* set if we need to force a chunk
- alloc for this space */
-
struct list_head list;
+ struct rw_semaphore groups_sem;
/* for block groups in our same type */
struct list_head block_groups[BTRFS_NR_RAID_TYPES];
- spinlock_t lock;
- struct rw_semaphore groups_sem;
wait_queue_head_t wait;
+
+ struct kobject kobj;
+ struct kobject *block_group_kobjs[BTRFS_NR_RAID_TYPES];
};
#define BTRFS_BLOCK_RSV_GLOBAL 1
@@ -1213,11 +1259,19 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
+ u64 delalloc_bytes;
u64 bytes_super;
u64 flags;
u64 sectorsize;
u64 cache_generation;
+ /*
+ * It is just used for the delayed data space allocation because
+ * only the data space allocation and the relative metadata update
+ * can be done cross the transaction.
+ */
+ struct rw_semaphore data_rwsem;
+
/* for raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
@@ -1283,6 +1337,8 @@ struct btrfs_stripe_hash_table {
#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+void btrfs_init_async_reclaim_work(struct work_struct *work);
+
/* fs_info */
struct reloc_control;
struct btrfs_device;
@@ -1340,6 +1396,7 @@ struct btrfs_fs_info {
u64 generation;
u64 last_trans_committed;
+ u64 avg_delayed_ref_runtime;
/*
* this is updated to the current trans every time a full commit
@@ -1417,7 +1474,7 @@ struct btrfs_fs_info {
*/
struct mutex ordered_extent_flush_mutex;
- struct rw_semaphore extent_commit_sem;
+ struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
@@ -1442,7 +1499,6 @@ struct btrfs_fs_info {
spinlock_t tree_mod_seq_lock;
atomic64_t tree_mod_seq;
struct list_head tree_mod_seq_list;
- struct seq_list tree_mod_seq_elem;
/* this protects tree_mod_log */
rwlock_t tree_mod_log_lock;
@@ -1468,6 +1524,7 @@ struct btrfs_fs_info {
*/
struct list_head ordered_roots;
+ struct mutex delalloc_root_mutex;
spinlock_t delalloc_root_lock;
/* all fs/file tree roots that have delalloc inodes. */
struct list_head delalloc_roots;
@@ -1482,33 +1539,37 @@ struct btrfs_fs_info {
* A third pool does submit_bio to avoid deadlocking with the other
* two
*/
- struct btrfs_workers generic_worker;
- struct btrfs_workers workers;
- struct btrfs_workers delalloc_workers;
- struct btrfs_workers flush_workers;
- struct btrfs_workers endio_workers;
- struct btrfs_workers endio_meta_workers;
- struct btrfs_workers endio_raid56_workers;
- struct btrfs_workers rmw_workers;
- struct btrfs_workers endio_meta_write_workers;
- struct btrfs_workers endio_write_workers;
- struct btrfs_workers endio_freespace_worker;
- struct btrfs_workers submit_workers;
- struct btrfs_workers caching_workers;
- struct btrfs_workers readahead_workers;
+ struct btrfs_workqueue *workers;
+ struct btrfs_workqueue *delalloc_workers;
+ struct btrfs_workqueue *flush_workers;
+ struct btrfs_workqueue *endio_workers;
+ struct btrfs_workqueue *endio_meta_workers;
+ struct btrfs_workqueue *endio_raid56_workers;
+ struct btrfs_workqueue *rmw_workers;
+ struct btrfs_workqueue *endio_meta_write_workers;
+ struct btrfs_workqueue *endio_write_workers;
+ struct btrfs_workqueue *endio_freespace_worker;
+ struct btrfs_workqueue *submit_workers;
+ struct btrfs_workqueue *caching_workers;
+ struct btrfs_workqueue *readahead_workers;
/*
* fixup workers take dirty pages that didn't properly go through
* the cow mechanism and make them safe to write. It happens
* for the sys_munmap function call path
*/
- struct btrfs_workers fixup_workers;
- struct btrfs_workers delayed_workers;
+ struct btrfs_workqueue *fixup_workers;
+ struct btrfs_workqueue *delayed_workers;
+
+ /* the extent workers do delayed refs on the extent allocation tree */
+ struct btrfs_workqueue *extent_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
int thread_pool_size;
struct kobject super_kobj;
+ struct kobject *space_info_kobj;
+ struct kobject *device_dir_kobj;
struct completion kobj_unregister;
int do_barriers;
int closing;
@@ -1580,11 +1641,10 @@ struct btrfs_fs_info {
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
- struct rw_semaphore scrub_super_lock;
int scrub_workers_refcnt;
- struct btrfs_workers scrub_workers;
- struct btrfs_workers scrub_wr_completion_workers;
- struct btrfs_workers scrub_nocow_workers;
+ struct btrfs_workqueue *scrub_workers;
+ struct btrfs_workqueue *scrub_wr_completion_workers;
+ struct btrfs_workqueue *scrub_nocow_workers;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
@@ -1605,7 +1665,10 @@ struct btrfs_fs_info {
/* holds configuration and tracking. Protected by qgroup_lock */
struct rb_root qgroup_tree;
+ struct rb_root qgroup_op_tree;
spinlock_t qgroup_lock;
+ spinlock_t qgroup_op_lock;
+ atomic_t qgroup_op_seq;
/*
* used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -1625,7 +1688,7 @@ struct btrfs_fs_info {
/* qgroup rescan items */
struct mutex qgroup_rescan_lock; /* protects the progress item */
struct btrfs_key qgroup_rescan_progress;
- struct btrfs_workers qgroup_rescan_workers;
+ struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
@@ -1638,6 +1701,10 @@ struct btrfs_fs_info {
spinlock_t reada_lock;
struct radix_tree_root reada_tree;
+ /* Extent buffer radix tree */
+ spinlock_t buffer_lock;
+ struct radix_tree_root buffer_radix;
+
/* next backup root to be overwritten */
int backup_root_index;
@@ -1648,11 +1715,42 @@ struct btrfs_fs_info {
atomic_t mutually_exclusive_operation_running;
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
+
struct semaphore uuid_tree_rescan_sem;
unsigned int update_uuid_tree_gen:1;
+
+ /* Used to reclaim the metadata space in the background. */
+ struct work_struct async_reclaim_work;
+};
+
+struct btrfs_subvolume_writers {
+ struct percpu_counter counter;
+ wait_queue_head_t wait;
};
/*
+ * The state of btrfs root
+ */
+/*
+ * btrfs_record_root_in_trans is a multi-step process,
+ * and it can race with the balancing code. But the
+ * race is very small, and only the first time the root
+ * is added to each transaction. So IN_TRANS_SETUP
+ * is used to tell us when more checks are required
+ */
+#define BTRFS_ROOT_IN_TRANS_SETUP 0
+#define BTRFS_ROOT_REF_COWS 1
+#define BTRFS_ROOT_TRACK_DIRTY 2
+#define BTRFS_ROOT_IN_RADIX 3
+#define BTRFS_ROOT_DUMMY_ROOT 4
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
+#define BTRFS_ROOT_DEFRAG_RUNNING 6
+#define BTRFS_ROOT_FORCE_COW 7
+#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+
+/*
* in ram representation of the tree. extent_root is used for all allocations
* and for the extent tree extent_root root.
*/
@@ -1663,6 +1761,7 @@ struct btrfs_root {
struct btrfs_root *log_root;
struct btrfs_root *reloc_root;
+ unsigned long state;
struct btrfs_root_item root_item;
struct btrfs_key root_key;
struct btrfs_fs_info *fs_info;
@@ -1676,7 +1775,6 @@ struct btrfs_root {
struct btrfs_block_rsv *block_rsv;
/* free ino cache stuff */
- struct mutex fs_commit_mutex;
struct btrfs_free_space_ctl *free_ino_ctl;
enum btrfs_caching_type cached;
spinlock_t cache_lock;
@@ -1688,13 +1786,16 @@ struct btrfs_root {
struct mutex log_mutex;
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
+ struct list_head log_ctxs[2];
atomic_t log_writers;
atomic_t log_commit[2];
atomic_t log_batch;
- unsigned long log_transid;
- unsigned long last_log_commit;
+ int log_transid;
+ /* No matter the commit succeeds or not*/
+ int log_transid_committed;
+ /* Just be updated when the commit succeeds. */
+ int last_log_commit;
pid_t log_start_pid;
- bool log_multiple_pids;
u64 objectid;
u64 last_trans;
@@ -1714,21 +1815,13 @@ struct btrfs_root {
u64 highest_objectid;
- /* btrfs_record_root_in_trans is a multi-step process,
- * and it can race with the balancing code. But the
- * race is very small, and only the first time the root
- * is added to each transaction. So in_trans_setup
- * is used to tell us when more checks are required
- */
- unsigned long in_trans_setup;
- int ref_cows;
- int track_dirty;
- int in_radix;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ u64 alloc_bytenr;
+#endif
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
- int defrag_running;
char *name;
/* the dirty list is only used by non-reference counted roots */
@@ -1742,7 +1835,6 @@ struct btrfs_root {
spinlock_t orphan_lock;
atomic_t orphan_inodes;
struct btrfs_block_rsv *orphan_block_rsv;
- int orphan_item_inserted;
int orphan_cleanup_state;
spinlock_t inode_lock;
@@ -1760,11 +1852,10 @@ struct btrfs_root {
*/
dev_t anon_dev;
- int force_cow;
-
spinlock_t root_item_lock;
atomic_t refs;
+ struct mutex delalloc_mutex;
spinlock_t delalloc_lock;
/*
* all of the inodes that have delalloc bytes. It is possible for
@@ -1774,6 +1865,8 @@ struct btrfs_root {
struct list_head delalloc_inodes;
struct list_head delalloc_root;
u64 nr_delalloc_inodes;
+
+ struct mutex ordered_extent_mutex;
/*
* this is used by the balancing code to wait for all the pending
* ordered extents
@@ -1788,6 +1881,14 @@ struct btrfs_root {
struct list_head ordered_extents;
struct list_head ordered_root;
u64 nr_ordered_extents;
+
+ /*
+ * Number of currently running SEND ioctls to prevent
+ * manipulation with the read-only status via SUBVOL_SETFLAGS
+ */
+ int send_in_progress;
+ struct btrfs_subvolume_writers *subv_writers;
+ atomic_t will_be_snapshoted;
};
struct btrfs_ioctl_defrag_range_args {
@@ -1990,6 +2091,7 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
+#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
@@ -1998,6 +2100,20 @@ struct btrfs_ioctl_defrag_range_args {
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
BTRFS_MOUNT_##opt)
+#define btrfs_set_and_info(root, opt, fmt, args...) \
+{ \
+ if (!btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_set_opt(root->fs_info->mount_opt, opt); \
+}
+
+#define btrfs_clear_and_info(root, opt, fmt, args...) \
+{ \
+ if (btrfs_test_opt(root, opt)) \
+ btrfs_info(root->fs_info, fmt, ##args); \
+ btrfs_clear_opt(root->fs_info->mount_opt, opt); \
+}
+
/*
* Inode flags
*/
@@ -2461,8 +2577,7 @@ static inline unsigned long btrfs_item_nr_offset(int nr)
sizeof(struct btrfs_item) * nr;
}
-static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb,
- int nr)
+static inline struct btrfs_item *btrfs_item_nr(int nr)
{
return (struct btrfs_item *)btrfs_item_nr_offset(nr);
}
@@ -2475,30 +2590,30 @@ static inline u32 btrfs_item_end(struct extent_buffer *eb,
static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr)
{
- return btrfs_item_end(eb, btrfs_item_nr(eb, nr));
+ return btrfs_item_end(eb, btrfs_item_nr(nr));
}
static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr)
{
- return btrfs_item_offset(eb, btrfs_item_nr(eb, nr));
+ return btrfs_item_offset(eb, btrfs_item_nr(nr));
}
static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr)
{
- return btrfs_item_size(eb, btrfs_item_nr(eb, nr));
+ return btrfs_item_size(eb, btrfs_item_nr(nr));
}
static inline void btrfs_item_key(struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
- struct btrfs_item *item = btrfs_item_nr(eb, nr);
+ struct btrfs_item *item = btrfs_item_nr(nr);
read_eb_member(eb, item, struct btrfs_item, key, disk_key);
}
static inline void btrfs_set_item_key(struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
- struct btrfs_item *item = btrfs_item_nr(eb, nr);
+ struct btrfs_item *item = btrfs_item_nr(nr);
write_eb_member(eb, item, struct btrfs_item, key, disk_key);
}
@@ -2666,7 +2781,7 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
btrfs_set_header_flags(eb, flags);
}
-static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(void)
{
return offsetof(struct btrfs_header, fsid);
}
@@ -2715,6 +2830,11 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
}
+static inline bool btrfs_root_dead(struct btrfs_root *root)
+{
+ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0;
+}
+
/* struct btrfs_root_backup */
BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
tree_root, 64);
@@ -2824,6 +2944,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
cpu->vend = le64_to_cpu(disk->vend);
cpu->target = le64_to_cpu(disk->target);
cpu->flags = le64_to_cpu(disk->flags);
+ cpu->limit = le64_to_cpu(disk->limit);
}
static inline void
@@ -2841,6 +2962,7 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
disk->vend = cpu_to_le64(cpu->vend);
disk->target = cpu_to_le64(cpu->target);
disk->flags = cpu_to_le64(cpu->flags);
+ disk->limit = cpu_to_le64(cpu->limit);
}
/* struct btrfs_super_block */
@@ -2919,6 +3041,10 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
struct btrfs_file_extent_item, generation, 64);
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
struct btrfs_file_extent_item, num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes,
+ struct btrfs_file_extent_item, disk_num_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
+ struct btrfs_file_extent_item, compression, 8);
static inline unsigned long
btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
@@ -2952,15 +3078,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item,
BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
other_encoding, 16);
-/* this returns the number of file bytes represented by the inline item.
- * If an item is compressed, this is the uncompressed size
- */
-static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
- struct btrfs_file_extent_item *e)
-{
- return btrfs_file_extent_ram_bytes(eb, e);
-}
-
/*
* this returns the number of bytes used by the item on disk, minus the
* size of any extent headers. If a file is compressed on disk, this is
@@ -2974,6 +3091,32 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
return btrfs_item_size(eb, e) - offset;
}
+/* this returns the number of file bytes represented by the inline item.
+ * If an item is compressed, this is the uncompressed size
+ */
+static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb,
+ int slot,
+ struct btrfs_file_extent_item *fi)
+{
+ struct btrfs_map_token token;
+
+ btrfs_init_map_token(&token);
+ /*
+ * return the space used on disk if this item isn't
+ * compressed or encoded
+ */
+ if (btrfs_token_file_extent_compression(eb, fi, &token) == 0 &&
+ btrfs_token_file_extent_encryption(eb, fi, &token) == 0 &&
+ btrfs_token_file_extent_other_encoding(eb, fi, &token) == 0) {
+ return btrfs_file_extent_inline_item_len(eb,
+ btrfs_item_nr(slot));
+ }
+
+ /* otherwise use the ram bytes field */
+ return btrfs_token_file_extent_ram_bytes(eb, fi, &token);
+}
+
+
/* btrfs_dev_stats_item */
static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
struct btrfs_dev_stats_item *ptr,
@@ -3105,11 +3248,6 @@ static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
((unsigned long)(btrfs_leaf_data(leaf) + \
btrfs_item_offset_nr(leaf, slot)))
-static inline struct dentry *fdentry(struct file *file)
-{
- return file->f_path.dentry;
-}
-
static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
{
return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
@@ -3142,9 +3280,13 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
@@ -3162,6 +3304,7 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
u64 bytenr);
void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
+int get_block_group_index(struct btrfs_block_group_cache *cache);
struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u32 blocksize,
u64 parent, u64 root_objectid,
@@ -3181,11 +3324,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data);
+ struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow);
+ struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow);
+ struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 flags,
@@ -3193,9 +3336,10 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int for_cow);
+ u64 owner, u64 offset, int no_quota);
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
+ int delalloc);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len);
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3205,7 +3349,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int for_cow);
+ u64 root_objectid, u64 owner, u64 offset, int no_quota);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3293,6 +3437,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);
+int btrfs_start_nocow_write(struct btrfs_root *root);
+void btrfs_end_nocow_write(struct btrfs_root *root);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
@@ -3300,6 +3446,8 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
int btrfs_previous_item(struct btrfs_root *root,
struct btrfs_path *path, u64 min_objectid,
int type);
+int btrfs_previous_extent_item(struct btrfs_root *root,
+ struct btrfs_path *path, u64 min_objectid);
void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *new_key);
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
@@ -3308,7 +3456,6 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *key, int lowest_level,
u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
- struct btrfs_key *max_key,
struct btrfs_path *path,
u64 min_trans);
enum btrfs_compare_tree_result {
@@ -3350,6 +3497,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *new_key);
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key);
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_key *key, struct btrfs_path *p, int
ins_len, int cow);
@@ -3399,6 +3548,7 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
}
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
u64 time_seq);
static inline int btrfs_next_old_item(struct btrfs_root *root,
@@ -3462,7 +3612,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem);
-u64 btrfs_tree_mod_seq_prev(u64 seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
@@ -3563,12 +3712,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 *index);
-int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod,
- u64 *ret_index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
@@ -3613,11 +3756,14 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct btrfs_path *path,
- u64 isize);
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit);
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em);
+
/* inode.c */
struct btrfs_delalloc_work {
struct inode *inode;
@@ -3675,12 +3821,14 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u32 min_type);
int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
- int delay_iput);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr);
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, u64 new_dirid);
+ struct btrfs_root *new_root,
+ struct btrfs_root *parent_root,
+ u64 new_dirid);
int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
size_t size, struct bio *bio,
unsigned long bio_flags);
@@ -3745,14 +3893,14 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
int skip_pinned);
-int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
- u64 start, u64 end, int skip_pinned,
- int modified);
extern const struct file_operations btrfs_file_operations;
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache);
+ u64 *drop_end, int drop_cache,
+ int replace_extent,
+ u32 extent_item_size,
+ int *key_inserted);
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode, u64 start,
u64 end, int drop_cache);
@@ -3771,6 +3919,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -3803,14 +3953,20 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+
+#ifdef DEBUG
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#else
+#define btrfs_debug(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#endif
#ifdef CONFIG_BTRFS_ASSERT
static inline void assfail(char *expr, char *file, int line)
{
- printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d",
+ pr_err("BTRFS: assertion failed: %s, file: %s, line: %d",
expr, file, line);
BUG();
}
@@ -3848,7 +4004,7 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
if (!(features & flag)) {
features |= flag;
btrfs_set_super_incompat_flags(disk_super, features);
- printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+ btrfs_info(fs_info, "setting %llu feature flag",
flag);
}
spin_unlock(&fs_info->super_lock);
@@ -3906,20 +4062,17 @@ do { \
/* acl.c */
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
+int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir);
-int btrfs_acl_chmod(struct inode *inode);
#else
#define btrfs_get_acl NULL
+#define btrfs_set_acl NULL
static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir)
{
return 0;
}
-static inline int btrfs_acl_chmod(struct inode *inode)
-{
- return 0;
-}
#endif
/* relocation.c */
@@ -3944,15 +4097,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
void btrfs_scrub_pause(struct btrfs_root *root);
-void btrfs_scrub_pause_super(struct btrfs_root *root);
void btrfs_scrub_continue(struct btrfs_root *root);
-void btrfs_scrub_continue_super(struct btrfs_root *root);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
struct btrfs_scrub_progress *progress);
+/* dev-replace.c */
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info);
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info);
+
/* reada.c */
struct reada_control {
struct btrfs_root *root; /* tree to prefetch */
@@ -3969,52 +4125,6 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
u64 start, int err);
-/* qgroup.c */
-struct qgroup_update {
- struct list_head list;
- struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op;
-};
-
-int btrfs_quota_enable(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_quota_disable(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
-void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
-int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 src, u64 dst);
-int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- char *name);
-int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid);
-int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 qgroupid,
- struct btrfs_qgroup_limit *limit);
-int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
-void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
-struct btrfs_delayed_extent_op;
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op);
-int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
- struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
-void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
-
static inline int is_fstree(u64 rootid)
{
if (rootid == BTRFS_FS_TREE_OBJECTID ||
@@ -4028,5 +4138,11 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
+/* Sanity test specific functions */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode);
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
+#endif
#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index cbd9523ad09..da775bfdebc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -55,8 +55,7 @@ static inline void btrfs_init_delayed_node(
delayed_node->inode_id = inode_id;
atomic_set(&delayed_node->refs, 0);
delayed_node->count = 0;
- delayed_node->in_list = 0;
- delayed_node->inode_dirty = 0;
+ delayed_node->flags = 0;
delayed_node->ins_root = RB_ROOT;
delayed_node->del_root = RB_ROOT;
mutex_init(&delayed_node->mutex);
@@ -108,8 +107,8 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
return node;
}
btrfs_inode->delayed_node = node;
- atomic_inc(&node->refs); /* can be accessed */
- atomic_inc(&node->refs); /* cached in the inode */
+ /* can be accessed and cached in the inode */
+ atomic_add(2, &node->refs);
spin_unlock(&root->inode_lock);
return node;
}
@@ -138,8 +137,8 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- atomic_inc(&node->refs); /* cached in the btrfs inode */
- atomic_inc(&node->refs); /* can be accessed */
+ /* cached in the btrfs inode and can be accessed */
+ atomic_add(2, &node->refs);
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret) {
@@ -150,8 +149,8 @@ again:
spin_lock(&root->inode_lock);
ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
if (ret == -EEXIST) {
- kmem_cache_free(delayed_node_cache, node);
spin_unlock(&root->inode_lock);
+ kmem_cache_free(delayed_node_cache, node);
radix_tree_preload_end();
goto again;
}
@@ -172,7 +171,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
int mod)
{
spin_lock(&root->lock);
- if (node->in_list) {
+ if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
if (!list_empty(&node->p_list))
list_move_tail(&node->p_list, &root->prepare_list);
else if (mod)
@@ -182,7 +181,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
list_add_tail(&node->p_list, &root->prepare_list);
atomic_inc(&node->refs); /* inserted into list */
root->nodes++;
- node->in_list = 1;
+ set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
spin_unlock(&root->lock);
}
@@ -192,13 +191,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
struct btrfs_delayed_node *node)
{
spin_lock(&root->lock);
- if (node->in_list) {
+ if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
root->nodes--;
atomic_dec(&node->refs); /* not in the list */
list_del_init(&node->n_list);
if (!list_empty(&node->p_list))
list_del_init(&node->p_list);
- node->in_list = 0;
+ clear_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags);
}
spin_unlock(&root->lock);
}
@@ -231,7 +230,8 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node(
delayed_root = node->root->fs_info->delayed_root;
spin_lock(&delayed_root->lock);
- if (!node->in_list) { /* not in the list */
+ if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) {
+ /* not in the list */
if (list_empty(&delayed_root->node_list))
goto out;
p = delayed_root->node_list.next;
@@ -267,14 +267,17 @@ static void __btrfs_release_delayed_node(
mutex_unlock(&delayed_node->mutex);
if (atomic_dec_and_test(&delayed_node->refs)) {
+ bool free = false;
struct btrfs_root *root = delayed_node->root;
spin_lock(&root->inode_lock);
if (atomic_read(&delayed_node->refs) == 0) {
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
- kmem_cache_free(delayed_node_cache, delayed_node);
+ free = true;
}
spin_unlock(&root->inode_lock);
+ if (free)
+ kmem_cache_free(delayed_node_cache, delayed_node);
}
}
@@ -649,14 +652,13 @@ static int btrfs_delayed_inode_reserve_metadata(
goto out;
ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
+ if (!WARN_ON(ret))
goto out;
/*
* Ok this is a problem, let's just steal from the global rsv
* since this really shouldn't happen that often.
*/
- WARN_ON(1);
ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
dst_rsv, num_bytes);
goto out;
@@ -771,13 +773,13 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
*/
btrfs_set_path_blocking(path);
- keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+ keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS);
if (!keys) {
ret = -ENOMEM;
goto out;
}
- data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+ data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS);
if (!data_size) {
ret = -ENOMEM;
goto error;
@@ -1005,9 +1007,10 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
{
struct btrfs_delayed_root *delayed_root;
- if (delayed_node && delayed_node->inode_dirty) {
+ if (delayed_node &&
+ test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
BUG_ON(!delayed_node->root);
- delayed_node->inode_dirty = 0;
+ clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count--;
delayed_root = delayed_node->root->fs_info->delayed_root;
@@ -1015,6 +1018,18 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
}
}
+static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node)
+{
+ struct btrfs_delayed_root *delayed_root;
+
+ ASSERT(delayed_node->root);
+ clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+ delayed_node->count--;
+
+ delayed_root = delayed_node->root->fs_info->delayed_root;
+ finish_one_item(delayed_root);
+}
+
static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
@@ -1023,13 +1038,19 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ int mod;
int ret;
key.objectid = node->inode_id;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
- ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+ mod = -1;
+ else
+ mod = 1;
+
+ ret = btrfs_lookup_inode(trans, root, path, &key, mod);
if (ret > 0) {
btrfs_release_path(path);
return -ENOENT;
@@ -1037,19 +1058,58 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
return ret;
}
- btrfs_unlock_up_safe(path, 1);
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
sizeof(struct btrfs_inode_item));
btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
+ if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
+ goto no_iref;
+
+ path->slots[0]++;
+ if (path->slots[0] >= btrfs_header_nritems(leaf))
+ goto search;
+again:
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != node->inode_id)
+ goto out;
+
+ if (key.type != BTRFS_INODE_REF_KEY &&
+ key.type != BTRFS_INODE_EXTREF_KEY)
+ goto out;
+
+ /*
+ * Delayed iref deletion is for the inode who has only one link,
+ * so there is only one iref. The case that several irefs are
+ * in the same item doesn't exist.
+ */
+ btrfs_del_item(trans, root, path);
+out:
+ btrfs_release_delayed_iref(node);
+no_iref:
+ btrfs_release_path(path);
+err_out:
btrfs_delayed_inode_release_metadata(root, node);
btrfs_release_delayed_inode(node);
- return 0;
+ return ret;
+
+search:
+ btrfs_release_path(path);
+
+ btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+ key.offset = -1;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret);
+
+ ret = 0;
+ leaf = path->nodes[0];
+ path->slots[0]--;
+ goto again;
}
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -1060,7 +1120,7 @@ static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
int ret;
mutex_lock(&node->mutex);
- if (!node->inode_dirty) {
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &node->flags)) {
mutex_unlock(&node->mutex);
return 0;
}
@@ -1174,8 +1234,10 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
mutex_unlock(&delayed_node->mutex);
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ btrfs_release_delayed_node(delayed_node);
return -ENOMEM;
+ }
path->leave_spinning = 1;
block_rsv = trans->block_rsv;
@@ -1202,7 +1264,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
return 0;
mutex_lock(&delayed_node->mutex);
- if (!delayed_node->inode_dirty) {
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
return 0;
@@ -1226,7 +1288,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode)
trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
mutex_lock(&delayed_node->mutex);
- if (delayed_node->inode_dirty)
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags))
ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
path, delayed_node);
else
@@ -1299,36 +1361,9 @@ again:
trans->block_rsv = &root->fs_info->delayed_block_rsv;
__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
- /*
- * Maybe new delayed items have been inserted, so we need requeue
- * the work. Besides that, we must dequeue the empty delayed nodes
- * to avoid the race between delayed items balance and the worker.
- * The race like this:
- * Task1 Worker thread
- * count == 0, needn't requeue
- * also needn't insert the
- * delayed node into prepare
- * list again.
- * add lots of delayed items
- * queue the delayed node
- * already in the list,
- * and not in the prepare
- * list, it means the delayed
- * node is being dealt with
- * by the worker.
- * do delayed items balance
- * the delayed node is being
- * dealt with by the worker
- * now, just wait.
- * the worker goto idle.
- * Task1 will sleep until the transaction is commited.
- */
- mutex_lock(&delayed_node->mutex);
- btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
- mutex_unlock(&delayed_node->mutex);
trans->block_rsv = block_rsv;
- btrfs_end_transaction_dmeta(trans, root);
+ btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty_nodelay(root);
release_path:
@@ -1360,11 +1395,11 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- async_work->work.func = btrfs_async_run_delayed_root;
- async_work->work.flags = 0;
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root,
+ NULL, NULL);
async_work->nr = nr;
- btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
+ btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
return 0;
}
@@ -1375,52 +1410,41 @@ void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
WARN_ON(btrfs_first_delayed_node(delayed_root));
}
-static int refs_newer(struct btrfs_delayed_root *delayed_root,
- int seq, int count)
+static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq)
{
int val = atomic_read(&delayed_root->items_seq);
- if (val < seq || val >= seq + count)
+ if (val < seq || val >= seq + BTRFS_DELAYED_BATCH)
+ return 1;
+
+ if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
return 1;
+
return 0;
}
void btrfs_balance_delayed_items(struct btrfs_root *root)
{
struct btrfs_delayed_root *delayed_root;
- int seq;
delayed_root = btrfs_get_delayed_root(root);
if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
return;
- seq = atomic_read(&delayed_root->items_seq);
-
if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+ int seq;
int ret;
- DEFINE_WAIT(__wait);
+
+ seq = atomic_read(&delayed_root->items_seq);
ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
if (ret)
return;
- while (1) {
- prepare_to_wait(&delayed_root->wait, &__wait,
- TASK_INTERRUPTIBLE);
-
- if (refs_newer(delayed_root, seq,
- BTRFS_DELAYED_BATCH) ||
- atomic_read(&delayed_root->items) <
- BTRFS_DELAYED_BACKGROUND) {
- break;
- }
- if (!signal_pending(current))
- schedule();
- else
- break;
- }
- finish_wait(&delayed_root->wait, &__wait);
+ wait_event_interruptible(delayed_root->wait,
+ could_end_wait(delayed_root, seq));
+ return;
}
btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
@@ -1471,9 +1495,9 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_lock(&delayed_node->mutex);
ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
if (unlikely(ret)) {
- printk(KERN_ERR "err add delayed dir index item(name: %.*s) "
+ btrfs_err(root->fs_info, "err add delayed dir index item(name: %.*s) "
"into the insertion tree of the delayed node"
- "(root id: %llu, inode id: %llu, errno: %d)\n",
+ "(root id: %llu, inode id: %llu, errno: %d)",
name_len, name, delayed_node->root->objectid,
delayed_node->inode_id, ret);
BUG();
@@ -1543,9 +1567,9 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
mutex_lock(&node->mutex);
ret = __btrfs_add_delayed_deletion_item(node, item);
if (unlikely(ret)) {
- printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+ btrfs_err(root->fs_info, "err add delayed dir index item(index: %llu) "
"into the deletion tree of the delayed node"
- "(root id: %llu, inode id: %llu, errno: %d)\n",
+ "(root id: %llu, inode id: %llu, errno: %d)",
index, node->root->objectid, node->inode_id,
ret);
BUG();
@@ -1758,7 +1782,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
return -ENOENT;
mutex_lock(&delayed_node->mutex);
- if (!delayed_node->inode_dirty) {
+ if (!test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
mutex_unlock(&delayed_node->mutex);
btrfs_release_delayed_node(delayed_node);
return -ENOENT;
@@ -1809,7 +1833,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
return PTR_ERR(delayed_node);
mutex_lock(&delayed_node->mutex);
- if (delayed_node->inode_dirty) {
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
goto release_node;
}
@@ -1820,7 +1844,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
- delayed_node->inode_dirty = 1;
+ set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags);
delayed_node->count++;
atomic_inc(&root->fs_info->delayed_root->items);
release_node:
@@ -1829,6 +1853,41 @@ release_node:
return ret;
}
+int btrfs_delayed_delete_inode_ref(struct inode *inode)
+{
+ struct btrfs_delayed_node *delayed_node;
+
+ delayed_node = btrfs_get_or_create_delayed_node(inode);
+ if (IS_ERR(delayed_node))
+ return PTR_ERR(delayed_node);
+
+ /*
+ * We don't reserve space for inode ref deletion is because:
+ * - We ONLY do async inode ref deletion for the inode who has only
+ * one link(i_nlink == 1), it means there is only one inode ref.
+ * And in most case, the inode ref and the inode item are in the
+ * same leaf, and we will deal with them at the same time.
+ * Since we are sure we will reserve the space for the inode item,
+ * it is unnecessary to reserve space for inode ref deletion.
+ * - If the inode ref and the inode item are not in the same leaf,
+ * We also needn't worry about enospc problem, because we reserve
+ * much more space for the inode update than it needs.
+ * - At the worst, we can steal some space from the global reservation.
+ * It is very rare.
+ */
+ mutex_lock(&delayed_node->mutex);
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+ goto release_node;
+
+ set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags);
+ delayed_node->count++;
+ atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items);
+release_node:
+ mutex_unlock(&delayed_node->mutex);
+ btrfs_release_delayed_node(delayed_node);
+ return 0;
+}
+
static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
{
struct btrfs_root *root = delayed_node->root;
@@ -1851,7 +1910,10 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
btrfs_release_delayed_item(prev_item);
}
- if (delayed_node->inode_dirty) {
+ if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags))
+ btrfs_release_delayed_iref(delayed_node);
+
+ if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) {
btrfs_delayed_inode_release_metadata(root, delayed_node);
btrfs_release_delayed_inode(delayed_node);
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index a4b38f934d1..f70119f2542 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -48,6 +48,10 @@ struct btrfs_delayed_root {
wait_queue_head_t wait;
};
+#define BTRFS_DELAYED_NODE_IN_LIST 0
+#define BTRFS_DELAYED_NODE_INODE_DIRTY 1
+#define BTRFS_DELAYED_NODE_DEL_IREF 2
+
struct btrfs_delayed_node {
u64 inode_id;
u64 bytes_reserved;
@@ -65,8 +69,7 @@ struct btrfs_delayed_node {
struct btrfs_inode_item inode_item;
atomic_t refs;
u64 index_cnt;
- bool in_list;
- bool inode_dirty;
+ unsigned long flags;
int count;
};
@@ -125,6 +128,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode);
int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode);
int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+int btrfs_delayed_delete_inode_ref(struct inode *inode);
/* Used for drop dead root */
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e4d467be2dd..6d16bea94e1 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
return -1;
if (ref1->type > ref2->type)
return 1;
+ if (ref1->no_quota > ref2->no_quota)
+ return 1;
+ if (ref1->no_quota < ref2->no_quota)
+ return -1;
/* merging of sequenced refs is not allowed */
if (compare_seq) {
if (ref1->seq < ref2->seq)
@@ -161,56 +165,69 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
return NULL;
}
+/* insert a new ref to head ref rbtree */
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_head *entry;
+ struct btrfs_delayed_ref_head *ins;
+ u64 bytenr;
+
+ ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
+ bytenr = ins->node.bytenr;
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
+ href_node);
+
+ if (bytenr < entry->node.bytenr)
+ p = &(*p)->rb_left;
+ else if (bytenr > entry->node.bytenr)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
/*
* find an head entry based on bytenr. This returns the delayed ref
* head if it was able to find one, or NULL if nothing was in that spot.
* If return_bigger is given, the next bigger entry is returned if no exact
* match is found.
*/
-static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
- u64 bytenr,
- struct btrfs_delayed_ref_node **last,
- int return_bigger)
+static struct btrfs_delayed_ref_head *
+find_ref_head(struct rb_root *root, u64 bytenr,
+ int return_bigger)
{
struct rb_node *n;
- struct btrfs_delayed_ref_node *entry;
- int cmp = 0;
+ struct btrfs_delayed_ref_head *entry;
-again:
n = root->rb_node;
entry = NULL;
while (n) {
- entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
- WARN_ON(!entry->in_tree);
- if (last)
- *last = entry;
-
- if (bytenr < entry->bytenr)
- cmp = -1;
- else if (bytenr > entry->bytenr)
- cmp = 1;
- else if (!btrfs_delayed_ref_is_head(entry))
- cmp = 1;
- else
- cmp = 0;
+ entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
- if (cmp < 0)
+ if (bytenr < entry->node.bytenr)
n = n->rb_left;
- else if (cmp > 0)
+ else if (bytenr > entry->node.bytenr)
n = n->rb_right;
else
return entry;
}
if (entry && return_bigger) {
- if (cmp > 0) {
- n = rb_next(&entry->rb_node);
+ if (bytenr > entry->node.bytenr) {
+ n = rb_next(&entry->href_node);
if (!n)
n = rb_first(root);
- entry = rb_entry(n, struct btrfs_delayed_ref_node,
- rb_node);
- bytenr = entry->bytenr;
- return_bigger = 0;
- goto again;
+ entry = rb_entry(n, struct btrfs_delayed_ref_head,
+ href_node);
+ return entry;
}
return entry;
}
@@ -243,33 +260,38 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref)
{
- rb_erase(&ref->rb_node, &delayed_refs->root);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head = btrfs_delayed_node_to_head(ref);
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+ } else {
+ assert_spin_locked(&head->lock);
+ rb_erase(&ref->rb_node, &head->ref_root);
+ }
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
- delayed_refs->num_entries--;
+ atomic_dec(&delayed_refs->num_entries);
if (trans->delayed_ref_updates)
trans->delayed_ref_updates--;
}
static int merge_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *ref, u64 seq)
{
struct rb_node *node;
- int merged = 0;
int mod = 0;
int done = 0;
- node = rb_prev(&ref->rb_node);
- while (node) {
+ node = rb_next(&ref->rb_node);
+ while (!done && node) {
struct btrfs_delayed_ref_node *next;
next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- node = rb_prev(node);
- if (next->bytenr != ref->bytenr)
- break;
+ node = rb_next(node);
if (seq && next->seq >= seq)
break;
if (comp_entry(ref, next, 0))
@@ -289,12 +311,11 @@ static int merge_ref(struct btrfs_trans_handle *trans,
mod = -next->ref_mod;
}
- merged++;
- drop_delayed_ref(trans, delayed_refs, next);
+ drop_delayed_ref(trans, delayed_refs, head, next);
ref->ref_mod += mod;
if (ref->ref_mod == 0) {
- drop_delayed_ref(trans, delayed_refs, ref);
- break;
+ drop_delayed_ref(trans, delayed_refs, head, ref);
+ done = 1;
} else {
/*
* You can't have multiples of the same ref on a tree
@@ -303,13 +324,8 @@ static int merge_ref(struct btrfs_trans_handle *trans,
WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
}
-
- if (done)
- break;
- node = rb_prev(&ref->rb_node);
}
-
- return merged;
+ return done;
}
void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
@@ -320,6 +336,14 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct rb_node *node;
u64 seq = 0;
+ assert_spin_locked(&head->lock);
+ /*
+ * We don't have too much refs to merge in the case of delayed data
+ * refs.
+ */
+ if (head->is_data)
+ return;
+
spin_lock(&fs_info->tree_mod_seq_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct seq_list *elem;
@@ -330,22 +354,19 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
}
spin_unlock(&fs_info->tree_mod_seq_lock);
- node = rb_prev(&head->node.rb_node);
+ node = rb_first(&head->ref_root);
while (node) {
struct btrfs_delayed_ref_node *ref;
ref = rb_entry(node, struct btrfs_delayed_ref_node,
rb_node);
- if (ref->bytenr != head->node.bytenr)
- break;
-
/* We can't merge refs that are outside of our seq count */
if (seq && ref->seq >= seq)
break;
- if (merge_ref(trans, delayed_refs, ref, seq))
- node = rb_prev(&head->node.rb_node);
+ if (merge_ref(trans, delayed_refs, head, ref, seq))
+ node = rb_first(&head->ref_root);
else
- node = rb_prev(node);
+ node = rb_next(&ref->rb_node);
}
}
@@ -373,71 +394,52 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
return ret;
}
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
- struct list_head *cluster, u64 start)
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans)
{
- int count = 0;
struct btrfs_delayed_ref_root *delayed_refs;
- struct rb_node *node;
- struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *head;
+ u64 start;
+ bool loop = false;
delayed_refs = &trans->transaction->delayed_refs;
- if (start == 0) {
- node = rb_first(&delayed_refs->root);
- } else {
- ref = NULL;
- find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
- if (ref) {
- node = &ref->rb_node;
- } else
- node = rb_first(&delayed_refs->root);
- }
+
again:
- while (node && count < 32) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- if (btrfs_delayed_ref_is_head(ref)) {
- head = btrfs_delayed_node_to_head(ref);
- if (list_empty(&head->cluster)) {
- list_add_tail(&head->cluster, cluster);
- delayed_refs->run_delayed_start =
- head->node.bytenr;
- count++;
-
- WARN_ON(delayed_refs->num_heads_ready == 0);
- delayed_refs->num_heads_ready--;
- } else if (count) {
- /* the goal of the clustering is to find extents
- * that are likely to end up in the same extent
- * leaf on disk. So, we don't want them spread
- * all over the tree. Stop now if we've hit
- * a head that was already in use
- */
- break;
- }
- }
- node = rb_next(node);
- }
- if (count) {
- return 0;
- } else if (start) {
- /*
- * we've gone to the end of the rbtree without finding any
- * clusters. start from the beginning and try again
- */
+ start = delayed_refs->run_delayed_start;
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
+ if (!head && !loop) {
+ delayed_refs->run_delayed_start = 0;
start = 0;
- node = rb_first(&delayed_refs->root);
- goto again;
+ loop = true;
+ head = find_ref_head(&delayed_refs->href_root, start, 1);
+ if (!head)
+ return NULL;
+ } else if (!head && loop) {
+ return NULL;
}
- return 1;
-}
-void btrfs_release_ref_cluster(struct list_head *cluster)
-{
- struct list_head *pos, *q;
+ while (head->processing) {
+ struct rb_node *node;
- list_for_each_safe(pos, q, cluster)
- list_del_init(pos);
+ node = rb_next(&head->href_node);
+ if (!node) {
+ if (loop)
+ return NULL;
+ delayed_refs->run_delayed_start = 0;
+ start = 0;
+ loop = true;
+ goto again;
+ }
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ }
+
+ head->processing = 1;
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ delayed_refs->run_delayed_start = head->node.bytenr +
+ head->node.num_bytes;
+ return head;
}
/*
@@ -451,6 +453,7 @@ void btrfs_release_ref_cluster(struct list_head *cluster)
static noinline void
update_existing_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
struct btrfs_delayed_ref_node *existing,
struct btrfs_delayed_ref_node *update)
{
@@ -463,7 +466,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
*/
existing->ref_mod--;
if (existing->ref_mod == 0)
- drop_delayed_ref(trans, delayed_refs, existing);
+ drop_delayed_ref(trans, delayed_refs, head, existing);
else
WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -496,6 +499,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
ref = btrfs_delayed_node_to_head(update);
BUG_ON(existing_ref->is_data != ref->is_data);
+ spin_lock(&existing_ref->lock);
if (ref->must_insert_reserved) {
/* if the extent was freed and then
* reallocated before the delayed ref
@@ -533,9 +537,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
}
}
/*
- * update the reference mod on the head to reflect this new operation
+ * update the reference mod on the head to reflect this new operation,
+ * only need the lock for this case cause we could be processing it
+ * currently, for refs we just added we know we're a-ok.
*/
existing->ref_mod += update->ref_mod;
+ spin_unlock(&existing_ref->lock);
}
/*
@@ -543,13 +550,13 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
* this does all the dirty work in terms of maintaining the correct
* overall modification count.
*/
-static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref,
- u64 bytenr, u64 num_bytes,
- int action, int is_data)
+static noinline struct btrfs_delayed_ref_head *
+add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref, u64 bytenr,
+ u64 num_bytes, int action, int is_data)
{
- struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
struct btrfs_delayed_ref_root *delayed_refs;
int count_mod = 1;
@@ -596,38 +603,43 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
head_ref = btrfs_delayed_node_to_head(ref);
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
+ head_ref->ref_root = RB_ROOT;
+ head_ref->processing = 0;
- INIT_LIST_HEAD(&head_ref->cluster);
+ spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
trace_add_delayed_ref_head(ref, head_ref, action);
- existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
+ existing = htree_insert(&delayed_refs->href_root,
+ &head_ref->href_node);
if (existing) {
- update_existing_head_ref(existing, ref);
+ update_existing_head_ref(&existing->node, ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
*/
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ head_ref = existing;
} else {
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
- delayed_refs->num_entries++;
+ atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
}
+ return head_ref;
}
/*
* helper to insert a delayed tree ref into the rbtree.
*/
-static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, int level, int action,
- int for_cow)
+static noinline void
+add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_delayed_ref_node *ref, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root, int level,
+ int action, int no_quota)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_tree_ref *full_ref;
@@ -637,6 +649,8 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
if (action == BTRFS_ADD_DELAYED_EXTENT)
action = BTRFS_ADD_DELAYED_REF;
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
delayed_refs = &trans->transaction->delayed_refs;
/* first set the basic ref node struct up */
@@ -647,9 +661,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
-
- if (need_ref_seq(for_cow, ref_root))
- seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -663,30 +675,33 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_tree_ref(ref, full_ref, action);
- existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
+ spin_lock(&head_ref->lock);
+ existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
if (existing) {
- update_existing_ref(trans, delayed_refs, existing, ref);
+ update_existing_ref(trans, delayed_refs, head_ref, existing,
+ ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
*/
kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
} else {
- delayed_refs->num_entries++;
+ atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
}
+ spin_unlock(&head_ref->lock);
}
/*
* helper to insert a delayed data ref into the rbtree.
*/
-static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
- struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *ref,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 ref_root, u64 owner, u64 offset,
- int action, int for_cow)
+static noinline void
+add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head_ref,
+ struct btrfs_delayed_ref_node *ref, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
+ u64 offset, int action, int no_quota)
{
struct btrfs_delayed_ref_node *existing;
struct btrfs_delayed_data_ref *full_ref;
@@ -698,6 +713,9 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
delayed_refs = &trans->transaction->delayed_refs;
+ if (is_fstree(ref_root))
+ seq = atomic64_read(&fs_info->tree_mod_seq);
+
/* first set the basic ref node struct up */
atomic_set(&ref->refs, 1);
ref->bytenr = bytenr;
@@ -706,9 +724,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
-
- if (need_ref_seq(for_cow, ref_root))
- seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+ ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -724,19 +740,21 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
trace_add_delayed_data_ref(ref, full_ref, action);
- existing = tree_insert(&delayed_refs->root, &ref->rb_node);
-
+ spin_lock(&head_ref->lock);
+ existing = tree_insert(&head_ref->ref_root, &ref->rb_node);
if (existing) {
- update_existing_ref(trans, delayed_refs, existing, ref);
+ update_existing_ref(trans, delayed_refs, head_ref, existing,
+ ref);
/*
* we've updated the existing ref, free the newly
* allocated ref
*/
kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
} else {
- delayed_refs->num_entries++;
+ atomic_inc(&delayed_refs->num_entries);
trans->delayed_ref_updates++;
}
+ spin_unlock(&head_ref->lock);
}
/*
@@ -749,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow)
+ int no_quota)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -775,15 +796,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
- num_bytes, action, 0);
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ bytenr, num_bytes, action, 0);
- add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+ add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, level, action,
- for_cow);
+ no_quota);
spin_unlock(&delayed_refs->lock);
- if (need_ref_seq(for_cow, ref_root))
- btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -797,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow)
+ int no_quota)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ no_quota = 0;
+
BUG_ON(extent_op && !extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
@@ -823,15 +845,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* insert both the head node and the new ref without dropping
* the spin lock
*/
- add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
- num_bytes, action, 1);
+ head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+ bytenr, num_bytes, action, 1);
- add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+ add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
- action, for_cow);
+ action, no_quota);
spin_unlock(&delayed_refs->lock);
- if (need_ref_seq(for_cow, ref_root))
- btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
return 0;
}
@@ -869,14 +889,10 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
{
- struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_root *delayed_refs;
delayed_refs = &trans->transaction->delayed_refs;
- ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
- if (ref)
- return btrfs_delayed_node_to_head(ref);
- return NULL;
+ return find_ref_head(&delayed_refs->href_root, bytenr, 0);
}
void btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 70b962cc177..a764e2340d4 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
+ unsigned int no_quota:1;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
@@ -81,7 +82,10 @@ struct btrfs_delayed_ref_head {
*/
struct mutex mutex;
- struct list_head cluster;
+ spinlock_t lock;
+ struct rb_root ref_root;
+
+ struct rb_node href_node;
struct btrfs_delayed_extent_op *extent_op;
/*
@@ -98,6 +102,7 @@ struct btrfs_delayed_ref_head {
*/
unsigned int must_insert_reserved:1;
unsigned int is_data:1;
+ unsigned int processing:1;
};
struct btrfs_delayed_tree_ref {
@@ -116,7 +121,8 @@ struct btrfs_delayed_data_ref {
};
struct btrfs_delayed_ref_root {
- struct rb_root root;
+ /* head ref rbtree */
+ struct rb_root href_root;
/* this spin lock protects the rbtree and the entries inside */
spinlock_t lock;
@@ -124,7 +130,7 @@ struct btrfs_delayed_ref_root {
/* how many delayed ref updates we've queued, used by the
* throttling code
*/
- unsigned long num_entries;
+ atomic_t num_entries;
/* total number of head nodes in tree */
unsigned long num_heads;
@@ -133,15 +139,6 @@ struct btrfs_delayed_ref_root {
unsigned long num_heads_ready;
/*
- * bumped when someone is making progress on the delayed
- * refs, so that other procs know they are just adding to
- * contention intead of helping
- */
- atomic_t procs_running_refs;
- atomic_t ref_seq;
- wait_queue_head_t wait;
-
- /*
* set when the tree is flushing before a transaction commit,
* used by the throttling code to decide if new updates need
* to be run right away
@@ -200,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow);
+ int no_quota);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
u64 owner, u64 offset, int action,
struct btrfs_delayed_extent_op *extent_op,
- int for_cow);
+ int no_quota);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -226,34 +223,15 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
mutex_unlock(&head->mutex);
}
-int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
- struct list_head *cluster, u64 search_start);
-void btrfs_release_ref_cluster(struct list_head *cluster);
+
+struct btrfs_delayed_ref_head *
+btrfs_select_ref_head(struct btrfs_trans_handle *trans);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq);
/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
- if (for_cow)
- return 0;
-
- if (rootid == BTRFS_FS_TREE_OBJECTID)
- return 1;
-
- if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
- return 1;
-
- return 0;
-}
-
-/*
* a node might live in a head or a regular ref, this lets you
* test for the proper type to use.
*/
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 9efb94e9585..eea26e1b2fd 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -26,7 +26,6 @@
#include <linux/kthread.h>
#include <linux/math64.h>
#include <asm/div64.h>
-#include "compat.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
@@ -37,8 +36,8 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
+#include "sysfs.h"
-static u64 btrfs_get_seconds_since_1970(void);
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
static void btrfs_dev_replace_update_device_in_mapping_tree(
@@ -104,7 +103,8 @@ no_valid_dev_replace_entry_found:
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
- pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
+ btrfs_warn(fs_info,
+ "dev_replace entry found has unexpected size, ignore entry");
goto no_valid_dev_replace_entry_found;
}
@@ -147,13 +147,19 @@ no_valid_dev_replace_entry_found:
if (!dev_replace->srcdev &&
!btrfs_test_opt(dev_root, DEGRADED)) {
ret = -EIO;
- pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
- src_devid);
+ btrfs_warn(fs_info,
+ "cannot mount because device replace operation is ongoing and");
+ btrfs_warn(fs_info,
+ "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
+ src_devid);
}
if (!dev_replace->tgtdev &&
!btrfs_test_opt(dev_root, DEGRADED)) {
ret = -EIO;
- pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
+ btrfs_warn(fs_info,
+ "cannot mount because device replace operation is ongoing and");
+ btrfs_warn(fs_info,
+ "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
BTRFS_DEV_REPLACE_DEVID);
}
if (dev_replace->tgtdev) {
@@ -212,7 +218,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
}
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- pr_warn("btrfs: error %d while searching for dev_replace item!\n",
+ btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
ret);
goto out;
}
@@ -232,7 +238,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
*/
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
+ btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
ret);
goto out;
}
@@ -245,7 +251,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- pr_warn("btrfs: insert dev_replace item failed %d!\n",
+ btrfs_warn(fs_info, "insert dev_replace item failed %d!",
ret);
goto out;
}
@@ -296,13 +302,6 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
dev_replace->cursor_left_last_write_of_item;
}
-static u64 btrfs_get_seconds_since_1970(void)
-{
- struct timespec t = CURRENT_TIME_SEC;
-
- return t.tv_sec;
-}
-
int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -314,8 +313,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
struct btrfs_device *src_device = NULL;
if (btrfs_fs_incompat(fs_info, RAID56)) {
- pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
- return -EINVAL;
+ btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
+ return -EOPNOTSUPP;
}
switch (args->start.cont_reading_from_srcdev_mode) {
@@ -334,7 +333,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
&tgt_device);
if (ret) {
- pr_err("btrfs: target device %s is invalid!\n",
+ btrfs_err(fs_info, "target device %s is invalid!",
args->start.tgtdev_name);
mutex_unlock(&fs_info->volume_mutex);
return -EINVAL;
@@ -350,7 +349,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
}
if (tgt_device->total_bytes < src_device->total_bytes) {
- pr_err("btrfs: target device is smaller than source device!\n");
+ btrfs_err(fs_info, "target device is smaller than source device!");
ret = -EINVAL;
goto leave_no_lock;
}
@@ -375,7 +374,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
dev_replace->tgtdev = tgt_device;
printk_in_rcu(KERN_INFO
- "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
+ "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -390,7 +389,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
* go to the tgtdev as well (refer to btrfs_map_block()).
*/
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
- dev_replace->time_started = btrfs_get_seconds_since_1970();
+ dev_replace->time_started = get_seconds();
dev_replace->cursor_left = 0;
dev_replace->committed_cursor_left = 0;
dev_replace->cursor_left_last_write_of_item = 0;
@@ -400,7 +399,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
- btrfs_wait_all_ordered_extents(root->fs_info);
+ btrfs_wait_ordered_roots(root->fs_info, -1);
/* force writing the updated state information to disk */
trans = btrfs_start_transaction(root, 0);
@@ -433,6 +432,35 @@ leave_no_lock:
return ret;
}
+/*
+ * blocked until all flighting bios are finished.
+ */
+static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ do {
+ prepare_to_wait(&fs_info->replace_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ writers = percpu_counter_sum(&fs_info->bio_counter);
+ if (writers)
+ schedule();
+ finish_wait(&fs_info->replace_wait, &wait);
+ } while (writers);
+}
+
+/*
+ * we have removed target device, it is safe to allow new bios request.
+ */
+static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
+{
+ clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -460,22 +488,16 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device = dev_replace->srcdev;
btrfs_dev_replace_unlock(dev_replace);
- /* replace old device with new one in mapping tree */
- if (!scrub_ret)
- btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
- src_device,
- tgt_device);
-
/*
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
- ret = btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
- btrfs_wait_all_ordered_extents(root->fs_info);
+ btrfs_wait_ordered_roots(root->fs_info, -1);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -486,6 +508,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&root->fs_info->chunk_mutex);
mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
btrfs_dev_replace_lock(dev_replace);
dev_replace->replace_state =
@@ -493,18 +516,24 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
- dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- if (scrub_ret) {
+ /* replace old device with new one in mapping tree */
+ if (!scrub_ret) {
+ btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
+ src_device,
+ tgt_device);
+ } else {
printk_in_rcu(KERN_ERR
- "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -513,7 +542,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
printk_in_rcu(KERN_INFO
- "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
+ "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -534,8 +563,16 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
fs_info->fs_devices->latest_bdev = tgt_device->bdev;
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
+ /* replace the sysfs entry */
+ btrfs_kobj_rm_device(fs_info, src_device);
+ btrfs_kobj_add_device(fs_info, tgt_device);
+
+ btrfs_rm_dev_replace_blocked(fs_info);
+
btrfs_rm_dev_replace_srcdev(fs_info, src_device);
+ btrfs_rm_dev_replace_unblocked(fs_info);
+
/*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
@@ -545,6 +582,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
*/
btrfs_dev_replace_unlock(dev_replace);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&root->fs_info->chunk_mutex);
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
@@ -650,6 +688,9 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
u64 result;
int ret;
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
@@ -668,7 +709,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
break;
}
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
- dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_unlock(dev_replace);
btrfs_scrub_cancel(fs_info);
@@ -703,9 +744,9 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
- dev_replace->time_stopped = btrfs_get_seconds_since_1970();
+ dev_replace->time_stopped = get_seconds();
dev_replace->item_needs_writeback = 1;
- pr_info("btrfs: suspending dev_replace for unmount\n");
+ btrfs_info(fs_info, "suspending dev_replace for unmount");
break;
}
@@ -734,8 +775,9 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
break;
}
if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
- pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
- "btrfs: you may cancel the operation after 'mount -o degraded'\n");
+ btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
+ btrfs_info(fs_info,
+ "you may cancel the operation after 'mount -o degraded'");
btrfs_dev_replace_unlock(dev_replace);
return 0;
}
@@ -761,14 +803,14 @@ static int btrfs_dev_replace_kthread(void *data)
kfree(status_args);
do_div(progress, 10);
printk_in_rcu(KERN_INFO
- "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
- dev_replace->srcdev->missing ? "<missing disk>" :
- rcu_str_deref(dev_replace->srcdev->name),
- dev_replace->srcdev->devid,
- dev_replace->tgtdev ?
- rcu_str_deref(dev_replace->tgtdev->name) :
- "<missing target disk>",
- (unsigned int)progress);
+ "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ dev_replace->srcdev->missing ? "<missing disk>" :
+ rcu_str_deref(dev_replace->srcdev->name),
+ dev_replace->srcdev->devid,
+ dev_replace->tgtdev ?
+ rcu_str_deref(dev_replace->tgtdev->name) :
+ "<missing target disk>",
+ (unsigned int)progress);
}
btrfs_dev_replace_continue_on_mount(fs_info);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
@@ -860,3 +902,31 @@ void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
mutex_unlock(&dev_replace->lock_management_lock);
}
}
+
+void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_inc(&fs_info->bio_counter);
+}
+
+void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
+{
+ percpu_counter_dec(&fs_info->bio_counter);
+
+ if (waitqueue_active(&fs_info->replace_wait))
+ wake_up(&fs_info->replace_wait);
+}
+
+void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
+{
+ DEFINE_WAIT(wait);
+again:
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+ btrfs_bio_counter_dec(fs_info);
+ wait_event(fs_info->replace_wait,
+ !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state));
+ goto again;
+ }
+
+}
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 79e594e341c..a0691df5dce 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -58,7 +58,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
return ERR_PTR(ret);
WARN_ON(ret > 0);
leaf = path->nodes[0];
- item = btrfs_item_nr(leaf, path->slots[0]);
+ item = btrfs_item_nr(path->slots[0]);
ptr = btrfs_item_ptr(leaf, path->slots[0], char);
BUG_ON(data_size > btrfs_item_size(leaf, item));
ptr += btrfs_item_size(leaf, item) - data_size;
@@ -261,7 +261,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
* see if there is room in the item to insert this
* name
*/
- data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+ data_size = sizeof(*di) + name_len;
leaf = path->nodes[0];
slot = path->slots[0];
if (data_size + btrfs_item_size_nr(leaf, slot) +
@@ -459,7 +459,7 @@ int verify_dir_item(struct btrfs_root *root,
u8 type = btrfs_dir_type(leaf, dir_item);
if (type >= BTRFS_FT_MAX) {
- printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+ btrfs_crit(root->fs_info, "invalid dir item type: %d",
(int)type);
return 1;
}
@@ -468,14 +468,16 @@ int verify_dir_item(struct btrfs_root *root,
namelen = XATTR_NAME_MAX;
if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
- printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
+ btrfs_crit(root->fs_info, "invalid dir item name len: %u",
(unsigned)btrfs_dir_data_len(leaf, dir_item));
return 1;
}
/* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
- if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
- printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+ if ((btrfs_dir_data_len(leaf, dir_item) +
+ btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) {
+ btrfs_crit(root->fs_info, "invalid dir item name + data len: %u + %u",
+ (unsigned)btrfs_dir_name_len(leaf, dir_item),
(unsigned)btrfs_dir_data_len(leaf, dir_item));
return 1;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ae17ed13b3..08e65e9cf2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,16 +26,15 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
-#include <linux/crc32c.h>
#include <linux/slab.h>
#include <linux/migrate.h>
#include <linux/ratelimit.h>
#include <linux/uuid.h>
#include <linux/semaphore.h>
#include <asm/unaligned.h>
-#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
+#include "hash.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
@@ -49,6 +48,8 @@
#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"
+#include "sysfs.h"
+#include "qgroup.h"
#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -64,7 +65,6 @@ static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
struct btrfs_root *root);
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
static int btrfs_destroy_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages,
@@ -245,7 +245,7 @@ out:
u32 btrfs_csum_data(char *data, u32 seed, size_t len)
{
- return crc32c(seed, data, len);
+ return btrfs_crc32c(seed, data, len);
}
void btrfs_csum_final(u32 crc, char *result)
@@ -301,11 +301,11 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
- "failed on %llu wanted %X found %X "
- "level %d\n",
- root->fs_info->sb->s_id, buf->start,
- val, found, btrfs_header_level(buf));
+ printk_ratelimited(KERN_INFO
+ "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
+ "level %d\n",
+ root->fs_info->sb->s_id, buf->start,
+ val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
kfree(result);
return 1;
@@ -330,6 +330,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
{
struct extent_state *cached_state = NULL;
int ret;
+ bool need_lock = (current->journal_info ==
+ (void *)BTRFS_SEND_TRANS_STUB);
if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
return 0;
@@ -337,6 +339,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
if (atomic)
return -EAGAIN;
+ if (need_lock) {
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ }
+
lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
0, &cached_state);
if (extent_buffer_uptodate(eb) &&
@@ -348,10 +355,22 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
"found %llu\n",
eb->start, parent_transid, btrfs_header_generation(eb));
ret = 1;
- clear_extent_buffer_uptodate(eb);
+
+ /*
+ * Things reading via commit roots that don't have normal protection,
+ * like send, can have a really old block in cache that may point at a
+ * block that has been free'd and re-allocated. So don't clear uptodate
+ * if we find an eb that is under IO (dirty/writeback) because we could
+ * end up reading in the stale data and then writing it back out and
+ * making everybody very sad.
+ */
+ if (!extent_buffer_under_io(eb))
+ clear_extent_buffer_uptodate(eb);
out:
unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
&cached_state, GFP_NOFS);
+ if (need_lock)
+ btrfs_tree_read_unlock_blocking(eb);
return ret;
}
@@ -384,13 +403,14 @@ static int btrfs_check_super_csum(char *raw_disk_sb)
ret = 1;
if (ret && btrfs_super_generation(disk_sb) < 10) {
- printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
+ printk(KERN_WARNING
+ "BTRFS: super block crcs don't match, older mkfs detected\n");
ret = 0;
}
}
if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
- printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
+ printk(KERN_ERR "BTRFS: unsupported checksum algorithm %u\n",
csum_type);
ret = 1;
}
@@ -466,25 +486,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
{
- struct extent_io_tree *tree;
u64 start = page_offset(page);
u64 found_start;
struct extent_buffer *eb;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
-
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
return 0;
found_start = btrfs_header_bytenr(eb);
- if (found_start != start) {
- WARN_ON(1);
+ if (WARN_ON(found_start != start || !PageUptodate(page)))
return 0;
- }
- if (!PageUptodate(page)) {
- WARN_ON(1);
- return 0;
- }
csum_tree_block(root, eb, 0);
return 0;
}
@@ -496,7 +507,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
u8 fsid[BTRFS_UUID_SIZE];
int ret = 1;
- read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE);
+ read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
while (fs_devices) {
if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
ret = 0;
@@ -508,8 +519,8 @@ static int check_tree_block_fsid(struct btrfs_root *root,
}
#define CORRUPT(reason, eb, root, slot) \
- printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
- "root=%llu, slot=%d\n", reason, \
+ btrfs_crit(root->fs_info, "corrupt leaf, %s: block=%llu," \
+ "root=%llu, slot=%d", reason, \
btrfs_header_bytenr(eb), root->objectid, slot)
static noinline int check_leaf(struct btrfs_root *root,
@@ -577,7 +588,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
u64 phy_offset, struct page *page,
u64 start, u64 end, int mirror)
{
- struct extent_io_tree *tree;
u64 found_start;
int found_level;
struct extent_buffer *eb;
@@ -588,7 +598,6 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
if (!page->private)
goto out;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
eb = (struct extent_buffer *)page->private;
/* the pending IO might have been the only thing that kept this buffer
@@ -608,21 +617,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "btrfs bad tree block start "
+ printk_ratelimited(KERN_INFO "BTRFS: bad tree block start "
"%llu %llu\n",
found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
+ printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n",
eb->start);
ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_info(root->fs_info, "bad tree block level %d\n",
+ btrfs_info(root->fs_info, "bad tree block level %d",
(int)btrfs_header_level(eb));
ret = -EIO;
goto err;
@@ -689,32 +698,31 @@ static void end_workqueue_bio(struct bio *bio, int err)
fs_info = end_io_wq->info;
end_io_wq->error = err;
- end_io_wq->work.func = end_workqueue_fn;
- end_io_wq->work.flags = 0;
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
if (bio->bi_rw & REQ_WRITE) {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
- btrfs_queue_worker(&fs_info->endio_meta_write_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_meta_write_workers,
+ &end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
- btrfs_queue_worker(&fs_info->endio_freespace_worker,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_freespace_worker,
+ &end_io_wq->work);
else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_worker(&fs_info->endio_raid56_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else
- btrfs_queue_worker(&fs_info->endio_write_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_write_workers,
+ &end_io_wq->work);
} else {
if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
- btrfs_queue_worker(&fs_info->endio_raid56_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_raid56_workers,
+ &end_io_wq->work);
else if (end_io_wq->metadata)
- btrfs_queue_worker(&fs_info->endio_meta_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_meta_workers,
+ &end_io_wq->work);
else
- btrfs_queue_worker(&fs_info->endio_workers,
- &end_io_wq->work);
+ btrfs_queue_work(fs_info->endio_workers,
+ &end_io_wq->work);
}
}
@@ -749,7 +757,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
{
unsigned long limit = min_t(unsigned long,
- info->workers.max_workers,
+ info->thread_pool_size,
info->fs_devices->open_devices);
return 256 * limit;
}
@@ -822,11 +830,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->submit_bio_start = submit_bio_start;
async->submit_bio_done = submit_bio_done;
- async->work.func = run_one_async_start;
- async->work.ordered_func = run_one_async_done;
- async->work.ordered_free = run_one_async_free;
+ btrfs_init_work(&async->work, run_one_async_start,
+ run_one_async_done, run_one_async_free);
- async->work.flags = 0;
async->bio_flags = bio_flags;
async->bio_offset = bio_offset;
@@ -835,9 +841,9 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
atomic_inc(&fs_info->nr_async_submits);
if (rw & REQ_SYNC)
- btrfs_set_work_high_prio(&async->work);
+ btrfs_set_work_high_priority(&async->work);
- btrfs_queue_worker(&fs_info->workers, &async->work);
+ btrfs_queue_work(fs_info->workers, &async->work);
while (atomic_read(&fs_info->async_submit_draining) &&
atomic_read(&fs_info->nr_async_submits)) {
@@ -850,20 +856,17 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
static int btree_csum_one_bio(struct bio *bio)
{
- struct bio_vec *bvec = bio->bi_io_vec;
- int bio_index = 0;
+ struct bio_vec *bvec;
struct btrfs_root *root;
- int ret = 0;
+ int i, ret = 0;
- WARN_ON(bio->bi_vcnt <= 0);
- while (bio_index < bio->bi_vcnt) {
+ bio_for_each_segment_all(bvec, bio, i) {
root = BTRFS_I(bvec->bv_page->mapping->host)->root;
ret = csum_dirty_buffer(root, bvec->bv_page);
if (ret)
break;
- bio_index++;
- bvec++;
}
+
return ret;
}
@@ -975,11 +978,9 @@ static int btree_migratepage(struct address_space *mapping,
static int btree_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- struct extent_io_tree *tree;
struct btrfs_fs_info *fs_info;
int ret;
- tree = &BTRFS_I(mapping->host)->io_tree;
if (wbc->sync_mode == WB_SYNC_NONE) {
if (wbc->for_kupdate)
@@ -1018,8 +1019,9 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
extent_invalidatepage(tree, page, offset);
btree_releasepage(page, GFP_NOFS);
if (PagePrivate(page)) {
- printk(KERN_WARNING "btrfs warning page private not zero "
- "on page %llu\n", (unsigned long long)page_offset(page));
+ btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
+ "page private not zero on page %llu",
+ (unsigned long long)page_offset(page));
ClearPagePrivate(page);
set_page_private(page, 0);
page_cache_release(page);
@@ -1103,22 +1105,18 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
- struct inode *btree_inode = root->fs_info->btree_inode;
- struct extent_buffer *eb;
- eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize);
- return eb;
+ return find_extent_buffer(root->fs_info, bytenr);
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr, u32 blocksize)
{
- struct inode *btree_inode = root->fs_info->btree_inode;
- struct extent_buffer *eb;
-
- eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
- bytenr, blocksize);
- return eb;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return alloc_test_extent_buffer(root->fs_info, bytenr,
+ blocksize);
+#endif
+ return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
}
@@ -1173,6 +1171,32 @@ void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
}
+static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
+{
+ struct btrfs_subvolume_writers *writers;
+ int ret;
+
+ writers = kmalloc(sizeof(*writers), GFP_NOFS);
+ if (!writers)
+ return ERR_PTR(-ENOMEM);
+
+ ret = percpu_counter_init(&writers->counter, 0);
+ if (ret < 0) {
+ kfree(writers);
+ return ERR_PTR(ret);
+ }
+
+ init_waitqueue_head(&writers->wait);
+ return writers;
+}
+
+static void
+btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
+{
+ percpu_counter_destroy(&writers->counter);
+ kfree(writers);
+}
+
static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
u32 stripesize, struct btrfs_root *root,
struct btrfs_fs_info *fs_info,
@@ -1184,10 +1208,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->nodesize = nodesize;
root->leafsize = leafsize;
root->stripesize = stripesize;
- root->ref_cows = 0;
- root->track_dirty = 0;
- root->in_radix = 0;
- root->orphan_item_inserted = 0;
+ root->state = 0;
root->orphan_cleanup_state = 0;
root->objectid = objectid;
@@ -1218,27 +1239,36 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
spin_lock_init(&root->log_extents_lock[1]);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
+ mutex_init(&root->ordered_extent_mutex);
+ mutex_init(&root->delalloc_mutex);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
+ INIT_LIST_HEAD(&root->log_ctxs[0]);
+ INIT_LIST_HEAD(&root->log_ctxs[1]);
atomic_set(&root->log_commit[0], 0);
atomic_set(&root->log_commit[1], 0);
atomic_set(&root->log_writers, 0);
atomic_set(&root->log_batch, 0);
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
+ atomic_set(&root->will_be_snapshoted, 0);
root->log_transid = 0;
+ root->log_transid_committed = -1;
root->last_log_commit = 0;
- extent_io_tree_init(&root->dirty_log_pages,
- fs_info->btree_inode->i_mapping);
+ if (fs_info)
+ extent_io_tree_init(&root->dirty_log_pages,
+ fs_info->btree_inode->i_mapping);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
memset(&root->root_kobj, 0, sizeof(root->root_kobj));
- root->defrag_trans_start = fs_info->generation;
+ if (fs_info)
+ root->defrag_trans_start = fs_info->generation;
+ else
+ root->defrag_trans_start = 0;
init_completion(&root->kobj_unregister);
- root->defrag_running = 0;
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1253,6 +1283,23 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
return root;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/* Should only be used by the testing infrastructure */
+struct btrfs_root *btrfs_alloc_dummy_root(void)
+{
+ struct btrfs_root *root;
+
+ root = btrfs_alloc_root(NULL);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+ __setup_root(4096, 4096, 4096, 4096, root, NULL, 1);
+ set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
+ root->alloc_bytenr = 0;
+
+ return root;
+}
+#endif
+
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
u64 objectid)
@@ -1262,7 +1309,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root;
struct btrfs_key key;
int ret = 0;
- u64 bytenr;
uuid_le uuid;
root = btrfs_alloc_root(fs_info);
@@ -1284,7 +1330,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
goto fail;
}
- bytenr = leaf->start;
memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
btrfs_set_header_bytenr(leaf, leaf->start);
btrfs_set_header_generation(leaf, trans->transid);
@@ -1292,7 +1337,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_set_header_owner(leaf, objectid);
root->node = leaf;
- write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf),
+ write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(),
BTRFS_FSID_SIZE);
write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(leaf),
@@ -1300,8 +1345,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
root->commit_root = btrfs_root_node(root);
- root->track_dirty = 1;
-
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
root->root_item.flags = 0;
root->root_item.byte_limit = 0;
@@ -1330,6 +1374,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
fail:
if (leaf) {
btrfs_tree_unlock(leaf);
+ free_extent_buffer(root->commit_root);
free_extent_buffer(leaf);
}
kfree(root);
@@ -1355,13 +1400,15 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
root->root_key.type = BTRFS_ROOT_ITEM_KEY;
root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+
/*
+ * DON'T set REF_COWS for log trees
+ *
* log trees do not get reference counted because they go away
* before a real commit is actually done. They do store pointers
* to file data extents, and those reference counts still get
* updated (along with back refs to the log tree).
*/
- root->ref_cows = 0;
leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
BTRFS_TREE_LOG_OBJECTID, NULL,
@@ -1379,7 +1426,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
root->node = leaf;
write_extent_buffer(root->node, root->fs_info->fsid,
- btrfs_header_fsid(root->node), BTRFS_FSID_SIZE);
+ btrfs_header_fsid(), BTRFS_FSID_SIZE);
btrfs_mark_buffer_dirty(root->node);
btrfs_tree_unlock(root->node);
return root;
@@ -1423,6 +1470,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(root->log_root);
root->log_root = log_root;
root->log_transid = 0;
+ root->log_transid_committed = -1;
root->last_log_commit = 0;
return 0;
}
@@ -1494,7 +1542,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
return root;
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
- root->ref_cows = 1;
+ set_bit(BTRFS_ROOT_REF_COWS, &root->state);
btrfs_check_and_init_root_item(&root->root_item);
}
@@ -1504,6 +1552,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
int btrfs_init_fs_root(struct btrfs_root *root)
{
int ret;
+ struct btrfs_subvolume_writers *writers;
root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
@@ -1513,15 +1562,24 @@ int btrfs_init_fs_root(struct btrfs_root *root)
goto fail;
}
+ writers = btrfs_alloc_subvolume_writers();
+ if (IS_ERR(writers)) {
+ ret = PTR_ERR(writers);
+ goto fail;
+ }
+ root->subv_writers = writers;
+
btrfs_init_free_ino_ctl(root);
- mutex_init(&root->fs_commit_mutex);
spin_lock_init(&root->cache_lock);
init_waitqueue_head(&root->cache_wait);
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto fail;
+ goto free_writers;
return 0;
+
+free_writers:
+ btrfs_free_subvolume_writers(root->subv_writers);
fail:
kfree(root->free_ino_ctl);
kfree(root->free_ino_pinned);
@@ -1554,15 +1612,16 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
(unsigned long)root->root_key.objectid,
root);
if (ret == 0)
- root->in_radix = 1;
+ set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
return ret;
}
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location)
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location,
+ bool check_ref)
{
struct btrfs_root *root;
int ret;
@@ -1586,7 +1645,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, location->objectid);
if (root) {
- if (btrfs_root_refs(&root->root_item) == 0)
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0)
return ERR_PTR(-ENOENT);
return root;
}
@@ -1595,7 +1654,7 @@ again:
if (IS_ERR(root))
return root;
- if (btrfs_root_refs(&root->root_item) == 0) {
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
ret = -ENOENT;
goto fail;
}
@@ -1604,11 +1663,12 @@ again:
if (ret)
goto fail;
- ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+ ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
+ location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
if (ret < 0)
goto fail;
if (ret == 0)
- root->orphan_item_inserted = 1;
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
@@ -1672,18 +1732,16 @@ static void end_workqueue_fn(struct btrfs_work *work)
{
struct bio *bio;
struct end_io_wq *end_io_wq;
- struct btrfs_fs_info *fs_info;
int error;
end_io_wq = container_of(work, struct end_io_wq, work);
bio = end_io_wq->bio;
- fs_info = end_io_wq->info;
error = end_io_wq->error;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
kfree(end_io_wq);
- bio_endio(bio, error);
+ bio_endio_nodec(bio, error);
}
static int cleaner_kthread(void *arg)
@@ -1779,6 +1837,9 @@ sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+ if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
+ &root->fs_info->fs_state)))
+ btrfs_cleanup_transaction(root);
if (!try_to_freeze()) {
set_current_state(TASK_INTERRUPTIBLE);
if (!kthread_should_stop() &&
@@ -1993,72 +2054,50 @@ static noinline int next_root_backup(struct btrfs_fs_info *info,
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
- btrfs_stop_workers(&fs_info->generic_worker);
- btrfs_stop_workers(&fs_info->fixup_workers);
- btrfs_stop_workers(&fs_info->delalloc_workers);
- btrfs_stop_workers(&fs_info->workers);
- btrfs_stop_workers(&fs_info->endio_workers);
- btrfs_stop_workers(&fs_info->endio_meta_workers);
- btrfs_stop_workers(&fs_info->endio_raid56_workers);
- btrfs_stop_workers(&fs_info->rmw_workers);
- btrfs_stop_workers(&fs_info->endio_meta_write_workers);
- btrfs_stop_workers(&fs_info->endio_write_workers);
- btrfs_stop_workers(&fs_info->endio_freespace_worker);
- btrfs_stop_workers(&fs_info->submit_workers);
- btrfs_stop_workers(&fs_info->delayed_workers);
- btrfs_stop_workers(&fs_info->caching_workers);
- btrfs_stop_workers(&fs_info->readahead_workers);
- btrfs_stop_workers(&fs_info->flush_workers);
- btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->fixup_workers);
+ btrfs_destroy_workqueue(fs_info->delalloc_workers);
+ btrfs_destroy_workqueue(fs_info->workers);
+ btrfs_destroy_workqueue(fs_info->endio_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_workers);
+ btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
+ btrfs_destroy_workqueue(fs_info->rmw_workers);
+ btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_write_workers);
+ btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
+ btrfs_destroy_workqueue(fs_info->submit_workers);
+ btrfs_destroy_workqueue(fs_info->delayed_workers);
+ btrfs_destroy_workqueue(fs_info->caching_workers);
+ btrfs_destroy_workqueue(fs_info->readahead_workers);
+ btrfs_destroy_workqueue(fs_info->flush_workers);
+ btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
+ btrfs_destroy_workqueue(fs_info->extent_workers);
+}
+
+static void free_root_extent_buffers(struct btrfs_root *root)
+{
+ if (root) {
+ free_extent_buffer(root->node);
+ free_extent_buffer(root->commit_root);
+ root->node = NULL;
+ root->commit_root = NULL;
+ }
}
/* helper to cleanup tree roots */
static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
{
- free_extent_buffer(info->tree_root->node);
- free_extent_buffer(info->tree_root->commit_root);
- info->tree_root->node = NULL;
- info->tree_root->commit_root = NULL;
-
- if (info->dev_root) {
- free_extent_buffer(info->dev_root->node);
- free_extent_buffer(info->dev_root->commit_root);
- info->dev_root->node = NULL;
- info->dev_root->commit_root = NULL;
- }
- if (info->extent_root) {
- free_extent_buffer(info->extent_root->node);
- free_extent_buffer(info->extent_root->commit_root);
- info->extent_root->node = NULL;
- info->extent_root->commit_root = NULL;
- }
- if (info->csum_root) {
- free_extent_buffer(info->csum_root->node);
- free_extent_buffer(info->csum_root->commit_root);
- info->csum_root->node = NULL;
- info->csum_root->commit_root = NULL;
- }
- if (info->quota_root) {
- free_extent_buffer(info->quota_root->node);
- free_extent_buffer(info->quota_root->commit_root);
- info->quota_root->node = NULL;
- info->quota_root->commit_root = NULL;
- }
- if (info->uuid_root) {
- free_extent_buffer(info->uuid_root->node);
- free_extent_buffer(info->uuid_root->commit_root);
- info->uuid_root->node = NULL;
- info->uuid_root->commit_root = NULL;
- }
- if (chunk_root) {
- free_extent_buffer(info->chunk_root->node);
- free_extent_buffer(info->chunk_root->commit_root);
- info->chunk_root->node = NULL;
- info->chunk_root->commit_root = NULL;
- }
+ free_root_extent_buffers(info->tree_root);
+
+ free_root_extent_buffers(info->dev_root);
+ free_root_extent_buffers(info->extent_root);
+ free_root_extent_buffers(info->csum_root);
+ free_root_extent_buffers(info->quota_root);
+ free_root_extent_buffers(info->uuid_root);
+ if (chunk_root)
+ free_root_extent_buffers(info->chunk_root);
}
-static void del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
@@ -2069,7 +2108,7 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
- if (gang[0]->in_radix) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
} else {
free_extent_buffer(gang[0]->node);
@@ -2087,6 +2126,12 @@ static void del_fs_roots(struct btrfs_fs_info *fs_info)
for (i = 0; i < ret; i++)
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
}
+
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ btrfs_free_log_root_tree(NULL, fs_info);
+ btrfs_destroy_pinned_extent(fs_info->tree_root,
+ fs_info->pinned_extents);
+ }
}
int open_ctree(struct super_block *sb,
@@ -2116,6 +2161,8 @@ int open_ctree(struct super_block *sb,
int err = -EINVAL;
int num_backups_tried = 0;
int backup_index = 0;
+ int max_active;
+ int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
bool create_uuid_tree;
bool check_uuid_tree;
@@ -2152,15 +2199,22 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
+ ret = percpu_counter_init(&fs_info->bio_counter, 0);
+ if (ret) {
+ err = ret;
+ goto fail_delalloc_bytes;
+ }
+
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
- goto fail_delalloc_bytes;
+ goto fail_bio_counter;
}
mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@@ -2174,8 +2228,11 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->free_chunk_lock);
spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
+ spin_lock_init(&fs_info->buffer_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
+ mutex_init(&fs_info->delalloc_root_mutex);
seqlock_init(&fs_info->profiles_lock);
init_completion(&fs_info->kobj_unregister);
@@ -2197,6 +2254,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->async_submit_draining, 0);
atomic_set(&fs_info->nr_async_bios, 0);
atomic_set(&fs_info->defrag_running, 0);
+ atomic_set(&fs_info->qgroup_op_seq, 0);
atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->sb = sb;
fs_info->max_inline = 8192 * 1024;
@@ -2205,7 +2263,7 @@ int open_ctree(struct super_block *sb,
fs_info->free_chunk_space = 0;
fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
-
+ fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64);
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
spin_lock_init(&fs_info->reada_lock);
@@ -2228,8 +2286,8 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->scrub_pause_req, 0);
atomic_set(&fs_info->scrubs_paused, 0);
atomic_set(&fs_info->scrub_cancel_req, 0);
+ init_waitqueue_head(&fs_info->replace_wait);
init_waitqueue_head(&fs_info->scrub_pause_wait);
- init_rwsem(&fs_info->scrub_super_lock);
fs_info->scrub_workers_refcnt = 0;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
fs_info->check_integrity_print_mask = 0;
@@ -2242,6 +2300,7 @@ int open_ctree(struct super_block *sb,
atomic_set(&fs_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(&fs_info->balance_wait_q);
+ btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
sb->s_blocksize = 4096;
sb->s_blocksize_bits = blksize_bits(4096);
@@ -2271,7 +2330,7 @@ int open_ctree(struct super_block *sb,
sizeof(struct btrfs_key));
set_bit(BTRFS_INODE_DUMMY,
&BTRFS_I(fs_info->btree_inode)->runtime_flags);
- insert_inode_hash(fs_info->btree_inode);
+ btrfs_insert_inode_hash(fs_info->btree_inode);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
@@ -2292,7 +2351,7 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
- init_rwsem(&fs_info->extent_commit_sem);
+ init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
sema_init(&fs_info->uuid_tree_rescan_sem, 1);
@@ -2305,6 +2364,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_op_tree = RB_ROOT;
INIT_LIST_HEAD(&fs_info->dirty_qgroups);
fs_info->qgroup_seq = 1;
fs_info->quota_enabled = 0;
@@ -2345,7 +2405,7 @@ int open_ctree(struct super_block *sb,
* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
*/
if (btrfs_check_super_csum(bh->b_data)) {
- printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
+ printk(KERN_ERR "BTRFS: superblock checksum mismatch\n");
err = -EINVAL;
goto fail_alloc;
}
@@ -2364,7 +2424,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
if (ret) {
- printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+ printk(KERN_ERR "BTRFS: superblock contains fatal errors\n");
err = -EINVAL;
goto fail_alloc;
}
@@ -2429,7 +2489,7 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_ERR "btrfs: has skinny extents\n");
+ printk(KERN_ERR "BTRFS: has skinny extents\n");
/*
* flag our filesystem as having big metadata blocks if
@@ -2437,7 +2497,7 @@ int open_ctree(struct super_block *sb,
*/
if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+ printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
@@ -2454,7 +2514,7 @@ int open_ctree(struct super_block *sb,
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != leafsize)) {
- printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+ printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
goto fail_alloc;
@@ -2476,104 +2536,73 @@ int open_ctree(struct super_block *sb,
goto fail_alloc;
}
- btrfs_init_workers(&fs_info->generic_worker,
- "genwork", 1, NULL);
-
- btrfs_init_workers(&fs_info->workers, "worker",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
+ max_active = fs_info->thread_pool_size;
- btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
- fs_info->thread_pool_size, NULL);
+ fs_info->workers =
+ btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
+ max_active, 16);
- btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
- fs_info->thread_pool_size, NULL);
+ fs_info->delalloc_workers =
+ btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
- btrfs_init_workers(&fs_info->submit_workers, "submit",
- min_t(u64, fs_devices->num_devices,
- fs_info->thread_pool_size), NULL);
+ fs_info->flush_workers =
+ btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
- btrfs_init_workers(&fs_info->caching_workers, "cache",
- fs_info->thread_pool_size, NULL);
+ fs_info->caching_workers =
+ btrfs_alloc_workqueue("cache", flags, max_active, 0);
- /* a higher idle thresh on the submit workers makes it much more
+ /*
+ * a higher idle thresh on the submit workers makes it much more
* likely that bios will be send down in a sane order to the
* devices
*/
- fs_info->submit_workers.idle_thresh = 64;
-
- fs_info->workers.idle_thresh = 16;
- fs_info->workers.ordered = 1;
-
- fs_info->delalloc_workers.idle_thresh = 2;
- fs_info->delalloc_workers.ordered = 1;
-
- btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_workers, "endio",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_meta_write_workers,
- "endio-meta-write", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_raid56_workers,
- "endio-raid56", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->rmw_workers,
- "rmw", fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
- 1, &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->readahead_workers, "readahead",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
- &fs_info->generic_worker);
+ fs_info->submit_workers =
+ btrfs_alloc_workqueue("submit", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 64);
+
+ fs_info->fixup_workers =
+ btrfs_alloc_workqueue("fixup", flags, 1, 0);
/*
* endios are largely parallel and should have a very
* low idle thresh
*/
- fs_info->endio_workers.idle_thresh = 4;
- fs_info->endio_meta_workers.idle_thresh = 4;
- fs_info->endio_raid56_workers.idle_thresh = 4;
- fs_info->rmw_workers.idle_thresh = 2;
-
- fs_info->endio_write_workers.idle_thresh = 2;
- fs_info->endio_meta_write_workers.idle_thresh = 2;
- fs_info->readahead_workers.idle_thresh = 2;
-
- /*
- * btrfs_start_workers can really only fail because of ENOMEM so just
- * return -ENOMEM if any of these fail.
- */
- ret = btrfs_start_workers(&fs_info->workers);
- ret |= btrfs_start_workers(&fs_info->generic_worker);
- ret |= btrfs_start_workers(&fs_info->submit_workers);
- ret |= btrfs_start_workers(&fs_info->delalloc_workers);
- ret |= btrfs_start_workers(&fs_info->fixup_workers);
- ret |= btrfs_start_workers(&fs_info->endio_workers);
- ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
- ret |= btrfs_start_workers(&fs_info->rmw_workers);
- ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
- ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
- ret |= btrfs_start_workers(&fs_info->endio_write_workers);
- ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
- ret |= btrfs_start_workers(&fs_info->delayed_workers);
- ret |= btrfs_start_workers(&fs_info->caching_workers);
- ret |= btrfs_start_workers(&fs_info->readahead_workers);
- ret |= btrfs_start_workers(&fs_info->flush_workers);
- ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
- if (ret) {
+ fs_info->endio_workers =
+ btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ fs_info->endio_meta_workers =
+ btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ fs_info->endio_meta_write_workers =
+ btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ fs_info->endio_raid56_workers =
+ btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ fs_info->rmw_workers =
+ btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ fs_info->endio_write_workers =
+ btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ fs_info->endio_freespace_worker =
+ btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ fs_info->delayed_workers =
+ btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ fs_info->readahead_workers =
+ btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ fs_info->qgroup_rescan_workers =
+ btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ fs_info->extent_workers =
+ btrfs_alloc_workqueue("extent-refs", flags,
+ min_t(u64, fs_devices->num_devices,
+ max_active), 8);
+
+ if (!(fs_info->workers && fs_info->delalloc_workers &&
+ fs_info->submit_workers && fs_info->flush_workers &&
+ fs_info->endio_workers && fs_info->endio_meta_workers &&
+ fs_info->endio_meta_write_workers &&
+ fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
+ fs_info->endio_freespace_worker && fs_info->rmw_workers &&
+ fs_info->caching_workers && fs_info->readahead_workers &&
+ fs_info->fixup_workers && fs_info->delayed_workers &&
+ fs_info->fixup_workers && fs_info->extent_workers &&
+ fs_info->qgroup_rescan_workers)) {
err = -ENOMEM;
goto fail_sb_buffer;
}
@@ -2591,12 +2620,12 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(sectorsize);
if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
+ printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
goto fail_sb_buffer;
}
if (sectorsize != PAGE_SIZE) {
- printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
+ printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
"found on %s\n", (unsigned long)sectorsize, sb->s_id);
goto fail_sb_buffer;
}
@@ -2605,7 +2634,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
- printk(KERN_WARNING "btrfs: failed to read the system "
+ printk(KERN_WARNING "BTRFS: failed to read the system "
"array on %s\n", sb->s_id);
goto fail_sb_buffer;
}
@@ -2622,7 +2651,7 @@ int open_ctree(struct super_block *sb,
blocksize, generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
- printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+ printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2634,7 +2663,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_chunk_tree(chunk_root);
if (ret) {
- printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
+ printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2646,7 +2675,7 @@ int open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
- printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
+ printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2661,7 +2690,7 @@ retry_root_backup:
blocksize, generation);
if (!tree_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
- printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+ printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
goto recovery_tree_root;
@@ -2669,6 +2698,7 @@ retry_root_backup:
btrfs_set_root_node(&tree_root->root_item, tree_root->node);
tree_root->commit_root = btrfs_root_node(tree_root);
+ btrfs_set_root_refs(&tree_root->root_item, 1);
location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
location.type = BTRFS_ROOT_ITEM_KEY;
@@ -2679,7 +2709,7 @@ retry_root_backup:
ret = PTR_ERR(extent_root);
goto recovery_tree_root;
}
- extent_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state);
fs_info->extent_root = extent_root;
location.objectid = BTRFS_DEV_TREE_OBJECTID;
@@ -2688,7 +2718,7 @@ retry_root_backup:
ret = PTR_ERR(dev_root);
goto recovery_tree_root;
}
- dev_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state);
fs_info->dev_root = dev_root;
btrfs_init_devices_late(fs_info);
@@ -2698,13 +2728,13 @@ retry_root_backup:
ret = PTR_ERR(csum_root);
goto recovery_tree_root;
}
- csum_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state);
fs_info->csum_root = csum_root;
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
quota_root = btrfs_read_tree_root(tree_root, &location);
if (!IS_ERR(quota_root)) {
- quota_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &quota_root->state);
fs_info->quota_enabled = 1;
fs_info->pending_quota_state = 1;
fs_info->quota_root = quota_root;
@@ -2719,7 +2749,7 @@ retry_root_backup:
create_uuid_tree = true;
check_uuid_tree = false;
} else {
- uuid_root->track_dirty = 1;
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state);
fs_info->uuid_root = uuid_root;
create_uuid_tree = false;
check_uuid_tree =
@@ -2731,50 +2761,56 @@ retry_root_backup:
ret = btrfs_recover_balance(fs_info);
if (ret) {
- printk(KERN_WARNING "btrfs: failed to recover balance\n");
+ printk(KERN_WARNING "BTRFS: failed to recover balance\n");
goto fail_block_groups;
}
ret = btrfs_init_dev_stats(fs_info);
if (ret) {
- printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+ printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n",
ret);
goto fail_block_groups;
}
ret = btrfs_init_dev_replace(fs_info);
if (ret) {
- pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+ pr_err("BTRFS: failed to init dev_replace: %d\n", ret);
goto fail_block_groups;
}
btrfs_close_extra_devices(fs_info, fs_devices, 1);
- ret = btrfs_init_space_info(fs_info);
+ ret = btrfs_sysfs_add_one(fs_info);
if (ret) {
- printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+ pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
goto fail_block_groups;
}
+ ret = btrfs_init_space_info(fs_info);
+ if (ret) {
+ printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret);
+ goto fail_sysfs;
+ }
+
ret = btrfs_read_block_groups(extent_root);
if (ret) {
- printk(KERN_ERR "Failed to read block groups: %d\n", ret);
- goto fail_block_groups;
+ printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret);
+ goto fail_sysfs;
}
fs_info->num_tolerated_disk_barrier_failures =
btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(sb->s_flags & MS_RDONLY)) {
- printk(KERN_WARNING
- "Btrfs: too many missing devices, writeable mount is not allowed\n");
- goto fail_block_groups;
+ printk(KERN_WARNING "BTRFS: "
+ "too many missing devices, writeable mount is not allowed\n");
+ goto fail_sysfs;
}
fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
"btrfs-cleaner");
if (IS_ERR(fs_info->cleaner_kthread))
- goto fail_block_groups;
+ goto fail_sysfs;
fs_info->transaction_kthread = kthread_run(transaction_kthread,
tree_root,
@@ -2785,11 +2821,15 @@ retry_root_backup:
if (!btrfs_test_opt(tree_root, SSD) &&
!btrfs_test_opt(tree_root, NOSSD) &&
!fs_info->fs_devices->rotating) {
- printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+ printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD "
"mode\n");
btrfs_set_opt(fs_info->mount_opt, SSD);
}
+ /* Set the real inode map cache flag */
+ if (btrfs_test_opt(tree_root, CHANGE_INODE_CACHE))
+ btrfs_set_opt(tree_root->fs_info->mount_opt, INODE_MAP_CACHE);
+
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
ret = btrfsic_mount(tree_root, fs_devices,
@@ -2798,7 +2838,7 @@ retry_root_backup:
1 : 0,
fs_info->check_integrity_print_mask);
if (ret)
- printk(KERN_WARNING "btrfs: failed to initialize"
+ printk(KERN_WARNING "BTRFS: failed to initialize"
" integrity check module %s\n", sb->s_id);
}
#endif
@@ -2811,7 +2851,7 @@ retry_root_backup:
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "Btrfs log replay required "
+ printk(KERN_WARNING "BTRFS: log replay required "
"on RO media\n");
err = -EIO;
goto fail_qgroup;
@@ -2834,10 +2874,10 @@ retry_root_backup:
generation + 1);
if (!log_tree_root->node ||
!extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "btrfs: failed to read log tree\n");
+ printk(KERN_ERR "BTRFS: failed to read log tree\n");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
@@ -2846,29 +2886,31 @@ retry_root_backup:
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
if (sb->s_flags & MS_RDONLY) {
ret = btrfs_commit_super(tree_root);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
}
}
ret = btrfs_find_orphan_roots(tree_root);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
- goto fail_trans_kthread;
+ goto fail_qgroup;
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(tree_root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
printk(KERN_WARNING
- "btrfs: failed to recover relocation\n");
+ "BTRFS: failed to recover relocation\n");
err = -EINVAL;
goto fail_qgroup;
}
@@ -2898,14 +2940,14 @@ retry_root_backup:
ret = btrfs_resume_balance_async(fs_info);
if (ret) {
- printk(KERN_WARNING "btrfs: failed to resume balance\n");
+ printk(KERN_WARNING "BTRFS: failed to resume balance\n");
close_ctree(tree_root);
return ret;
}
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
- pr_warn("btrfs: failed to resume dev_replace\n");
+ pr_warn("BTRFS: failed to resume dev_replace\n");
close_ctree(tree_root);
return ret;
}
@@ -2913,20 +2955,20 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
if (create_uuid_tree) {
- pr_info("btrfs: creating UUID tree\n");
+ pr_info("BTRFS: creating UUID tree\n");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
- pr_warn("btrfs: failed to create the UUID tree %d\n",
+ pr_warn("BTRFS: failed to create the UUID tree %d\n",
ret);
close_ctree(tree_root);
return ret;
}
} else if (check_uuid_tree ||
btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
- pr_info("btrfs: checking UUID tree\n");
+ pr_info("BTRFS: checking UUID tree\n");
ret = btrfs_check_uuid_tree(fs_info);
if (ret) {
- pr_warn("btrfs: failed to check the UUID tree %d\n",
+ pr_warn("BTRFS: failed to check the UUID tree %d\n",
ret);
close_ctree(tree_root);
return ret;
@@ -2942,7 +2984,7 @@ fail_qgroup:
fail_trans_kthread:
kthread_stop(fs_info->transaction_kthread);
btrfs_cleanup_transaction(fs_info->tree_root);
- del_fs_roots(fs_info);
+ btrfs_free_fs_roots(fs_info);
fail_cleaner:
kthread_stop(fs_info->cleaner_kthread);
@@ -2952,6 +2994,9 @@ fail_cleaner:
*/
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
+fail_sysfs:
+ btrfs_sysfs_remove_one(fs_info);
+
fail_block_groups:
btrfs_put_block_group_cache(fs_info);
btrfs_free_block_groups(fs_info);
@@ -2967,6 +3012,8 @@ fail_iput:
btrfs_mapping_tree_free(&fs_info->mapping_tree);
iput(fs_info->btree_inode);
+fail_bio_counter:
+ percpu_counter_destroy(&fs_info->bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3007,7 +3054,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+ printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
"I/O error on %s\n",
rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
@@ -3126,7 +3173,7 @@ static int write_dev_supers(struct btrfs_device *device,
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
- printk(KERN_ERR "btrfs: couldn't get super "
+ printk(KERN_ERR "BTRFS: couldn't get super "
"buffer head for bytenr %Lu\n", bytenr);
errors++;
continue;
@@ -3147,7 +3194,10 @@ static int write_dev_supers(struct btrfs_device *device,
* we fua the first super. The others we allow
* to go down lazy.
*/
- ret = btrfsic_submit_bh(WRITE_FUA, bh);
+ if (i == 0)
+ ret = btrfsic_submit_bh(WRITE_FUA, bh);
+ else
+ ret = btrfsic_submit_bh(WRITE_SYNC, bh);
if (ret)
errors++;
}
@@ -3193,7 +3243,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
wait_for_completion(&device->flush_wait);
if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
- printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+ printk_in_rcu("BTRFS: disabling barriers on dev %s\n",
rcu_str_deref(device->name));
device->nobarriers = 1;
} else if (!bio_flagged(bio, BIO_UPTODATE)) {
@@ -3245,6 +3295,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
/* send down all the barriers */
head = &info->fs_devices->devices;
list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
if (!dev->bdev) {
errors_send++;
continue;
@@ -3259,6 +3311,8 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
/* wait for all the barriers */
list_for_each_entry_rcu(dev, head, dev_list) {
+ if (dev->missing)
+ continue;
if (!dev->bdev) {
errors_wait++;
continue;
@@ -3414,7 +3468,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
total_errors++;
}
if (total_errors > max_errors) {
- printk(KERN_ERR "btrfs: %d errors while writing supers\n",
+ btrfs_err(root->fs_info, "%d errors while writing supers",
total_errors);
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@@ -3447,10 +3501,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors)
{
- int ret;
-
- ret = write_all_supers(root, max_mirrors);
- return ret;
+ return write_all_supers(root, max_mirrors);
}
/* Drop a fs root from the radix tree and free it. */
@@ -3465,13 +3516,13 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
- if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_free_log(NULL, root);
- btrfs_free_log_root_tree(NULL, fs_info);
- }
- __btrfs_remove_free_space_cache(root->free_ino_pinned);
- __btrfs_remove_free_space_cache(root->free_ino_ctl);
+ if (root->free_ino_pinned)
+ __btrfs_remove_free_space_cache(root->free_ino_pinned);
+ if (root->free_ino_ctl)
+ __btrfs_remove_free_space_cache(root->free_ino_ctl);
free_fs_root(root);
}
@@ -3483,6 +3534,8 @@ static void free_fs_root(struct btrfs_root *root)
root->orphan_block_rsv = NULL;
if (root->anon_dev)
free_anon_bdev(root->anon_dev);
+ if (root->subv_writers)
+ btrfs_free_subvolume_writers(root->subv_writers);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->free_ino_ctl);
@@ -3500,34 +3553,56 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
- int i;
- int ret;
+ int i = 0;
+ int err = 0;
+ unsigned int ret = 0;
+ int index;
while (1) {
+ index = srcu_read_lock(&fs_info->subvol_srcu);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
- if (!ret)
+ if (!ret) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
break;
-
+ }
root_objectid = gang[ret - 1]->root_key.objectid + 1;
+
for (i = 0; i < ret; i++) {
- int err;
+ /* Avoid to grab roots in dead_roots */
+ if (btrfs_root_refs(&gang[i]->root_item) == 0) {
+ gang[i] = NULL;
+ continue;
+ }
+ /* grab all the search result for later use */
+ gang[i] = btrfs_grab_fs_root(gang[i]);
+ }
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ for (i = 0; i < ret; i++) {
+ if (!gang[i])
+ continue;
root_objectid = gang[i]->root_key.objectid;
err = btrfs_orphan_cleanup(gang[i]);
if (err)
- return err;
+ break;
+ btrfs_put_fs_root(gang[i]);
}
root_objectid++;
}
- return 0;
+
+ /* release the uncleaned roots due to error */
+ for (; i < ret; i++) {
+ if (gang[i])
+ btrfs_put_fs_root(gang[i]);
+ }
+ return err;
}
int btrfs_commit_super(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
- int ret;
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_run_delayed_iputs(root);
@@ -3541,25 +3616,7 @@ int btrfs_commit_super(struct btrfs_root *root)
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- /* run commit again to drop the original snapshot */
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- ret = btrfs_write_and_wait_transaction(NULL, root);
- if (ret) {
- btrfs_error(root->fs_info, ret,
- "Failed to sync btree inode to disk.");
- return ret;
- }
-
- ret = write_ctree_super(NULL, root, 0);
- return ret;
+ return btrfs_commit_transaction(trans, root);
}
int close_ctree(struct btrfs_root *root)
@@ -3589,17 +3646,17 @@ int close_ctree(struct btrfs_root *root)
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ cancel_work_sync(&fs_info->async_reclaim_work);
+
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_commit_super(root);
if (ret)
- printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+ btrfs_err(root->fs_info, "commit super ret %d", ret);
}
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
btrfs_error_commit_super(root);
- btrfs_put_block_group_cache(fs_info);
-
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
@@ -3609,16 +3666,25 @@ int close_ctree(struct btrfs_root *root)
btrfs_free_qgroup_config(root->fs_info);
if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
- printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+ btrfs_info(root->fs_info, "at unmount delalloc count %lld",
percpu_counter_sum(&fs_info->delalloc_bytes));
}
+ btrfs_sysfs_remove_one(fs_info);
+
+ btrfs_free_fs_roots(fs_info);
+
+ btrfs_put_block_group_cache(fs_info);
+
btrfs_free_block_groups(fs_info);
+ /*
+ * we must make sure there is not any read request to
+ * submit after we stopping all workers.
+ */
+ invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
- del_fs_roots(fs_info);
-
free_root_pointers(fs_info, 1);
iput(fs_info->btree_inode);
@@ -3633,6 +3699,7 @@ int close_ctree(struct btrfs_root *root)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
+ percpu_counter_destroy(&fs_info->bio_counter);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
@@ -3668,10 +3735,20 @@ int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
- struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+ struct btrfs_root *root;
u64 transid = btrfs_header_generation(buf);
int was_dirty;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ /*
+ * This is a fast path so only do this check if we have sanity tests
+ * enabled. Normal people shouldn't be marking dummy buffers as dirty
+ * outside of the sanity tests.
+ */
+ if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
+ return;
+#endif
+ root = BTRFS_I(buf->pages[0]->mapping->host)->root;
btrfs_assert_tree_locked(buf);
if (transid != root->fs_info->generation)
WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
@@ -3682,6 +3759,12 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
buf->len,
root->fs_info->dirty_metadata_batch);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
+ btrfs_print_leaf(root, buf);
+ ASSERT(0);
+ }
+#endif
}
static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3801,11 +3884,14 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
while (!list_empty(&splice)) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
- list_del_init(&root->ordered_root);
+ list_move_tail(&root->ordered_root,
+ &fs_info->ordered_roots);
+ spin_unlock(&fs_info->ordered_root_lock);
btrfs_destroy_ordered_extents(root);
- cond_resched_lock(&fs_info->ordered_root_lock);
+ cond_resched();
+ spin_lock(&fs_info->ordered_root_lock);
}
spin_unlock(&fs_info->ordered_root_lock);
}
@@ -3821,55 +3907,54 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
delayed_refs = &trans->delayed_refs;
spin_lock(&delayed_refs->lock);
- if (delayed_refs->num_entries == 0) {
+ if (atomic_read(&delayed_refs->num_entries) == 0) {
spin_unlock(&delayed_refs->lock);
- printk(KERN_INFO "delayed_refs has NO entry\n");
+ btrfs_info(root->fs_info, "delayed_refs has NO entry");
return ret;
}
- while ((node = rb_first(&delayed_refs->root)) != NULL) {
- struct btrfs_delayed_ref_head *head = NULL;
+ while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+ struct btrfs_delayed_ref_head *head;
bool pin_bytes = false;
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- atomic_set(&ref->refs, 1);
- if (btrfs_delayed_ref_is_head(ref)) {
-
- head = btrfs_delayed_node_to_head(ref);
- if (!mutex_trylock(&head->mutex)) {
- atomic_inc(&ref->refs);
- spin_unlock(&delayed_refs->lock);
-
- /* Need to wait for the delayed ref to run */
- mutex_lock(&head->mutex);
- mutex_unlock(&head->mutex);
- btrfs_put_delayed_ref(ref);
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ if (!mutex_trylock(&head->mutex)) {
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
- spin_lock(&delayed_refs->lock);
- continue;
- }
-
- if (head->must_insert_reserved)
- pin_bytes = true;
- btrfs_free_delayed_extent_op(head->extent_op);
- delayed_refs->num_heads--;
- if (list_empty(&head->cluster))
- delayed_refs->num_heads_ready--;
- list_del_init(&head->cluster);
- }
-
- ref->in_tree = 0;
- rb_erase(&ref->rb_node, &delayed_refs->root);
- delayed_refs->num_entries--;
- spin_unlock(&delayed_refs->lock);
- if (head) {
- if (pin_bytes)
- btrfs_pin_extent(root, ref->bytenr,
- ref->num_bytes, 1);
+ mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ spin_lock(&delayed_refs->lock);
+ continue;
}
- btrfs_put_delayed_ref(ref);
+ spin_lock(&head->lock);
+ while ((node = rb_first(&head->ref_root)) != NULL) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &head->ref_root);
+ atomic_dec(&delayed_refs->num_entries);
+ btrfs_put_delayed_ref(ref);
+ }
+ if (head->must_insert_reserved)
+ pin_bytes = true;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ delayed_refs->num_heads--;
+ if (head->processing == 0)
+ delayed_refs->num_heads_ready--;
+ atomic_dec(&delayed_refs->num_entries);
+ head->node.in_tree = 0;
+ rb_erase(&head->href_node, &delayed_refs->href_root);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ mutex_unlock(&head->mutex);
+ if (pin_bytes)
+ btrfs_pin_extent(root, head->node.bytenr,
+ head->node.num_bytes, 1);
+ btrfs_put_delayed_ref(&head->node);
cond_resched();
spin_lock(&delayed_refs->lock);
}
@@ -3879,24 +3964,6 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
-static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
-{
- struct btrfs_pending_snapshot *snapshot;
- struct list_head splice;
-
- INIT_LIST_HEAD(&splice);
-
- list_splice_init(&t->pending_snapshots, &splice);
-
- while (!list_empty(&splice)) {
- snapshot = list_entry(splice.next,
- struct btrfs_pending_snapshot,
- list);
- snapshot->error = -ECANCELED;
- list_del_init(&snapshot->list);
- }
-}
-
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
@@ -4026,15 +4093,13 @@ again:
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
+ btrfs_destroy_ordered_operations(cur_trans, root);
+
btrfs_destroy_delayed_refs(cur_trans, root);
- btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
- cur_trans->dirty_pages.dirty_bytes);
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&root->fs_info->transaction_blocked_wait);
- btrfs_evict_pending_snapshots(cur_trans);
-
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
@@ -4058,63 +4123,51 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
static int btrfs_cleanup_transaction(struct btrfs_root *root)
{
struct btrfs_transaction *t;
- LIST_HEAD(list);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
spin_lock(&root->fs_info->trans_lock);
- list_splice_init(&root->fs_info->trans_list, &list);
- root->fs_info->running_transaction = NULL;
- spin_unlock(&root->fs_info->trans_lock);
-
- while (!list_empty(&list)) {
- t = list_entry(list.next, struct btrfs_transaction, list);
-
- btrfs_destroy_ordered_operations(t, root);
-
- btrfs_destroy_all_ordered_extents(root->fs_info);
-
- btrfs_destroy_delayed_refs(t, root);
-
- /*
- * FIXME: cleanup wait for commit
- * We needn't acquire the lock here, because we are during
- * the umount, there is no other task which will change it.
- */
- t->state = TRANS_STATE_COMMIT_START;
- smp_mb();
- if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
- wake_up(&root->fs_info->transaction_blocked_wait);
-
- btrfs_evict_pending_snapshots(t);
-
- t->state = TRANS_STATE_UNBLOCKED;
- smp_mb();
- if (waitqueue_active(&root->fs_info->transaction_wait))
- wake_up(&root->fs_info->transaction_wait);
-
- btrfs_destroy_delayed_inodes(root);
- btrfs_assert_delayed_root_empty(root);
-
- btrfs_destroy_all_delalloc_inodes(root->fs_info);
-
- btrfs_destroy_marked_extents(root, &t->dirty_pages,
- EXTENT_DIRTY);
-
- btrfs_destroy_pinned_extent(root,
- root->fs_info->pinned_extents);
-
- t->state = TRANS_STATE_COMPLETED;
- smp_mb();
- if (waitqueue_active(&t->commit_wait))
- wake_up(&t->commit_wait);
+ while (!list_empty(&root->fs_info->trans_list)) {
+ t = list_first_entry(&root->fs_info->trans_list,
+ struct btrfs_transaction, list);
+ if (t->state >= TRANS_STATE_COMMIT_START) {
+ atomic_inc(&t->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+ btrfs_wait_for_commit(root, t->transid);
+ btrfs_put_transaction(t);
+ spin_lock(&root->fs_info->trans_lock);
+ continue;
+ }
+ if (t == root->fs_info->running_transaction) {
+ t->state = TRANS_STATE_COMMIT_DOING;
+ spin_unlock(&root->fs_info->trans_lock);
+ /*
+ * We wait for 0 num_writers since we don't hold a trans
+ * handle open currently for this transaction.
+ */
+ wait_event(t->writer_wait,
+ atomic_read(&t->num_writers) == 0);
+ } else {
+ spin_unlock(&root->fs_info->trans_lock);
+ }
+ btrfs_cleanup_one_transaction(t, root);
- atomic_set(&t->use_count, 0);
+ spin_lock(&root->fs_info->trans_lock);
+ if (t == root->fs_info->running_transaction)
+ root->fs_info->running_transaction = NULL;
list_del_init(&t->list);
- memset(t, 0, sizeof(*t));
- kmem_cache_free(btrfs_transaction_cachep, t);
- }
+ spin_unlock(&root->fs_info->trans_lock);
+ btrfs_put_transaction(t);
+ trace_btrfs_transaction_commit(root);
+ spin_lock(&root->fs_info->trans_lock);
+ }
+ spin_unlock(&root->fs_info->trans_lock);
+ btrfs_destroy_all_ordered_extents(root->fs_info);
+ btrfs_destroy_delayed_inodes(root);
+ btrfs_assert_delayed_root_empty(root);
+ btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents);
+ btrfs_destroy_all_delalloc_inodes(root->fs_info);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
return 0;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index b71acd6e1e5..23ce3ceba0a 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -68,8 +68,18 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
int btrfs_init_fs_root(struct btrfs_root *root);
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
- struct btrfs_key *location);
+void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *key,
+ bool check_ref);
+static inline struct btrfs_root *
+btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+ struct btrfs_key *location)
+{
+ return btrfs_get_fs_root(fs_info, location, true);
+}
+
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
void btrfs_btree_balance_dirty(struct btrfs_root *root);
void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
@@ -77,6 +87,10 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
void btrfs_free_fs_root(struct btrfs_root *root);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_root *btrfs_alloc_dummy_root(void);
+#endif
+
/*
* This function is used to grab the root, and avoid it is freed when we
* access it. But it doesn't ensure that the tree is not dropped.
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 4b869160737..41422a3de8e 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -5,7 +5,6 @@
#include "btrfs_inode.h"
#include "print-tree.h"
#include "export.h"
-#include "compat.h"
#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
parent_objectid) / 4)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d58bef130a4..813537f362f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -25,17 +25,17 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/percpu_counter.h>
-#include "compat.h"
#include "hash.h"
-#include "ctree.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "print-tree.h"
-#include "transaction.h"
#include "volumes.h"
#include "raid56.h"
#include "locking.h"
#include "free-space-cache.h"
#include "math.h"
+#include "sysfs.h"
+#include "qgroup.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -81,7 +81,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extra_op);
+ struct btrfs_delayed_extent_op *extra_op,
+ int no_quota);
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
struct extent_buffer *leaf,
struct btrfs_extent_item *ei);
@@ -94,7 +95,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins);
+ int level, struct btrfs_key *ins,
+ int no_quota);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -103,7 +105,8 @@ static int find_next_key(struct btrfs_path *path, int level,
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve);
+ u64 num_bytes, int reserve,
+ int delalloc);
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
@@ -419,7 +422,7 @@ static noinline void caching_thread(struct btrfs_work *work)
again:
mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
- down_read(&fs_info->extent_commit_sem);
+ down_read(&fs_info->commit_root_sem);
next:
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
@@ -442,10 +445,11 @@ next:
if (ret)
break;
- if (need_resched()) {
+ if (need_resched() ||
+ rwsem_is_contended(&fs_info->commit_root_sem)) {
caching_ctl->progress = last;
btrfs_release_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
cond_resched();
goto again;
@@ -512,7 +516,7 @@ next:
err:
btrfs_free_path(path);
- up_read(&fs_info->extent_commit_sem);
+ up_read(&fs_info->commit_root_sem);
free_excluded_extents(extent_root, block_group);
@@ -548,7 +552,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
atomic_set(&caching_ctl->count, 1);
- caching_ctl->work.func = caching_thread;
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -632,14 +636,14 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
return 0;
}
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
atomic_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
- up_write(&fs_info->extent_commit_sem);
+ up_write(&fs_info->commit_root_sem);
btrfs_get_block_group(cache);
- btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
+ btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
return ret;
}
@@ -768,20 +772,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- if (metadata) {
- key.objectid = bytenr;
- key.type = BTRFS_METADATA_ITEM_KEY;
- key.offset = offset;
- } else {
- key.objectid = bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = offset;
- }
-
if (!trans) {
path->skip_locking = 1;
path->search_commit_root = 1;
}
+
+search_again:
+ key.objectid = bytenr;
+ key.offset = offset;
+ if (metadata)
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+
again:
ret = btrfs_search_slot(trans, root->fs_info->extent_root,
&key, path, 0, 0);
@@ -789,7 +792,6 @@ again:
goto out_free;
if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
- metadata = 0;
if (path->slots[0]) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -856,14 +858,16 @@ again:
mutex_lock(&head->mutex);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref(&head->node);
- goto again;
+ goto search_again;
}
+ spin_lock(&head->lock);
if (head->extent_op && head->extent_op->update_flags)
extent_flags |= head->extent_op->flags_to_set;
else
BUG_ON(num_refs == 0);
num_refs += head->node.ref_mod;
+ spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
}
spin_unlock(&delayed_refs->lock);
@@ -1073,11 +1077,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
__le64 lenum;
lenum = cpu_to_le64(root_objectid);
- high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
+ high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(owner);
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
lenum = cpu_to_le64(offset);
- low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
+ low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
return ((u64)high_crc << 31) ^ (u64)low_crc;
}
@@ -1270,7 +1274,7 @@ fail:
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
- int refs_to_drop)
+ int refs_to_drop, int *last_ref)
{
struct btrfs_key key;
struct btrfs_extent_data_ref *ref1 = NULL;
@@ -1306,6 +1310,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
+ *last_ref = 1;
} else {
if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
@@ -1541,6 +1546,7 @@ again:
ret = 0;
}
if (ret) {
+ key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
@@ -1551,9 +1557,8 @@ again:
if (ret && !insert) {
err = -ENOENT;
goto out;
- } else if (ret) {
+ } else if (WARN_ON(ret)) {
err = -EIO;
- WARN_ON(1);
goto out;
}
@@ -1763,7 +1768,8 @@ void update_inline_extent_backref(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
int refs_to_mod,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int *last_ref)
{
struct extent_buffer *leaf;
struct btrfs_extent_item *ei;
@@ -1807,6 +1813,7 @@ void update_inline_extent_backref(struct btrfs_root *root,
else
btrfs_set_shared_data_ref_count(leaf, sref, refs);
} else {
+ *last_ref = 1;
size = btrfs_extent_inline_ref_size(type);
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = (unsigned long)iref;
@@ -1838,7 +1845,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
if (ret == 0) {
BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
update_inline_extent_backref(root, path, iref,
- refs_to_add, extent_op);
+ refs_to_add, extent_op, NULL);
} else if (ret == -ENOENT) {
setup_inline_extent_backref(root, path, iref, parent,
root_objectid, owner, offset,
@@ -1871,17 +1878,19 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_extent_inline_ref *iref,
- int refs_to_drop, int is_data)
+ int refs_to_drop, int is_data, int *last_ref)
{
int ret = 0;
BUG_ON(!is_data && refs_to_drop != 1);
if (iref) {
update_inline_extent_backref(root, path, iref,
- -refs_to_drop, NULL);
+ -refs_to_drop, NULL, last_ref);
} else if (is_data) {
- ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+ ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
+ last_ref);
} else {
+ *last_ref = 1;
ret = btrfs_del_item(trans, root, path);
}
return ret;
@@ -1945,7 +1954,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int for_cow)
+ u64 root_objectid, u64 owner, u64 offset,
+ int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1957,12 +1967,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, for_cow);
+ BTRFS_ADD_DELAYED_REF, NULL, no_quota);
}
return ret;
}
@@ -1972,37 +1982,64 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 root_objectid,
u64 owner, u64 offset, int refs_to_add,
+ int no_quota,
struct btrfs_delayed_extent_op *extent_op)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_extent_item *item;
+ struct btrfs_key key;
u64 refs;
int ret;
- int err = 0;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
+ if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
+ no_quota = 1;
+
path->reada = 1;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
- ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
- path, bytenr, num_bytes, parent,
+ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
+ bytenr, num_bytes, parent,
root_objectid, owner, offset,
refs_to_add, extent_op);
- if (ret == 0)
+ if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
goto out;
+ /*
+ * Ok we were able to insert an inline extent and it appears to be a new
+ * reference, deal with the qgroup accounting.
+ */
+ if (!ret && !no_quota) {
+ ASSERT(root->fs_info->quota_enabled);
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ item = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
+ btrfs_release_path(path);
- if (ret != -EAGAIN) {
- err = ret;
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
goto out;
}
+ /*
+ * Ok we had -EAGAIN which means we didn't have space to insert and
+ * inline extent ref, so just update the reference count and add a
+ * normal backref.
+ */
leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, item);
+ if (refs)
+ type = BTRFS_QGROUP_OPER_ADD_SHARED;
btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, item);
@@ -2010,9 +2047,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ bytenr, num_bytes, type, 0);
+ if (ret)
+ goto out;
+ }
+
path->reada = 1;
path->leave_spinning = 1;
-
/* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent, root_objectid,
@@ -2021,7 +2064,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, root, ret);
out:
btrfs_free_path(path);
- return err;
+ return ret;
}
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
@@ -2046,8 +2089,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
- else
- ref_root = ref->root;
+ ref_root = ref->root;
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
if (extent_op)
@@ -2061,13 +2103,13 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
node->num_bytes, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op);
+ node->no_quota, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, root, node->bytenr,
node->num_bytes, parent,
ref_root, ref->objectid,
ref->offset, node->ref_mod,
- extent_op);
+ extent_op, node->no_quota);
} else {
BUG();
}
@@ -2137,15 +2179,28 @@ again:
}
if (ret > 0) {
if (metadata) {
- btrfs_release_path(path);
- metadata = 0;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == node->bytenr &&
+ key.type == BTRFS_EXTENT_ITEM_KEY &&
+ key.offset == node->num_bytes)
+ ret = 0;
+ }
+ if (ret > 0) {
+ btrfs_release_path(path);
+ metadata = 0;
- key.offset = node->num_bytes;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- goto again;
+ key.objectid = node->bytenr;
+ key.offset = node->num_bytes;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ goto again;
+ }
+ } else {
+ err = -EIO;
+ goto out;
}
- err = -EIO;
- goto out;
}
leaf = path->nodes[0];
@@ -2191,8 +2246,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
parent = ref->parent;
- else
- ref_root = ref->root;
+ ref_root = ref->root;
ins.objectid = node->bytenr;
if (skinny_metadata) {
@@ -2210,15 +2264,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins);
+ ref->level, &ins,
+ node->no_quota);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, 1, node->no_quota,
+ extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
ret = __btrfs_free_extent(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
- ref->level, 0, 1, extent_op);
+ ref->level, 0, 1, extent_op,
+ node->no_quota);
} else {
BUG();
}
@@ -2234,8 +2291,12 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
- if (trans->aborted)
+ if (trans->aborted) {
+ if (insert_reserved)
+ btrfs_pin_extent(root, node->bytenr,
+ node->num_bytes, 1);
return 0;
+ }
if (btrfs_delayed_ref_is_head(node)) {
struct btrfs_delayed_ref_head *head;
@@ -2278,64 +2339,62 @@ static noinline struct btrfs_delayed_ref_node *
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct rb_node *node;
- struct btrfs_delayed_ref_node *ref;
- int action = BTRFS_ADD_DELAYED_REF;
-again:
+ struct btrfs_delayed_ref_node *ref, *last = NULL;;
+
/*
* select delayed ref of type BTRFS_ADD_DELAYED_REF first.
* this prevents ref count from going down to zero when
* there still are pending delayed ref.
*/
- node = rb_prev(&head->node.rb_node);
- while (1) {
- if (!node)
- break;
+ node = rb_first(&head->ref_root);
+ while (node) {
ref = rb_entry(node, struct btrfs_delayed_ref_node,
rb_node);
- if (ref->bytenr != head->node.bytenr)
- break;
- if (ref->action == action)
+ if (ref->action == BTRFS_ADD_DELAYED_REF)
return ref;
- node = rb_prev(node);
+ else if (last == NULL)
+ last = ref;
+ node = rb_next(node);
}
- if (action == BTRFS_ADD_DELAYED_REF) {
- action = BTRFS_DROP_DELAYED_REF;
- goto again;
- }
- return NULL;
+ return last;
}
/*
* Returns 0 on success or if called with an already aborted transaction.
* Returns -ENOMEM or -EIO on failure and will abort the transaction.
*/
-static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct list_head *cluster)
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ unsigned long nr)
{
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_node *ref;
struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
struct btrfs_fs_info *fs_info = root->fs_info;
+ ktime_t start = ktime_get();
int ret;
- int count = 0;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
int must_insert_reserved = 0;
delayed_refs = &trans->transaction->delayed_refs;
while (1) {
if (!locked_ref) {
- /* pick a new head ref from the cluster list */
- if (list_empty(cluster))
+ if (count >= nr)
break;
- locked_ref = list_entry(cluster->next,
- struct btrfs_delayed_ref_head, cluster);
+ spin_lock(&delayed_refs->lock);
+ locked_ref = btrfs_select_ref_head(trans);
+ if (!locked_ref) {
+ spin_unlock(&delayed_refs->lock);
+ break;
+ }
/* grab the lock that says we are going to process
* all the refs for this head */
ret = btrfs_delayed_ref_lock(trans, locked_ref);
-
+ spin_unlock(&delayed_refs->lock);
/*
* we may have dropped the spin lock to get the head
* mutex lock, and that might have given someone else
@@ -2356,6 +2415,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
* finish. If we merged anything we need to re-loop so we can
* get a good ref.
*/
+ spin_lock(&locked_ref->lock);
btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
locked_ref);
@@ -2367,17 +2427,15 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
if (ref && ref->seq &&
btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
- /*
- * there are still refs with lower seq numbers in the
- * process of being added. Don't run this ref yet.
- */
- list_del_init(&locked_ref->cluster);
+ spin_unlock(&locked_ref->lock);
btrfs_delayed_ref_unlock(locked_ref);
- locked_ref = NULL;
+ spin_lock(&delayed_refs->lock);
+ locked_ref->processing = 0;
delayed_refs->num_heads_ready++;
spin_unlock(&delayed_refs->lock);
+ locked_ref = NULL;
cond_resched();
- spin_lock(&delayed_refs->lock);
+ count++;
continue;
}
@@ -2392,6 +2450,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
locked_ref->extent_op = NULL;
if (!ref) {
+
+
/* All delayed refs have been processed, Go ahead
* and send the head node to run_one_delayed_ref,
* so that any accounting fixes can happen
@@ -2404,26 +2464,54 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
if (extent_op) {
- spin_unlock(&delayed_refs->lock);
-
+ spin_unlock(&locked_ref->lock);
ret = run_delayed_extent_op(trans, root,
ref, extent_op);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
+ /*
+ * Need to reset must_insert_reserved if
+ * there was an error so the abort stuff
+ * can cleanup the reserved space
+ * properly.
+ */
+ if (must_insert_reserved)
+ locked_ref->must_insert_reserved = 1;
+ locked_ref->processing = 0;
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
- spin_lock(&delayed_refs->lock);
btrfs_delayed_ref_unlock(locked_ref);
return ret;
}
+ continue;
+ }
- goto next;
+ /*
+ * Need to drop our head ref lock and re-aqcuire the
+ * delayed ref lock and then re-check to make sure
+ * nobody got added.
+ */
+ spin_unlock(&locked_ref->lock);
+ spin_lock(&delayed_refs->lock);
+ spin_lock(&locked_ref->lock);
+ if (rb_first(&locked_ref->ref_root) ||
+ locked_ref->extent_op) {
+ spin_unlock(&locked_ref->lock);
+ spin_unlock(&delayed_refs->lock);
+ continue;
}
+ ref->in_tree = 0;
+ delayed_refs->num_heads--;
+ rb_erase(&locked_ref->href_node,
+ &delayed_refs->href_root);
+ spin_unlock(&delayed_refs->lock);
+ } else {
+ actual_count++;
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &locked_ref->ref_root);
}
+ atomic_dec(&delayed_refs->num_entries);
- ref->in_tree = 0;
- rb_erase(&ref->rb_node, &delayed_refs->root);
- delayed_refs->num_entries--;
if (!btrfs_delayed_ref_is_head(ref)) {
/*
* when we play the delayed ref, also correct the
@@ -2440,20 +2528,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
default:
WARN_ON(1);
}
- } else {
- list_del_init(&locked_ref->cluster);
}
- spin_unlock(&delayed_refs->lock);
+ spin_unlock(&locked_ref->lock);
ret = run_one_delayed_ref(trans, root, ref, extent_op,
must_insert_reserved);
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
+ locked_ref->processing = 0;
btrfs_delayed_ref_unlock(locked_ref);
btrfs_put_delayed_ref(ref);
btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
- spin_lock(&delayed_refs->lock);
return ret;
}
@@ -2469,11 +2555,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
}
btrfs_put_delayed_ref(ref);
count++;
-next:
cond_resched();
+ }
+
+ /*
+ * We don't want to include ref heads since we can have empty ref heads
+ * and those will drastically skew our runtime down since we just do
+ * accounting, no actual extent tree updates.
+ */
+ if (actual_count > 0) {
+ u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+ u64 avg;
+
+ /*
+ * We weigh the current average higher than our current runtime
+ * to avoid large swings in the average.
+ */
spin_lock(&delayed_refs->lock);
+ avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+ avg = div64_u64(avg, 4);
+ fs_info->avg_delayed_ref_runtime = avg;
+ spin_unlock(&delayed_refs->lock);
}
- return count;
+ return 0;
}
#ifdef SCRAMBLE_DELAYED_REFS
@@ -2519,52 +2623,6 @@ static u64 find_middle(struct rb_root *root)
}
#endif
-int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct qgroup_update *qgroup_update;
- int ret = 0;
-
- if (list_empty(&trans->qgroup_ref_list) !=
- !trans->delayed_ref_elem.seq) {
- /* list without seq or seq without list */
- btrfs_err(fs_info,
- "qgroup accounting update error, list is%s empty, seq is %#x.%x",
- list_empty(&trans->qgroup_ref_list) ? "" : " not",
- (u32)(trans->delayed_ref_elem.seq >> 32),
- (u32)trans->delayed_ref_elem.seq);
- BUG();
- }
-
- if (!trans->delayed_ref_elem.seq)
- return 0;
-
- while (!list_empty(&trans->qgroup_ref_list)) {
- qgroup_update = list_first_entry(&trans->qgroup_ref_list,
- struct qgroup_update, list);
- list_del(&qgroup_update->list);
- if (!ret)
- ret = btrfs_qgroup_account_ref(
- trans, fs_info, qgroup_update->node,
- qgroup_update->extent_op);
- kfree(qgroup_update);
- }
-
- btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
-
- return ret;
-}
-
-static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
- int count)
-{
- int val = atomic_read(&delayed_refs->ref_seq);
-
- if (val < seq || val >= seq + count)
- return 1;
- return 0;
-}
-
static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
{
u64 num_bytes;
@@ -2581,7 +2639,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_rsv *global_rsv;
@@ -2610,6 +2668,101 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
}
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ u64 num_entries =
+ atomic_read(&trans->transaction->delayed_refs.num_entries);
+ u64 avg_runtime;
+ u64 val;
+
+ smp_mb();
+ avg_runtime = fs_info->avg_delayed_ref_runtime;
+ val = num_entries * avg_runtime;
+ if (num_entries * avg_runtime >= NSEC_PER_SEC)
+ return 1;
+ if (val >= NSEC_PER_SEC / 2)
+ return 2;
+
+ return btrfs_check_space_for_delayed_refs(trans, root);
+}
+
+struct async_delayed_refs {
+ struct btrfs_root *root;
+ int count;
+ int error;
+ int sync;
+ struct completion wait;
+ struct btrfs_work work;
+};
+
+static void delayed_ref_async_start(struct btrfs_work *work)
+{
+ struct async_delayed_refs *async;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ async = container_of(work, struct async_delayed_refs, work);
+
+ trans = btrfs_join_transaction(async->root);
+ if (IS_ERR(trans)) {
+ async->error = PTR_ERR(trans);
+ goto done;
+ }
+
+ /*
+ * trans->sync means that when we call end_transaciton, we won't
+ * wait on delayed refs
+ */
+ trans->sync = true;
+ ret = btrfs_run_delayed_refs(trans, async->root, async->count);
+ if (ret)
+ async->error = ret;
+
+ ret = btrfs_end_transaction(trans, async->root);
+ if (ret && !async->error)
+ async->error = ret;
+done:
+ if (async->sync)
+ complete(&async->wait);
+ else
+ kfree(async);
+}
+
+int btrfs_async_run_delayed_refs(struct btrfs_root *root,
+ unsigned long count, int wait)
+{
+ struct async_delayed_refs *async;
+ int ret;
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+ return -ENOMEM;
+
+ async->root = root->fs_info->tree_root;
+ async->count = count;
+ async->error = 0;
+ if (wait)
+ async->sync = 1;
+ else
+ async->sync = 0;
+ init_completion(&async->wait);
+
+ btrfs_init_work(&async->work, delayed_ref_async_start,
+ NULL, NULL);
+
+ btrfs_queue_work(root->fs_info->extent_workers, &async->work);
+
+ if (wait) {
+ wait_for_completion(&async->wait);
+ ret = async->error;
+ kfree(async);
+ return ret;
+ }
+ return 0;
+}
+
/*
* this starts processing the delayed reference count updates and
* extent insertions we have queued up so far. count can be
@@ -2625,13 +2778,10 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
{
struct rb_node *node;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_node *ref;
- struct list_head cluster;
+ struct btrfs_delayed_ref_head *head;
int ret;
- u64 delayed_start;
int run_all = count == (unsigned long)-1;
int run_most = 0;
- int loops;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2640,133 +2790,41 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
if (root == root->fs_info->extent_root)
root = root->fs_info->tree_root;
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
delayed_refs = &trans->transaction->delayed_refs;
- INIT_LIST_HEAD(&cluster);
if (count == 0) {
- count = delayed_refs->num_entries * 2;
+ count = atomic_read(&delayed_refs->num_entries) * 2;
run_most = 1;
}
- if (!run_all && !run_most) {
- int old;
- int seq = atomic_read(&delayed_refs->ref_seq);
-
-progress:
- old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
- if (old) {
- DEFINE_WAIT(__wait);
- if (delayed_refs->flushing ||
- !btrfs_should_throttle_delayed_refs(trans, root))
- return 0;
-
- prepare_to_wait(&delayed_refs->wait, &__wait,
- TASK_UNINTERRUPTIBLE);
-
- old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
- if (old) {
- schedule();
- finish_wait(&delayed_refs->wait, &__wait);
-
- if (!refs_newer(delayed_refs, seq, 256))
- goto progress;
- else
- return 0;
- } else {
- finish_wait(&delayed_refs->wait, &__wait);
- goto again;
- }
- }
-
- } else {
- atomic_inc(&delayed_refs->procs_running_refs);
- }
-
again:
- loops = 0;
- spin_lock(&delayed_refs->lock);
-
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
-
- while (1) {
- if (!(run_all || run_most) &&
- !btrfs_should_throttle_delayed_refs(trans, root))
- break;
-
- /*
- * go find something we can process in the rbtree. We start at
- * the beginning of the tree, and then build a cluster
- * of refs to process starting at the first one we are able to
- * lock
- */
- delayed_start = delayed_refs->run_delayed_start;
- ret = btrfs_find_ref_cluster(trans, &cluster,
- delayed_refs->run_delayed_start);
- if (ret)
- break;
-
- ret = run_clustered_refs(trans, root, &cluster);
- if (ret < 0) {
- btrfs_release_ref_cluster(&cluster);
- spin_unlock(&delayed_refs->lock);
- btrfs_abort_transaction(trans, root, ret);
- atomic_dec(&delayed_refs->procs_running_refs);
- wake_up(&delayed_refs->wait);
- return ret;
- }
-
- atomic_add(ret, &delayed_refs->ref_seq);
-
- count -= min_t(unsigned long, ret, count);
-
- if (count == 0)
- break;
-
- if (delayed_start >= delayed_refs->run_delayed_start) {
- if (loops == 0) {
- /*
- * btrfs_find_ref_cluster looped. let's do one
- * more cycle. if we don't run any delayed ref
- * during that cycle (because we can't because
- * all of them are blocked), bail out.
- */
- loops = 1;
- } else {
- /*
- * no runnable refs left, stop trying
- */
- BUG_ON(run_all);
- break;
- }
- }
- if (ret) {
- /* refs were run, let's reset staleness detection */
- loops = 0;
- }
+ ret = __btrfs_run_delayed_refs(trans, root, count);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, root, ret);
+ return ret;
}
if (run_all) {
- if (!list_empty(&trans->new_bgs)) {
- spin_unlock(&delayed_refs->lock);
+ if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- spin_lock(&delayed_refs->lock);
- }
- node = rb_first(&delayed_refs->root);
- if (!node)
+ spin_lock(&delayed_refs->lock);
+ node = rb_first(&delayed_refs->href_root);
+ if (!node) {
+ spin_unlock(&delayed_refs->lock);
goto out;
+ }
count = (unsigned long)-1;
while (node) {
- ref = rb_entry(node, struct btrfs_delayed_ref_node,
- rb_node);
- if (btrfs_delayed_ref_is_head(ref)) {
- struct btrfs_delayed_ref_head *head;
+ head = rb_entry(node, struct btrfs_delayed_ref_head,
+ href_node);
+ if (btrfs_delayed_ref_is_head(&head->node)) {
+ struct btrfs_delayed_ref_node *ref;
- head = btrfs_delayed_node_to_head(ref);
+ ref = &head->node;
atomic_inc(&ref->refs);
spin_unlock(&delayed_refs->lock);
@@ -2780,20 +2838,19 @@ again:
btrfs_put_delayed_ref(ref);
cond_resched();
goto again;
+ } else {
+ WARN_ON(1);
}
node = rb_next(node);
}
spin_unlock(&delayed_refs->lock);
- schedule_timeout(1);
+ cond_resched();
goto again;
}
out:
- atomic_dec(&delayed_refs->procs_running_refs);
- smp_mb();
- if (waitqueue_active(&delayed_refs->wait))
- wake_up(&delayed_refs->wait);
-
- spin_unlock(&delayed_refs->lock);
+ ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
+ if (ret)
+ return ret;
assert_qgroups_uptodate(trans);
return 0;
}
@@ -2835,12 +2892,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
struct rb_node *node;
int ret = 0;
- ret = -ENOENT;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(trans, bytenr);
- if (!head)
- goto out;
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+ }
if (!mutex_trylock(&head->mutex)) {
atomic_inc(&head->node.refs);
@@ -2857,40 +2915,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
btrfs_put_delayed_ref(&head->node);
return -EAGAIN;
}
+ spin_unlock(&delayed_refs->lock);
- node = rb_prev(&head->node.rb_node);
- if (!node)
- goto out_unlock;
-
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
- if (ref->bytenr != bytenr)
- goto out_unlock;
-
- ret = 1;
- if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
- goto out_unlock;
+ spin_lock(&head->lock);
+ node = rb_first(&head->ref_root);
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ node = rb_next(node);
- data_ref = btrfs_delayed_node_to_data_ref(ref);
+ /* If it's a shared ref we know a cross reference exists */
+ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
+ ret = 1;
+ break;
+ }
- node = rb_prev(node);
- if (node) {
- int seq = ref->seq;
+ data_ref = btrfs_delayed_node_to_data_ref(ref);
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- if (ref->bytenr == bytenr && ref->seq == seq)
- goto out_unlock;
+ /*
+ * If our ref doesn't match the one we're currently looking at
+ * then we have a cross reference.
+ */
+ if (data_ref->root != root->root_key.objectid ||
+ data_ref->objectid != objectid ||
+ data_ref->offset != offset) {
+ ret = 1;
+ break;
+ }
}
-
- if (data_ref->root != root->root_key.objectid ||
- data_ref->objectid != objectid || data_ref->offset != offset)
- goto out_unlock;
-
- ret = 0;
-out_unlock:
+ spin_unlock(&head->lock);
mutex_unlock(&head->mutex);
-out:
- spin_unlock(&delayed_refs->lock);
return ret;
}
@@ -3004,7 +3057,7 @@ out:
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
- int full_backref, int inc, int for_cow)
+ int full_backref, int inc, int no_quota)
{
u64 bytenr;
u64 num_bytes;
@@ -3019,11 +3072,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
u64, u64, u64, u64, u64, u64, int);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
ref_root = btrfs_header_owner(buf);
nritems = btrfs_header_nritems(buf);
level = btrfs_header_level(buf);
- if (!root->ref_cows && level == 0)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
return 0;
if (inc)
@@ -3054,7 +3111,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, for_cow);
+ key.offset, no_quota);
if (ret)
goto fail;
} else {
@@ -3062,7 +3119,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
num_bytes = btrfs_level_size(root, level - 1);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, level - 1, 0,
- for_cow);
+ no_quota);
if (ret)
goto fail;
}
@@ -3073,15 +3130,15 @@ fail:
}
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow)
+ struct extent_buffer *buf, int full_backref, int no_quota)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota);
}
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- struct extent_buffer *buf, int full_backref, int for_cow)
+ struct extent_buffer *buf, int full_backref, int no_quota)
{
- return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
+ return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota);
}
static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3197,15 +3254,15 @@ again:
if (ret)
goto out_put;
- ret = btrfs_truncate_free_space_cache(root, trans, path,
- inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, inode);
if (ret)
goto out_put;
}
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root, SPACE_CACHE) ||
+ block_group->delalloc_bytes) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3318,10 +3375,9 @@ again:
last = cache->key.objectid + cache->key.offset;
err = write_one_cache_group(trans, root, path, cache);
+ btrfs_put_block_group(cache);
if (err) /* File system offline */
goto out;
-
- btrfs_put_block_group(cache);
}
while (1) {
@@ -3389,6 +3445,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
return readonly;
}
+static const char *alloc_name(u64 flags)
+{
+ switch (flags) {
+ case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
+ return "mixed";
+ case BTRFS_BLOCK_GROUP_METADATA:
+ return "metadata";
+ case BTRFS_BLOCK_GROUP_DATA:
+ return "data";
+ case BTRFS_BLOCK_GROUP_SYSTEM:
+ return "system";
+ default:
+ WARN_ON(1);
+ return "invalid-combination";
+ };
+}
+
static int update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
struct btrfs_space_info **space_info)
@@ -3444,11 +3517,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->chunk_alloc = 0;
found->flush = 0;
init_waitqueue_head(&found->wait);
+
+ ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
+ info->space_info_kobj, "%s",
+ alloc_name(found->flags));
+ if (ret) {
+ kfree(found);
+ return ret;
+ }
+
*space_info = found;
list_add_rcu(&found->list, &info->space_info);
if (flags & BTRFS_BLOCK_GROUP_DATA)
info->data_sinfo = found;
- return 0;
+
+ return ret;
}
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
@@ -3556,11 +3639,13 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
return extended_to_chunk(flags | tmp);
}
-static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
{
unsigned seq;
+ u64 flags;
do {
+ flags = orig_flags;
seq = read_seqbegin(&root->fs_info->profiles_lock);
if (flags & BTRFS_BLOCK_GROUP_DATA)
@@ -3605,10 +3690,9 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
/* make sure bytes are sectorsize aligned */
bytes = ALIGN(bytes, root->sectorsize);
- if (root == root->fs_info->tree_root ||
- BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
- alloc_chunk = 0;
+ if (btrfs_is_free_space_inode(inode)) {
committed = 1;
+ ASSERT(current->journal_info);
}
data_sinfo = fs_info->data_sinfo;
@@ -3636,6 +3720,16 @@ again:
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
+ /*
+ * It is ugly that we don't call nolock join
+ * transaction for the free space inode case here.
+ * But it is safe because we only do the data space
+ * reservation for the free space cache in the
+ * transaction context, the common join transaction
+ * just increase the counter of the current transaction
+ * handler, doesn't try to acquire the trans_lock of
+ * the fs.
+ */
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);
@@ -3681,6 +3775,9 @@ commit_trans:
goto again;
}
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info:enospc",
+ data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
data_sinfo->bytes_may_use += bytes;
@@ -3974,7 +4071,7 @@ static int can_overcommit(struct btrfs_root *root,
}
static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
- unsigned long nr_pages)
+ unsigned long nr_pages, int nr_items)
{
struct super_block *sb = root->fs_info->sb;
@@ -3989,12 +4086,26 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
* the filesystem is readonly(all dirty pages are written to
* the disk).
*/
- btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+ btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
if (!current->journal_info)
- btrfs_wait_all_ordered_extents(root->fs_info);
+ btrfs_wait_ordered_roots(root->fs_info, nr_items);
}
}
+static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
+{
+ u64 bytes;
+ int nr;
+
+ bytes = btrfs_calc_trans_metadata_size(root, 1);
+ nr = (int)div64_u64(to_reclaim, bytes);
+ if (!nr)
+ nr = 1;
+ return nr;
+}
+
+#define EXTENT_SIZE_PER_ITEM (256 * 1024)
+
/*
* shrink metadata reservation for delalloc
*/
@@ -4007,35 +4118,51 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
u64 delalloc_bytes;
u64 max_reclaim;
long time_left;
- unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
- int loops = 0;
+ unsigned long nr_pages;
+ int loops;
+ int items;
enum btrfs_reserve_flush_enum flush;
+ /* Calc the number of the pages we need flush for space reservation */
+ items = calc_reclaim_items_nr(root, to_reclaim);
+ to_reclaim = items * EXTENT_SIZE_PER_ITEM;
+
trans = (struct btrfs_trans_handle *)current->journal_info;
block_rsv = &root->fs_info->delalloc_block_rsv;
space_info = block_rsv->space_info;
- smp_mb();
delalloc_bytes = percpu_counter_sum_positive(
&root->fs_info->delalloc_bytes);
if (delalloc_bytes == 0) {
if (trans)
return;
- btrfs_wait_all_ordered_extents(root->fs_info);
+ if (wait_ordered)
+ btrfs_wait_ordered_roots(root->fs_info, items);
return;
}
+ loops = 0;
while (delalloc_bytes && loops < 3) {
max_reclaim = min(delalloc_bytes, to_reclaim);
nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
- btrfs_writeback_inodes_sb_nr(root, nr_pages);
+ btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
/*
* We need to wait for the async pages to actually start before
* we do anything.
*/
- wait_event(root->fs_info->async_submit_wait,
- !atomic_read(&root->fs_info->async_delalloc_pages));
+ max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
+ if (!max_reclaim)
+ goto skip_async;
+
+ if (max_reclaim <= nr_pages)
+ max_reclaim = 0;
+ else
+ max_reclaim -= nr_pages;
+ wait_event(root->fs_info->async_submit_wait,
+ atomic_read(&root->fs_info->async_delalloc_pages) <=
+ (int)max_reclaim);
+skip_async:
if (!trans)
flush = BTRFS_RESERVE_FLUSH_ALL;
else
@@ -4049,13 +4176,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
loops++;
if (wait_ordered && !trans) {
- btrfs_wait_all_ordered_extents(root->fs_info);
+ btrfs_wait_ordered_roots(root->fs_info, items);
} else {
time_left = schedule_timeout_killable(1);
if (time_left)
break;
}
- smp_mb();
delalloc_bytes = percpu_counter_sum_positive(
&root->fs_info->delalloc_bytes);
}
@@ -4086,13 +4212,9 @@ static int may_commit_transaction(struct btrfs_root *root,
goto commit;
/* See if there is enough pinned space to make this reservation */
- spin_lock(&space_info->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
- bytes) >= 0) {
- spin_unlock(&space_info->lock);
+ bytes) >= 0)
goto commit;
- }
- spin_unlock(&space_info->lock);
/*
* See if there is some space in the delayed insertion reservation for
@@ -4101,16 +4223,13 @@ static int may_commit_transaction(struct btrfs_root *root,
if (space_info != delayed_rsv->space_info)
return -ENOSPC;
- spin_lock(&space_info->lock);
spin_lock(&delayed_rsv->lock);
if (percpu_counter_compare(&space_info->total_bytes_pinned,
bytes - delayed_rsv->size) >= 0) {
spin_unlock(&delayed_rsv->lock);
- spin_unlock(&space_info->lock);
return -ENOSPC;
}
spin_unlock(&delayed_rsv->lock);
- spin_unlock(&space_info->lock);
commit:
trans = btrfs_join_transaction(root);
@@ -4140,16 +4259,11 @@ static int flush_space(struct btrfs_root *root,
switch (state) {
case FLUSH_DELAYED_ITEMS_NR:
case FLUSH_DELAYED_ITEMS:
- if (state == FLUSH_DELAYED_ITEMS_NR) {
- u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
-
- nr = (int)div64_u64(num_bytes, bytes);
- if (!nr)
- nr = 1;
- nr *= 2;
- } else {
+ if (state == FLUSH_DELAYED_ITEMS_NR)
+ nr = calc_reclaim_items_nr(root, num_bytes) * 2;
+ else
nr = -1;
- }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4160,7 +4274,7 @@ static int flush_space(struct btrfs_root *root,
break;
case FLUSH_DELALLOC:
case FLUSH_DELALLOC_WAIT:
- shrink_delalloc(root, num_bytes, orig_bytes,
+ shrink_delalloc(root, num_bytes * 2, orig_bytes,
state == FLUSH_DELALLOC_WAIT);
break;
case ALLOC_CHUNK:
@@ -4186,6 +4300,104 @@ static int flush_space(struct btrfs_root *root,
return ret;
}
+
+static inline u64
+btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
+ struct btrfs_space_info *space_info)
+{
+ u64 used;
+ u64 expected;
+ u64 to_reclaim;
+
+ to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
+ 16 * 1024 * 1024);
+ spin_lock(&space_info->lock);
+ if (can_overcommit(root, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL)) {
+ to_reclaim = 0;
+ goto out;
+ }
+
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (can_overcommit(root, space_info, 1024 * 1024,
+ BTRFS_RESERVE_FLUSH_ALL))
+ expected = div_factor_fine(space_info->total_bytes, 95);
+ else
+ expected = div_factor_fine(space_info->total_bytes, 90);
+
+ if (used > expected)
+ to_reclaim = used - expected;
+ else
+ to_reclaim = 0;
+ to_reclaim = min(to_reclaim, space_info->bytes_may_use +
+ space_info->bytes_reserved);
+out:
+ spin_unlock(&space_info->lock);
+
+ return to_reclaim;
+}
+
+static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info, u64 used)
+{
+ return (used >= div_factor_fine(space_info->total_bytes, 98) &&
+ !btrfs_fs_closing(fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+}
+
+static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
+ struct btrfs_fs_info *fs_info)
+{
+ u64 used;
+
+ spin_lock(&space_info->lock);
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (need_do_async_reclaim(space_info, fs_info, used)) {
+ spin_unlock(&space_info->lock);
+ return 1;
+ }
+ spin_unlock(&space_info->lock);
+
+ return 0;
+}
+
+static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_space_info *space_info;
+ u64 to_reclaim;
+ int flush_state;
+
+ fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
+ space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim)
+ return;
+
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ do {
+ flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ flush_state++;
+ if (!btrfs_need_do_async_reclaim(space_info, fs_info))
+ return;
+ } while (flush_state <= COMMIT_TRANS);
+
+ if (btrfs_need_do_async_reclaim(space_info, fs_info))
+ queue_work(system_unbound_wq, work);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
@@ -4293,8 +4505,13 @@ again:
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
flushing = true;
space_info->flush = 1;
+ } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ used += orig_bytes;
+ if (need_do_async_reclaim(space_info, root->fs_info, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work))
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
}
-
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
@@ -4332,6 +4549,10 @@ out:
!block_rsv_use_bytes(global_rsv, orig_bytes))
ret = 0;
}
+ if (ret == -ENOSPC)
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info:enospc",
+ space_info->flags, orig_bytes, 1);
if (flushing) {
spin_lock(&space_info->lock);
space_info->flush = 0;
@@ -4347,7 +4568,7 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (root->ref_cows)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
block_rsv = trans->block_rsv;
if (root == root->fs_info->csum_root && trans->adding_csums)
@@ -4584,7 +4805,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
u64 num_bytes)
{
struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
- if (global_rsv->full || global_rsv == block_rsv ||
+ if (global_rsv == block_rsv ||
block_rsv->space_info != global_rsv->space_info)
global_rsv = NULL;
block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
@@ -4986,7 +5207,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
if (to_reserve)
- trace_btrfs_space_reservation(root->fs_info,"delalloc",
+ trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_reserve, 1);
block_rsv_add_bytes(block_rsv, to_reserve, 1);
@@ -5264,6 +5485,8 @@ static int pin_down_extent(struct btrfs_root *root,
set_extent_dirty(root->fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+ if (reserved)
+ trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
return 0;
}
@@ -5392,6 +5615,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @reserve: One of the reservation enums
+ * @delalloc: The blocks are allocated for the delalloc write
*
* This is called by the allocator when it reserves space, or by somebody who is
* freeing space that was never actually used on disk. For example if you
@@ -5410,7 +5634,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
* succeeds.
*/
static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int reserve)
+ u64 num_bytes, int reserve, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
int ret = 0;
@@ -5429,12 +5653,18 @@ static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
num_bytes, 0);
space_info->bytes_may_use -= num_bytes;
}
+
+ if (delalloc)
+ cache->delalloc_bytes += num_bytes;
}
} else {
if (cache->ro)
space_info->bytes_readonly += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
+
+ if (delalloc)
+ cache->delalloc_bytes -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
@@ -5448,9 +5678,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
- struct btrfs_space_info *space_info;
- down_write(&fs_info->extent_commit_sem);
+ down_write(&fs_info->commit_root_sem);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
@@ -5469,10 +5698,7 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
else
fs_info->pinned_extents = &fs_info->freed_extents[0];
- up_write(&fs_info->extent_commit_sem);
-
- list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
- percpu_counter_set(&space_info->total_bytes_pinned, 0);
+ up_write(&fs_info->commit_root_sem);
update_global_block_rsv(fs_info);
}
@@ -5511,6 +5737,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
readonly = true;
@@ -5597,7 +5824,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner_objectid,
u64 owner_offset, int refs_to_drop,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_delayed_extent_op *extent_op,
+ int no_quota)
{
struct btrfs_key key;
struct btrfs_path *path;
@@ -5613,9 +5841,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int num_to_del = 1;
u32 item_size;
u64 refs;
+ int last_ref = 0;
+ enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
+ if (!info->quota_enabled || !is_fstree(root_objectid))
+ no_quota = 1;
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -5663,7 +5896,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
BUG_ON(iref);
ret = remove_extent_backref(trans, extent_root, path,
NULL, refs_to_drop,
- is_data);
+ is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5698,6 +5931,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (ret > 0 && skinny_metadata) {
skinny_metadata = false;
+ key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = num_bytes;
btrfs_release_path(path);
@@ -5718,13 +5952,14 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
extent_slot = path->slots[0];
}
- } else if (ret == -ENOENT) {
+ } else if (WARN_ON(ret == -ENOENT)) {
btrfs_print_leaf(extent_root, path->nodes[0]);
- WARN_ON(1);
btrfs_err(info,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
owner_offset);
+ btrfs_abort_transaction(trans, extent_root, ret);
+ goto out;
} else {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5780,7 +6015,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs = btrfs_extent_refs(leaf, ei);
if (refs < refs_to_drop) {
btrfs_err(info, "trying to drop %d refs but we only have %Lu "
- "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
+ "for bytenr %Lu", refs_to_drop, refs, bytenr);
ret = -EINVAL;
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5788,6 +6023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
refs -= refs_to_drop;
if (refs > 0) {
+ type = BTRFS_QGROUP_OPER_SUB_SHARED;
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
/*
@@ -5803,7 +6039,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (found_extent) {
ret = remove_extent_backref(trans, extent_root, path,
iref, refs_to_drop,
- is_data);
+ is_data, &last_ref);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -5824,6 +6060,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
+ last_ref = 1;
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
@@ -5846,6 +6083,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
goto out;
}
}
+ btrfs_release_path(path);
+
+ /* Deal with the quota accounting */
+ if (!ret && last_ref && !no_quota) {
+ int mod_seq = 0;
+
+ if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
+ type == BTRFS_QGROUP_OPER_SUB_SHARED)
+ mod_seq = 1;
+
+ ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
+ bytenr, num_bytes, type,
+ mod_seq);
+ }
out:
btrfs_free_path(path);
return ret;
@@ -5862,24 +6113,16 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
{
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_node *ref;
- struct rb_node *node;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
head = btrfs_find_delayed_ref_head(trans, bytenr);
if (!head)
- goto out;
+ goto out_delayed_unlock;
- node = rb_prev(&head->node.rb_node);
- if (!node)
- goto out;
-
- ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-
- /* there are still entries for this ref, we can't drop it */
- if (ref->bytenr == bytenr)
+ spin_lock(&head->lock);
+ if (rb_first(&head->ref_root))
goto out;
if (head->extent_op) {
@@ -5901,19 +6144,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* ahead and process it.
*/
head->node.in_tree = 0;
- rb_erase(&head->node.rb_node, &delayed_refs->root);
+ rb_erase(&head->href_node, &delayed_refs->href_root);
- delayed_refs->num_entries--;
+ atomic_dec(&delayed_refs->num_entries);
/*
* we don't take a ref on the node because we're removing it from the
* tree, so we just steal the ref the tree was holding.
*/
delayed_refs->num_heads--;
- if (list_empty(&head->cluster))
+ if (head->processing == 0)
delayed_refs->num_heads_ready--;
-
- list_del_init(&head->cluster);
+ head->processing = 0;
+ spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
BUG_ON(head->extent_op);
@@ -5924,6 +6167,9 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
btrfs_put_delayed_ref(&head->node);
return ret;
out:
+ spin_unlock(&head->lock);
+
+out_delayed_unlock:
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -5966,7 +6212,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
out:
@@ -5986,11 +6233,15 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int for_cow)
+ u64 owner, u64 offset, int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
/*
@@ -6006,13 +6257,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+ BTRFS_DROP_DELAYED_REF, NULL, no_quota);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
offset, BTRFS_DROP_DELAYED_REF,
- NULL, for_cow);
+ NULL, no_quota);
}
return ret;
}
@@ -6090,11 +6341,29 @@ int __get_raid_index(u64 flags)
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
}
-static int get_block_group_index(struct btrfs_block_group_cache *cache)
+int get_block_group_index(struct btrfs_block_group_cache *cache)
{
return __get_raid_index(cache->flags);
}
+static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = "raid10",
+ [BTRFS_RAID_RAID1] = "raid1",
+ [BTRFS_RAID_DUP] = "dup",
+ [BTRFS_RAID_RAID0] = "raid0",
+ [BTRFS_RAID_SINGLE] = "single",
+ [BTRFS_RAID_RAID5] = "raid5",
+ [BTRFS_RAID_RAID6] = "raid6",
+};
+
+static const char *get_raid_name(enum btrfs_raid_types type)
+{
+ if (type >= BTRFS_NR_RAID_TYPES)
+ return NULL;
+
+ return btrfs_raid_type_names[type];
+}
+
enum btrfs_loop_type {
LOOP_CACHING_NOWAIT = 0,
LOOP_CACHING_WAIT = 1,
@@ -6102,6 +6371,70 @@ enum btrfs_loop_type {
LOOP_NO_EMPTY_SIZE = 3,
};
+static inline void
+btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static inline void
+btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ btrfs_get_block_group(cache);
+ if (delalloc)
+ down_read(&cache->data_rwsem);
+}
+
+static struct btrfs_block_group_cache *
+btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
+ struct btrfs_free_cluster *cluster,
+ int delalloc)
+{
+ struct btrfs_block_group_cache *used_bg;
+ bool locked = false;
+again:
+ spin_lock(&cluster->refill_lock);
+ if (locked) {
+ if (used_bg == cluster->block_group)
+ return used_bg;
+
+ up_read(&used_bg->data_rwsem);
+ btrfs_put_block_group(used_bg);
+ }
+
+ used_bg = cluster->block_group;
+ if (!used_bg)
+ return NULL;
+
+ if (used_bg == block_group)
+ return used_bg;
+
+ btrfs_get_block_group(used_bg);
+
+ if (!delalloc)
+ return used_bg;
+
+ if (down_read_trylock(&used_bg->data_rwsem))
+ return used_bg;
+
+ spin_unlock(&cluster->refill_lock);
+ down_read(&used_bg->data_rwsem);
+ locked = true;
+ goto again;
+}
+
+static inline void
+btrfs_release_block_group(struct btrfs_block_group_cache *cache,
+ int delalloc)
+{
+ if (delalloc)
+ up_read(&cache->data_rwsem);
+ btrfs_put_block_group(cache);
+}
+
/*
* walks the btree of allocated extents and find a hole of a given size.
* The key ins is changed to record the hole:
@@ -6116,13 +6449,12 @@ enum btrfs_loop_type {
static noinline int find_free_extent(struct btrfs_root *orig_root,
u64 num_bytes, u64 empty_size,
u64 hint_byte, struct btrfs_key *ins,
- u64 flags)
+ u64 flags, int delalloc)
{
int ret = 0;
struct btrfs_root *root = orig_root->fs_info->extent_root;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
- struct btrfs_block_group_cache *used_block_group;
u64 search_start = 0;
u64 max_extent_size = 0;
int empty_cluster = 2 * 1024 * 1024;
@@ -6131,7 +6463,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
int index = __get_raid_index(flags);
int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
- bool found_uncached_bg = false;
bool failed_cluster_refill = false;
bool failed_alloc = false;
bool use_cluster = true;
@@ -6184,7 +6515,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
- used_block_group = block_group;
/*
* we don't want to use the block group if it doesn't match our
* allocation bits, or if its not cached.
@@ -6207,6 +6537,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
up_read(&space_info->groups_sem);
} else {
index = get_block_group_index(block_group);
+ btrfs_lock_block_group(block_group, delalloc);
goto have_block_group;
}
} else if (block_group) {
@@ -6221,8 +6552,7 @@ search:
u64 offset;
int cached;
- used_block_group = block_group;
- btrfs_get_block_group(block_group);
+ btrfs_grab_block_group(block_group, delalloc);
search_start = block_group->key.objectid;
/*
@@ -6249,7 +6579,6 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
- found_uncached_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -6265,23 +6594,22 @@ have_block_group:
* lets look there
*/
if (last_ptr) {
+ struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
* the refill lock keeps out other
* people trying to start a new cluster
*/
- spin_lock(&last_ptr->refill_lock);
- used_block_group = last_ptr->block_group;
- if (used_block_group != block_group &&
- (!used_block_group ||
- used_block_group->ro ||
- !block_group_bits(used_block_group, flags))) {
- used_block_group = block_group;
+ used_block_group = btrfs_lock_cluster(block_group,
+ last_ptr,
+ delalloc);
+ if (!used_block_group)
goto refill_cluster;
- }
- if (used_block_group != block_group)
- btrfs_get_block_group(used_block_group);
+ if (used_block_group != block_group &&
+ (used_block_group->ro ||
+ !block_group_bits(used_block_group, flags)))
+ goto release_cluster;
offset = btrfs_alloc_from_cluster(used_block_group,
last_ptr,
@@ -6292,17 +6620,18 @@ have_block_group:
/* we have a block, we're done */
spin_unlock(&last_ptr->refill_lock);
trace_btrfs_reserve_extent_cluster(root,
- block_group, search_start, num_bytes);
+ used_block_group,
+ search_start, num_bytes);
+ if (used_block_group != block_group) {
+ btrfs_release_block_group(block_group,
+ delalloc);
+ block_group = used_block_group;
+ }
goto checks;
}
WARN_ON(last_ptr->block_group != used_block_group);
- if (used_block_group != block_group) {
- btrfs_put_block_group(used_block_group);
- used_block_group = block_group;
- }
-refill_cluster:
- BUG_ON(used_block_group != block_group);
+release_cluster:
/* If we are on LOOP_NO_EMPTY_SIZE, we can't
* set up a new clusters, so lets just skip it
* and let the allocator find whatever block
@@ -6319,8 +6648,10 @@ refill_cluster:
* succeeding in the unclustered
* allocation. */
if (loop >= LOOP_NO_EMPTY_SIZE &&
- last_ptr->block_group != block_group) {
+ used_block_group != block_group) {
spin_unlock(&last_ptr->refill_lock);
+ btrfs_release_block_group(used_block_group,
+ delalloc);
goto unclustered_alloc;
}
@@ -6330,6 +6661,10 @@ refill_cluster:
*/
btrfs_return_cluster_to_free_space(NULL, last_ptr);
+ if (used_block_group != block_group)
+ btrfs_release_block_group(used_block_group,
+ delalloc);
+refill_cluster:
if (loop >= LOOP_NO_EMPTY_SIZE) {
spin_unlock(&last_ptr->refill_lock);
goto unclustered_alloc;
@@ -6421,25 +6756,25 @@ unclustered_alloc:
goto loop;
}
checks:
- search_start = stripe_align(root, used_block_group,
+ search_start = stripe_align(root, block_group,
offset, num_bytes);
/* move on to the next group */
if (search_start + num_bytes >
- used_block_group->key.objectid + used_block_group->key.offset) {
- btrfs_add_free_space(used_block_group, offset, num_bytes);
+ block_group->key.objectid + block_group->key.offset) {
+ btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
}
if (offset < search_start)
- btrfs_add_free_space(used_block_group, offset,
+ btrfs_add_free_space(block_group, offset,
search_start - offset);
BUG_ON(offset > search_start);
- ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
- alloc_type);
+ ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+ alloc_type, delalloc);
if (ret == -EAGAIN) {
- btrfs_add_free_space(used_block_group, offset, num_bytes);
+ btrfs_add_free_space(block_group, offset, num_bytes);
goto loop;
}
@@ -6449,17 +6784,13 @@ checks:
trace_btrfs_reserve_extent(orig_root, block_group,
search_start, num_bytes);
- if (used_block_group != block_group)
- btrfs_put_block_group(used_block_group);
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
break;
loop:
failed_cluster_refill = false;
failed_alloc = false;
BUG_ON(index != get_block_group_index(block_group));
- if (used_block_group != block_group)
- btrfs_put_block_group(used_block_group);
- btrfs_put_block_group(block_group);
+ btrfs_release_block_group(block_group, delalloc);
}
up_read(&space_info->groups_sem);
@@ -6482,8 +6813,14 @@ loop:
loop++;
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
+ int exist = 0;
+
+ trans = current->journal_info;
+ if (trans)
+ exist = 1;
+ else
+ trans = btrfs_join_transaction(root);
- trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
@@ -6500,7 +6837,8 @@ loop:
root, ret);
else
ret = 0;
- btrfs_end_transaction(trans, root);
+ if (!exist)
+ btrfs_end_transaction(trans, root);
if (ret)
goto out;
}
@@ -6529,12 +6867,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int index = 0;
spin_lock(&info->lock);
- printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+ printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
info->flags,
info->total_bytes - info->bytes_used - info->bytes_pinned -
info->bytes_reserved - info->bytes_readonly,
(info->full) ? "" : "not ");
- printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
+ printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
"reserved=%llu, may_use=%llu, readonly=%llu\n",
info->total_bytes, info->bytes_used, info->bytes_pinned,
info->bytes_reserved, info->bytes_may_use,
@@ -6548,7 +6886,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
again:
list_for_each_entry(cache, &info->block_groups[index], list) {
spin_lock(&cache->lock);
- printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
+ printk(KERN_INFO "BTRFS: "
+ "block group %llu has %llu bytes, "
+ "%llu used %llu pinned %llu reserved %s\n",
cache->key.objectid, cache->key.offset,
btrfs_block_group_used(&cache->item), cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
@@ -6563,7 +6903,7 @@ again:
int btrfs_reserve_extent(struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
- struct btrfs_key *ins, int is_data)
+ struct btrfs_key *ins, int is_data, int delalloc)
{
bool final_tried = false;
u64 flags;
@@ -6573,7 +6913,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
again:
WARN_ON(num_bytes < root->sectorsize);
ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
- flags);
+ flags, delalloc);
if (ret == -ENOSPC) {
if (!final_tried && ins->offset) {
@@ -6594,13 +6934,12 @@ again:
}
}
- trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
-
return ret;
}
static int __btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len, int pin)
+ u64 start, u64 len,
+ int pin, int delalloc)
{
struct btrfs_block_group_cache *cache;
int ret = 0;
@@ -6619,7 +6958,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
pin_down_extent(root, cache, start, len, 1);
else {
btrfs_add_free_space(cache, start, len);
- btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+ btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
}
btrfs_put_block_group(cache);
@@ -6629,15 +6968,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
}
int btrfs_free_reserved_extent(struct btrfs_root *root,
- u64 start, u64 len)
+ u64 start, u64 len, int delalloc)
{
- return __btrfs_free_reserved_extent(root, start, len, 0);
+ return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
}
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
u64 start, u64 len)
{
- return __btrfs_free_reserved_extent(root, start, len, 1);
+ return __btrfs_free_reserved_extent(root, start, len, 1, 0);
}
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -6701,12 +7040,20 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
+ /* Always set parent to 0 here since its exclusive anyway. */
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, ins->offset,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+
ret = update_block_group(root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
return ret;
}
@@ -6714,7 +7061,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins)
+ int level, struct btrfs_key *ins,
+ int no_quota)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6724,6 +7072,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
u32 size = sizeof(*extent_item) + sizeof(*iref);
+ u64 num_bytes = ins->offset;
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
@@ -6731,13 +7080,18 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
size += sizeof(*block_info);
path = btrfs_alloc_path();
- if (!path)
+ if (!path) {
+ btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+ root->leafsize);
return -ENOMEM;
+ }
path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
ins, size);
if (ret) {
+ btrfs_free_and_pin_reserved_extent(root, ins->objectid,
+ root->leafsize);
btrfs_free_path(path);
return ret;
}
@@ -6752,6 +7106,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
if (skinny_metadata) {
iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+ num_bytes = root->leafsize;
} else {
block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
btrfs_set_tree_block_key(leaf, block_info, key);
@@ -6773,12 +7128,22 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_free_path(path);
+ if (!no_quota) {
+ ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
+ ins->objectid, num_bytes,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret)
+ return ret;
+ }
+
ret = update_block_group(root, ins->objectid, root->leafsize, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
BUG();
}
+
+ trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize);
return ret;
}
@@ -6826,7 +7191,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
return -EINVAL;
ret = btrfs_update_reserved_bytes(block_group, ins->offset,
- RESERVE_ALLOC_NO_ACCOUNT);
+ RESERVE_ALLOC_NO_ACCOUNT, 0);
BUG_ON(ret); /* logic error */
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
@@ -6905,7 +7270,7 @@ again:
/*DEFAULT_RATELIMIT_BURST*/ 1);
if (__ratelimit(&_rs))
WARN(1, KERN_DEBUG
- "btrfs: block rsv returned %d\n", ret);
+ "BTRFS: block rsv returned %d\n", ret);
}
try_reserve:
ret = reserve_metadata_bytes(root, block_rsv, blocksize,
@@ -6954,12 +7319,21 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
+ buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
+ blocksize, level);
+ if (!IS_ERR(buf))
+ root->alloc_bytenr += blocksize;
+ return buf;
+ }
+#endif
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
return ERR_CAST(block_rsv);
ret = btrfs_reserve_extent(root, blocksize, blocksize,
- empty_size, hint, &ins, 0);
+ empty_size, hint, &ins, 0, 0);
if (ret) {
unuse_block_rsv(root->fs_info, block_rsv, blocksize);
return ERR_PTR(ret);
@@ -7653,7 +8027,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
btrfs_end_transaction_throttle(trans, tree_root);
if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
- pr_debug("btrfs: drop snapshot early exit\n");
+ pr_debug("BTRFS: drop snapshot early exit\n");
err = -EAGAIN;
goto out_free;
}
@@ -7695,7 +8069,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
}
- if (root->in_radix) {
+ if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
@@ -7718,7 +8092,7 @@ out:
*/
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
- if (err)
+ if (err && err != -EAGAIN)
btrfs_std_error(root->fs_info, err);
return err;
}
@@ -7983,7 +8357,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
spin_lock(&sinfo->lock);
- for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
if (!list_empty(&sinfo->block_groups[i]))
free_bytes += __btrfs_get_ro_block_group_free_space(
&sinfo->block_groups[i]);
@@ -8222,14 +8596,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
- down_write(&info->extent_commit_sem);
+ down_write(&info->commit_root_sem);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
put_caching_control(caching_ctl);
}
- up_write(&info->extent_commit_sem);
+ up_write(&info->commit_root_sem);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
@@ -8271,21 +8645,31 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
release_global_block_rsv(info);
- while(!list_empty(&info->space_info)) {
+ while (!list_empty(&info->space_info)) {
+ int i;
+
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
- if (space_info->bytes_pinned > 0 ||
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0) {
- WARN_ON(1);
+ space_info->bytes_may_use > 0)) {
dump_space_info(space_info, 0, 0);
}
}
- percpu_counter_destroy(&space_info->total_bytes_pinned);
list_del(&space_info->list);
- kfree(space_info);
+ for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+ struct kobject *kobj;
+ kobj = space_info->block_group_kobjs[i];
+ space_info->block_group_kobjs[i] = NULL;
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
+ }
+ kobject_del(&space_info->kobj);
+ kobject_put(&space_info->kobj);
}
return 0;
}
@@ -8294,10 +8678,71 @@ static void __link_block_group(struct btrfs_space_info *space_info,
struct btrfs_block_group_cache *cache)
{
int index = get_block_group_index(cache);
+ bool first = false;
down_write(&space_info->groups_sem);
+ if (list_empty(&space_info->block_groups[index]))
+ first = true;
list_add_tail(&cache->list, &space_info->block_groups[index]);
up_write(&space_info->groups_sem);
+
+ if (first) {
+ struct raid_kobject *rkobj;
+ int ret;
+
+ rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
+ if (!rkobj)
+ goto out_err;
+ rkobj->raid_type = index;
+ kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
+ ret = kobject_add(&rkobj->kobj, &space_info->kobj,
+ "%s", get_raid_name(index));
+ if (ret) {
+ kobject_put(&rkobj->kobj);
+ goto out_err;
+ }
+ space_info->block_group_kobjs[index] = &rkobj->kobj;
+ }
+
+ return;
+out_err:
+ pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
+}
+
+static struct btrfs_block_group_cache *
+btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ if (!cache)
+ return NULL;
+
+ cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+ GFP_NOFS);
+ if (!cache->free_space_ctl) {
+ kfree(cache);
+ return NULL;
+ }
+
+ cache->key.objectid = start;
+ cache->key.offset = size;
+ cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+
+ cache->sectorsize = root->sectorsize;
+ cache->fs_info = root->fs_info;
+ cache->full_stripe_len = btrfs_full_stripe_len(root,
+ &root->fs_info->mapping_tree,
+ start);
+ atomic_set(&cache->count, 1);
+ spin_lock_init(&cache->lock);
+ init_rwsem(&cache->data_rwsem);
+ INIT_LIST_HEAD(&cache->list);
+ INIT_LIST_HEAD(&cache->cluster_list);
+ INIT_LIST_HEAD(&cache->new_bg_list);
+ btrfs_init_free_space_ctl(cache);
+
+ return cache;
}
int btrfs_read_block_groups(struct btrfs_root *root)
@@ -8335,26 +8780,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
break;
if (ret != 0)
goto error;
+
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
+
+ cache = btrfs_create_block_group_cache(root, found_key.objectid,
+ found_key.offset);
if (!cache) {
ret = -ENOMEM;
goto error;
}
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- ret = -ENOMEM;
- goto error;
- }
-
- atomic_set(&cache->count, 1);
- spin_lock_init(&cache->lock);
- cache->fs_info = info;
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
if (need_clear) {
/*
@@ -8375,16 +8810,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
read_extent_buffer(leaf, &cache->item,
btrfs_item_ptr_offset(leaf, path->slots[0]),
sizeof(cache->item));
- memcpy(&cache->key, &found_key, sizeof(found_key));
+ cache->flags = btrfs_block_group_flags(&cache->item);
key.objectid = found_key.objectid + found_key.offset;
btrfs_release_path(path);
- cache->flags = btrfs_block_group_flags(&cache->item);
- cache->sectorsize = root->sectorsize;
- cache->full_stripe_len = btrfs_full_stripe_len(root,
- &root->fs_info->mapping_tree,
- found_key.objectid);
- btrfs_init_free_space_ctl(cache);
/*
* We need to exclude the super stripes now so that the space
@@ -8398,8 +8827,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* case.
*/
free_excluded_extents(root, cache);
- kfree(cache->free_space_ctl);
- kfree(cache);
+ btrfs_put_block_group(cache);
goto error;
}
@@ -8528,40 +8956,17 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root;
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
- cache = kzalloc(sizeof(*cache), GFP_NOFS);
+ cache = btrfs_create_block_group_cache(root, chunk_offset, size);
if (!cache)
return -ENOMEM;
- cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
- GFP_NOFS);
- if (!cache->free_space_ctl) {
- kfree(cache);
- return -ENOMEM;
- }
-
- cache->key.objectid = chunk_offset;
- cache->key.offset = size;
- cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
- cache->sectorsize = root->sectorsize;
- cache->fs_info = root->fs_info;
- cache->full_stripe_len = btrfs_full_stripe_len(root,
- &root->fs_info->mapping_tree,
- chunk_offset);
-
- atomic_set(&cache->count, 1);
- spin_lock_init(&cache->lock);
- INIT_LIST_HEAD(&cache->list);
- INIT_LIST_HEAD(&cache->cluster_list);
- INIT_LIST_HEAD(&cache->new_bg_list);
-
- btrfs_init_free_space_ctl(cache);
btrfs_set_block_group_used(&cache->item, bytes_used);
btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
- cache->flags = type;
btrfs_set_block_group_flags(&cache->item, type);
+ cache->flags = type;
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
ret = exclude_super_stripes(root, cache);
@@ -8571,8 +8976,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* case.
*/
free_excluded_extents(root, cache);
- kfree(cache->free_space_ctl);
- kfree(cache);
+ btrfs_put_block_group(cache);
return ret;
}
@@ -8638,6 +9042,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root = root->fs_info->tree_root;
struct btrfs_key key;
struct inode *inode;
+ struct kobject *kobj = NULL;
int ret;
int index;
int factor;
@@ -8736,9 +9141,16 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
- if (list_empty(&block_group->space_info->block_groups[index]))
+ if (list_empty(&block_group->space_info->block_groups[index])) {
+ kobj = block_group->space_info->block_group_kobjs[index];
+ block_group->space_info->block_group_kobjs[index] = NULL;
clear_avail_alloc_bits(root->fs_info, block_group->flags);
+ }
up_write(&block_group->space_info->groups_sem);
+ if (kobj) {
+ kobject_del(kobj);
+ kobject_put(kobj);
+ }
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_block_group_cache_done(block_group);
@@ -8880,3 +9292,38 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
range->len = trimmed;
return ret;
}
+
+/*
+ * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
+ * they are used to prevent the some tasks writing data into the page cache
+ * by nocow before the subvolume is snapshoted, but flush the data into
+ * the disk after the snapshot creation.
+ */
+void btrfs_end_nocow_write(struct btrfs_root *root)
+{
+ percpu_counter_dec(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we wake up
+ * waiters.
+ */
+ smp_mb();
+ if (waitqueue_active(&root->subv_writers->wait))
+ wake_up(&root->subv_writers->wait);
+}
+
+int btrfs_start_nocow_write(struct btrfs_root *root)
+{
+ if (unlikely(atomic_read(&root->will_be_snapshoted)))
+ return 0;
+
+ percpu_counter_inc(&root->subv_writers->counter);
+ /*
+ * Make sure counter is updated before we check for snapshot creation.
+ */
+ smp_mb();
+ if (unlikely(atomic_read(&root->will_be_snapshoted))) {
+ btrfs_end_nocow_write(root);
+ return 0;
+ }
+ return 1;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 22bda32acb8..a389820d158 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,13 +13,13 @@
#include <linux/cleancache.h>
#include "extent_io.h"
#include "extent_map.h"
-#include "compat.h"
#include "ctree.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "check-integrity.h"
#include "locking.h"
#include "rcu-string.h"
+#include "backref.h"
static struct kmem_cache *extent_state_cache;
static struct kmem_cache *extent_buffer_cache;
@@ -59,7 +59,7 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+ printk(KERN_ERR "BTRFS: state leak: start %llu end %llu "
"state %lu in tree %p refs %d\n",
state->start, state->end, state->state, state->tree,
atomic_read(&state->refs));
@@ -69,7 +69,7 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&buffers)) {
eb = list_entry(buffers.next, struct extent_buffer, leak_list);
- printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+ printk(KERN_ERR "BTRFS: buffer leak start %llu len %lu "
"refs %d\n",
eb->start, eb->len, atomic_read(&eb->refs));
list_del(&eb->leak_list);
@@ -77,16 +77,22 @@ void btrfs_leak_debug_check(void)
}
}
-#define btrfs_debug_check_extent_io_range(inode, start, end) \
- __btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+#define btrfs_debug_check_extent_io_range(tree, start, end) \
+ __btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
static inline void __btrfs_debug_check_extent_io_range(const char *caller,
- struct inode *inode, u64 start, u64 end)
+ struct extent_io_tree *tree, u64 start, u64 end)
{
- u64 isize = i_size_read(inode);
+ struct inode *inode;
+ u64 isize;
+
+ if (!tree->mapping)
+ return;
+ inode = tree->mapping->host;
+ isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
printk_ratelimited(KERN_DEBUG
- "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
caller, btrfs_ino(inode), isize, start, end);
}
}
@@ -124,6 +130,8 @@ static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
{
+ if (!tree->mapping)
+ return NULL;
return btrfs_sb(tree->mapping->host->i_sb);
}
@@ -186,11 +194,9 @@ void extent_io_tree_init(struct extent_io_tree *tree,
struct address_space *mapping)
{
tree->state = RB_ROOT;
- INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
tree->ops = NULL;
tree->dirty_bytes = 0;
spin_lock_init(&tree->lock);
- spin_lock_init(&tree->buffer_lock);
tree->mapping = mapping;
}
@@ -223,13 +229,24 @@ void free_extent_state(struct extent_state *state)
}
}
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
- struct rb_node *node)
+static struct rb_node *tree_insert(struct rb_root *root,
+ struct rb_node *search_start,
+ u64 offset,
+ struct rb_node *node,
+ struct rb_node ***p_in,
+ struct rb_node **parent_in)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p;
struct rb_node *parent = NULL;
struct tree_entry *entry;
+ if (p_in && parent_in) {
+ p = *p_in;
+ parent = *parent_in;
+ goto do_insert;
+ }
+
+ p = search_start ? &search_start : &root->rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct tree_entry, rb_node);
@@ -242,35 +259,43 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
return parent;
}
+do_insert:
rb_link_node(node, parent, p);
rb_insert_color(node, root);
return NULL;
}
static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
- struct rb_node **prev_ret,
- struct rb_node **next_ret)
+ struct rb_node **prev_ret,
+ struct rb_node **next_ret,
+ struct rb_node ***p_ret,
+ struct rb_node **parent_ret)
{
struct rb_root *root = &tree->state;
- struct rb_node *n = root->rb_node;
+ struct rb_node **n = &root->rb_node;
struct rb_node *prev = NULL;
struct rb_node *orig_prev = NULL;
struct tree_entry *entry;
struct tree_entry *prev_entry = NULL;
- while (n) {
- entry = rb_entry(n, struct tree_entry, rb_node);
- prev = n;
+ while (*n) {
+ prev = *n;
+ entry = rb_entry(prev, struct tree_entry, rb_node);
prev_entry = entry;
if (offset < entry->start)
- n = n->rb_left;
+ n = &(*n)->rb_left;
else if (offset > entry->end)
- n = n->rb_right;
+ n = &(*n)->rb_right;
else
- return n;
+ return *n;
}
+ if (p_ret)
+ *p_ret = n;
+ if (parent_ret)
+ *parent_ret = prev;
+
if (prev_ret) {
orig_prev = prev;
while (prev && offset > prev_entry->end) {
@@ -292,18 +317,27 @@ static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
return NULL;
}
-static inline struct rb_node *tree_search(struct extent_io_tree *tree,
- u64 offset)
+static inline struct rb_node *
+tree_search_for_insert(struct extent_io_tree *tree,
+ u64 offset,
+ struct rb_node ***p_ret,
+ struct rb_node **parent_ret)
{
struct rb_node *prev = NULL;
struct rb_node *ret;
- ret = __etree_search(tree, offset, &prev, NULL);
+ ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
if (!ret)
return prev;
return ret;
}
+static inline struct rb_node *tree_search(struct extent_io_tree *tree,
+ u64 offset)
+{
+ return tree_search_for_insert(tree, offset, NULL, NULL);
+}
+
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
@@ -385,23 +419,25 @@ static void set_state_bits(struct extent_io_tree *tree,
*/
static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
+ struct rb_node ***p,
+ struct rb_node **parent,
unsigned long *bits)
{
struct rb_node *node;
if (end < start)
- WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
+ WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
end, start);
state->start = start;
state->end = end;
set_state_bits(tree, state, bits);
- node = tree_insert(&tree->state, end, &state->rb_node);
+ node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
struct extent_state *found;
found = rb_entry(node, struct extent_state, rb_node);
- printk(KERN_ERR "btrfs found node %llu %llu on insert of "
+ printk(KERN_ERR "BTRFS: found node %llu %llu on insert of "
"%llu %llu\n",
found->start, found->end, start, end);
return -EEXIST;
@@ -444,7 +480,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
prealloc->state = orig->state;
orig->start = split;
- node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
+ node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
+ &prealloc->rb_node, NULL, NULL);
if (node) {
free_extent_state(prealloc);
return -EEXIST;
@@ -542,7 +579,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int err;
int clear = 0;
- btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+ btrfs_debug_check_extent_io_range(tree, start, end);
if (bits & EXTENT_DELALLOC)
bits |= EXTENT_NORESERVE;
@@ -702,7 +739,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *state;
struct rb_node *node;
- btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+ btrfs_debug_check_extent_io_range(tree, start, end);
spin_lock(&tree->lock);
again:
@@ -712,6 +749,7 @@ again:
* our range starts
*/
node = tree_search(tree, start);
+process_node:
if (!node)
break;
@@ -732,7 +770,10 @@ again:
if (start > end)
break;
- cond_resched_lock(&tree->lock);
+ if (!cond_resched_lock(&tree->lock)) {
+ node = rb_next(node);
+ goto process_node;
+ }
}
out:
spin_unlock(&tree->lock);
@@ -783,11 +824,13 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
+ struct rb_node **p;
+ struct rb_node *parent;
int err = 0;
u64 last_start;
u64 last_end;
- btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+ btrfs_debug_check_extent_io_range(tree, start, end);
bits |= EXTENT_FIRST_DELALLOC;
again:
@@ -809,14 +852,16 @@ again:
* this search will find all the extents that end after
* our range starts.
*/
- node = tree_search(tree, start);
+ node = tree_search_for_insert(tree, start, &p, &parent);
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
- err = insert_state(tree, prealloc, start, end, &bits);
+ err = insert_state(tree, prealloc, start, end,
+ &p, &parent, &bits);
if (err)
extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
prealloc = NULL;
goto out;
}
@@ -919,7 +964,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- &bits);
+ NULL, NULL, &bits);
if (err)
extent_io_tree_panic(tree, err);
@@ -1005,11 +1050,13 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
+ struct rb_node **p;
+ struct rb_node *parent;
int err = 0;
u64 last_start;
u64 last_end;
- btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+ btrfs_debug_check_extent_io_range(tree, start, end);
again:
if (!prealloc && (mask & __GFP_WAIT)) {
@@ -1032,17 +1079,19 @@ again:
* this search will find all the extents that end after
* our range starts.
*/
- node = tree_search(tree, start);
+ node = tree_search_for_insert(tree, start, &p, &parent);
if (!node) {
prealloc = alloc_extent_state_atomic(prealloc);
if (!prealloc) {
err = -ENOMEM;
goto out;
}
- err = insert_state(tree, prealloc, start, end, &bits);
- prealloc = NULL;
+ err = insert_state(tree, prealloc, start, end,
+ &p, &parent, &bits);
if (err)
extent_io_tree_panic(tree, err);
+ cache_state(prealloc, cached_state);
+ prealloc = NULL;
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
@@ -1135,7 +1184,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- &bits);
+ NULL, NULL, &bits);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1490,10 +1539,8 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
cur_start = state->end + 1;
node = rb_next(node);
total_bytes += state->end - state->start + 1;
- if (total_bytes >= max_bytes) {
- *end = *start + max_bytes - 1;
+ if (total_bytes >= max_bytes)
break;
- }
if (!node)
break;
}
@@ -1599,11 +1646,10 @@ done:
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-static noinline u64 find_lock_delalloc_range(struct inode *inode,
- struct extent_io_tree *tree,
- struct page *locked_page,
- u64 *start, u64 *end,
- u64 max_bytes)
+STATIC u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
{
u64 delalloc_start;
u64 delalloc_end;
@@ -1635,10 +1681,9 @@ again:
/*
* make sure to limit the number of pages we try to lock down
- * if we're looping.
*/
- if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
- delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ if (delalloc_end + 1 - delalloc_start > max_bytes)
+ delalloc_end = delalloc_start + max_bytes - 1;
/* step two, lock all the pages after the page that has start */
ret = lock_delalloc_pages(inode, locked_page,
@@ -1648,9 +1693,9 @@ again:
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
+ cached_state = NULL;
if (!loops) {
- unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
- max_bytes = PAGE_CACHE_SIZE - offset;
+ max_bytes = PAGE_CACHE_SIZE;
loops = 1;
goto again;
} else {
@@ -1744,10 +1789,8 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 last = 0;
int found = 0;
- if (search_end <= cur_start) {
- WARN_ON(1);
+ if (WARN_ON(search_end <= cur_start))
return 0;
- }
spin_lock(&tree->lock);
if (cur_start == 0 && bits == EXTENT_DIRTY) {
@@ -1959,11 +2002,6 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
return err;
}
-static void repair_io_failure_callback(struct bio *bio, int err)
-{
- complete(bio->bi_private);
-}
-
/*
* this bypasses the standard btrfs submit functions deliberately, as
* the standard behavior is to write all copies in a raid setup. here we only
@@ -1980,13 +2018,13 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
{
struct bio *bio;
struct btrfs_device *dev;
- DECLARE_COMPLETION_ONSTACK(compl);
u64 map_length = 0;
u64 sector;
struct btrfs_bio *bbio = NULL;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
int ret;
+ ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
BUG_ON(!mirror_num);
/* we can't repair anything in raid56 yet */
@@ -1996,9 +2034,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
if (!bio)
return -EIO;
- bio->bi_private = &compl;
- bio->bi_end_io = repair_io_failure_callback;
- bio->bi_size = 0;
+ bio->bi_iter.bi_size = 0;
map_length = length;
ret = btrfs_map_block(fs_info, WRITE, logical,
@@ -2009,7 +2045,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
}
BUG_ON(mirror_num != bbio->mirror_num);
sector = bbio->stripes[mirror_num-1].physical >> 9;
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[mirror_num-1].dev;
kfree(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
@@ -2018,19 +2054,18 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
}
bio->bi_bdev = dev->bdev;
bio_add_page(bio, page, length, start - page_offset(page));
- btrfsic_submit_bio(WRITE_SYNC, bio);
- wait_for_completion(&compl);
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+ if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
/* try to remap that extent elsewhere? */
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
- "(dev %s sector %llu)\n", page->mapping->host->i_ino,
- start, rcu_str_deref(dev->name), sector);
+ printk_ratelimited_in_rcu(KERN_INFO
+ "BTRFS: read error corrected: ino %lu off %llu "
+ "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+ start, rcu_str_deref(dev->name), sector);
bio_put(bio);
return 0;
@@ -2043,6 +2078,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
int ret = 0;
+ if (root->fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
@@ -2064,12 +2102,12 @@ static int clean_io_failure(u64 start, struct page *page)
u64 private;
u64 private_failure;
struct io_failure_record *failrec;
- struct btrfs_fs_info *fs_info;
+ struct inode *inode = page->mapping->host;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct extent_state *state;
int num_copies;
int did_repair = 0;
int ret;
- struct inode *inode = page->mapping->host;
private = 0;
ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
@@ -2092,6 +2130,8 @@ static int clean_io_failure(u64 start, struct page *page)
did_repair = 1;
goto out;
}
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ goto out;
spin_lock(&BTRFS_I(inode)->io_tree.lock);
state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
@@ -2101,7 +2141,6 @@ static int clean_io_failure(u64 start, struct page *page)
if (state && state->start <= failrec->start &&
state->end >= failrec->start + failrec->len - 1) {
- fs_info = BTRFS_I(inode)->root->fs_info;
num_copies = btrfs_num_copies(fs_info, failrec->logical,
failrec->len);
if (num_copies > 1) {
@@ -2168,7 +2207,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
return -EIO;
}
- if (em->start > start || em->start + em->len < start) {
+ if (em->start > start || em->start + em->len <= start) {
free_extent_map(em);
em = NULL;
}
@@ -2280,9 +2319,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
return -EIO;
}
bio->bi_end_io = failed_bio->bi_end_io;
- bio->bi_sector = failrec->logical >> 9;
+ bio->bi_iter.bi_sector = failrec->logical >> 9;
bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
- bio->bi_size = 0;
+ bio->bi_iter.bi_size = 0;
btrfs_failed_bio = btrfs_io_bio(failed_bio);
if (btrfs_failed_bio->csum) {
@@ -2315,7 +2354,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
{
int uptodate = (err == 0);
struct extent_io_tree *tree;
- int ret;
+ int ret = 0;
tree = &BTRFS_I(page->mapping->host)->io_tree;
@@ -2329,6 +2368,8 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
if (!uptodate) {
ClearPageUptodate(page);
SetPageError(page);
+ ret = ret < 0 ? ret : -EIO;
+ mapping_set_error(page->mapping, ret);
}
return 0;
}
@@ -2344,37 +2385,39 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
*/
static void end_bio_extent_writepage(struct bio *bio, int err)
{
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct extent_io_tree *tree;
+ struct bio_vec *bvec;
u64 start;
u64 end;
+ int i;
- do {
+ bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- tree = &BTRFS_I(page->mapping->host)->io_tree;
/* We always issue full-page reads, but if some block
* in a page fails to read, blk_update_request() will
* advance bv_offset and adjust bv_len to compensate.
* Print a warning for nonzero offsets, and an error
* if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
- printk("%s page write in btrfs with offset %u and length %u\n",
- bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
- ? KERN_ERR "partial" : KERN_INFO "incomplete",
- bvec->bv_offset, bvec->bv_len);
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+ "partial page write in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else
+ btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+ "incomplete page write in btrfs with offset %u and "
+ "length %u",
+ bvec->bv_offset, bvec->bv_len);
+ }
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
if (end_extent_writepage(page, err, start, end))
continue;
end_page_writeback(page);
- } while (bvec >= bio->bi_io_vec);
+ }
bio_put(bio);
}
@@ -2404,9 +2447,8 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
*/
static void end_bio_extent_readpage(struct bio *bio, int err)
{
+ struct bio_vec *bvec;
int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct bio_vec *bvec = bio->bi_io_vec;
struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
struct extent_io_tree *tree;
u64 offset = 0;
@@ -2417,16 +2459,17 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
u64 extent_len = 0;
int mirror;
int ret;
+ int i;
if (err)
uptodate = 0;
- do {
+ bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct inode *inode = page->mapping->host;
pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
- "mirror=%lu\n", (u64)bio->bi_sector, err,
+ "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err,
io_bio->mirror_num);
tree = &BTRFS_I(inode)->io_tree;
@@ -2435,19 +2478,22 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
* advance bv_offset and adjust bv_len to compensate.
* Print a warning for nonzero offsets, and an error
* if they don't add up to a full page. */
- if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
- printk("%s page read in btrfs with offset %u and length %u\n",
- bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
- ? KERN_ERR "partial" : KERN_INFO "incomplete",
- bvec->bv_offset, bvec->bv_len);
+ if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
+ if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
+ btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
+ "partial page read in btrfs with offset %u and length %u",
+ bvec->bv_offset, bvec->bv_len);
+ else
+ btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
+ "incomplete page read in btrfs with offset %u and "
+ "length %u",
+ bvec->bv_offset, bvec->bv_len);
+ }
start = page_offset(page);
end = start + bvec->bv_offset + bvec->bv_len - 1;
len = bvec->bv_len;
- if (++bvec <= bvec_end)
- prefetchw(&bvec->bv_page->flags);
-
mirror = io_bio->mirror_num;
if (likely(uptodate && tree->ops &&
tree->ops->readpage_end_io_hook)) {
@@ -2528,7 +2574,7 @@ readpage_ok:
extent_start = start;
extent_len = end + 1 - start;
}
- } while (bvec <= bvec_end);
+ }
if (extent_len)
endio_readpage_release_extent(tree, extent_start, extent_len,
@@ -2559,9 +2605,8 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
}
if (bio) {
- bio->bi_size = 0;
bio->bi_bdev = bdev;
- bio->bi_sector = first_sector;
+ bio->bi_iter.bi_sector = first_sector;
btrfs_bio = btrfs_io_bio(bio);
btrfs_bio->csum = NULL;
btrfs_bio->csum_allocated = NULL;
@@ -2655,7 +2700,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
if (bio_ret && *bio_ret) {
bio = *bio_ret;
if (old_compressed)
- contig = bio->bi_sector == sector;
+ contig = bio->bi_iter.bi_sector == sector;
else
contig = bio_end_sector(bio) == sector;
@@ -2722,7 +2767,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
if (em_cached && *em_cached) {
em = *em_cached;
- if (em->in_tree && start >= em->start &&
+ if (extent_map_in_tree(em) && start >= em->start &&
start < extent_map_end(em)) {
atomic_inc(&em->refs);
return em;
@@ -3056,143 +3101,130 @@ static noinline void update_nr_written(struct page *page,
}
/*
- * the writepage semantics are similar to regular writepage. extent
- * records are inserted to lock ranges in the tree, and as dirty areas
- * are found, they are marked writeback. Then the lock bits are removed
- * and the end_io handler clears the writeback ranges
+ * helper for __extent_writepage, doing all of the delayed allocation setup.
+ *
+ * This returns 1 if our fill_delalloc function did all the work required
+ * to write the page (copy into inline extent). In this case the IO has
+ * been started and the page is already unlocked.
+ *
+ * This returns 0 if all went well (page still locked)
+ * This returns < 0 if there were errors (page still locked)
*/
-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
- void *data)
+static noinline_for_stack int writepage_delalloc(struct inode *inode,
+ struct page *page, struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ u64 delalloc_start,
+ unsigned long *nr_written)
+{
+ struct extent_io_tree *tree = epd->tree;
+ u64 page_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+ u64 nr_delalloc;
+ u64 delalloc_to_write = 0;
+ u64 delalloc_end = 0;
+ int ret;
+ int page_started = 0;
+
+ if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
+ return 0;
+
+ while (delalloc_end < page_end) {
+ nr_delalloc = find_lock_delalloc_range(inode, tree,
+ page,
+ &delalloc_start,
+ &delalloc_end,
+ 128 * 1024 * 1024);
+ if (nr_delalloc == 0) {
+ delalloc_start = delalloc_end + 1;
+ continue;
+ }
+ ret = tree->ops->fill_delalloc(inode, page,
+ delalloc_start,
+ delalloc_end,
+ &page_started,
+ nr_written);
+ /* File system has been set read-only */
+ if (ret) {
+ SetPageError(page);
+ /* fill_delalloc should be return < 0 for error
+ * but just in case, we use > 0 here meaning the
+ * IO is started, so we don't want to return > 0
+ * unless things are going well.
+ */
+ ret = ret < 0 ? ret : -EIO;
+ goto done;
+ }
+ /*
+ * delalloc_end is already one less than the total
+ * length, so we don't subtract one from
+ * PAGE_CACHE_SIZE
+ */
+ delalloc_to_write += (delalloc_end - delalloc_start +
+ PAGE_CACHE_SIZE) >>
+ PAGE_CACHE_SHIFT;
+ delalloc_start = delalloc_end + 1;
+ }
+ if (wbc->nr_to_write < delalloc_to_write) {
+ int thresh = 8192;
+
+ if (delalloc_to_write < thresh * 2)
+ thresh = delalloc_to_write;
+ wbc->nr_to_write = min_t(u64, delalloc_to_write,
+ thresh);
+ }
+
+ /* did the fill delalloc function already unlock and start
+ * the IO?
+ */
+ if (page_started) {
+ /*
+ * we've unlocked the page, so we can't update
+ * the mapping's writeback index, just update
+ * nr_to_write.
+ */
+ wbc->nr_to_write -= *nr_written;
+ return 1;
+ }
+
+ ret = 0;
+
+done:
+ return ret;
+}
+
+/*
+ * helper for __extent_writepage. This calls the writepage start hooks,
+ * and does the loop to map the page into extents and bios.
+ *
+ * We return 1 if the IO is started and the page is unlocked,
+ * 0 if all went well (page still locked)
+ * < 0 if there were errors (page still locked)
+ */
+static noinline_for_stack int __extent_writepage_io(struct inode *inode,
+ struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd,
+ loff_t i_size,
+ unsigned long nr_written,
+ int write_flags, int *nr_ret)
{
- struct inode *inode = page->mapping->host;
- struct extent_page_data *epd = data;
struct extent_io_tree *tree = epd->tree;
u64 start = page_offset(page);
- u64 delalloc_start;
u64 page_end = start + PAGE_CACHE_SIZE - 1;
u64 end;
u64 cur = start;
u64 extent_offset;
- u64 last_byte = i_size_read(inode);
u64 block_start;
u64 iosize;
sector_t sector;
struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
- int ret;
- int nr = 0;
size_t pg_offset = 0;
size_t blocksize;
- loff_t i_size = i_size_read(inode);
- unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
- u64 nr_delalloc;
- u64 delalloc_end;
- int page_started;
- int compressed;
- int write_flags;
- unsigned long nr_written = 0;
- bool fill_delalloc = true;
-
- if (wbc->sync_mode == WB_SYNC_ALL)
- write_flags = WRITE_SYNC;
- else
- write_flags = WRITE;
-
- trace___extent_writepage(page, inode, wbc);
-
- WARN_ON(!PageLocked(page));
-
- ClearPageError(page);
-
- pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
- if (page->index > end_index ||
- (page->index == end_index && !pg_offset)) {
- page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
- unlock_page(page);
- return 0;
- }
-
- if (page->index == end_index) {
- char *userpage;
-
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0,
- PAGE_CACHE_SIZE - pg_offset);
- kunmap_atomic(userpage);
- flush_dcache_page(page);
- }
- pg_offset = 0;
-
- set_page_extent_mapped(page);
-
- if (!tree->ops || !tree->ops->fill_delalloc)
- fill_delalloc = false;
-
- delalloc_start = start;
- delalloc_end = 0;
- page_started = 0;
- if (!epd->extent_locked && fill_delalloc) {
- u64 delalloc_to_write = 0;
- /*
- * make sure the wbc mapping index is at least updated
- * to this page.
- */
- update_nr_written(page, wbc, 0);
-
- while (delalloc_end < page_end) {
- nr_delalloc = find_lock_delalloc_range(inode, tree,
- page,
- &delalloc_start,
- &delalloc_end,
- 128 * 1024 * 1024);
- if (nr_delalloc == 0) {
- delalloc_start = delalloc_end + 1;
- continue;
- }
- ret = tree->ops->fill_delalloc(inode, page,
- delalloc_start,
- delalloc_end,
- &page_started,
- &nr_written);
- /* File system has been set read-only */
- if (ret) {
- SetPageError(page);
- goto done;
- }
- /*
- * delalloc_end is already one less than the total
- * length, so we don't subtract one from
- * PAGE_CACHE_SIZE
- */
- delalloc_to_write += (delalloc_end - delalloc_start +
- PAGE_CACHE_SIZE) >>
- PAGE_CACHE_SHIFT;
- delalloc_start = delalloc_end + 1;
- }
- if (wbc->nr_to_write < delalloc_to_write) {
- int thresh = 8192;
-
- if (delalloc_to_write < thresh * 2)
- thresh = delalloc_to_write;
- wbc->nr_to_write = min_t(u64, delalloc_to_write,
- thresh);
- }
+ int ret = 0;
+ int nr = 0;
+ bool compressed;
- /* did the fill delalloc function already unlock and start
- * the IO?
- */
- if (page_started) {
- ret = 0;
- /*
- * we've unlocked the page, so we can't update
- * the mapping's writeback index, just update
- * nr_to_write.
- */
- wbc->nr_to_write -= nr_written;
- goto done_unlocked;
- }
- }
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
@@ -3202,9 +3234,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
wbc->pages_skipped++;
else
redirty_page_for_writepage(wbc, page);
+
update_nr_written(page, wbc, nr_written);
unlock_page(page);
- ret = 0;
+ ret = 1;
goto done_unlocked;
}
}
@@ -3216,7 +3249,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
- if (last_byte <= start) {
+ if (i_size <= start) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
@@ -3226,7 +3259,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
- if (cur >= last_byte) {
+ u64 em_end;
+ if (cur >= i_size) {
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
@@ -3236,13 +3270,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
end - cur + 1, 1);
if (IS_ERR_OR_NULL(em)) {
SetPageError(page);
+ ret = PTR_ERR_OR_ZERO(em);
break;
}
extent_offset = cur - em->start;
- BUG_ON(extent_map_end(em) <= cur);
+ em_end = extent_map_end(em);
+ BUG_ON(em_end <= cur);
BUG_ON(end < cur);
- iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ iosize = min(em_end - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
sector = (em->block_start + extent_offset) >> 9;
bdev = em->bdev;
@@ -3278,13 +3314,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
pg_offset += iosize;
continue;
}
- /* leave this out until we have a page_mkwrite call */
- if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
- EXTENT_DIRTY, 0, NULL)) {
- cur = cur + iosize;
- pg_offset += iosize;
- continue;
- }
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
@@ -3295,12 +3324,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (ret) {
SetPageError(page);
} else {
- unsigned long max_nr = end_index + 1;
+ unsigned long max_nr = (i_size >> PAGE_CACHE_SHIFT) + 1;
set_range_writeback(tree, cur, cur + iosize - 1);
if (!PageWriteback(page)) {
- printk(KERN_ERR "btrfs warning page %lu not "
- "writeback, cur %llu end %llu\n",
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "page %lu not writeback, cur %llu end %llu",
page->index, cur, end);
}
@@ -3317,17 +3346,94 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
nr++;
}
done:
+ *nr_ret = nr;
+
+done_unlocked:
+
+ /* drop our reference on any cached states */
+ free_extent_state(cached_state);
+ return ret;
+}
+
+/*
+ * the writepage semantics are similar to regular writepage. extent
+ * records are inserted to lock ranges in the tree, and as dirty areas
+ * are found, they are marked writeback. Then the lock bits are removed
+ * and the end_io handler clears the writeback ranges
+ */
+static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ void *data)
+{
+ struct inode *inode = page->mapping->host;
+ struct extent_page_data *epd = data;
+ u64 start = page_offset(page);
+ u64 page_end = start + PAGE_CACHE_SIZE - 1;
+ int ret;
+ int nr = 0;
+ size_t pg_offset = 0;
+ loff_t i_size = i_size_read(inode);
+ unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
+ int write_flags;
+ unsigned long nr_written = 0;
+
+ if (wbc->sync_mode == WB_SYNC_ALL)
+ write_flags = WRITE_SYNC;
+ else
+ write_flags = WRITE;
+
+ trace___extent_writepage(page, inode, wbc);
+
+ WARN_ON(!PageLocked(page));
+
+ ClearPageError(page);
+
+ pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
+ if (page->index > end_index ||
+ (page->index == end_index && !pg_offset)) {
+ page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
+ unlock_page(page);
+ return 0;
+ }
+
+ if (page->index == end_index) {
+ char *userpage;
+
+ userpage = kmap_atomic(page);
+ memset(userpage + pg_offset, 0,
+ PAGE_CACHE_SIZE - pg_offset);
+ kunmap_atomic(userpage);
+ flush_dcache_page(page);
+ }
+
+ pg_offset = 0;
+
+ set_page_extent_mapped(page);
+
+ ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
+ if (ret == 1)
+ goto done_unlocked;
+ if (ret)
+ goto done;
+
+ ret = __extent_writepage_io(inode, page, wbc, epd,
+ i_size, nr_written, write_flags, &nr);
+ if (ret == 1)
+ goto done_unlocked;
+
+done:
if (nr == 0) {
/* make sure the mapping tag for page dirty gets cleared */
set_page_writeback(page);
end_page_writeback(page);
}
+ if (PageError(page)) {
+ ret = ret < 0 ? ret : -EIO;
+ end_extent_writepage(page, ret, start, page_end);
+ }
unlock_page(page);
+ return ret;
done_unlocked:
-
- /* drop our reference on any cached states */
- free_extent_state(cached_state);
return 0;
}
@@ -3343,9 +3449,10 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
TASK_UNINTERRUPTIBLE);
}
-static int lock_extent_buffer_for_io(struct extent_buffer *eb,
- struct btrfs_fs_info *fs_info,
- struct extent_page_data *epd)
+static noinline_for_stack int
+lock_extent_buffer_for_io(struct extent_buffer *eb,
+ struct btrfs_fs_info *fs_info,
+ struct extent_page_data *epd)
{
unsigned long i, num_pages;
int flush = 0;
@@ -3416,26 +3523,24 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
static void end_extent_buffer_writeback(struct extent_buffer *eb)
{
clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
}
static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
{
- int uptodate = err == 0;
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec;
struct extent_buffer *eb;
- int done;
+ int i, done;
- do {
+ bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- bvec--;
eb = (struct extent_buffer *)page->private;
BUG_ON(!eb);
done = atomic_dec_and_test(&eb->io_pages);
- if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+ if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
ClearPageUptodate(page);
SetPageError(page);
@@ -3447,18 +3552,18 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
continue;
end_extent_buffer_writeback(eb);
- } while (bvec >= bio->bi_io_vec);
+ }
bio_put(bio);
-
}
-static int write_one_eb(struct extent_buffer *eb,
+static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct btrfs_fs_info *fs_info,
struct writeback_control *wbc,
struct extent_page_data *epd)
{
struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+ struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
u64 offset = eb->start;
unsigned long i, num_pages;
unsigned long bio_flags = 0;
@@ -3476,7 +3581,7 @@ static int write_one_eb(struct extent_buffer *eb,
clear_page_dirty_for_io(p);
set_page_writeback(p);
- ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+ ret = submit_extent_page(rw, tree, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
0, epd->bio_flags, bio_flags);
@@ -3573,9 +3678,8 @@ retry:
* but no sense in crashing the users box for something
* we can survive anyway.
*/
- if (!eb) {
+ if (WARN_ON(!eb)) {
spin_unlock(&mapping->private_lock);
- WARN_ON(1);
continue;
}
@@ -3651,6 +3755,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
struct inode *inode = mapping->host;
int ret = 0;
int done = 0;
+ int err = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
@@ -3737,8 +3842,8 @@ retry:
unlock_page(page);
ret = 0;
}
- if (ret)
- done = 1;
+ if (!err && ret < 0)
+ err = ret;
/*
* the filesystem may choose to bump up nr_to_write.
@@ -3750,7 +3855,7 @@ retry:
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done) {
+ if (!scanned && !done && !err) {
/*
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
@@ -3760,7 +3865,7 @@ retry:
goto retry;
}
btrfs_add_delayed_iput(inode);
- return ret;
+ return err;
}
static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4042,7 +4147,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
if (offset >= last)
return NULL;
- while(1) {
+ while (1) {
len = last - offset;
if (len == 0)
break;
@@ -4066,6 +4171,19 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
}
+static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
+{
+ unsigned long cnt = *((unsigned long *)ctx);
+
+ cnt++;
+ *((unsigned long *)ctx) = cnt;
+
+ /* Now we're sure that the extent is shared. */
+ if (cnt > 1)
+ return 1;
+ return 0;
+}
+
int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len, get_extent_t *get_extent)
{
@@ -4082,12 +4200,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
struct btrfs_path *path;
- struct btrfs_file_extent_item *item;
int end = 0;
u64 em_start = 0;
u64 em_len = 0;
u64 em_end = 0;
- unsigned long emflags;
if (len == 0)
return -EINVAL;
@@ -4112,8 +4228,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
WARN_ON(!ret);
path->slots[0]--;
- item = btrfs_item_ptr(path->nodes[0], path->slots[0],
- struct btrfs_file_extent_item);
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
found_type = btrfs_key_type(&found_key);
@@ -4132,7 +4246,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last = found_key.offset;
last_for_get_extent = last + 1;
}
- btrfs_free_path(path);
+ btrfs_release_path(path);
/*
* we might have some extents allocated but more delalloc past those
@@ -4181,7 +4295,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
offset_in_extent = em_start - em->start;
em_end = extent_map_end(em);
em_len = em_end - em_start;
- emflags = em->flags;
disko = 0;
flags = 0;
@@ -4202,7 +4315,24 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
flags |= (FIEMAP_EXTENT_DELALLOC |
FIEMAP_EXTENT_UNKNOWN);
} else {
+ unsigned long ref_cnt = 0;
+
disko = em->block_start + offset_in_extent;
+
+ /*
+ * As btrfs supports shared space, this information
+ * can be exported to userspace tools via
+ * flag FIEMAP_EXTENT_SHARED.
+ */
+ ret = iterate_inodes_from_logical(
+ em->block_start,
+ BTRFS_I(inode)->root->fs_info,
+ path, count_ext_ref, &ref_cnt);
+ if (ret < 0 && ret != -ENOENT)
+ goto out_free;
+
+ if (ref_cnt > 1)
+ flags |= FIEMAP_EXTENT_SHARED;
}
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
flags |= FIEMAP_EXTENT_ENCODED;
@@ -4234,6 +4364,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
out_free:
free_extent_map(em);
out:
+ btrfs_free_path(path);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
&cached_state, GFP_NOFS);
return ret;
@@ -4245,7 +4376,7 @@ static void __free_extent_buffer(struct extent_buffer *eb)
kmem_cache_free(extent_buffer_cache, eb);
}
-static int extent_buffer_under_io(struct extent_buffer *eb)
+int extent_buffer_under_io(struct extent_buffer *eb)
{
return (atomic_read(&eb->io_pages) ||
test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
@@ -4315,10 +4446,9 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
__free_extent_buffer(eb);
}
-static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
- u64 start,
- unsigned long len,
- gfp_t mask)
+static struct extent_buffer *
+__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
+ unsigned long len, gfp_t mask)
{
struct extent_buffer *eb = NULL;
@@ -4327,7 +4457,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
return NULL;
eb->start = start;
eb->len = len;
- eb->tree = tree;
+ eb->fs_info = fs_info;
eb->bflags = 0;
rwlock_init(&eb->lock);
atomic_set(&eb->write_locks, 0);
@@ -4446,7 +4576,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb,
+ struct page *accessed)
{
unsigned long num_pages, i;
@@ -4455,11 +4586,77 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb)
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
struct page *p = extent_buffer_page(eb, i);
- mark_page_accessed(p);
+ if (p != accessed)
+ mark_page_accessed(p);
}
}
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
+{
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len)
+{
+ struct extent_buffer *eb, *exists = NULL;
+ int ret;
+
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
+ return eb;
+ eb = alloc_dummy_extent_buffer(start, len);
+ if (!eb)
+ return NULL;
+ eb->fs_info = fs_info;
+again:
+ ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+ if (ret)
+ goto free_eb;
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
+ if (ret == -EEXIST) {
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
+ goto again;
+ }
+ check_buffer_tree_ref(eb);
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
+
+ /*
+ * We will free dummy extent buffer's if they come into
+ * free_extent_buffer with a ref count of 2, but if we are using this we
+ * want the buffers to stay in memory until we're done with them, so
+ * bump the ref count again.
+ */
+ atomic_inc(&eb->refs);
+ return eb;
+free_eb:
+ btrfs_release_extent_buffer(eb);
+ return exists;
+}
+#endif
+
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
unsigned long num_pages = num_extent_pages(start, len);
@@ -4468,20 +4665,15 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
struct extent_buffer *eb;
struct extent_buffer *exists = NULL;
struct page *p;
- struct address_space *mapping = tree->mapping;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
int uptodate = 1;
int ret;
- rcu_read_lock();
- eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- mark_extent_buffer_accessed(eb);
+ eb = find_extent_buffer(fs_info, start);
+ if (eb)
return eb;
- }
- rcu_read_unlock();
- eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
+ eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
if (!eb)
return NULL;
@@ -4504,7 +4696,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
spin_unlock(&mapping->private_lock);
unlock_page(p);
page_cache_release(p);
- mark_extent_buffer_accessed(exists);
+ mark_extent_buffer_accessed(exists, p);
goto free_eb;
}
@@ -4519,7 +4711,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
attach_extent_buffer_page(eb, p);
spin_unlock(&mapping->private_lock);
WARN_ON(PageDirty(p));
- mark_page_accessed(p);
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -4536,26 +4727,21 @@ again:
if (ret)
goto free_eb;
- spin_lock(&tree->buffer_lock);
- ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
+ spin_lock(&fs_info->buffer_lock);
+ ret = radix_tree_insert(&fs_info->buffer_radix,
+ start >> PAGE_CACHE_SHIFT, eb);
+ spin_unlock(&fs_info->buffer_lock);
+ radix_tree_preload_end();
if (ret == -EEXIST) {
- exists = radix_tree_lookup(&tree->buffer,
- start >> PAGE_CACHE_SHIFT);
- if (!atomic_inc_not_zero(&exists->refs)) {
- spin_unlock(&tree->buffer_lock);
- radix_tree_preload_end();
- exists = NULL;
+ exists = find_extent_buffer(fs_info, start);
+ if (exists)
+ goto free_eb;
+ else
goto again;
- }
- spin_unlock(&tree->buffer_lock);
- radix_tree_preload_end();
- mark_extent_buffer_accessed(exists);
- goto free_eb;
}
/* add one reference for the tree */
check_buffer_tree_ref(eb);
- spin_unlock(&tree->buffer_lock);
- radix_tree_preload_end();
+ set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
/*
* there is a race where release page may have
@@ -4586,23 +4772,6 @@ free_eb:
return exists;
}
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len)
-{
- struct extent_buffer *eb;
-
- rcu_read_lock();
- eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- mark_extent_buffer_accessed(eb);
- return eb;
- }
- rcu_read_unlock();
-
- return NULL;
-}
-
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
{
struct extent_buffer *eb =
@@ -4616,17 +4785,17 @@ static int release_extent_buffer(struct extent_buffer *eb)
{
WARN_ON(atomic_read(&eb->refs) == 0);
if (atomic_dec_and_test(&eb->refs)) {
- if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
- spin_unlock(&eb->refs_lock);
- } else {
- struct extent_io_tree *tree = eb->tree;
+ if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
+ struct btrfs_fs_info *fs_info = eb->fs_info;
spin_unlock(&eb->refs_lock);
- spin_lock(&tree->buffer_lock);
- radix_tree_delete(&tree->buffer,
+ spin_lock(&fs_info->buffer_lock);
+ radix_tree_delete(&fs_info->buffer_radix,
eb->start >> PAGE_CACHE_SHIFT);
- spin_unlock(&tree->buffer_lock);
+ spin_unlock(&fs_info->buffer_lock);
+ } else {
+ spin_unlock(&eb->refs_lock);
}
/* Should be safe to release our pages at this point */
@@ -4899,6 +5068,43 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
}
}
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
+ unsigned long start,
+ unsigned long len)
+{
+ size_t cur;
+ size_t offset;
+ struct page *page;
+ char *kaddr;
+ char __user *dst = (char __user *)dstv;
+ size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+ unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+ int ret = 0;
+
+ WARN_ON(start > eb->len);
+ WARN_ON(start + len > eb->start + eb->len);
+
+ offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+ while (len > 0) {
+ page = extent_buffer_page(eb, i);
+
+ cur = min(len, (PAGE_CACHE_SIZE - offset));
+ kaddr = page_address(page);
+ if (copy_to_user(dst, kaddr + offset, cur)) {
+ ret = -EFAULT;
+ break;
+ }
+
+ dst += cur;
+ len -= cur;
+ offset = 0;
+ i++;
+ }
+
+ return ret;
+}
+
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
unsigned long min_len, char **map,
unsigned long *map_start,
@@ -5066,23 +5272,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
}
}
-static void move_pages(struct page *dst_page, struct page *src_page,
- unsigned long dst_off, unsigned long src_off,
- unsigned long len)
-{
- char *dst_kaddr = page_address(dst_page);
- if (dst_page == src_page) {
- memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
- } else {
- char *src_kaddr = page_address(src_page);
- char *p = dst_kaddr + dst_off + len;
- char *s = src_kaddr + src_off + len;
-
- while (len--)
- *--p = *--s;
- }
-}
-
static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
{
unsigned long distance = (src > dst) ? src - dst : dst - src;
@@ -5122,12 +5311,12 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+ printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
"len %lu dst len %lu\n", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+ printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
"len %lu dst len %lu\n", dst_offset, len, dst->len);
BUG_ON(1);
}
@@ -5169,12 +5358,12 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
+ printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
"len %lu len %lu\n", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
+ printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
"len %lu len %lu\n", dst_offset, len, dst->len);
BUG_ON(1);
}
@@ -5193,7 +5382,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
cur = min_t(unsigned long, len, src_off_in_page + 1);
cur = min(cur, dst_off_in_page + 1);
- move_pages(extent_buffer_page(dst, dst_i),
+ copy_pages(extent_buffer_page(dst, dst_i),
extent_buffer_page(dst, src_i),
dst_off_in_page - cur + 1,
src_off_in_page - cur + 1, cur);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6dbc645f1f3..ccc264e7bde 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -43,6 +43,7 @@
#define EXTENT_BUFFER_WRITEBACK 7
#define EXTENT_BUFFER_IOERR 8
#define EXTENT_BUFFER_DUMMY 9
+#define EXTENT_BUFFER_IN_TREE 10
/* these are flags for extent_clear_unlock_delalloc */
#define PAGE_UNLOCK (1 << 0)
@@ -94,12 +95,10 @@ struct extent_io_ops {
struct extent_io_tree {
struct rb_root state;
- struct radix_tree_root buffer;
struct address_space *mapping;
u64 dirty_bytes;
int track_uptodate;
spinlock_t lock;
- spinlock_t buffer_lock;
struct extent_io_ops *ops;
};
@@ -130,7 +129,7 @@ struct extent_buffer {
unsigned long map_start;
unsigned long map_len;
unsigned long bflags;
- struct extent_io_tree *tree;
+ struct btrfs_fs_info *fs_info;
spinlock_t refs_lock;
atomic_t refs;
atomic_t io_pages;
@@ -159,7 +158,6 @@ struct extent_buffer {
* to unlock
*/
wait_queue_head_t read_lock_wq;
- wait_queue_head_t lock_wq;
struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
@@ -266,12 +264,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
-struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
+struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len);
struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
-struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
- u64 start, unsigned long len);
+struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
void free_extent_buffer(struct extent_buffer *eb);
void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
@@ -305,6 +303,9 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
+int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst,
+ unsigned long start,
+ unsigned long len);
void write_extent_buffer(struct extent_buffer *eb, const void *src,
unsigned long start, unsigned long len);
void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
@@ -321,6 +322,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_under_io(struct extent_buffer *eb);
int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
unsigned long min_len, char **map,
unsigned long *map_start,
@@ -345,4 +347,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
int mirror_num);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+noinline u64 find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes);
+struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start, unsigned long len);
+#endif
#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a4a7a1a8da9..225302b39af 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,7 +51,7 @@ struct extent_map *alloc_extent_map(void)
em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
if (!em)
return NULL;
- em->in_tree = 0;
+ RB_CLEAR_NODE(&em->rb_node);
em->flags = 0;
em->compress_type = BTRFS_COMPRESS_NONE;
em->generation = 0;
@@ -73,38 +73,64 @@ void free_extent_map(struct extent_map *em)
return;
WARN_ON(atomic_read(&em->refs) == 0);
if (atomic_dec_and_test(&em->refs)) {
- WARN_ON(em->in_tree);
+ WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
+ if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
+ kfree(em->bdev);
kmem_cache_free(extent_map_cache, em);
}
}
-static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
- struct rb_node *node)
+/* simple helper to do math around the end of an extent, handling wrap */
+static u64 range_end(u64 start, u64 len)
+{
+ if (start + len < start)
+ return (u64)-1;
+ return start + len;
+}
+
+static int tree_insert(struct rb_root *root, struct extent_map *em)
{
struct rb_node **p = &root->rb_node;
struct rb_node *parent = NULL;
- struct extent_map *entry;
+ struct extent_map *entry = NULL;
+ struct rb_node *orig_parent = NULL;
+ u64 end = range_end(em->start, em->len);
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- WARN_ON(!entry->in_tree);
-
- if (offset < entry->start)
+ if (em->start < entry->start)
p = &(*p)->rb_left;
- else if (offset >= extent_map_end(entry))
+ else if (em->start >= extent_map_end(entry))
p = &(*p)->rb_right;
else
- return parent;
+ return -EEXIST;
}
- entry = rb_entry(node, struct extent_map, rb_node);
- entry->in_tree = 1;
- rb_link_node(node, parent, p);
- rb_insert_color(node, root);
- return NULL;
+ orig_parent = parent;
+ while (parent && em->start >= extent_map_end(entry)) {
+ parent = rb_next(parent);
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ }
+ if (parent)
+ if (end > entry->start && em->start < extent_map_end(entry))
+ return -EEXIST;
+
+ parent = orig_parent;
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ while (parent && em->start < entry->start) {
+ parent = rb_prev(parent);
+ entry = rb_entry(parent, struct extent_map, rb_node);
+ }
+ if (parent)
+ if (end > entry->start && em->start < extent_map_end(entry))
+ return -EEXIST;
+
+ rb_link_node(&em->rb_node, orig_parent, p);
+ rb_insert_color(&em->rb_node, root);
+ return 0;
}
/*
@@ -126,8 +152,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
prev = n;
prev_entry = entry;
- WARN_ON(!entry->in_tree);
-
if (offset < entry->start)
n = n->rb_left;
else if (offset >= extent_map_end(entry))
@@ -213,12 +237,12 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
- merge->in_tree = 0;
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
rb_erase(&merge->rb_node, &tree->map);
+ RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
}
@@ -228,9 +252,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
- em->block_len += merge->len;
+ em->block_len += merge->block_len;
rb_erase(&merge->rb_node, &tree->map);
- merge->in_tree = 0;
+ RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
free_extent_map(merge);
@@ -292,7 +316,21 @@ out:
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
- if (em->in_tree)
+ if (extent_map_in_tree(em))
+ try_merge_map(tree, em);
+}
+
+static inline void setup_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em,
+ int modified)
+{
+ atomic_inc(&em->refs);
+ em->mod_start = em->start;
+ em->mod_len = em->len;
+
+ if (modified)
+ list_move(&em->list, &tree->modified_extents);
+ else
try_merge_map(tree, em);
}
@@ -310,41 +348,16 @@ int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified)
{
int ret = 0;
- struct rb_node *rb;
- struct extent_map *exist;
- exist = lookup_extent_mapping(tree, em->start, em->len);
- if (exist) {
- free_extent_map(exist);
- ret = -EEXIST;
- goto out;
- }
- rb = tree_insert(&tree->map, em->start, &em->rb_node);
- if (rb) {
- ret = -EEXIST;
+ ret = tree_insert(&tree->map, em);
+ if (ret)
goto out;
- }
- atomic_inc(&em->refs);
- em->mod_start = em->start;
- em->mod_len = em->len;
-
- if (modified)
- list_move(&em->list, &tree->modified_extents);
- else
- try_merge_map(tree, em);
+ setup_extent_mapping(tree, em, modified);
out:
return ret;
}
-/* simple helper to do math around the end of an extent, handling wrap */
-static u64 range_end(u64 start, u64 len)
-{
- if (start + len < start)
- return (u64)-1;
- return start + len;
-}
-
static struct extent_map *
__lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len, int strict)
@@ -424,6 +437,21 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
rb_erase(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
- em->in_tree = 0;
+ RB_CLEAR_NODE(&em->rb_node);
return ret;
}
+
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified)
+{
+ WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ ASSERT(extent_map_in_tree(cur));
+ if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ list_del_init(&cur->list);
+ rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ RB_CLEAR_NODE(&cur->rb_node);
+
+ setup_extent_mapping(tree, new, modified);
+}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 61adc44b780..b2991fd8583 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -3,10 +3,10 @@
#include <linux/rbtree.h>
-#define EXTENT_MAP_LAST_BYTE (u64)-4
-#define EXTENT_MAP_HOLE (u64)-3
-#define EXTENT_MAP_INLINE (u64)-2
-#define EXTENT_MAP_DELALLOC (u64)-1
+#define EXTENT_MAP_LAST_BYTE ((u64)-4)
+#define EXTENT_MAP_HOLE ((u64)-3)
+#define EXTENT_MAP_INLINE ((u64)-2)
+#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the flags field */
#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */
@@ -15,6 +15,7 @@
#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
+#define EXTENT_FLAG_FS_MAPPING 6 /* filesystem extent mapping type */
struct extent_map {
struct rb_node rb_node;
@@ -33,7 +34,6 @@ struct extent_map {
unsigned long flags;
struct block_device *bdev;
atomic_t refs;
- unsigned int in_tree;
unsigned int compress_type;
struct list_head list;
};
@@ -44,6 +44,11 @@ struct extent_map_tree {
rwlock_t lock;
};
+static inline int extent_map_in_tree(const struct extent_map *em)
+{
+ return !RB_EMPTY_NODE(&em->rb_node);
+}
+
static inline u64 extent_map_end(struct extent_map *em)
{
if (em->start + em->len < em->start)
@@ -64,6 +69,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void replace_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *cur,
+ struct extent_map *new,
+ int modified);
struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 4f53159bdb9..f46cfe45d68 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -182,7 +182,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
if (!path)
return -ENOMEM;
- nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits;
+ nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits;
if (!dst) {
if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
@@ -201,7 +201,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
csum = (u8 *)dst;
}
- if (bio->bi_size > PAGE_CACHE_SIZE * 8)
+ if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
path->reada = 2;
WARN_ON(bio->bi_vcnt <= 0);
@@ -217,7 +217,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
path->skip_locking = 1;
}
- disk_bytenr = (u64)bio->bi_sector << 9;
+ disk_bytenr = (u64)bio->bi_iter.bi_sector << 9;
if (dio)
offset = logical_offset;
while (bio_index < bio->bi_vcnt) {
@@ -246,8 +246,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
offset + bvec->bv_len - 1,
EXTENT_NODATASUM, GFP_NOFS);
} else {
- printk(KERN_INFO "btrfs no csum found "
- "for inode %llu start %llu\n",
+ btrfs_info(BTRFS_I(inode)->root->fs_info,
+ "no csum found for inode %llu start %llu",
btrfs_ino(inode), offset);
}
item = NULL;
@@ -281,10 +281,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
found:
csum += count * csum_size;
nblocks -= count;
+ bio_index += count;
while (count--) {
disk_bytenr += bvec->bv_len;
offset += bvec->bv_len;
- bio_index++;
bvec++;
}
}
@@ -302,7 +302,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
struct btrfs_dio_private *dip, struct bio *bio,
u64 offset)
{
- int len = (bio->bi_sector << 9) - dip->disk_bytenr;
+ int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
int ret;
@@ -329,6 +329,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
u64 csum_end;
u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+ ASSERT(start == ALIGN(start, root->sectorsize) &&
+ (end + 1) == ALIGN(end + 1, root->sectorsize));
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -444,11 +447,12 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
u64 offset;
WARN_ON(bio->bi_vcnt <= 0);
- sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
+ sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size),
+ GFP_NOFS);
if (!sums)
return -ENOMEM;
- sums->len = bio->bi_size;
+ sums->len = bio->bi_iter.bi_size;
INIT_LIST_HEAD(&sums->list);
if (contig)
@@ -458,7 +462,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = (u64)bio->bi_sector << 9;
+ sums->bytenr = (u64)bio->bi_iter.bi_sector << 9;
index = 0;
while (bio_index < bio->bi_vcnt) {
@@ -473,7 +477,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
btrfs_add_ordered_sum(inode, ordered, sums);
btrfs_put_ordered_extent(ordered);
- bytes_left = bio->bi_size - total_bytes;
+ bytes_left = bio->bi_iter.bi_size - total_bytes;
sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
GFP_NOFS);
@@ -481,7 +485,7 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode, offset);
BUG_ON(!ordered); /* Logic error */
- sums->bytenr = ((u64)bio->bi_sector << 9) +
+ sums->bytenr = ((u64)bio->bi_iter.bi_sector << 9) +
total_bytes;
index = 0;
}
@@ -746,7 +750,7 @@ again:
int slot = path->slots[0] + 1;
/* we didn't find a csum item, insert one */
nritems = btrfs_header_nritems(path->nodes[0]);
- if (path->slots[0] >= nritems - 1) {
+ if (!nritems || (path->slots[0] >= nritems - 1)) {
ret = btrfs_next_leaf(root, path);
if (ret == 1)
found_next = 1;
@@ -846,10 +850,8 @@ insert:
path->leave_spinning = 0;
if (ret < 0)
goto fail_unlock;
- if (ret != 0) {
- WARN_ON(1);
+ if (WARN_ON(ret != 0))
goto fail_unlock;
- }
leaf = path->nodes[0];
csum:
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -883,3 +885,79 @@ out:
fail_unlock:
goto out;
}
+
+void btrfs_extent_item_to_extent_map(struct inode *inode,
+ const struct btrfs_path *path,
+ struct btrfs_file_extent_item *fi,
+ const bool new_inline,
+ struct extent_map *em)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_buffer *leaf = path->nodes[0];
+ const int slot = path->slots[0];
+ struct btrfs_key key;
+ u64 extent_start, extent_end;
+ u64 bytenr;
+ u8 type = btrfs_file_extent_type(leaf, fi);
+ int compress_type = btrfs_file_extent_compression(leaf, fi);
+
+ em->bdev = root->fs_info->fs_devices->latest_bdev;
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_start = key.offset;
+
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ extent_end = extent_start +
+ btrfs_file_extent_num_bytes(leaf, fi);
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ size_t size;
+ size = btrfs_file_extent_inline_len(leaf, slot, fi);
+ extent_end = ALIGN(extent_start + size, root->sectorsize);
+ }
+
+ em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
+ if (type == BTRFS_FILE_EXTENT_REG ||
+ type == BTRFS_FILE_EXTENT_PREALLOC) {
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ em->orig_start = extent_start -
+ btrfs_file_extent_offset(leaf, fi);
+ em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+ bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+ if (bytenr == 0) {
+ em->block_start = EXTENT_MAP_HOLE;
+ return;
+ }
+ if (compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ em->block_start = bytenr;
+ em->block_len = em->orig_block_len;
+ } else {
+ bytenr += btrfs_file_extent_offset(leaf, fi);
+ em->block_start = bytenr;
+ em->block_len = em->len;
+ if (type == BTRFS_FILE_EXTENT_PREALLOC)
+ set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ }
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ em->block_start = EXTENT_MAP_INLINE;
+ em->start = extent_start;
+ em->len = extent_end - extent_start;
+ /*
+ * Initialize orig_start and block_len with the same values
+ * as in inode.c:btrfs_get_extent().
+ */
+ em->orig_start = EXTENT_MAP_HOLE;
+ em->block_len = (u64)-1;
+ if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
+ set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->compress_type = compress_type;
+ }
+ } else {
+ btrfs_err(root->fs_info,
+ "unknown file extent item type %d, inode %llu, offset %llu, root %llu",
+ type, btrfs_ino(inode), extent_start,
+ root->root_key.objectid);
+ }
+}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 72da4df53c9..1f2b99cb55e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -39,8 +39,8 @@
#include "print-tree.h"
#include "tree-log.h"
#include "locking.h"
-#include "compat.h"
#include "volumes.h"
+#include "qgroup.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
@@ -370,7 +370,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
u64 root_objectid = 0;
atomic_inc(&fs_info->defrag_running);
- while(1) {
+ while (1) {
/* Pause the auto defragger. */
if (test_bit(BTRFS_FS_STATE_REMOUNTING,
&fs_info->fs_state))
@@ -426,13 +426,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
struct page *page = prepared_pages[pg];
/*
* Copy data from userspace to the current page
- *
- * Disable pagefault to avoid recursive lock since
- * the pages are already locked
*/
- pagefault_disable();
copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
- pagefault_enable();
/* Flush processor's dcache for this page */
flush_dcache_page(page);
@@ -453,7 +448,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
write_bytes -= copied;
total_copied += copied;
- /* Return to btrfs_file_aio_write to fault page */
+ /* Return to btrfs_file_write_iter to fault page */
if (unlikely(copied == 0))
break;
@@ -476,11 +471,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
for (i = 0; i < num_pages; i++) {
/* page checked is some magic around finding pages that
* have been modified without going through btrfs_set_page_dirty
- * clear it here
+ * clear it here. There should be no need to mark the pages
+ * accessed as prepare_pages should have marked them accessed
+ * in prepare_pages via find_or_create_page()
*/
ClearPageChecked(pages[i]);
unlock_page(pages[i]);
- mark_page_accessed(pages[i]);
page_cache_release(pages[i]);
}
}
@@ -592,7 +588,6 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &flags);
modified = !list_empty(&em->list);
- remove_extent_mapping(em_tree, em);
if (no_splits)
goto next;
@@ -623,8 +618,7 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->bdev = em->bdev;
split->flags = flags;
split->compress_type = em->compress_type;
- ret = add_extent_mapping(em_tree, split, modified);
- BUG_ON(ret); /* Logic error */
+ replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
split2 = NULL;
@@ -662,12 +656,20 @@ void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
split->orig_block_len = 0;
}
- ret = add_extent_mapping(em_tree, split, modified);
- BUG_ON(ret); /* Logic error */
+ if (extent_map_in_tree(em)) {
+ replace_extent_mapping(em_tree, em, split,
+ modified);
+ } else {
+ ret = add_extent_mapping(em_tree, split,
+ modified);
+ ASSERT(ret == 0); /* Logic error */
+ }
free_extent_map(split);
split = NULL;
}
next:
+ if (extent_map_in_tree(em))
+ remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
/* once for us */
@@ -693,7 +695,10 @@ next:
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
struct btrfs_path *path, u64 start, u64 end,
- u64 *drop_end, int drop_cache)
+ u64 *drop_end, int drop_cache,
+ int replace_extent,
+ u32 extent_item_size,
+ int *key_inserted)
{
struct extent_buffer *leaf;
struct btrfs_file_extent_item *fi;
@@ -711,15 +716,18 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
int recow;
int ret;
int modify_tree = -1;
- int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+ int update_refs;
int found = 0;
+ int leafs_visited = 0;
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
- if (start >= BTRFS_I(inode)->disk_i_size)
+ if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
modify_tree = 0;
+ update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root);
while (1) {
recow = 0;
ret = btrfs_lookup_file_extent(trans, root, path, ino,
@@ -734,6 +742,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
path->slots[0]--;
}
ret = 0;
+ leafs_visited++;
next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -745,6 +754,7 @@ next_slot:
ret = 0;
break;
}
+ leafs_visited++;
leaf = path->nodes[0];
recow = 1;
}
@@ -767,12 +777,25 @@ next_slot:
btrfs_file_extent_num_bytes(leaf, fi);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = key.offset +
- btrfs_file_extent_inline_len(leaf, fi);
+ btrfs_file_extent_inline_len(leaf,
+ path->slots[0], fi);
} else {
WARN_ON(1);
extent_end = search_start;
}
+ /*
+ * Don't skip extent items representing 0 byte lengths. They
+ * used to be created (bug) if while punching holes we hit
+ * -ENOSPC condition. So if we find one here, just ensure we
+ * delete it, otherwise we would insert a new file extent item
+ * with the same key (offset) as that 0 bytes length file
+ * extent item in the call to setup_items_for_insert() later
+ * in this function.
+ */
+ if (extent_end == key.offset && extent_end >= search_start)
+ goto delete_extent_item;
+
if (extent_end <= search_start) {
path->slots[0]++;
goto next_slot;
@@ -792,7 +815,10 @@ next_slot:
*/
if (start > key.offset && end < extent_end) {
BUG_ON(del_nr > 0);
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = start;
@@ -825,7 +851,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 0);
+ start - extent_offset, 1);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -835,7 +861,10 @@ next_slot:
* | -------- extent -------- |
*/
if (start <= key.offset && end < extent_end) {
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
memcpy(&new_key, &key, sizeof(new_key));
new_key.offset = end;
@@ -858,7 +887,10 @@ next_slot:
*/
if (start > key.offset && end >= extent_end) {
BUG_ON(del_nr > 0);
- BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
+ if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
btrfs_set_file_extent_num_bytes(leaf, fi,
start - key.offset);
@@ -877,6 +909,7 @@ next_slot:
* | ------ extent ------ |
*/
if (start <= key.offset && end >= extent_end) {
+delete_extent_item:
if (del_nr == 0) {
del_slot = path->slots[0];
del_nr = 1;
@@ -928,14 +961,52 @@ next_slot:
}
if (!ret && del_nr > 0) {
+ /*
+ * Set path->slots[0] to first slot, so that after the delete
+ * if items are move off from our leaf to its immediate left or
+ * right neighbor leafs, we end up with a correct and adjusted
+ * path->slots[0] for our insertion (if replace_extent != 0).
+ */
+ path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret)
btrfs_abort_transaction(trans, root, ret);
}
+ leaf = path->nodes[0];
+ /*
+ * If btrfs_del_items() was called, it might have deleted a leaf, in
+ * which case it unlocked our path, so check path->locks[0] matches a
+ * write lock.
+ */
+ if (!ret && replace_extent && leafs_visited == 1 &&
+ (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
+ path->locks[0] == BTRFS_WRITE_LOCK) &&
+ btrfs_leaf_free_space(root, leaf) >=
+ sizeof(struct btrfs_item) + extent_item_size) {
+
+ key.objectid = ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+ if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
+ struct btrfs_key slot_key;
+
+ btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
+ if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
+ path->slots[0]++;
+ }
+ setup_items_for_insert(root, path, &key,
+ &extent_item_size,
+ extent_item_size,
+ sizeof(struct btrfs_item) +
+ extent_item_size, 1);
+ *key_inserted = 1;
+ }
+
+ if (!replace_extent || !(*key_inserted))
+ btrfs_release_path(path);
if (drop_end)
*drop_end = found ? min(end, extent_end) : end;
- btrfs_release_path(path);
return ret;
}
@@ -950,7 +1021,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
- drop_cache);
+ drop_cache, 0, 0, NULL);
btrfs_free_path(path);
return ret;
}
@@ -1137,7 +1208,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset, 1);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1236,29 +1307,18 @@ static int prepare_uptodate_page(struct page *page, u64 pos,
}
/*
- * this gets pages into the page cache and locks them down, it also properly
- * waits for data=ordered extents to finish before allowing the pages to be
- * modified.
+ * this just gets pages into the page cache and locks them down.
*/
-static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
- struct page **pages, size_t num_pages,
- loff_t pos, unsigned long first_index,
- size_t write_bytes, bool force_uptodate)
+static noinline int prepare_pages(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos,
+ size_t write_bytes, bool force_uptodate)
{
- struct extent_state *cached_state = NULL;
int i;
unsigned long index = pos >> PAGE_CACHE_SHIFT;
- struct inode *inode = file_inode(file);
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
int err = 0;
- int faili = 0;
- u64 start_pos;
- u64 last_pos;
+ int faili;
- start_pos = pos & ~((u64)root->sectorsize - 1);
- last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
-
-again:
for (i = 0; i < num_pages; i++) {
pages[i] = find_or_create_page(inode->i_mapping, index + i,
mask | __GFP_WRITE);
@@ -1281,54 +1341,82 @@ again:
}
wait_on_page_writeback(pages[i]);
}
- err = 0;
+
+ return 0;
+fail:
+ while (faili >= 0) {
+ unlock_page(pages[faili]);
+ page_cache_release(pages[faili]);
+ faili--;
+ }
+ return err;
+
+}
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
+ size_t num_pages, loff_t pos,
+ u64 *lockstart, u64 *lockend,
+ struct extent_state **cached_state)
+{
+ u64 start_pos;
+ u64 last_pos;
+ int i;
+ int ret = 0;
+
+ start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
+ last_pos = start_pos + ((u64)num_pages << PAGE_CACHE_SHIFT) - 1;
+
if (start_pos < inode->i_size) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, 0, &cached_state);
- ordered = btrfs_lookup_first_ordered_extent(inode,
- last_pos - 1);
+ start_pos, last_pos, 0, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start_pos,
+ last_pos - start_pos + 1);
if (ordered &&
ordered->file_offset + ordered->len > start_pos &&
- ordered->file_offset < last_pos) {
- btrfs_put_ordered_extent(ordered);
+ ordered->file_offset <= last_pos) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1,
- &cached_state, GFP_NOFS);
+ start_pos, last_pos,
+ cached_state, GFP_NOFS);
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_wait_ordered_range(inode, start_pos,
- last_pos - start_pos);
- goto again;
+ btrfs_start_ordered_extent(inode, ordered, 1);
+ btrfs_put_ordered_extent(ordered);
+ return -EAGAIN;
}
if (ordered)
btrfs_put_ordered_extent(ordered);
clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
- last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+ last_pos, EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, &cached_state, GFP_NOFS);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree,
- start_pos, last_pos - 1, &cached_state,
- GFP_NOFS);
+ 0, 0, cached_state, GFP_NOFS);
+ *lockstart = start_pos;
+ *lockend = last_pos;
+ ret = 1;
}
+
for (i = 0; i < num_pages; i++) {
if (clear_page_dirty_for_io(pages[i]))
account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
- return 0;
-fail:
- while (faili >= 0) {
- unlock_page(pages[faili]);
- page_cache_release(pages[faili]);
- faili--;
- }
- return err;
+ return ret;
}
static noinline int check_can_nocow(struct inode *inode, loff_t pos,
@@ -1340,8 +1428,12 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
u64 num_bytes;
int ret;
+ ret = btrfs_start_nocow_write(root);
+ if (!ret)
+ return -ENOSPC;
+
lockstart = round_down(pos, root->sectorsize);
- lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+ lockend = round_up(pos + *write_bytes, root->sectorsize) - 1;
while (1) {
lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1359,12 +1451,10 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
if (ret <= 0) {
ret = 0;
+ btrfs_end_nocow_write(root);
} else {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
- NULL, GFP_NOFS);
- *write_bytes = min_t(size_t, *write_bytes, num_bytes);
+ *write_bytes = min_t(size_t, *write_bytes ,
+ num_bytes - pos + lockstart);
}
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
@@ -1379,13 +1469,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct page **pages = NULL;
+ struct extent_state *cached_state = NULL;
u64 release_bytes = 0;
+ u64 lockstart;
+ u64 lockend;
unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
bool only_release_metadata = false;
bool force_page_uptodate = false;
+ bool need_unlock;
nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1450,22 +1544,37 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!only_release_metadata)
btrfs_free_reserved_data_space(inode,
reserve_bytes);
+ else
+ btrfs_end_nocow_write(root);
break;
}
release_bytes = reserve_bytes;
-
+ need_unlock = false;
+again:
/*
* This is going to setup the pages array with the number of
* pages we want, so we don't really need to worry about the
* contents of pages from loop to loop
*/
- ret = prepare_pages(root, file, pages, num_pages,
- pos, first_index, write_bytes,
+ ret = prepare_pages(inode, pages, num_pages,
+ pos, write_bytes,
force_page_uptodate);
if (ret)
break;
+ ret = lock_and_cleanup_extent_if_need(inode, pages, num_pages,
+ pos, &lockstart, &lockend,
+ &cached_state);
+ if (ret < 0) {
+ if (ret == -EAGAIN)
+ goto again;
+ break;
+ } else if (ret > 0) {
+ need_unlock = true;
+ ret = 0;
+ }
+
copied = btrfs_copy_from_user(pos, num_pages,
write_bytes, pages, i);
@@ -1510,18 +1619,23 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
- if (copied > 0) {
+
+ if (copied > 0)
ret = btrfs_dirty_pages(root, inode, pages,
dirty_pages, pos, copied,
NULL);
- if (ret) {
- btrfs_drop_pages(pages, num_pages);
- break;
- }
+ if (need_unlock)
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+ lockstart, lockend, &cached_state,
+ GFP_NOFS);
+ if (ret) {
+ btrfs_drop_pages(pages, num_pages);
+ break;
}
release_bytes = 0;
- btrfs_drop_pages(pages, num_pages);
+ if (only_release_metadata)
+ btrfs_end_nocow_write(root);
if (only_release_metadata && copied > 0) {
u64 lockstart = round_down(pos, root->sectorsize);
@@ -1534,6 +1648,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
only_release_metadata = false;
}
+ btrfs_drop_pages(pages, num_pages);
+
cond_resched();
balance_dirty_pages_ratelimited(inode->i_mapping);
@@ -1547,37 +1663,34 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
kfree(pages);
if (release_bytes) {
- if (only_release_metadata)
+ if (only_release_metadata) {
+ btrfs_end_nocow_write(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
- else
+ } else {
btrfs_delalloc_release_space(inode, release_bytes);
+ }
}
return num_written ? num_written : ret;
}
static ssize_t __btrfs_direct_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos,
- loff_t *ppos, size_t count, size_t ocount)
+ struct iov_iter *from,
+ loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct iov_iter i;
ssize_t written;
ssize_t written_buffered;
loff_t endbyte;
int err;
- written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
- count, ocount);
+ written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
+ if (written < 0 || !iov_iter_count(from))
return written;
pos += written;
- count -= written;
- iov_iter_init(&i, iov, nr_segs, count, written);
- written_buffered = __btrfs_buffered_write(file, &i, pos);
+ written_buffered = __btrfs_buffered_write(file, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1587,7 +1700,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb,
if (err)
goto out;
written += written_buffered;
- *ppos = pos + written_buffered;
+ iocb->ki_pos = pos + written_buffered;
invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
endbyte >> PAGE_CACHE_SHIFT);
out:
@@ -1612,29 +1725,22 @@ static void update_time_for_write(struct inode *inode)
inode_inc_iversion(inode);
}
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- loff_t *ppos = &iocb->ki_pos;
u64 start_pos;
+ u64 end_pos;
ssize_t num_written = 0;
ssize_t err = 0;
- size_t count, ocount;
+ size_t count = iov_iter_count(from);
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
+ loff_t pos = iocb->ki_pos;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err) {
- mutex_unlock(&inode->i_mutex);
- goto out;
- }
- count = ocount;
-
current->backing_dev_info = inode->i_mapping->backing_dev_info;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
@@ -1647,6 +1753,8 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
goto out;
}
+ iov_iter_truncate(from, count);
+
err = file_remove_suid(file);
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -1675,7 +1783,9 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
start_pos = round_down(pos, root->sectorsize);
if (start_pos > i_size_read(inode)) {
- err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+ /* Expand hole size to cover write data, preventing empty gap */
+ end_pos = round_up(pos + count, root->sectorsize);
+ err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
if (err) {
mutex_unlock(&inode->i_mutex);
goto out;
@@ -1686,16 +1796,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
atomic_inc(&BTRFS_I(inode)->sync_writers);
if (unlikely(file->f_flags & O_DIRECT)) {
- num_written = __btrfs_direct_write(iocb, iov, nr_segs,
- pos, ppos, count, ocount);
+ num_written = __btrfs_direct_write(iocb, from, pos);
} else {
- struct iov_iter i;
-
- iov_iter_init(&i, iov, nr_segs, count, num_written);
-
- num_written = __btrfs_buffered_write(file, &i, pos);
+ num_written = __btrfs_buffered_write(file, from, pos);
if (num_written > 0)
- *ppos = pos + num_written;
+ iocb->ki_pos = pos + num_written;
}
mutex_unlock(&inode->i_mutex);
@@ -1720,7 +1825,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
BTRFS_I(inode)->last_sub_trans = root->log_transid;
if (num_written > 0) {
err = generic_write_sync(file, pos, num_written);
- if (err < 0 && num_written > 0)
+ if (err < 0)
num_written = err;
}
@@ -1779,8 +1884,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
- int ret = 0;
struct btrfs_trans_handle *trans;
+ struct btrfs_log_ctx ctx;
+ int ret = 0;
bool full_sync = 0;
trace_btrfs_sync_file(file, datasync);
@@ -1809,8 +1915,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- if (full_sync)
- btrfs_wait_ordered_range(inode, start, end - start + 1);
+ if (full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ }
atomic_inc(&root->log_batch);
/*
@@ -1850,14 +1961,28 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (file->private_data)
btrfs_ioctl_trans_end(file);
+ /*
+ * We use start here because we will need to wait on the IO to complete
+ * in btrfs_sync_log, which could require joining a transaction (for
+ * example checking cross references in the nocow path). If we use join
+ * here we could get into a situation where we're waiting on IO to
+ * happen that is blocked on a transaction trying to commit. With start
+ * we inc the extwriter counter, so we wait for all extwriters to exit
+ * before we start blocking join'ers. This comment is to keep somebody
+ * from thinking they are super smart and changing this to
+ * btrfs_join_transaction *cough*Josef*cough*.
+ */
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
mutex_unlock(&inode->i_mutex);
goto out;
}
+ trans->sync = true;
+
+ btrfs_init_log_ctx(&ctx);
- ret = btrfs_log_dentry_safe(trans, root, dentry);
+ ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
if (ret < 0) {
/* Fallthrough and commit/free transaction. */
ret = 1;
@@ -1876,27 +2001,22 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
mutex_unlock(&inode->i_mutex);
if (ret != BTRFS_NO_LOG_SYNC) {
- if (ret > 0) {
- /*
- * If we didn't already wait for ordered extents we need
- * to do that now.
- */
- if (!full_sync)
- btrfs_wait_ordered_range(inode, start,
- end - start + 1);
- ret = btrfs_commit_transaction(trans, root);
- } else {
- ret = btrfs_sync_log(trans, root);
- if (ret == 0) {
+ if (!ret) {
+ ret = btrfs_sync_log(trans, root, &ctx);
+ if (!ret) {
ret = btrfs_end_transaction(trans, root);
- } else {
- if (!full_sync)
- btrfs_wait_ordered_range(inode, start,
- end -
- start + 1);
- ret = btrfs_commit_transaction(trans, root);
+ goto out;
+ }
+ }
+ if (!full_sync) {
+ ret = btrfs_wait_ordered_range(inode, start,
+ end - start + 1);
+ if (ret) {
+ btrfs_end_transaction(trans, root);
+ goto out;
}
}
+ ret = btrfs_commit_transaction(trans, root);
} else {
ret = btrfs_end_transaction(trans, root);
}
@@ -1906,6 +2026,7 @@ out:
static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = btrfs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -1963,11 +2084,13 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
struct btrfs_key key;
int ret;
+ if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+ goto out;
+
key.objectid = btrfs_ino(inode);
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = offset;
-
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
return ret;
@@ -2049,6 +2172,37 @@ out:
return 0;
}
+/*
+ * Find a hole extent on given inode and change start/len to the end of hole
+ * extent.(hole/vacuum extent whose em->start <= start &&
+ * em->start + em->len > start)
+ * When a hole extent is found, return 1 and modify start/len.
+ */
+static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
+{
+ struct extent_map *em;
+ int ret = 0;
+
+ em = btrfs_get_extent(inode, NULL, 0, *start, *len, 0);
+ if (IS_ERR_OR_NULL(em)) {
+ if (!em)
+ ret = -ENOMEM;
+ else
+ ret = PTR_ERR(em);
+ return ret;
+ }
+
+ /* Hole or vacuum extent(only exists in no-hole mode) */
+ if (em->block_start == EXTENT_MAP_HOLE) {
+ ret = 1;
+ *len = em->start + em->len > *start + *len ?
+ 0 : *start + *len - em->start - em->len;
+ *start = em->start + em->len;
+ }
+ free_extent_map(em);
+ return ret;
+}
+
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2056,20 +2210,42 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
struct btrfs_path *path;
struct btrfs_block_rsv *rsv;
struct btrfs_trans_handle *trans;
- u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
- u64 lockend = round_down(offset + len,
- BTRFS_I(inode)->root->sectorsize) - 1;
- u64 cur_offset = lockstart;
+ u64 lockstart;
+ u64 lockend;
+ u64 tail_start;
+ u64 tail_len;
+ u64 orig_start = offset;
+ u64 cur_offset;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
u64 drop_end;
int ret = 0;
int err = 0;
- bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
- ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+ int rsv_count;
+ bool same_page;
+ bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
+ u64 ino_size;
- btrfs_wait_ordered_range(inode, offset, len);
+ ret = btrfs_wait_ordered_range(inode, offset, len);
+ if (ret)
+ return ret;
mutex_lock(&inode->i_mutex);
+ ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ /* Already in a large hole */
+ ret = 0;
+ goto out_only_mutex;
+ }
+
+ lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize);
+ lockend = round_down(offset + len,
+ BTRFS_I(inode)->root->sectorsize) - 1;
+ same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+ ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
/*
* We needn't truncate any page which is beyond the end of the file
* because we are sure there is no data there.
@@ -2079,14 +2255,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* entire page.
*/
if (same_page && len < PAGE_CACHE_SIZE) {
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+ if (offset < ino_size)
ret = btrfs_truncate_page(inode, offset, len, 0);
- mutex_unlock(&inode->i_mutex);
- return ret;
+ goto out_only_mutex;
}
/* zero back part of the first page */
- if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+ if (offset < ino_size) {
ret = btrfs_truncate_page(inode, offset, 0, 0);
if (ret) {
mutex_unlock(&inode->i_mutex);
@@ -2094,12 +2269,39 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
}
- /* zero the front end of the last page */
- if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
- ret = btrfs_truncate_page(inode, offset + len, 0, 1);
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- return ret;
+ /* Check the aligned pages after the first unaligned page,
+ * if offset != orig_start, which means the first unaligned page
+ * including serveral following pages are already in holes,
+ * the extra check can be skipped */
+ if (offset == orig_start) {
+ /* after truncate page, check hole again */
+ len = offset + len - lockstart;
+ offset = lockstart;
+ ret = find_first_non_hole(inode, &offset, &len);
+ if (ret < 0)
+ goto out_only_mutex;
+ if (ret && !len) {
+ ret = 0;
+ goto out_only_mutex;
+ }
+ lockstart = offset;
+ }
+
+ /* Check the tail unaligned part is in a hole */
+ tail_start = lockend + 1;
+ tail_len = offset + len - tail_start;
+ if (tail_len) {
+ ret = find_first_non_hole(inode, &tail_start, &tail_len);
+ if (unlikely(ret < 0))
+ goto out_only_mutex;
+ if (!ret) {
+ /* zero the front end of the last page */
+ if (tail_start + tail_len < ino_size) {
+ ret = btrfs_truncate_page(inode,
+ tail_start + tail_len, 0, 1);
+ if (ret)
+ goto out_only_mutex;
+ }
}
}
@@ -2123,11 +2325,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
* we need to try again.
*/
if ((!ordered ||
- (ordered->file_offset + ordered->len < lockstart ||
+ (ordered->file_offset + ordered->len <= lockstart ||
ordered->file_offset > lockend)) &&
- !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockend, EXTENT_UPTODATE, 0,
- cached_state)) {
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
@@ -2136,8 +2336,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, &cached_state, GFP_NOFS);
- btrfs_wait_ordered_range(inode, lockstart,
- lockend - lockstart + 1);
+ ret = btrfs_wait_ordered_range(inode, lockstart,
+ lockend - lockstart + 1);
+ if (ret) {
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+ }
}
path = btrfs_alloc_path();
@@ -2157,9 +2361,10 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
/*
* 1 - update the inode
* 1 - removing the extents in the range
- * 1 - adding the hole extent
+ * 1 - adding the hole extent if no_holes isn't set
*/
- trans = btrfs_start_transaction(root, 3);
+ rsv_count = no_holes ? 2 : 3;
+ trans = btrfs_start_transaction(root, rsv_count);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -2170,19 +2375,24 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
BUG_ON(ret);
trans->block_rsv = rsv;
+ cur_offset = lockstart;
+ len = lockend - cur_offset;
while (cur_offset < lockend) {
ret = __btrfs_drop_extents(trans, root, inode, path,
cur_offset, lockend + 1,
- &drop_end, 1);
+ &drop_end, 1, 0, 0, NULL);
if (ret != -ENOSPC)
break;
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = fill_holes(trans, inode, path, cur_offset, drop_end);
- if (ret) {
- err = ret;
- break;
+ if (cur_offset < ino_size) {
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_end);
+ if (ret) {
+ err = ret;
+ break;
+ }
}
cur_offset = drop_end;
@@ -2196,7 +2406,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
- trans = btrfs_start_transaction(root, 3);
+ trans = btrfs_start_transaction(root, rsv_count);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
@@ -2207,6 +2417,14 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
rsv, min_size);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
+
+ ret = find_first_non_hole(inode, &cur_offset, &len);
+ if (unlikely(ret < 0))
+ break;
+ if (ret && !len) {
+ ret = 0;
+ break;
+ }
}
if (ret) {
@@ -2215,10 +2433,17 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
trans->block_rsv = &root->fs_info->trans_block_rsv;
- ret = fill_holes(trans, inode, path, cur_offset, drop_end);
- if (ret) {
- err = ret;
- goto out_trans;
+ /*
+ * Don't insert file hole extent item if it's for a range beyond eof
+ * (because it's useless) or if it represents a 0 bytes range (when
+ * cur_offset == drop_end).
+ */
+ if (cur_offset < ino_size && cur_offset < drop_end) {
+ ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+ if (ret) {
+ err = ret;
+ goto out_trans;
+ }
}
out_trans:
@@ -2238,6 +2463,7 @@ out_free:
out:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
+out_only_mutex:
mutex_unlock(&inode->i_mutex);
if (ret && !err)
err = ret;
@@ -2308,7 +2534,10 @@ static long btrfs_fallocate(struct file *file, int mode,
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
- btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
locked_end = alloc_end - 1;
while (1) {
@@ -2332,8 +2561,10 @@ static long btrfs_fallocate(struct file *file, int mode,
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
- btrfs_wait_ordered_range(inode, alloc_start,
- alloc_end - alloc_start);
+ ret = btrfs_wait_ordered_range(inode, alloc_start,
+ alloc_end - alloc_start);
+ if (ret)
+ goto out;
} else {
if (ordered)
btrfs_put_ordered_extent(ordered);
@@ -2405,14 +2636,12 @@ out_reserve_fail:
static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct extent_map *em;
+ struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
u64 lockstart = *offset;
u64 lockend = i_size_read(inode);
u64 start = *offset;
- u64 orig_start = *offset;
u64 len = i_size_read(inode);
- u64 last_end = 0;
int ret = 0;
lockend = max_t(u64, root->sectorsize, lockend);
@@ -2429,89 +2658,35 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
&cached_state);
- /*
- * Delalloc is such a pain. If we have a hole and we have pending
- * delalloc for a portion of the hole we will get back a hole that
- * exists for the entire range since it hasn't been actually written
- * yet. So to take care of this case we need to look for an extent just
- * before the position we want in case there is outstanding delalloc
- * going on here.
- */
- if (whence == SEEK_HOLE && start != 0) {
- if (start <= root->sectorsize)
- em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
- root->sectorsize, 0);
- else
- em = btrfs_get_extent_fiemap(inode, NULL, 0,
- start - root->sectorsize,
- root->sectorsize, 0);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
- goto out;
- }
- last_end = em->start + em->len;
- if (em->block_start == EXTENT_MAP_DELALLOC)
- last_end = min_t(u64, last_end, inode->i_size);
- free_extent_map(em);
- }
-
- while (1) {
+ while (start < inode->i_size) {
em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
+ em = NULL;
break;
}
- if (em->block_start == EXTENT_MAP_HOLE) {
- if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
- if (last_end <= orig_start) {
- free_extent_map(em);
- ret = -ENXIO;
- break;
- }
- }
-
- if (whence == SEEK_HOLE) {
- *offset = start;
- free_extent_map(em);
- break;
- }
- } else {
- if (whence == SEEK_DATA) {
- if (em->block_start == EXTENT_MAP_DELALLOC) {
- if (start >= inode->i_size) {
- free_extent_map(em);
- ret = -ENXIO;
- break;
- }
- }
-
- if (!test_bit(EXTENT_FLAG_PREALLOC,
- &em->flags)) {
- *offset = start;
- free_extent_map(em);
- break;
- }
- }
- }
+ if (whence == SEEK_HOLE &&
+ (em->block_start == EXTENT_MAP_HOLE ||
+ test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ break;
+ else if (whence == SEEK_DATA &&
+ (em->block_start != EXTENT_MAP_HOLE &&
+ !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
+ break;
start = em->start + em->len;
- last_end = em->start + em->len;
-
- if (em->block_start == EXTENT_MAP_DELALLOC)
- last_end = min_t(u64, last_end, inode->i_size);
-
- if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
- free_extent_map(em);
- ret = -ENXIO;
- break;
- }
free_extent_map(em);
+ em = NULL;
cond_resched();
}
- if (!ret)
- *offset = min(*offset, inode->i_size);
-out:
+ free_extent_map(em);
+ if (!ret) {
+ if (whence == SEEK_DATA && start >= inode->i_size)
+ ret = -ENXIO;
+ else
+ *offset = min_t(loff_t, start, inode->i_size);
+ }
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
&cached_state, GFP_NOFS);
return ret;
@@ -2550,11 +2725,11 @@ out:
const struct file_operations btrfs_file_operations = {
.llseek = btrfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
- .aio_write = btrfs_file_aio_write,
+ .write_iter = btrfs_file_write_iter,
.mmap = btrfs_file_mmap,
.open = generic_file_open,
.release = btrfs_release_file,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b4f9904c4c6..2b0a627cb5f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -218,7 +218,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
struct inode *inode)
{
int ret = 0;
@@ -275,18 +274,32 @@ struct io_ctl {
};
static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
- struct btrfs_root *root)
+ struct btrfs_root *root, int write)
{
+ int num_pages;
+ int check_crcs = 0;
+
+ num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+
+ if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+ check_crcs = 1;
+
+ /* Make sure we can fit our crcs into the first page */
+ if (write && check_crcs &&
+ (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
+ return -ENOSPC;
+
memset(io_ctl, 0, sizeof(struct io_ctl));
- io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT;
- io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
- GFP_NOFS);
+
+ io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
if (!io_ctl->pages)
return -ENOMEM;
+
+ io_ctl->num_pages = num_pages;
io_ctl->root = root;
- if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
- io_ctl->check_crcs = 1;
+ io_ctl->check_crcs = check_crcs;
+
return 0;
}
@@ -348,8 +361,8 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
btrfs_readpage(NULL, page);
lock_page(page);
if (!PageUptodate(page)) {
- printk(KERN_ERR "btrfs: error reading free "
- "space cache\n");
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "error reading free space cache");
io_ctl_drop_pages(io_ctl);
return -EIO;
}
@@ -406,7 +419,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
gen = io_ctl->cur;
if (le64_to_cpu(*gen) != generation) {
- printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+ printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
"(%Lu) does not match inode (%Lu)\n", *gen,
generation);
io_ctl_unmap_page(io_ctl);
@@ -464,7 +477,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
- printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+ printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
"space cache\n");
io_ctl_unmap_page(io_ctl);
return -EIO;
@@ -667,6 +680,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
generation = btrfs_free_space_generation(leaf, header);
btrfs_release_path(path);
+ if (!BTRFS_I(inode)->generation) {
+ btrfs_info(root->fs_info,
+ "The free space cache file (%llu) is invalid. skip it\n",
+ offset);
+ return 0;
+ }
+
if (BTRFS_I(inode)->generation != generation) {
btrfs_err(root->fs_info,
"free space inode generation (%llu) "
@@ -678,7 +698,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
if (!num_entries)
return 0;
- ret = io_ctl_init(&io_ctl, inode, root);
+ ret = io_ctl_init(&io_ctl, inode, root, 0);
if (ret)
return ret;
@@ -832,7 +852,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
if (!matched) {
__btrfs_remove_free_space_cache(ctl);
- btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+ btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
block_group->key.objectid);
ret = -1;
}
@@ -844,7 +864,7 @@ out:
spin_unlock(&block_group->lock);
ret = 0;
- btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+ btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
block_group->key.objectid);
}
@@ -852,90 +872,44 @@ out:
return ret;
}
-/**
- * __btrfs_write_out_cache - write out cached info to an inode
- * @root - the root the inode belongs to
- * @ctl - the free space cache we are going to write out
- * @block_group - the block_group for this cache if it belongs to a block_group
- * @trans - the trans handle
- * @path - the path to use
- * @offset - the offset for the key we'll insert
- *
- * This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
- * and -1 if it was not.
- */
-static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
- struct btrfs_free_space_ctl *ctl,
- struct btrfs_block_group_cache *block_group,
- struct btrfs_trans_handle *trans,
- struct btrfs_path *path, u64 offset)
+static noinline_for_stack
+int write_cache_extent_entries(struct io_ctl *io_ctl,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ int *entries, int *bitmaps,
+ struct list_head *bitmap_list)
{
- struct btrfs_free_space_header *header;
- struct extent_buffer *leaf;
- struct rb_node *node;
- struct list_head *pos, *n;
- struct extent_state *cached_state = NULL;
- struct btrfs_free_cluster *cluster = NULL;
- struct extent_io_tree *unpin = NULL;
- struct io_ctl io_ctl;
- struct list_head bitmap_list;
- struct btrfs_key key;
- u64 start, extent_start, extent_end, len;
- int entries = 0;
- int bitmaps = 0;
int ret;
- int err = -1;
-
- INIT_LIST_HEAD(&bitmap_list);
-
- if (!i_size_read(inode))
- return -1;
-
- ret = io_ctl_init(&io_ctl, inode, root);
- if (ret)
- return -1;
+ struct btrfs_free_cluster *cluster = NULL;
+ struct rb_node *node = rb_first(&ctl->free_space_offset);
/* Get the cluster for this block_group if it exists */
- if (block_group && !list_empty(&block_group->cluster_list))
+ if (block_group && !list_empty(&block_group->cluster_list)) {
cluster = list_entry(block_group->cluster_list.next,
struct btrfs_free_cluster,
block_group_list);
+ }
- /* Lock all pages first so we can lock the extent safely. */
- io_ctl_prepare_pages(&io_ctl, inode, 0);
-
- lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
- 0, &cached_state);
-
- node = rb_first(&ctl->free_space_offset);
if (!node && cluster) {
node = rb_first(&cluster->root);
cluster = NULL;
}
- /* Make sure we can fit our crcs into the first page */
- if (io_ctl.check_crcs &&
- (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
- goto out_nospc;
-
- io_ctl_set_generation(&io_ctl, trans->transid);
-
/* Write out the extent entries */
while (node) {
struct btrfs_free_space *e;
e = rb_entry(node, struct btrfs_free_space, offset_index);
- entries++;
+ *entries += 1;
- ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+ ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
e->bitmap);
if (ret)
- goto out_nospc;
+ goto fail;
if (e->bitmap) {
- list_add_tail(&e->list, &bitmap_list);
- bitmaps++;
+ list_add_tail(&e->list, bitmap_list);
+ *bitmaps += 1;
}
node = rb_next(node);
if (!node && cluster) {
@@ -943,131 +917,289 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
cluster = NULL;
}
}
+ return 0;
+fail:
+ return -ENOSPC;
+}
+
+static noinline_for_stack int
+update_cache_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode,
+ struct btrfs_path *path, u64 offset,
+ int entries, int bitmaps)
+{
+ struct btrfs_key key;
+ struct btrfs_free_space_header *header;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+ key.offset = offset;
+ key.type = 0;
+
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ if (ret < 0) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+ GFP_NOFS);
+ goto fail;
+ }
+ leaf = path->nodes[0];
+ if (ret > 0) {
+ struct btrfs_key found_key;
+ ASSERT(path->slots[0]);
+ path->slots[0]--;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+ if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+ found_key.offset != offset) {
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+ inode->i_size - 1,
+ EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+ NULL, GFP_NOFS);
+ btrfs_release_path(path);
+ goto fail;
+ }
+ }
+
+ BTRFS_I(inode)->generation = trans->transid;
+ header = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_free_space_header);
+ btrfs_set_free_space_entries(leaf, header, entries);
+ btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+ btrfs_set_free_space_generation(leaf, header, trans->transid);
+ btrfs_mark_buffer_dirty(leaf);
+ btrfs_release_path(path);
+
+ return 0;
+
+fail:
+ return -1;
+}
+
+static noinline_for_stack int
+write_pinned_extent_entries(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group,
+ struct io_ctl *io_ctl,
+ int *entries)
+{
+ u64 start, extent_start, extent_end, len;
+ struct extent_io_tree *unpin = NULL;
+ int ret;
+
+ if (!block_group)
+ return 0;
/*
* We want to add any pinned extents to our free space cache
* so we don't leak the space
- */
-
- /*
+ *
* We shouldn't have switched the pinned extents yet so this is the
* right one
*/
unpin = root->fs_info->pinned_extents;
- if (block_group)
- start = block_group->key.objectid;
+ start = block_group->key.objectid;
- while (block_group && (start < block_group->key.objectid +
- block_group->key.offset)) {
+ while (start < block_group->key.objectid + block_group->key.offset) {
ret = find_first_extent_bit(unpin, start,
&extent_start, &extent_end,
EXTENT_DIRTY, NULL);
- if (ret) {
- ret = 0;
- break;
- }
+ if (ret)
+ return 0;
/* This pinned extent is out of our range */
if (extent_start >= block_group->key.objectid +
block_group->key.offset)
- break;
+ return 0;
extent_start = max(extent_start, start);
extent_end = min(block_group->key.objectid +
block_group->key.offset, extent_end + 1);
len = extent_end - extent_start;
- entries++;
- ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
+ *entries += 1;
+ ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
start = extent_end;
}
+ return 0;
+}
+
+static noinline_for_stack int
+write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+ int ret;
+
/* Write out the bitmaps */
- list_for_each_safe(pos, n, &bitmap_list) {
+ list_for_each_safe(pos, n, bitmap_list) {
struct btrfs_free_space *entry =
list_entry(pos, struct btrfs_free_space, list);
- ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+ ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
if (ret)
- goto out_nospc;
+ return -ENOSPC;
list_del_init(&entry->list);
}
- /* Zero out the rest of the pages just to make sure */
- io_ctl_zero_remaining_pages(&io_ctl);
+ return 0;
+}
- ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
- 0, i_size_read(inode), &cached_state);
- io_ctl_drop_pages(&io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+static int flush_dirty_cache(struct inode *inode)
+{
+ int ret;
+ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (ret)
- goto out;
-
-
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
-
- key.objectid = BTRFS_FREE_SPACE_OBJECTID;
- key.offset = offset;
- key.type = 0;
-
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
- if (ret < 0) {
clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
GFP_NOFS);
- goto out;
+
+ return ret;
+}
+
+static void noinline_for_stack
+cleanup_write_cache_enospc(struct inode *inode,
+ struct io_ctl *io_ctl,
+ struct extent_state **cached_state,
+ struct list_head *bitmap_list)
+{
+ struct list_head *pos, *n;
+
+ list_for_each_safe(pos, n, bitmap_list) {
+ struct btrfs_free_space *entry =
+ list_entry(pos, struct btrfs_free_space, list);
+ list_del_init(&entry->list);
}
- leaf = path->nodes[0];
- if (ret > 0) {
- struct btrfs_key found_key;
- ASSERT(path->slots[0]);
- path->slots[0]--;
- btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
- found_key.offset != offset) {
- clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
- inode->i_size - 1,
- EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
- NULL, GFP_NOFS);
- btrfs_release_path(path);
+ io_ctl_drop_pages(io_ctl);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, cached_state,
+ GFP_NOFS);
+}
+
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount. This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+ struct btrfs_free_space_ctl *ctl,
+ struct btrfs_block_group_cache *block_group,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 offset)
+{
+ struct extent_state *cached_state = NULL;
+ struct io_ctl io_ctl;
+ LIST_HEAD(bitmap_list);
+ int entries = 0;
+ int bitmaps = 0;
+ int ret;
+
+ if (!i_size_read(inode))
+ return -1;
+
+ ret = io_ctl_init(&io_ctl, inode, root, 1);
+ if (ret)
+ return -1;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
+ down_write(&block_group->data_rwsem);
+ spin_lock(&block_group->lock);
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ up_write(&block_group->data_rwsem);
+ BTRFS_I(inode)->generation = 0;
+ ret = 0;
goto out;
}
+ spin_unlock(&block_group->lock);
}
- BTRFS_I(inode)->generation = trans->transid;
- header = btrfs_item_ptr(leaf, path->slots[0],
- struct btrfs_free_space_header);
- btrfs_set_free_space_entries(leaf, header, entries);
- btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
- btrfs_set_free_space_generation(leaf, header, trans->transid);
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(path);
+ /* Lock all pages first so we can lock the extent safely. */
+ io_ctl_prepare_pages(&io_ctl, inode, 0);
+
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+ 0, &cached_state);
- err = 0;
+ io_ctl_set_generation(&io_ctl, trans->transid);
+
+ /* Write out the extent entries in the free space cache */
+ ret = write_cache_extent_entries(&io_ctl, ctl,
+ block_group, &entries, &bitmaps,
+ &bitmap_list);
+ if (ret)
+ goto out_nospc;
+
+ /*
+ * Some spaces that are freed in the current transaction are pinned,
+ * they will be added into free space cache after the transaction is
+ * committed, we shouldn't lose them.
+ */
+ ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
+ if (ret)
+ goto out_nospc;
+
+ /* At last, we write out all the bitmaps. */
+ ret = write_bitmap_entries(&io_ctl, &bitmap_list);
+ if (ret)
+ goto out_nospc;
+
+ /* Zero out the rest of the pages just to make sure */
+ io_ctl_zero_remaining_pages(&io_ctl);
+
+ /* Everything is written out, now we dirty the pages in the file. */
+ ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+ 0, i_size_read(inode), &cached_state);
+ if (ret)
+ goto out_nospc;
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+ /*
+ * Release the pages and unlock the extent, we will flush
+ * them out later
+ */
+ io_ctl_drop_pages(&io_ctl);
+
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+ i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+
+ /* Flush the dirty pages in the cache file. */
+ ret = flush_dirty_cache(inode);
+ if (ret)
+ goto out;
+
+ /* Update the cache item to tell everyone this cache file is valid. */
+ ret = update_cache_item(trans, root, inode, path, offset,
+ entries, bitmaps);
out:
io_ctl_free(&io_ctl);
- if (err) {
+ if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
- return err;
+ return ret;
out_nospc:
- list_for_each_safe(pos, n, &bitmap_list) {
- struct btrfs_free_space *entry =
- list_entry(pos, struct btrfs_free_space, list);
- list_del_init(&entry->list);
- }
- io_ctl_drop_pages(&io_ctl);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
- i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+ cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
+
+ if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
+ up_write(&block_group->data_rwsem);
+
goto out;
}
@@ -1087,6 +1219,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
spin_unlock(&block_group->lock);
return 0;
}
+
+ if (block_group->delalloc_bytes) {
+ block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+ spin_unlock(&block_group->lock);
+ return 0;
+ }
spin_unlock(&block_group->lock);
inode = lookup_free_space_inode(root, block_group, path);
@@ -1898,7 +2036,7 @@ out:
spin_unlock(&ctl->tree_lock);
if (ret) {
- printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
+ printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
ASSERT(ret != -EEXIST);
}
@@ -2007,14 +2145,15 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
info = rb_entry(n, struct btrfs_free_space, offset_index);
if (info->bytes >= bytes && !block_group->ro)
count++;
- printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
- info->offset, info->bytes,
+ btrfs_crit(block_group->fs_info,
+ "entry offset %llu, bytes %llu, bitmap %s",
+ info->offset, info->bytes,
(info->bitmap) ? "yes" : "no");
}
- printk(KERN_INFO "block group has cluster?: %s\n",
+ btrfs_info(block_group->fs_info, "block group has cluster?: %s",
list_empty(&block_group->cluster_list) ? "no" : "yes");
- printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
- "\n", count);
+ btrfs_info(block_group->fs_info,
+ "%d blocks of free space at or bigger than bytes is", count);
}
void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
@@ -2276,7 +2415,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
goto out;
entry = rb_entry(node, struct btrfs_free_space, offset_index);
- while(1) {
+ while (1) {
if (entry->bytes < bytes && entry->bytes > *max_extent_size)
*max_extent_size = entry->bytes;
@@ -2417,7 +2556,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
struct btrfs_free_space *entry = NULL;
struct btrfs_free_space *last;
struct rb_node *node;
- u64 window_start;
u64 window_free;
u64 max_extent;
u64 total_size = 0;
@@ -2439,7 +2577,6 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
entry = rb_entry(node, struct btrfs_free_space, offset_index);
}
- window_start = entry->offset;
window_free = entry->bytes;
max_extent = entry->bytes;
first = entry;
@@ -2967,19 +3104,15 @@ out:
int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct inode *inode)
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
- struct inode *inode;
int ret;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return 0;
- inode = lookup_free_ino_inode(root, path);
- if (IS_ERR(inode))
- return 0;
-
ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
if (ret) {
btrfs_delalloc_release_metadata(inode, inode->i_size);
@@ -2990,7 +3123,6 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
#endif
}
- iput(inode);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e737f92cf6d..0cf4977ef70 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -58,7 +58,6 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_truncate_free_space_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_path *path,
struct inode *inode);
int load_free_space_cache(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *block_group);
@@ -76,7 +75,8 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info,
struct btrfs_root *root);
int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
- struct btrfs_path *path);
+ struct btrfs_path *path,
+ struct inode *inode);
void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
new file mode 100644
index 00000000000..85889aa82c6
--- /dev/null
+++ b/fs/btrfs/hash.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <crypto/hash.h>
+#include <linux/err.h>
+#include "hash.h"
+
+static struct crypto_shash *tfm;
+
+int __init btrfs_hash_init(void)
+{
+ tfm = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ return 0;
+}
+
+void btrfs_hash_exit(void)
+{
+ crypto_free_shash(tfm);
+}
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
+{
+ struct {
+ struct shash_desc shash;
+ char ctx[crypto_shash_descsize(tfm)];
+ } desc;
+ int err;
+
+ desc.shash.tfm = tfm;
+ desc.shash.flags = 0;
+ *(u32 *)desc.ctx = crc;
+
+ err = crypto_shash_update(&desc.shash, address, length);
+ BUG_ON(err);
+
+ return *(u32 *)desc.ctx;
+}
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 1d982812ab6..118a2316e5d 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,10 +19,15 @@
#ifndef __HASH__
#define __HASH__
-#include <linux/crc32c.h>
+int __init btrfs_hash_init(void);
+
+void btrfs_hash_exit(void);
+
+u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length);
+
static inline u64 btrfs_name_hash(const char *name, int len)
{
- return crc32c((u32)~1, name, len);
+ return btrfs_crc32c((u32)~1, name, len);
}
/*
@@ -31,7 +36,7 @@ static inline u64 btrfs_name_hash(const char *name, int len)
static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
int len)
{
- return (u64) crc32c(parent_objectid, name, len);
+ return (u64) btrfs_crc32c(parent_objectid, name, len);
}
#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index e0b7034d634..2be38df703c 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -91,32 +91,6 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
return 0;
}
-static struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int ins_len,
- int cow)
-{
- int ret;
- struct btrfs_key key;
- struct btrfs_inode_ref *ref;
-
- key.objectid = inode_objectid;
- key.type = BTRFS_INODE_REF_KEY;
- key.offset = ref_objectid;
-
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return NULL;
- if (!find_name_in_backref(path, name, name_len, &ref))
- return NULL;
- return ref;
-}
-
/* Returns NULL if no extref found */
struct btrfs_inode_extref *
btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
@@ -144,45 +118,6 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
return extref;
}
-int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- const char *name, int name_len,
- u64 inode_objectid, u64 ref_objectid, int mod,
- u64 *ret_index)
-{
- struct btrfs_inode_ref *ref;
- struct btrfs_inode_extref *extref;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
-
- ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
- inode_objectid, ref_objectid, ins_len,
- cow);
- if (IS_ERR(ref))
- return PTR_ERR(ref);
-
- if (ref != NULL) {
- *ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
- return 0;
- }
-
- btrfs_release_path(path);
-
- extref = btrfs_lookup_inode_extref(trans, root, path, name,
- name_len, inode_objectid,
- ref_objectid, ins_len, cow);
- if (IS_ERR(extref))
- return PTR_ERR(extref);
-
- if (extref) {
- *ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
- return 0;
- }
-
- return -ENOENT;
-}
-
static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -369,7 +304,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
goto out;
leaf = path->nodes[0];
- item = btrfs_item_nr(leaf, path->slots[0]);
+ item = btrfs_item_nr(path->slots[0]);
ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
ptr += btrfs_item_size(leaf, item) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2c66ddbbe67..888fbe19079 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -55,7 +55,7 @@ static int caching_kthread(void *data)
key.type = BTRFS_INODE_ITEM_KEY;
again:
/* need to make sure the commit_root doesn't disappear */
- mutex_lock(&root->fs_commit_mutex);
+ down_read(&fs_info->commit_root_sem);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -78,10 +78,8 @@ again:
btrfs_transaction_in_commit(fs_info)) {
leaf = path->nodes[0];
- if (btrfs_header_nritems(leaf) == 0) {
- WARN_ON(1);
+ if (WARN_ON(btrfs_header_nritems(leaf) == 0))
break;
- }
/*
* Save the key so we can advances forward
@@ -90,7 +88,7 @@ again:
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(path);
root->cache_progress = last;
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
schedule_timeout(1);
goto again;
} else
@@ -129,7 +127,7 @@ next:
btrfs_unpin_free_ino(root);
out:
wake_up(&root->cache_wait);
- mutex_unlock(&root->fs_commit_mutex);
+ up_read(&fs_info->commit_root_sem);
btrfs_free_path(path);
@@ -176,9 +174,13 @@ static void start_caching(struct btrfs_root *root)
BTRFS_LAST_FREE_OBJECTID - objectid + 1);
}
- tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+ tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu",
root->root_key.objectid);
- BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
+ if (IS_ERR(tsk)) {
+ btrfs_warn(root->fs_info, "failed to start inode caching task");
+ btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ "disabling inode map caching");
+ }
}
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -207,42 +209,28 @@ again:
void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
- struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return;
-
again:
if (root->cached == BTRFS_CACHE_FINISHED) {
- __btrfs_add_free_space(ctl, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
} else {
- /*
- * If we are in the process of caching free ino chunks,
- * to avoid adding the same inode number to the free_ino
- * tree twice due to cross transaction, we'll leave it
- * in the pinned tree until a transaction is committed
- * or the caching work is done.
- */
-
- mutex_lock(&root->fs_commit_mutex);
+ down_write(&root->fs_info->commit_root_sem);
spin_lock(&root->cache_lock);
if (root->cached == BTRFS_CACHE_FINISHED) {
spin_unlock(&root->cache_lock);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
goto again;
}
spin_unlock(&root->cache_lock);
start_caching(root);
- if (objectid <= root->cache_progress ||
- objectid > root->highest_objectid)
- __btrfs_add_free_space(ctl, objectid, 1);
- else
- __btrfs_add_free_space(pinned, objectid, 1);
+ __btrfs_add_free_space(pinned, objectid, 1);
- mutex_unlock(&root->fs_commit_mutex);
+ up_write(&root->fs_info->commit_root_sem);
}
}
@@ -252,7 +240,7 @@ again:
* and others will just be dropped, because the commit root we were
* searching has changed.
*
- * Must be called with root->fs_commit_mutex held
+ * Must be called with root->fs_info->commit_root_sem held
*/
void btrfs_unpin_free_ino(struct btrfs_root *root)
{
@@ -412,8 +400,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
return 0;
/* Don't save inode cache if we are deleting this root */
- if (btrfs_root_refs(&root->root_item) == 0 &&
- root != root->fs_info->tree_root)
+ if (btrfs_root_refs(&root->root_item) == 0)
return 0;
if (!btrfs_test_opt(root, INODE_MAP_CACHE))
@@ -467,7 +454,7 @@ again:
}
if (i_size_read(inode) > 0) {
- ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, inode);
if (ret) {
if (ret != -ENOSPC)
btrfs_abort_transaction(trans, root, ret);
@@ -504,7 +491,7 @@ again:
}
btrfs_free_reserved_data_space(inode, prealloc);
- ret = btrfs_write_out_ino_cache(root, trans, path);
+ ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 22ebc13b6c9..3668048e16f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,7 +43,6 @@
#include <linux/btrfs.h>
#include <linux/blkdev.h>
#include <linux/posix_acl_xattr.h>
-#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -59,9 +58,10 @@
#include "inode-map.h"
#include "backref.h"
#include "hash.h"
+#include "props.h"
struct btrfs_iget_args {
- u64 ino;
+ struct btrfs_key *location;
struct btrfs_root *root;
};
@@ -125,14 +125,13 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
* the btree. The caller should have done a btrfs_drop_extents so that
* no overlapping inline items exist in the btree
*/
-static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
+static int insert_inline_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, int extent_inserted,
struct btrfs_root *root, struct inode *inode,
u64 start, size_t size, size_t compressed_size,
int compress_type,
struct page **compressed_pages)
{
- struct btrfs_key key;
- struct btrfs_path *path;
struct extent_buffer *leaf;
struct page *page = NULL;
char *kaddr;
@@ -141,29 +140,29 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
int err = 0;
int ret;
size_t cur_size = size;
- size_t datasize;
unsigned long offset;
if (compressed_size && compressed_pages)
cur_size = compressed_size;
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
+ inode_add_bytes(inode, size);
- path->leave_spinning = 1;
+ if (!extent_inserted) {
+ struct btrfs_key key;
+ size_t datasize;
- key.objectid = btrfs_ino(inode);
- key.offset = start;
- btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
- datasize = btrfs_file_extent_calc_inline_size(cur_size);
+ key.objectid = btrfs_ino(inode);
+ key.offset = start;
+ btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
- inode_add_bytes(inode, size);
- ret = btrfs_insert_empty_item(trans, root, path, &key,
- datasize);
- if (ret) {
- err = ret;
- goto fail;
+ datasize = btrfs_file_extent_calc_inline_size(cur_size);
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &key,
+ datasize);
+ if (ret) {
+ err = ret;
+ goto fail;
+ }
}
leaf = path->nodes[0];
ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -204,7 +203,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
page_cache_release(page);
}
btrfs_mark_buffer_dirty(leaf);
- btrfs_free_path(path);
+ btrfs_release_path(path);
/*
* we're an inline extent, so nobody can
@@ -220,7 +219,6 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
return ret;
fail:
- btrfs_free_path(path);
return err;
}
@@ -243,6 +241,9 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
u64 aligned_end = ALIGN(end, root->sectorsize);
u64 data_len = inline_len;
int ret;
+ struct btrfs_path *path;
+ int extent_inserted = 0;
+ u32 extent_item_size;
if (compressed_size)
data_len = compressed_size;
@@ -257,12 +258,27 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
return 1;
}
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
trans = btrfs_join_transaction(root);
- if (IS_ERR(trans))
+ if (IS_ERR(trans)) {
+ btrfs_free_path(path);
return PTR_ERR(trans);
+ }
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
- ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
+ if (compressed_size && compressed_pages)
+ extent_item_size = btrfs_file_extent_calc_inline_size(
+ compressed_size);
+ else
+ extent_item_size = btrfs_file_extent_calc_inline_size(
+ inline_len);
+
+ ret = __btrfs_drop_extents(trans, root, inode, path,
+ start, aligned_end, NULL,
+ 1, 1, extent_item_size, &extent_inserted);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out;
@@ -270,7 +286,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
if (isize > actual_end)
inline_len = min_t(u64, isize, actual_end);
- ret = insert_inline_extent(trans, root, inode, start,
+ ret = insert_inline_extent(trans, path, extent_inserted,
+ root, inode, start,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) {
@@ -285,6 +302,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
+ btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
}
@@ -376,6 +394,14 @@ static noinline int compress_file_range(struct inode *inode,
(start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
btrfs_add_inode_defrag(NULL, inode);
+ /*
+ * skip compression for a small file range(<=blocksize) that
+ * isn't an inline extent, since it dosen't save disk space at all.
+ */
+ if ((end - start + 1) <= blocksize &&
+ (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+ goto cleanup_and_bail_uncompressed;
+
actual_end = min_t(u64, isize, end + 1);
again:
will_compress = 0;
@@ -667,7 +693,7 @@ retry:
ret = btrfs_reserve_extent(root,
async_extent->compressed_size,
async_extent->compressed_size,
- 0, alloc_hint, &ins, 1);
+ 0, alloc_hint, &ins, 1, 1);
if (ret) {
int i;
@@ -768,7 +794,7 @@ retry:
out:
return ret;
out_free_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_free:
extent_clear_unlock_delalloc(inode, async_extent->start,
async_extent->start +
@@ -844,7 +870,11 @@ static noinline int cow_file_range(struct inode *inode,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
int ret = 0;
- BUG_ON(btrfs_is_free_space_inode(inode));
+ if (btrfs_is_free_space_inode(inode)) {
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize, num_bytes);
@@ -887,7 +917,7 @@ static noinline int cow_file_range(struct inode *inode,
cur_alloc_size = disk_num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size,
root->sectorsize, 0, alloc_hint,
- &ins, 1);
+ &ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -965,7 +995,7 @@ out:
return ret;
out_reserve:
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
out_unlock:
extent_clear_unlock_delalloc(inode, start, end, locked_page,
EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
@@ -1054,17 +1084,15 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->end = cur_end;
INIT_LIST_HEAD(&async_cow->extents);
- async_cow->work.func = async_cow_start;
- async_cow->work.ordered_func = async_cow_submit;
- async_cow->work.ordered_free = async_cow_free;
- async_cow->work.flags = 0;
+ btrfs_init_work(&async_cow->work, async_cow_start,
+ async_cow_submit, async_cow_free);
nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
- btrfs_queue_worker(&root->fs_info->delalloc_workers,
- &async_cow->work);
+ btrfs_queue_work(root->fs_info->delalloc_workers,
+ &async_cow->work);
if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
wait_event(root->fs_info->async_submit_wait,
@@ -1178,10 +1206,8 @@ static noinline int run_delalloc_nocow(struct inode *inode,
while (1) {
ret = btrfs_lookup_file_extent(trans, root, path, ino,
cur_offset, 0);
- if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ if (ret < 0)
goto error;
- }
if (ret > 0 && path->slots[0] > 0 && check_prev) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1195,10 +1221,8 @@ next_slot:
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ if (ret < 0)
goto error;
- }
if (ret > 0)
break;
leaf = path->nodes[0];
@@ -1255,6 +1279,15 @@ next_slot:
disk_bytenr += cur_offset - found_key.offset;
num_bytes = min(end + 1, extent_end) - cur_offset;
/*
+ * if there are pending snapshots for this root,
+ * we fall into common COW way.
+ */
+ if (!nolock) {
+ err = btrfs_start_nocow_write(root);
+ if (!err)
+ goto out_check;
+ }
+ /*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
* either valid or do not exist.
@@ -1264,7 +1297,8 @@ next_slot:
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
- btrfs_file_extent_inline_len(leaf, fi);
+ btrfs_file_extent_inline_len(leaf,
+ path->slots[0], fi);
extent_end = ALIGN(extent_end, root->sectorsize);
} else {
BUG_ON(1);
@@ -1272,6 +1306,8 @@ next_slot:
out_check:
if (extent_end <= start) {
path->slots[0]++;
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto next_slot;
}
if (!nocow) {
@@ -1290,7 +1326,8 @@ out_check:
cow_start, found_key.offset - 1,
page_started, nr_written, 1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto error;
}
cow_start = (u64)-1;
@@ -1340,7 +1377,8 @@ out_check:
ret = btrfs_reloc_clone_csums(inode, cur_offset,
num_bytes);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
goto error;
}
}
@@ -1350,6 +1388,8 @@ out_check:
locked_page, EXTENT_LOCKED |
EXTENT_DELALLOC, PAGE_UNLOCK |
PAGE_SET_PRIVATE2);
+ if (!nolock && nocow)
+ btrfs_end_nocow_write(root);
cur_offset = extent_end;
if (cur_offset > end)
break;
@@ -1364,10 +1404,8 @@ out_check:
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page, cow_start, end,
page_started, nr_written, 1);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ if (ret)
goto error;
- }
}
error:
@@ -1551,7 +1589,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
spin_unlock(&BTRFS_I(inode)->lock);
}
- if (*bits & EXTENT_DO_ACCOUNTING)
+ /*
+ * We don't reserve metadata space for space cache inodes so we
+ * don't need to call dellalloc_release_metadata if there is an
+ * error.
+ */
+ if (*bits & EXTENT_DO_ACCOUNTING &&
+ root != root->fs_info->tree_root)
btrfs_delalloc_release_metadata(inode, len);
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
@@ -1579,7 +1623,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
unsigned long bio_flags)
{
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
- u64 logical = (u64)bio->bi_sector << 9;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
int ret;
@@ -1587,7 +1631,7 @@ int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
if (bio_flags & EXTENT_BIO_COMPRESSED)
return 0;
- length = bio->bi_size;
+ length = bio->bi_iter.bi_size;
map_length = length;
ret = btrfs_map_block(root->fs_info, rw, logical,
&map_length, NULL, 0);
@@ -1825,9 +1869,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
SetPageChecked(page);
page_cache_get(page);
- fixup->work.func = btrfs_writepage_fixup_worker;
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
- btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
+ btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
return -EBUSY;
}
@@ -1843,14 +1887,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_key ins;
+ int extent_inserted = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- path->leave_spinning = 1;
-
/*
* we may be replacing one extent in the tree with another.
* The new extent is pinned in the extent map, and we don't want
@@ -1860,17 +1903,23 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
- ret = btrfs_drop_extents(trans, root, inode, file_pos,
- file_pos + num_bytes, 0);
+ ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
+ file_pos + num_bytes, NULL, 0,
+ 1, sizeof(*fi), &extent_inserted);
if (ret)
goto out;
- ins.objectid = btrfs_ino(inode);
- ins.offset = file_pos;
- ins.type = BTRFS_EXTENT_DATA_KEY;
- ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
- if (ret)
- goto out;
+ if (!extent_inserted) {
+ ins.objectid = btrfs_ino(inode);
+ ins.offset = file_pos;
+ ins.type = BTRFS_EXTENT_DATA_KEY;
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(trans, root, path, &ins,
+ sizeof(*fi));
+ if (ret)
+ goto out;
+ }
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -2041,10 +2090,8 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
key.offset = offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0) {
- WARN_ON(1);
+ if (WARN_ON(ret < 0))
return ret;
- }
ret = 0;
while (1) {
@@ -2133,7 +2180,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
old->extent_offset, fs_info,
path, record_one_backref,
old);
- BUG_ON(ret < 0 && ret != -ENOENT);
+ if (ret < 0 && ret != -ENOENT)
+ return false;
/* no backref to be processed for this extent */
if (!old->count) {
@@ -2217,6 +2265,11 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
return PTR_ERR(root);
}
+ if (btrfs_root_readonly(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ return 0;
+ }
+
/* step 2: get inode */
key.objectid = backref->inum;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -2293,7 +2346,7 @@ again:
u64 extent_len;
struct btrfs_key found_key;
- ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+ ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
if (ret < 0)
goto out_free_path;
@@ -2367,10 +2420,23 @@ out_unlock:
return ret;
}
+static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
+{
+ struct old_sa_defrag_extent *old, *tmp;
+
+ if (!new)
+ return;
+
+ list_for_each_entry_safe(old, tmp, &new->head, list) {
+ list_del(&old->list);
+ kfree(old);
+ }
+ kfree(new);
+}
+
static void relink_file_extents(struct new_sa_defrag_extent *new)
{
struct btrfs_path *path;
- struct old_sa_defrag_extent *old, *tmp;
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
struct inode *inode;
@@ -2413,16 +2479,11 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
kfree(prev);
btrfs_free_path(path);
-
- list_for_each_entry_safe(old, tmp, &new->head, list) {
- list_del(&old->list);
- kfree(old);
- }
out:
+ free_sa_defrag_extent(new);
+
atomic_dec(&root->fs_info->defrag_running);
wake_up(&root->fs_info->transaction_wait);
-
- kfree(new);
}
static struct new_sa_defrag_extent *
@@ -2432,7 +2493,7 @@ record_old_file_extents(struct inode *inode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
struct btrfs_key key;
- struct old_sa_defrag_extent *old, *tmp;
+ struct old_sa_defrag_extent *old;
struct new_sa_defrag_extent *new;
int ret;
@@ -2480,7 +2541,7 @@ record_old_file_extents(struct inode *inode,
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
- goto out_free_list;
+ goto out_free_path;
else if (ret > 0)
break;
continue;
@@ -2509,7 +2570,7 @@ record_old_file_extents(struct inode *inode,
old = kmalloc(sizeof(*old), GFP_NOFS);
if (!old)
- goto out_free_list;
+ goto out_free_path;
offset = max(new->file_pos, key.offset);
end = min(new->file_pos + new->len, key.offset + num_bytes);
@@ -2531,24 +2592,28 @@ next:
return new;
-out_free_list:
- list_for_each_entry_safe(old, tmp, &new->head, list) {
- list_del(&old->list);
- kfree(old);
- }
out_free_path:
btrfs_free_path(path);
out_kfree:
- kfree(new);
+ free_sa_defrag_extent(new);
return NULL;
}
-/*
- * helper function for btrfs_finish_ordered_io, this
- * just reads in some of the csum leaves to prime them into ram
- * before we start the transaction. It limits the amount of btree
- * reads required while inside the transaction.
- */
+static void btrfs_release_delalloc_bytes(struct btrfs_root *root,
+ u64 start, u64 len)
+{
+ struct btrfs_block_group_cache *cache;
+
+ cache = btrfs_lookup_block_group(root->fs_info, start);
+ ASSERT(cache);
+
+ spin_lock(&cache->lock);
+ cache->delalloc_bytes -= len;
+ spin_unlock(&cache->lock);
+
+ btrfs_put_block_group(cache);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -2610,7 +2675,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
EXTENT_DEFRAG, 1, cached_state);
if (ret) {
u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
- if (last_snapshot >= BTRFS_I(inode)->generation)
+ if (0 && last_snapshot >= BTRFS_I(inode)->generation)
/* the inode is shared */
new = record_old_file_extents(inode, ordered_extent);
@@ -2628,6 +2693,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans = NULL;
goto out_unlock;
}
+
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -2647,6 +2713,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
logical_len, logical_len,
compress_type, 0, 0,
BTRFS_FILE_EXTENT_REG);
+ if (!ret)
+ btrfs_release_delalloc_bytes(root,
+ ordered_extent->start,
+ ordered_extent->disk_len);
}
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset, ordered_extent->len,
@@ -2699,7 +2769,7 @@ out:
!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
!test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
btrfs_free_reserved_extent(root, ordered_extent->start,
- ordered_extent->disk_len);
+ ordered_extent->disk_len, 1);
}
@@ -2710,8 +2780,14 @@ out:
btrfs_remove_ordered_extent(inode, ordered_extent);
/* for snapshot-aware defrag */
- if (new)
- relink_file_extents(new);
+ if (new) {
+ if (ret) {
+ free_sa_defrag_extent(new);
+ atomic_dec(&root->fs_info->defrag_running);
+ } else {
+ relink_file_extents(new);
+ }
+ }
/* once for us */
btrfs_put_ordered_extent(ordered_extent);
@@ -2734,7 +2810,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct inode *inode = page->mapping->host;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ordered_extent *ordered_extent = NULL;
- struct btrfs_workers *workers;
+ struct btrfs_workqueue *workers;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -2743,14 +2819,13 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
end - start + 1, uptodate))
return 0;
- ordered_extent->work.func = finish_ordered_fn;
- ordered_extent->work.flags = 0;
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
if (btrfs_is_free_space_inode(inode))
- workers = &root->fs_info->endio_freespace_worker;
+ workers = root->fs_info->endio_freespace_worker;
else
- workers = &root->fs_info->endio_write_workers;
- btrfs_queue_worker(workers, &ordered_extent->work);
+ workers = root->fs_info->endio_write_workers;
+ btrfs_queue_work(workers, &ordered_extent->work);
return 0;
}
@@ -2892,14 +2967,15 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
root->orphan_block_rsv = NULL;
spin_unlock(&root->orphan_lock);
- if (root->orphan_item_inserted &&
+ if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
btrfs_root_refs(&root->root_item) > 0) {
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret)
btrfs_abort_transaction(trans, root, ret);
else
- root->orphan_item_inserted = 0;
+ clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
+ &root->state);
}
if (block_rsv) {
@@ -2969,6 +3045,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (insert >= 1) {
ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
if (ret) {
+ atomic_dec(&root->orphan_inodes);
if (reserve) {
clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
&BTRFS_I(inode)->runtime_flags);
@@ -3018,14 +3095,16 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
release_rsv = 1;
spin_unlock(&root->orphan_lock);
- if (trans && delete_item)
- ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-
- if (release_rsv) {
- btrfs_orphan_release_metadata(inode);
+ if (delete_item) {
atomic_dec(&root->orphan_inodes);
+ if (trans)
+ ret = btrfs_del_orphan_item(trans, root,
+ btrfs_ino(inode));
}
+ if (release_rsv)
+ btrfs_orphan_release_metadata(inode);
+
return ret;
}
@@ -3172,8 +3251,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* if we have links, this was a truncate, lets do that */
if (inode->i_nlink) {
- if (!S_ISREG(inode->i_mode)) {
- WARN_ON(1);
+ if (WARN_ON(!S_ISREG(inode->i_mode))) {
iput(inode);
continue;
}
@@ -3214,7 +3292,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
btrfs_block_rsv_release(root, root->orphan_block_rsv,
(u64)-1);
- if (root->orphan_block_rsv || root->orphan_item_inserted) {
+ if (root->orphan_block_rsv ||
+ test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
trans = btrfs_join_transaction(root);
if (!IS_ERR(trans))
btrfs_end_transaction(trans, root);
@@ -3240,7 +3319,8 @@ out:
* slot is the slot the inode is in, objectid is the objectid of the inode
*/
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
- int slot, u64 objectid)
+ int slot, u64 objectid,
+ int *first_xattr_slot)
{
u32 nritems = btrfs_header_nritems(leaf);
struct btrfs_key found_key;
@@ -3256,6 +3336,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
}
slot++;
+ *first_xattr_slot = -1;
while (slot < nritems) {
btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -3265,6 +3346,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
/* we found an xattr, assume we've got an acl */
if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+ if (*first_xattr_slot == -1)
+ *first_xattr_slot = slot;
if (found_key.offset == xattr_access ||
found_key.offset == xattr_default)
return 1;
@@ -3293,6 +3376,8 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
* something larger than an xattr. We have to assume the inode
* has acls
*/
+ if (*first_xattr_slot == -1)
+ *first_xattr_slot = slot;
return 1;
}
@@ -3307,10 +3392,12 @@ static void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_timespec *tspec;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
+ unsigned long ptr;
int maybe_acls;
u32 rdev;
int ret;
bool filled = false;
+ int first_xattr_slot;
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
@@ -3320,7 +3407,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
if (!path)
goto make_bad;
- path->leave_spinning = 1;
memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -3330,7 +3416,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
leaf = path->nodes[0];
if (filled)
- goto cache_acl;
+ goto cache_index;
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -3373,18 +3459,51 @@ static void btrfs_read_locked_inode(struct inode *inode)
BTRFS_I(inode)->index_cnt = (u64)-1;
BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+
+cache_index:
+ path->slots[0]++;
+ if (inode->i_nlink != 1 ||
+ path->slots[0] >= btrfs_header_nritems(leaf))
+ goto cache_acl;
+
+ btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
+ if (location.objectid != btrfs_ino(inode))
+ goto cache_acl;
+
+ ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ if (location.type == BTRFS_INODE_REF_KEY) {
+ struct btrfs_inode_ref *ref;
+
+ ref = (struct btrfs_inode_ref *)ptr;
+ BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
+ } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
+ struct btrfs_inode_extref *extref;
+
+ extref = (struct btrfs_inode_extref *)ptr;
+ BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
+ extref);
+ }
cache_acl:
/*
* try to precache a NULL acl entry for files that don't have
* any xattrs or acls
*/
maybe_acls = acls_after_inode_item(leaf, path->slots[0],
- btrfs_ino(inode));
+ btrfs_ino(inode), &first_xattr_slot);
+ if (first_xattr_slot != -1) {
+ path->slots[0] = first_xattr_slot;
+ ret = btrfs_load_inode_props(inode, path);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "error loading props for ino %llu (root %llu): %d",
+ btrfs_ino(inode),
+ root->root_key.objectid, ret);
+ }
+ btrfs_free_path(path);
+
if (!maybe_acls)
cache_no_acl(inode);
- btrfs_free_path(path);
-
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
@@ -3488,7 +3607,6 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
goto failed;
}
- btrfs_unlock_up_safe(path, 1);
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_inode_item);
@@ -3585,6 +3703,24 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
btrfs_release_path(path);
+ /*
+ * If we don't have dir index, we have to get it by looking up
+ * the inode ref, since we get the inode ref, remove it directly,
+ * it is unnecessary to do delayed deletion.
+ *
+ * But if we have dir index, needn't search inode ref to get it.
+ * Since the inode ref is close to the inode item, it is better
+ * that we delay to delete it, and just do this deletion when
+ * we update the inode item.
+ */
+ if (BTRFS_I(inode)->dir_index) {
+ ret = btrfs_delayed_delete_inode_ref(inode);
+ if (!ret) {
+ index = BTRFS_I(inode)->dir_index;
+ goto skip_backref;
+ }
+ }
+
ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
dir_ino, &index);
if (ret) {
@@ -3594,7 +3730,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, root, ret);
goto err;
}
-
+skip_backref:
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -3636,7 +3772,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int ret;
ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
if (!ret) {
- btrfs_drop_nlink(inode);
+ drop_nlink(inode);
ret = btrfs_update_inode(trans, root, inode);
}
return ret;
@@ -3884,7 +4020,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
* not block aligned since we will be keeping the last block of the
* extent just the way it is.
*/
- if (root->ref_cows || root == root->fs_info->tree_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)
btrfs_drop_extent_cache(inode, ALIGN(new_size,
root->sectorsize), (u64)-1, 0);
@@ -3940,7 +4077,7 @@ search_again:
btrfs_file_extent_num_bytes(leaf, fi);
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
item_end += btrfs_file_extent_inline_len(leaf,
- fi);
+ path->slots[0], fi);
}
item_end--;
}
@@ -3977,7 +4114,9 @@ search_again:
extent_num_bytes);
num_dec = (orig_num_bytes -
extent_num_bytes);
- if (root->ref_cows && extent_start != 0)
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state) &&
+ extent_start != 0)
inode_sub_bytes(inode, num_dec);
btrfs_mark_buffer_dirty(leaf);
} else {
@@ -3991,7 +4130,8 @@ search_again:
num_dec = btrfs_file_extent_num_bytes(leaf, fi);
if (extent_start != 0) {
found_extent = 1;
- if (root->ref_cows)
+ if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
inode_sub_bytes(inode, num_dec);
}
}
@@ -4006,14 +4146,20 @@ search_again:
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
u32 size = new_size - found_key.offset;
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
inode_sub_bytes(inode, item_end + 1 -
new_size);
- }
+
+ /*
+ * update the ram bytes to properly reflect
+ * the new size of our item
+ */
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
size =
btrfs_file_extent_calc_inline_size(size);
btrfs_truncate_item(root, path, size, 1);
- } else if (root->ref_cows) {
+ } else if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state)) {
inode_sub_bytes(inode, item_end + 1 -
found_key.offset);
}
@@ -4035,8 +4181,9 @@ delete:
} else {
break;
}
- if (found_extent && (root->ref_cows ||
- root == root->fs_info->tree_root)) {
+ if (found_extent &&
+ (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root == root->fs_info->tree_root)) {
btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
@@ -4195,6 +4342,49 @@ out:
return ret;
}
+static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
+ u64 offset, u64 len)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ /*
+ * Still need to make sure the inode looks like it's been updated so
+ * that any holes get logged if we fsync.
+ */
+ if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) {
+ BTRFS_I(inode)->last_trans = root->fs_info->generation;
+ BTRFS_I(inode)->last_sub_trans = root->log_transid;
+ BTRFS_I(inode)->last_log_commit = root->last_log_commit;
+ return 0;
+ }
+
+ /*
+ * 1 - for the one we're dropping
+ * 1 - for the one we're adding
+ * 1 - for updating the inode.
+ */
+ trans = btrfs_start_transaction(root, 3);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ return ret;
+ }
+
+ ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+ 0, 0, len, 0, len, 0, 0, 0);
+ if (ret)
+ btrfs_abort_transaction(trans, root, ret);
+ else
+ btrfs_update_inode(trans, root, inode);
+ btrfs_end_transaction(trans, root);
+ return ret;
+}
+
/*
* This function puts in dummy file extents for the area we're creating a hole
* for. So if we are truncating this file to a larger size we need to insert
@@ -4203,7 +4393,6 @@ out:
*/
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
@@ -4230,15 +4419,16 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
while (1) {
struct btrfs_ordered_extent *ordered;
- btrfs_wait_ordered_range(inode, hole_start,
- block_end - hole_start);
+
lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
&cached_state);
- ordered = btrfs_lookup_ordered_extent(inode, hole_start);
+ ordered = btrfs_lookup_ordered_range(inode, hole_start,
+ block_end - hole_start);
if (!ordered)
break;
unlock_extent_cached(io_tree, hole_start, block_end - 1,
&cached_state, GFP_NOFS);
+ btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
}
@@ -4257,31 +4447,10 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
struct extent_map *hole_em;
hole_size = last_byte - cur_offset;
- trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
- break;
- }
-
- err = btrfs_drop_extents(trans, root, inode,
- cur_offset,
- cur_offset + hole_size, 1);
- if (err) {
- btrfs_abort_transaction(trans, root, err);
- btrfs_end_transaction(trans, root);
- break;
- }
-
- err = btrfs_insert_file_extent(trans, root,
- btrfs_ino(inode), cur_offset, 0,
- 0, hole_size, 0, hole_size,
- 0, 0, 0);
- if (err) {
- btrfs_abort_transaction(trans, root, err);
- btrfs_end_transaction(trans, root);
+ err = maybe_insert_hole(root, inode, cur_offset,
+ hole_size);
+ if (err)
break;
- }
-
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + hole_size - 1, 0);
hole_em = alloc_extent_map();
@@ -4300,7 +4469,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_em->ram_bytes = hole_size;
hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
hole_em->compress_type = BTRFS_COMPRESS_NONE;
- hole_em->generation = trans->transid;
+ hole_em->generation = root->fs_info->generation;
while (1) {
write_lock(&em_tree->lock);
@@ -4313,17 +4482,14 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
hole_size - 1, 0);
}
free_extent_map(hole_em);
-next:
- btrfs_update_inode(trans, root, inode);
- btrfs_end_transaction(trans, root);
}
+next:
free_extent_map(em);
em = NULL;
cur_offset = last_byte;
if (cur_offset >= block_end)
break;
}
-
free_extent_map(em);
unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
GFP_NOFS);
@@ -4345,8 +4511,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
- inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
+ if (newsize != oldsize) {
+ inode_inc_iversion(inode);
+ if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
+ inode->i_ctime = inode->i_mtime =
+ current_fs_time(inode->i_sb);
+ }
if (newsize > oldsize) {
truncate_pagecache(inode, newsize);
@@ -4455,12 +4625,70 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
err = btrfs_dirty_inode(inode);
if (!err && attr->ia_valid & ATTR_MODE)
- err = btrfs_acl_chmod(inode);
+ err = posix_acl_chmod(inode, inode->i_mode);
}
return err;
}
+/*
+ * While truncating the inode pages during eviction, we get the VFS calling
+ * btrfs_invalidatepage() against each page of the inode. This is slow because
+ * the calls to btrfs_invalidatepage() result in a huge amount of calls to
+ * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
+ * extent_state structures over and over, wasting lots of time.
+ *
+ * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
+ * those expensive operations on a per page basis and do only the ordered io
+ * finishing, while we release here the extent_map and extent_state structures,
+ * without the excessive merging and splitting.
+ */
+static void evict_inode_truncate_pages(struct inode *inode)
+{
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
+ struct rb_node *node;
+
+ ASSERT(inode->i_state & I_FREEING);
+ truncate_inode_pages_final(&inode->i_data);
+
+ write_lock(&map_tree->lock);
+ while (!RB_EMPTY_ROOT(&map_tree->map)) {
+ struct extent_map *em;
+
+ node = rb_first(&map_tree->map);
+ em = rb_entry(node, struct extent_map, rb_node);
+ clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ remove_extent_mapping(map_tree, em);
+ free_extent_map(em);
+ }
+ write_unlock(&map_tree->lock);
+
+ spin_lock(&io_tree->lock);
+ while (!RB_EMPTY_ROOT(&io_tree->state)) {
+ struct extent_state *state;
+ struct extent_state *cached_state = NULL;
+
+ node = rb_first(&io_tree->state);
+ state = rb_entry(node, struct extent_state, rb_node);
+ atomic_inc(&state->refs);
+ spin_unlock(&io_tree->lock);
+
+ lock_extent_bits(io_tree, state->start, state->end,
+ 0, &cached_state);
+ clear_extent_bit(io_tree, state->start, state->end,
+ EXTENT_LOCKED | EXTENT_DIRTY |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 1,
+ &cached_state, GFP_NOFS);
+ free_extent_state(state);
+
+ spin_lock(&io_tree->lock);
+ }
+ spin_unlock(&io_tree->lock);
+}
+
void btrfs_evict_inode(struct inode *inode)
{
struct btrfs_trans_handle *trans;
@@ -4471,9 +4699,12 @@ void btrfs_evict_inode(struct inode *inode)
trace_btrfs_inode_evict(inode);
- truncate_inode_pages(&inode->i_data, 0);
- if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
- btrfs_is_free_space_inode(inode)))
+ evict_inode_truncate_pages(inode);
+
+ if (inode->i_nlink &&
+ ((btrfs_root_refs(&root->root_item) != 0 &&
+ root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ btrfs_is_free_space_inode(inode)))
goto no_delete;
if (is_bad_inode(inode)) {
@@ -4490,7 +4721,8 @@ void btrfs_evict_inode(struct inode *inode)
}
if (inode->i_nlink > 0) {
- BUG_ON(btrfs_root_refs(&root->root_item) != 0);
+ BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+ root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
goto no_delete;
}
@@ -4643,9 +4875,9 @@ static int fixup_tree_root_location(struct btrfs_root *root,
}
err = -ENOENT;
- ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
- BTRFS_I(dir)->root->root_key.objectid,
- location->objectid);
+ ret = btrfs_find_item(root->fs_info->tree_root, path,
+ BTRFS_I(dir)->root->root_key.objectid,
+ location->objectid, BTRFS_ROOT_REF_KEY, NULL);
if (ret) {
if (ret < 0)
err = ret;
@@ -4731,14 +4963,7 @@ static void inode_tree_del(struct inode *inode)
}
spin_unlock(&root->inode_lock);
- /*
- * Free space cache has inodes in the tree root, but the tree root has a
- * root_refs of 0, so this could end up dropping the tree root as a
- * snapshot, so we need the extra !root->fs_info->tree_root check to
- * make sure we don't drop it.
- */
- if (empty && btrfs_root_refs(&root->root_item) == 0 &&
- root != root->fs_info->tree_root) {
+ if (empty && btrfs_root_refs(&root->root_item) == 0) {
synchronize_srcu(&root->fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -4756,7 +4981,8 @@ void btrfs_invalidate_inodes(struct btrfs_root *root)
struct inode *inode;
u64 objectid = 0;
- WARN_ON(btrfs_root_refs(&root->root_item) != 0);
+ if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
again:
@@ -4813,7 +5039,9 @@ again:
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
struct btrfs_iget_args *args = p;
- inode->i_ino = args->ino;
+ inode->i_ino = args->location->objectid;
+ memcpy(&BTRFS_I(inode)->location, args->location,
+ sizeof(*args->location));
BTRFS_I(inode)->root = args->root;
return 0;
}
@@ -4821,20 +5049,22 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
static int btrfs_find_actor(struct inode *inode, void *opaque)
{
struct btrfs_iget_args *args = opaque;
- return args->ino == btrfs_ino(inode) &&
+ return args->location->objectid == BTRFS_I(inode)->location.objectid &&
args->root == BTRFS_I(inode)->root;
}
static struct inode *btrfs_iget_locked(struct super_block *s,
- u64 objectid,
+ struct btrfs_key *location,
struct btrfs_root *root)
{
struct inode *inode;
struct btrfs_iget_args args;
- args.ino = objectid;
+ unsigned long hashval = btrfs_inode_hash(location->objectid, root);
+
+ args.location = location;
args.root = root;
- inode = iget5_locked(s, objectid, btrfs_find_actor,
+ inode = iget5_locked(s, hashval, btrfs_find_actor,
btrfs_init_locked_inode,
(void *)&args);
return inode;
@@ -4848,13 +5078,11 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
{
struct inode *inode;
- inode = btrfs_iget_locked(s, location->objectid, root);
+ inode = btrfs_iget_locked(s, location, root);
if (!inode)
return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
- BTRFS_I(inode)->root = root;
- memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
btrfs_read_locked_inode(inode);
if (!is_bad_inode(inode)) {
inode_tree_add(inode);
@@ -4910,7 +5138,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
return ERR_PTR(ret);
if (location.objectid == 0)
- return NULL;
+ return ERR_PTR(-ENOENT);
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(dir->i_sb, &location, root, NULL);
@@ -4967,17 +5195,23 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
static void btrfs_dentry_release(struct dentry *dentry)
{
- if (dentry->d_fsdata)
- kfree(dentry->d_fsdata);
+ kfree(dentry->d_fsdata);
}
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
- struct dentry *ret;
+ struct inode *inode;
- ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
- return ret;
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode)) {
+ if (PTR_ERR(inode) == -ENOENT)
+ inode = NULL;
+ else
+ return ERR_CAST(inode);
+ }
+
+ return d_materialise_unique(dentry, inode);
}
unsigned char btrfs_filetype_table[] = {
@@ -5048,7 +5282,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
continue;
}
- item = btrfs_item_nr(leaf, slot);
+ item = btrfs_item_nr(slot);
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid)
@@ -5345,9 +5579,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
struct btrfs_key key[2];
u32 sizes[2];
+ int nitems = name ? 2 : 1;
unsigned long ptr;
int ret;
- int owner;
path = btrfs_alloc_path();
if (!path)
@@ -5365,7 +5599,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
inode->i_ino = objectid;
- if (dir) {
+ if (dir && name) {
trace_btrfs_inode_request(dir);
ret = btrfs_set_inode_index(dir, index);
@@ -5374,6 +5608,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
iput(inode);
return ERR_PTR(ret);
}
+ } else if (dir) {
+ *index = 0;
}
/*
* index_cnt is ignored for everything but a dir,
@@ -5381,6 +5617,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
* number
*/
BTRFS_I(inode)->index_cnt = 2;
+ BTRFS_I(inode)->dir_index = *index;
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
@@ -5393,30 +5630,28 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
*/
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
- if (S_ISDIR(mode))
- owner = 0;
- else
- owner = 1;
-
key[0].objectid = objectid;
btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
key[0].offset = 0;
- /*
- * Start new inodes with an inode_ref. This is slightly more
- * efficient for small numbers of hard links since they will
- * be packed into one item. Extended refs will kick in if we
- * add more hard links than can fit in the ref item.
- */
- key[1].objectid = objectid;
- btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
- key[1].offset = ref_objectid;
-
sizes[0] = sizeof(struct btrfs_inode_item);
- sizes[1] = name_len + sizeof(*ref);
+
+ if (name) {
+ /*
+ * Start new inodes with an inode_ref. This is slightly more
+ * efficient for small numbers of hard links since they will
+ * be packed into one item. Extended refs will kick in if we
+ * add more hard links than can fit in the ref item.
+ */
+ key[1].objectid = objectid;
+ btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
+ key[1].offset = ref_objectid;
+
+ sizes[1] = name_len + sizeof(*ref);
+ }
path->leave_spinning = 1;
- ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
+ ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
if (ret != 0)
goto fail;
@@ -5429,12 +5664,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizeof(*inode_item));
fill_inode_item(trans, path->nodes[0], inode_item, inode);
- ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
- struct btrfs_inode_ref);
- btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
- btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
- ptr = (unsigned long)(ref + 1);
- write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ if (name) {
+ ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
+ struct btrfs_inode_ref);
+ btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
+ btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
+ ptr = (unsigned long)(ref + 1);
+ write_extent_buffer(path->nodes[0], name, ptr, name_len);
+ }
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
@@ -5454,7 +5691,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
BTRFS_INODE_NODATASUM;
}
- insert_inode_hash(inode);
+ btrfs_insert_inode_hash(inode);
inode_tree_add(inode);
trace_btrfs_inode_new(inode);
@@ -5462,9 +5699,15 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_update_root_times(trans, root);
+ ret = btrfs_inode_inherit_props(trans, inode, dir);
+ if (ret)
+ btrfs_err(root->fs_info,
+ "error inheriting props for ino %llu (root %llu): %d",
+ btrfs_ino(inode), root->root_key.objectid, ret);
+
return inode;
fail:
- if (dir)
+ if (dir && name)
BTRFS_I(dir)->index_cnt--;
btrfs_free_path(path);
iput(inode);
@@ -5621,6 +5864,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
}
out_unlock:
btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
if (drop_inode) {
inode_dec_link_count(inode);
@@ -5694,6 +5938,7 @@ out_unlock:
inode_dec_link_count(inode);
iput(inode);
}
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
}
@@ -5730,7 +5975,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
- btrfs_inc_nlink(inode);
+ /* There are several dir indexes for this inode, clear the cache. */
+ BTRFS_I(inode)->dir_index = 0ULL;
+ inc_nlink(inode);
inode_inc_iversion(inode);
inode->i_ctime = CURRENT_TIME;
ihold(inode);
@@ -5745,11 +5992,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
+ if (inode->i_nlink == 1) {
+ /*
+ * If new hard link count is 1, it's a file created
+ * with open(2) O_TMPFILE flag.
+ */
+ err = btrfs_orphan_del(trans, inode);
+ if (err)
+ goto fail;
+ }
d_instantiate(dentry, inode);
btrfs_log_new_name(trans, inode, NULL, parent);
}
btrfs_end_transaction(trans, root);
+ btrfs_balance_delayed_items(root);
fail:
if (drop_inode) {
inode_dec_link_count(inode);
@@ -5816,6 +6073,7 @@ out_fail:
btrfs_end_transaction(trans, root);
if (drop_on_err)
iput(inode);
+ btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
}
@@ -5860,7 +6118,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
compress_type = btrfs_file_extent_compression(leaf, item);
max_size = btrfs_file_extent_ram_bytes(leaf, item);
inline_size = btrfs_file_extent_inline_item_len(leaf,
- btrfs_item_nr(leaf, path->slots[0]));
+ btrfs_item_nr(path->slots[0]));
tmp = kmalloc(inline_size, GFP_NOFS);
if (!tmp)
return -ENOMEM;
@@ -5871,16 +6129,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
ret = btrfs_decompress(compress_type, tmp, page,
extent_offset, inline_size, max_size);
- if (ret) {
- char *kaddr = kmap_atomic(page);
- unsigned long copy_size = min_t(u64,
- PAGE_CACHE_SIZE - pg_offset,
- max_size - extent_offset);
- memset(kaddr + pg_offset, 0, copy_size);
- kunmap_atomic(kaddr);
- }
kfree(tmp);
- return 0;
+ return ret;
}
/*
@@ -5898,7 +6148,6 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
{
int ret;
int err = 0;
- u64 bytenr;
u64 extent_start = 0;
u64 extent_end = 0;
u64 objectid = btrfs_ino(inode);
@@ -5912,7 +6161,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_trans_handle *trans = NULL;
- int compress_type;
+ const bool new_inline = !page || create;
again:
read_lock(&em_tree->lock);
@@ -5974,22 +6223,28 @@ again:
found_type = btrfs_key_type(&found_key);
if (found_key.objectid != objectid ||
found_type != BTRFS_EXTENT_DATA_KEY) {
- goto not_found;
+ /*
+ * If we backup past the first extent we want to move forward
+ * and see if there is an extent in front of us, otherwise we'll
+ * say there is a hole for our whole search range which can
+ * cause problems.
+ */
+ extent_end = start;
+ goto next;
}
found_type = btrfs_file_extent_type(leaf, item);
extent_start = found_key.offset;
- compress_type = btrfs_file_extent_compression(leaf, item);
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
extent_end = extent_start +
btrfs_file_extent_num_bytes(leaf, item);
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
size_t size;
- size = btrfs_file_extent_inline_len(leaf, item);
+ size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_end = ALIGN(extent_start + size, root->sectorsize);
}
-
+next:
if (start >= extent_end) {
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
@@ -6014,32 +6269,10 @@ again:
goto not_found_em;
}
- em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
+ btrfs_extent_item_to_extent_map(inode, path, item, new_inline, em);
+
if (found_type == BTRFS_FILE_EXTENT_REG ||
found_type == BTRFS_FILE_EXTENT_PREALLOC) {
- em->start = extent_start;
- em->len = extent_end - extent_start;
- em->orig_start = extent_start -
- btrfs_file_extent_offset(leaf, item);
- em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
- item);
- bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
- if (bytenr == 0) {
- em->block_start = EXTENT_MAP_HOLE;
- goto insert;
- }
- if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- em->block_start = bytenr;
- em->block_len = em->orig_block_len;
- } else {
- bytenr += btrfs_file_extent_offset(leaf, item);
- em->block_start = bytenr;
- em->block_len = em->len;
- if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
- }
goto insert;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
unsigned long ptr;
@@ -6048,14 +6281,10 @@ again:
size_t extent_offset;
size_t copy_size;
- em->block_start = EXTENT_MAP_INLINE;
- if (!page || create) {
- em->start = extent_start;
- em->len = extent_end - extent_start;
+ if (new_inline)
goto out;
- }
- size = btrfs_file_extent_inline_len(leaf, item);
+ size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
extent_offset = page_offset(page) + pg_offset - extent_start;
copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
size - extent_offset);
@@ -6063,10 +6292,6 @@ again:
em->len = ALIGN(copy_size, root->sectorsize);
em->orig_block_len = em->len;
em->orig_start = em->start;
- if (compress_type) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
if (create == 0 && !PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
@@ -6074,7 +6299,10 @@ again:
ret = uncompress_inline(path, inode, page,
pg_offset,
extent_offset, item);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ err = ret;
+ goto out;
+ }
} else {
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -6110,8 +6338,6 @@ again:
set_extent_uptodate(io_tree, em->start,
extent_map_end(em) - 1, NULL, GFP_NOFS);
goto insert;
- } else {
- WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
}
not_found:
em->start = start;
@@ -6173,8 +6399,7 @@ insert:
write_unlock(&em_tree->lock);
out:
- if (em)
- trace_btrfs_get_extent(root, em);
+ trace_btrfs_get_extent(root, em);
if (path)
btrfs_free_path(path);
@@ -6249,7 +6474,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
/* adjust the range_start to make sure it doesn't
* go backwards from the start they passed in
*/
- range_start = max(start,range_start);
+ range_start = max(start, range_start);
found = found_end - range_start;
if (found > 0) {
@@ -6329,21 +6554,21 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
alloc_hint = get_extent_allocation_hint(inode, start, len);
ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
- alloc_hint, &ins, 1);
+ alloc_hint, &ins, 1, 1);
if (ret)
return ERR_PTR(ret);
em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
ins.offset, ins.offset, ins.offset, 0);
if (IS_ERR(em)) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
return em;
}
ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
ins.offset, ins.offset, 0);
if (ret) {
- btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+ btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
free_extent_map(em);
return ERR_PTR(ret);
}
@@ -6364,6 +6589,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
int ret;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct btrfs_file_extent_item *fi;
struct btrfs_key key;
u64 disk_bytenr;
@@ -6373,6 +6599,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
int slot;
int found_type;
bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -6416,6 +6643,10 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
goto out;
+ extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+ if (extent_end <= offset)
+ goto out;
+
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
if (disk_bytenr == 0)
goto out;
@@ -6433,11 +6664,24 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
}
- extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-
if (btrfs_extent_readonly(root, disk_bytenr))
goto out;
+ num_bytes = min(offset + *len, extent_end) - offset;
+ if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 range_end;
+
+ range_end = round_up(offset + num_bytes, root->sectorsize) - 1;
+ ret = test_range_bit(io_tree, offset, range_end,
+ EXTENT_DELALLOC, 0, NULL);
+ if (ret) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ }
+
+ btrfs_release_path(path);
+
/*
* look for other files referencing this extent, if we
* find any we must cow
@@ -6464,7 +6708,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
disk_bytenr += backref_offset;
disk_bytenr += offset - key.offset;
- num_bytes = min(offset + *len, extent_end) - offset;
if (csum_exist_in_range(root, disk_bytenr, num_bytes))
goto out;
/*
@@ -6478,6 +6721,76 @@ out:
return ret;
}
+bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
+{
+ struct radix_tree_root *root = &inode->i_mapping->page_tree;
+ int found = false;
+ void **pagep = NULL;
+ struct page *page = NULL;
+ int start_idx;
+ int end_idx;
+
+ start_idx = start >> PAGE_CACHE_SHIFT;
+
+ /*
+ * end is the last byte in the last page. end == start is legal
+ */
+ end_idx = end >> PAGE_CACHE_SHIFT;
+
+ rcu_read_lock();
+
+ /* Most of the code in this while loop is lifted from
+ * find_get_page. It's been modified to begin searching from a
+ * page and return just the first page found in that range. If the
+ * found idx is less than or equal to the end idx then we know that
+ * a page exists. If no pages are found or if those pages are
+ * outside of the range then we're fine (yay!) */
+ while (page == NULL &&
+ radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
+ page = radix_tree_deref_slot(pagep);
+ if (unlikely(!page))
+ break;
+
+ if (radix_tree_exception(page)) {
+ if (radix_tree_deref_retry(page)) {
+ page = NULL;
+ continue;
+ }
+ /*
+ * Otherwise, shmem/tmpfs must be storing a swap entry
+ * here as an exceptional entry: so return it without
+ * attempting to raise page count.
+ */
+ page = NULL;
+ break; /* TODO: Is this relevant for this use case? */
+ }
+
+ if (!page_cache_get_speculative(page)) {
+ page = NULL;
+ continue;
+ }
+
+ /*
+ * Has the page moved?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != *pagep)) {
+ page_cache_release(page);
+ page = NULL;
+ }
+ }
+
+ if (page) {
+ if (page->index <= end_idx)
+ found = true;
+ page_cache_release(page);
+ }
+
+ rcu_read_unlock();
+ return found;
+}
+
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct extent_state **cached_state, int writing)
{
@@ -6502,10 +6815,9 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
* invalidate needs to happen so that reads after a write do not
* get stale data.
*/
- if (!ordered && (!writing ||
- !test_range_bit(&BTRFS_I(inode)->io_tree,
- lockstart, lockend, EXTENT_UPTODATE, 0,
- *cached_state)))
+ if (!ordered &&
+ (!writing ||
+ !btrfs_page_exists_in_range(inode, lockstart, lockend)))
break;
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
@@ -6765,17 +7077,16 @@ unlock_err:
static void btrfs_endio_direct_read(struct bio *bio, int err)
{
struct btrfs_dio_private *dip = bio->bi_private;
- struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct bio_vec *bvec = bio->bi_io_vec;
+ struct bio_vec *bvec;
struct inode *inode = dip->inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct bio *dio_bio;
u32 *csums = (u32 *)dip->csum;
- int index = 0;
u64 start;
+ int i;
start = dip->logical_offset;
- do {
+ bio_for_each_segment_all(bvec, bio, i) {
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
struct page *page = bvec->bv_page;
char *kaddr;
@@ -6791,18 +7102,16 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
local_irq_restore(flags);
flush_dcache_page(bvec->bv_page);
- if (csum != csums[index]) {
+ if (csum != csums[i]) {
btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum,
- csums[index]);
+ csums[i]);
err = -EIO;
}
}
start += bvec->bv_len;
- bvec++;
- index++;
- } while (bvec <= bvec_end);
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
dip->logical_offset + dip->bytes - 1);
@@ -6837,10 +7146,9 @@ again:
if (!ret)
goto out_test;
- ordered->work.func = finish_ordered_fn;
- ordered->work.flags = 0;
- btrfs_queue_worker(&root->fs_info->endio_write_workers,
- &ordered->work);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL);
+ btrfs_queue_work(root->fs_info->endio_write_workers,
+ &ordered->work);
out_test:
/*
* our bio might span multiple ordered extents. If we haven't
@@ -6880,17 +7188,18 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
struct btrfs_dio_private *dip = bio->bi_private;
if (err) {
- printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
- "sector %#Lx len %u err no %d\n",
+ btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
+ "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
btrfs_ino(dip->inode), bio->bi_rw,
- (unsigned long long)bio->bi_sector, bio->bi_size, err);
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size, err);
dip->errors = 1;
/*
* before atomic variable goto zero, we must make sure
* dip->errors is perceived to be set.
*/
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
}
/* if there are more bios still pending for this dio, just exit */
@@ -6974,7 +7283,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
struct bio *bio;
struct bio *orig_bio = dip->orig_bio;
struct bio_vec *bvec = orig_bio->bi_io_vec;
- u64 start_sector = orig_bio->bi_sector;
+ u64 start_sector = orig_bio->bi_iter.bi_sector;
u64 file_offset = dip->logical_offset;
u64 submit_len = 0;
u64 map_length;
@@ -6982,7 +7291,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
int ret = 0;
int async_submit = 0;
- map_length = orig_bio->bi_size;
+ map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
&map_length, NULL, 0);
if (ret) {
@@ -6990,7 +7299,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
return -EIO;
}
- if (map_length >= orig_bio->bi_size) {
+ if (map_length >= orig_bio->bi_iter.bi_size) {
bio = orig_bio;
goto submit;
}
@@ -7042,7 +7351,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
bio->bi_private = dip;
bio->bi_end_io = btrfs_end_dio_bio;
- map_length = orig_bio->bi_size;
+ map_length = orig_bio->bi_iter.bi_size;
ret = btrfs_map_block(root->fs_info, rw,
start_sector << 9,
&map_length, NULL, 0);
@@ -7052,7 +7361,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
}
} else {
submit_len += bvec->bv_len;
- nr_pages ++;
+ nr_pages++;
bvec++;
}
}
@@ -7070,7 +7379,7 @@ out_err:
* before atomic variable goto zero, we must
* make sure dip->errors is perceived to be set.
*/
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
if (atomic_dec_and_test(&dip->pending_bios))
bio_io_error(dip->orig_bio);
@@ -7100,7 +7409,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
if (!skip_sum && !write) {
csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
- sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+ sum_len = dio_bio->bi_iter.bi_size >>
+ inode->i_sb->s_blocksize_bits;
sum_len *= csum_size;
} else {
sum_len = 0;
@@ -7115,8 +7425,8 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
dip->private = dio_bio->bi_private;
dip->inode = inode;
dip->logical_offset = file_offset;
- dip->bytes = dio_bio->bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
io_bio->bi_private = dip;
dip->errors = 0;
dip->orig_bio = io_bio;
@@ -7146,7 +7456,7 @@ free_ordered:
if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
btrfs_free_reserved_extent(root, ordered->start,
- ordered->disk_len);
+ ordered->disk_len, 1);
btrfs_put_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
}
@@ -7154,39 +7464,30 @@ free_ordered:
}
static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ const struct iov_iter *iter, loff_t offset)
{
int seg;
int i;
- size_t size;
- unsigned long addr;
unsigned blocksize_mask = root->sectorsize - 1;
ssize_t retval = -EINVAL;
- loff_t end = offset;
if (offset & blocksize_mask)
goto out;
- /* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
-
- /* If this is a write we don't need to check anymore */
- if (rw & WRITE)
- continue;
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ goto out;
- /*
- * Check to make sure we don't have duplicate iov_base's in this
- * iovec, if so return EINVAL, otherwise we'll get csum errors
- * when reading back.
- */
- for (i = seg + 1; i < nr_segs; i++) {
- if (iov[seg].iov_base == iov[i].iov_base)
+ /* If this is a write we don't need to check anymore */
+ if (rw & WRITE)
+ return 0;
+ /*
+ * Check to make sure we don't have duplicate iov_base's in this
+ * iovec, if so return EINVAL, otherwise we'll get csum errors
+ * when reading back.
+ */
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ for (i = seg + 1; i < iter->nr_segs; i++) {
+ if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
goto out;
}
}
@@ -7196,8 +7497,7 @@ out:
}
static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -7207,21 +7507,22 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
bool relock = false;
ssize_t ret;
- if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
- offset, nr_segs))
+ if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset))
return 0;
atomic_inc(&inode->i_dio_count);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/*
- * The generic stuff only does filemap_write_and_wait_range, which isn't
- * enough if we've written compressed pages to this area, so we need to
- * call btrfs_wait_ordered_range to make absolutely sure that any
- * outstanding dirty pages are on disk.
+ * The generic stuff only does filemap_write_and_wait_range, which
+ * isn't enough if we've written compressed pages to this area, so
+ * we need to flush the dirty pages again to make absolutely sure
+ * that any outstanding dirty pages are on disk.
*/
- count = iov_length(iov, nr_segs);
- btrfs_wait_ordered_range(inode, offset, count);
+ count = iov_iter_count(iter);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_fdatawrite_range(inode->i_mapping, offset, count);
if (rw & WRITE) {
/*
@@ -7245,7 +7546,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
ret = __blockdev_direct_IO(rw, iocb, inode,
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
- iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+ iter, offset, btrfs_get_blocks_direct, NULL,
btrfs_submit_direct, flags);
if (rw & WRITE) {
if (ret < 0 && ret != -EIOCBQUEUED)
@@ -7351,6 +7652,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
struct extent_state *cached_state = NULL;
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+ int inode_evicting = inode->i_state & I_FREEING;
/*
* we have the page locked, so new writeback can't start,
@@ -7366,17 +7668,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
btrfs_releasepage(page, GFP_NOFS);
return;
}
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
- ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
+
+ if (!inode_evicting)
+ lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
/*
* IO on this page will never be started, so we need
* to account for any ordered extents now
*/
- clear_extent_bit(tree, page_start, page_end,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
- EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
+ if (!inode_evicting)
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 0, &cached_state,
+ GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
@@ -7400,14 +7706,22 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
btrfs_finish_ordered_io(ordered);
}
btrfs_put_ordered_extent(ordered);
- cached_state = NULL;
- lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+ if (!inode_evicting) {
+ cached_state = NULL;
+ lock_extent_bits(tree, page_start, page_end, 0,
+ &cached_state);
+ }
+ }
+
+ if (!inode_evicting) {
+ clear_extent_bit(tree, page_start, page_end,
+ EXTENT_LOCKED | EXTENT_DIRTY |
+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+ EXTENT_DEFRAG, 1, 1,
+ &cached_state, GFP_NOFS);
+
+ __btrfs_releasepage(page, GFP_NOFS);
}
- clear_extent_bit(tree, page_start, page_end,
- EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
- &cached_state, GFP_NOFS);
- __btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
if (PagePrivate(page)) {
@@ -7562,7 +7876,10 @@ static int btrfs_truncate(struct inode *inode)
u64 mask = root->sectorsize - 1;
u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
- btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
+ ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
+ (u64)-1);
+ if (ret)
+ return ret;
/*
* Yes ladies and gentelment, this is indeed ugly. The fact is we have
@@ -7714,7 +8031,9 @@ out:
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *new_root, u64 new_dirid)
+ struct btrfs_root *new_root,
+ struct btrfs_root *parent_root,
+ u64 new_dirid)
{
struct inode *inode;
int err;
@@ -7732,6 +8051,12 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
set_nlink(inode, 1);
btrfs_i_size_write(inode, 0);
+ err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
+ if (err)
+ btrfs_err(new_root->fs_info,
+ "error inheriting subvolume %llu properties: %d",
+ new_root->root_key.objectid, err);
+
err = btrfs_update_inode(trans, new_root, inode);
iput(inode);
@@ -7757,6 +8082,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->flags = 0;
ei->csum_bytes = 0;
ei->index_cnt = (u64)-1;
+ ei->dir_index = 0;
ei->last_unlink_trans = 0;
ei->last_log_commit = 0;
@@ -7786,6 +8112,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
return inode;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_destroy_inode(struct inode *inode)
+{
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
+#endif
+
static void btrfs_i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -7856,8 +8190,7 @@ int btrfs_drop_inode(struct inode *inode)
return 1;
/* the snap/subvol tree is on deleting */
- if (btrfs_root_refs(&root->root_item) == 0 &&
- root != root->fs_info->tree_root)
+ if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
return generic_drop_inode(inode);
@@ -7986,7 +8319,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* check for collisions, even if the name isn't there */
- ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
+ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
new_dentry->d_name.name,
new_dentry->d_name.len);
@@ -7994,8 +8327,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret == -EEXIST) {
/* we shouldn't get
* eexist without a new_inode */
- if (!new_inode) {
- WARN_ON(1);
+ if (WARN_ON(!new_inode)) {
return ret;
}
} else {
@@ -8038,9 +8370,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_fail;
+ BTRFS_I(old_inode)->dir_index = 0ULL;
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
@@ -8126,6 +8459,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_fail;
}
+ if (old_inode->i_nlink == 1)
+ BTRFS_I(old_inode)->dir_index = index;
+
if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
struct dentry *parent = new_dentry->d_parent;
btrfs_log_new_name(trans, old_inode, old_dir, parent);
@@ -8143,18 +8479,24 @@ out_notrans:
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
struct btrfs_delalloc_work *delalloc_work;
+ struct inode *inode;
delalloc_work = container_of(work, struct btrfs_delalloc_work,
work);
- if (delalloc_work->wait)
- btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
- else
- filemap_flush(delalloc_work->inode->i_mapping);
+ inode = delalloc_work->inode;
+ if (delalloc_work->wait) {
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ } else {
+ filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
if (delalloc_work->delay_iput)
- btrfs_add_delayed_iput(delalloc_work->inode);
+ btrfs_add_delayed_iput(inode);
else
- iput(delalloc_work->inode);
+ iput(inode);
complete(&delalloc_work->completion);
}
@@ -8172,7 +8514,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
work->inode = inode;
work->wait = wait;
work->delay_iput = delay_iput;
- work->work.func = btrfs_run_delalloc_work;
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -8187,7 +8529,8 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
* some fairly slow code that needs optimization. This walks the list
* of all the inodes with pending delalloc and forces them to disk.
*/
-static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
+ int nr)
{
struct btrfs_inode *binode;
struct inode *inode;
@@ -8199,6 +8542,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
INIT_LIST_HEAD(&works);
INIT_LIST_HEAD(&splice);
+ mutex_lock(&root->delalloc_mutex);
spin_lock(&root->delalloc_lock);
list_splice_init(&root->delalloc_inodes, &splice);
while (!list_empty(&splice)) {
@@ -8224,19 +8568,16 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
goto out;
}
list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
-
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
+ ret++;
+ if (nr != -1 && ret >= nr)
+ goto out;
cond_resched();
spin_lock(&root->delalloc_lock);
}
spin_unlock(&root->delalloc_lock);
- list_for_each_entry_safe(work, next, &works, list) {
- list_del_init(&work->list);
- btrfs_wait_and_free_delalloc_work(work);
- }
- return 0;
out:
list_for_each_entry_safe(work, next, &works, list) {
list_del_init(&work->list);
@@ -8248,6 +8589,7 @@ out:
list_splice_tail(&splice, &root->delalloc_inodes);
spin_unlock(&root->delalloc_lock);
}
+ mutex_unlock(&root->delalloc_mutex);
return ret;
}
@@ -8255,10 +8597,12 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
{
int ret;
- if (root->fs_info->sb->s_flags & MS_RDONLY)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return -EROFS;
- ret = __start_delalloc_inodes(root, delay_iput);
+ ret = __start_delalloc_inodes(root, delay_iput, -1);
+ if (ret > 0)
+ ret = 0;
/*
* the filemap_flush will queue IO into the worker threads, but
* we have to make sure the IO is actually started and that
@@ -8275,21 +8619,22 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
return ret;
}
-int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
- int delay_iput)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
+ int nr)
{
struct btrfs_root *root;
struct list_head splice;
int ret;
- if (fs_info->sb->s_flags & MS_RDONLY)
+ if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
INIT_LIST_HEAD(&splice);
+ mutex_lock(&fs_info->delalloc_root_mutex);
spin_lock(&fs_info->delalloc_root_lock);
list_splice_init(&fs_info->delalloc_roots, &splice);
- while (!list_empty(&splice)) {
+ while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
delalloc_root);
root = btrfs_grab_fs_root(root);
@@ -8298,15 +8643,20 @@ int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
&fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
- ret = __start_delalloc_inodes(root, delay_iput);
+ ret = __start_delalloc_inodes(root, delay_iput, nr);
btrfs_put_fs_root(root);
- if (ret)
+ if (ret < 0)
goto out;
+ if (nr != -1) {
+ nr -= ret;
+ WARN_ON(nr < 0);
+ }
spin_lock(&fs_info->delalloc_root_lock);
}
spin_unlock(&fs_info->delalloc_root_lock);
+ ret = 0;
atomic_inc(&fs_info->async_submit_draining);
while (atomic_read(&fs_info->nr_async_submits) ||
atomic_read(&fs_info->async_delalloc_pages)) {
@@ -8315,13 +8665,13 @@ int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
atomic_read(&fs_info->async_delalloc_pages) == 0));
}
atomic_dec(&fs_info->async_submit_draining);
- return 0;
out:
if (!list_empty_careful(&splice)) {
spin_lock(&fs_info->delalloc_root_lock);
list_splice_tail(&splice, &fs_info->delalloc_roots);
spin_unlock(&fs_info->delalloc_root_lock);
}
+ mutex_unlock(&fs_info->delalloc_root_mutex);
return ret;
}
@@ -8336,14 +8686,14 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
int err;
int drop_inode = 0;
u64 objectid;
- u64 index = 0 ;
+ u64 index = 0;
int name_len;
int datasize;
unsigned long ptr;
struct btrfs_file_extent_item *ei;
struct extent_buffer *leaf;
- name_len = strlen(symname) + 1;
+ name_len = strlen(symname);
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
@@ -8431,7 +8781,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_mapping->a_ops = &btrfs_symlink_aops;
inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
inode_set_bytes(inode, name_len);
- btrfs_i_size_write(inode, name_len - 1);
+ btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
if (err)
drop_inode = 1;
@@ -8477,7 +8827,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
- *alloc_hint, &ins, 1);
+ *alloc_hint, &ins, 1, 0);
if (ret) {
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8490,6 +8840,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
ins.offset, 0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
if (ret) {
+ btrfs_free_reserved_extent(root, ins.objectid,
+ ins.offset, 0);
btrfs_abort_transaction(trans, root, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
@@ -8599,6 +8951,66 @@ static int btrfs_permission(struct inode *inode, int mask)
return generic_permission(inode, mask);
}
+static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct inode *inode = NULL;
+ u64 objectid;
+ u64 index;
+ int ret = 0;
+
+ /*
+ * 5 units required for adding orphan entry
+ */
+ trans = btrfs_start_transaction(root, 5);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ ret = btrfs_find_free_ino(root, &objectid);
+ if (ret)
+ goto out;
+
+ inode = btrfs_new_inode(trans, root, dir, NULL, 0,
+ btrfs_ino(dir), objectid, mode, &index);
+ if (IS_ERR(inode)) {
+ ret = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
+
+ ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+ if (ret)
+ goto out;
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ goto out;
+
+ inode->i_fop = &btrfs_file_operations;
+ inode->i_op = &btrfs_file_inode_operations;
+
+ inode->i_mapping->a_ops = &btrfs_aops;
+ inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+ ret = btrfs_orphan_add(trans, inode);
+ if (ret)
+ goto out;
+
+ d_tmpfile(dentry, inode);
+ mark_inode_dirty(inode);
+
+out:
+ btrfs_end_transaction(trans, root);
+ if (ret)
+ iput(inode);
+ btrfs_balance_delayed_items(root);
+ btrfs_btree_balance_dirty(root);
+
+ return ret;
+}
+
static const struct inode_operations btrfs_dir_inode_operations = {
.getattr = btrfs_getattr,
.lookup = btrfs_lookup,
@@ -8617,12 +9029,15 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.removexattr = btrfs_removexattr,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
+ .tmpfile = btrfs_tmpfile,
};
static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
@@ -8692,6 +9107,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
.permission = btrfs_permission,
.fiemap = btrfs_fiemap,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_special_inode_operations = {
@@ -8703,6 +9119,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
.get_acl = btrfs_get_acl,
+ .set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
};
static const struct inode_operations btrfs_symlink_inode_operations = {
@@ -8716,7 +9133,6 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.getxattr = btrfs_getxattr,
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
- .get_acl = btrfs_get_acl,
.update_time = btrfs_update_time,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9d46f60cb94..47aceb494d1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -44,7 +44,6 @@
#include <linux/uuid.h>
#include <linux/btrfs.h>
#include <linux/uaccess.h>
-#include "compat.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -57,6 +56,35 @@
#include "rcu-string.h"
#include "send.h"
#include "dev-replace.h"
+#include "props.h"
+#include "sysfs.h"
+#include "qgroup.h"
+
+#ifdef CONFIG_64BIT
+/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
+ * structures are incorrect, as the timespec structure from userspace
+ * is 4 bytes too small. We define these alternatives here to teach
+ * the kernel about the 32-bit struct packing.
+ */
+struct btrfs_ioctl_timespec_32 {
+ __u64 sec;
+ __u32 nsec;
+} __attribute__ ((__packed__));
+
+struct btrfs_ioctl_received_subvol_args_32 {
+ char uuid[BTRFS_UUID_SIZE]; /* in */
+ __u64 stransid; /* in */
+ __u64 rtransid; /* out */
+ struct btrfs_ioctl_timespec_32 stime; /* in */
+ struct btrfs_ioctl_timespec_32 rtime; /* out */
+ __u64 flags; /* in */
+ __u64 reserved[16]; /* in */
+} __attribute__ ((__packed__));
+
+#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
+ struct btrfs_ioctl_received_subvol_args_32)
+#endif
+
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff);
@@ -108,19 +136,22 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
void btrfs_update_iflags(struct inode *inode)
{
struct btrfs_inode *ip = BTRFS_I(inode);
-
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ unsigned int new_fl = 0;
if (ip->flags & BTRFS_INODE_SYNC)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (ip->flags & BTRFS_INODE_IMMUTABLE)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (ip->flags & BTRFS_INODE_APPEND)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (ip->flags & BTRFS_INODE_NOATIME)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (ip->flags & BTRFS_INODE_DIRSYNC)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+
+ set_mask_bits(&inode->i_flags,
+ S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
+ new_fl);
}
/*
@@ -191,6 +222,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int i_oldflags;
umode_t mode;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
if (btrfs_root_readonly(root))
return -EROFS;
@@ -201,9 +235,6 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (ret)
return ret;
- if (!inode_owner_or_capable(inode))
- return -EACCES;
-
ret = mnt_want_write_file(file);
if (ret)
return ret;
@@ -281,9 +312,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (flags & FS_NOCOMP_FL) {
ip->flags &= ~BTRFS_INODE_COMPRESS;
ip->flags |= BTRFS_INODE_NOCOMPRESS;
+
+ ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
+ if (ret && ret != -ENODATA)
+ goto out_drop;
} else if (flags & FS_COMPR_FL) {
+ const char *comp;
+
ip->flags |= BTRFS_INODE_COMPRESS;
ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+
+ if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
+ comp = "lzo";
+ else
+ comp = "zlib";
+ ret = btrfs_set_prop(inode, "btrfs.compression",
+ comp, strlen(comp), 0);
+ if (ret)
+ goto out_drop;
+
} else {
ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
@@ -321,7 +368,7 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
+ struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
@@ -369,9 +416,13 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
int btrfs_is_empty_uuid(u8 *uuid)
{
- static char empty_uuid[BTRFS_UUID_SIZE] = {0};
+ int i;
- return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE);
+ for (i = 0; i < BTRFS_UUID_SIZE; i++) {
+ if (uuid[i])
+ return 0;
+ }
+ return 1;
}
static noinline int create_subvol(struct inode *dir,
@@ -389,6 +440,7 @@ static noinline int create_subvol(struct inode *dir,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec cur_time = CURRENT_TIME;
+ struct inode *inode;
int ret;
int err;
u64 objectid;
@@ -414,7 +466,9 @@ static noinline int create_subvol(struct inode *dir,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- goto out;
+ btrfs_subvolume_release_metadata(root, &block_rsv,
+ qgroup_reserved);
+ return ret;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
@@ -436,7 +490,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(leaf, objectid);
- write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf),
+ write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(),
BTRFS_FSID_SIZE);
write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(leaf),
@@ -497,7 +551,7 @@ static noinline int create_subvol(struct inode *dir,
btrfs_record_root_in_trans(trans, new_root);
- ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+ ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
if (ret) {
/* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, root, ret);
@@ -539,6 +593,8 @@ static noinline int create_subvol(struct inode *dir,
fail:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
+ btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+
if (async_transid) {
*async_transid = trans->transid;
err = btrfs_commit_transaction_async(trans, root, 1);
@@ -550,13 +606,32 @@ fail:
if (err && !ret)
ret = err;
- if (!ret)
- d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
-out:
- btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
+ if (!ret) {
+ inode = btrfs_lookup_dentry(dir, dentry);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+ d_instantiate(dentry, inode);
+ }
return ret;
}
+static void btrfs_wait_nocow_write(struct btrfs_root *root)
+{
+ s64 writers;
+ DEFINE_WAIT(wait);
+
+ do {
+ prepare_to_wait(&root->subv_writers->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ writers = percpu_counter_sum(&root->subv_writers->counter);
+ if (writers)
+ schedule();
+
+ finish_wait(&root->subv_writers->wait, &wait);
+ } while (writers);
+}
+
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry, char *name, int namelen,
u64 *async_transid, bool readonly,
@@ -567,18 +642,24 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_trans_handle *trans;
int ret;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
+ atomic_inc(&root->will_be_snapshoted);
+ smp_mb__after_atomic();
+ btrfs_wait_nocow_write(root);
+
ret = btrfs_start_delalloc_inodes(root, 0);
if (ret)
- return ret;
+ goto out;
- btrfs_wait_ordered_extents(root);
+ btrfs_wait_ordered_extents(root, -1);
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
- if (!pending_snapshot)
- return -ENOMEM;
+ if (!pending_snapshot) {
+ ret = -ENOMEM;
+ goto out;
+ }
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
@@ -595,7 +676,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
&pending_snapshot->qgroup_reserved,
false);
if (ret)
- goto out;
+ goto free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
@@ -634,20 +715,51 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto fail;
+ /*
+ * If orphan cleanup did remove any orphans, it means the tree was
+ * modified and therefore the commit root is not the same as the
+ * current root anymore. This is a problem, because send uses the
+ * commit root and therefore can see inode items that don't exist
+ * in the current root anymore, and for example make calls to
+ * btrfs_iget, which will do tree lookups based on the current root
+ * and not on the commit root. Those lookups will fail, returning a
+ * -ESTALE error, and making send fail with that error. So make sure
+ * a send does not see any orphans we have just removed, and that it
+ * will see the same inodes regardless of whether a transaction
+ * commit happened before it started (meaning that the commit root
+ * will be the same as the current root) or not.
+ */
+ if (readonly && pending_snapshot->snap->node !=
+ pending_snapshot->snap->commit_root) {
+ trans = btrfs_join_transaction(pending_snapshot->snap);
+ if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) {
+ ret = PTR_ERR(trans);
+ goto fail;
+ }
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans,
+ pending_snapshot->snap);
+ if (ret)
+ goto fail;
+ }
+ }
+
inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto fail;
}
- BUG_ON(!inode);
+
d_instantiate(dentry, inode);
ret = 0;
fail:
btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv,
pending_snapshot->qgroup_reserved);
-out:
+free:
kfree(pending_snapshot);
+out:
+ atomic_dec(&root->will_be_snapshoted);
return ret;
}
@@ -688,7 +800,7 @@ static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
* nfs_async_unlink().
*/
-static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
{
int error;
@@ -842,7 +954,6 @@ static int find_new_extents(struct btrfs_root *root,
{
struct btrfs_path *path;
struct btrfs_key min_key;
- struct btrfs_key max_key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *extent;
int type;
@@ -857,17 +968,14 @@ static int find_new_extents(struct btrfs_root *root,
min_key.type = BTRFS_EXTENT_DATA_KEY;
min_key.offset = *off;
- max_key.objectid = ino;
- max_key.type = (u8)-1;
- max_key.offset = (u64)-1;
-
- path->keep_locks = 1;
-
- while(1) {
- ret = btrfs_search_forward(root, &min_key, &max_key,
- path, newer_than);
+ while (1) {
+ path->keep_locks = 1;
+ ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
+ path->keep_locks = 0;
+ btrfs_unlock_up_safe(path, 1);
+process_slot:
if (min_key.objectid != ino)
goto none;
if (min_key.type != BTRFS_EXTENT_DATA_KEY)
@@ -886,6 +994,12 @@ static int find_new_extents(struct btrfs_root *root,
return 0;
}
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(leaf)) {
+ btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
+ goto process_slot;
+ }
+
if (min_key.offset == (u64)-1)
goto none;
@@ -913,10 +1027,13 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
read_unlock(&em_tree->lock);
if (!em) {
+ struct extent_state *cached = NULL;
+ u64 end = start + len - 1;
+
/* get the big lock and read metadata off disk */
- lock_extent(io_tree, start, start + len - 1);
+ lock_extent_bits(io_tree, start, end, 0, &cached);
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
- unlock_extent(io_tree, start, start + len - 1);
+ unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
if (IS_ERR(em))
return NULL;
@@ -935,7 +1052,8 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
return false;
next = defrag_lookup_extent(inode, em->start + em->len);
- if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
+ (em->block_start + em->block_len == next->block_start))
ret = false;
free_extent_map(next);
@@ -1014,7 +1132,7 @@ out:
static int cluster_pages_for_defrag(struct inode *inode,
struct page **pages,
unsigned long start_index,
- int num_pages)
+ unsigned long num_pages)
{
unsigned long file_end;
u64 isize = i_size_read(inode);
@@ -1054,10 +1172,12 @@ again:
page_start = page_offset(page);
page_end = page_start + PAGE_CACHE_SIZE - 1;
while (1) {
- lock_extent(tree, page_start, page_end);
+ lock_extent_bits(tree, page_start, page_end,
+ 0, &cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
- unlock_extent(tree, page_start, page_end);
+ unlock_extent_cached(tree, page_start, page_end,
+ &cached_state, GFP_NOFS);
if (!ordered)
break;
@@ -1172,8 +1292,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
int extent_thresh = range->extent_thresh;
- int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
- int cluster = max_cluster;
+ unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+ unsigned long cluster = max_cluster;
u64 new_align = ~((u64)128 * 1024 - 1);
struct page **pages = NULL;
@@ -1206,7 +1326,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra = &file->f_ra;
}
- pages = kmalloc(sizeof(struct page *) * max_cluster,
+ pages = kmalloc_array(max_cluster, sizeof(struct page *),
GFP_NOFS);
if (!pages) {
ret = -ENOMEM;
@@ -1257,7 +1377,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (btrfs_defrag_cancelled(root->fs_info)) {
- printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+ printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
ret = -EAGAIN;
break;
}
@@ -1334,8 +1454,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
}
- if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
+ if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
filemap_flush(inode->i_mapping);
+ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+ &BTRFS_I(inode)->runtime_flags))
+ filemap_flush(inode->i_mapping);
+ }
if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
/* the filemap_flush will queue IO into the worker threads, but
@@ -1381,6 +1505,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
+ char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
@@ -1410,29 +1535,30 @@ static noinline int btrfs_ioctl_resize(struct file *file,
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
- char *end;
sizestr = devstr + 1;
*devstr = '\0';
devstr = vol_args->name;
- devid = simple_strtoull(devstr, &end, 10);
+ ret = kstrtoull(devstr, 10, &devid);
+ if (ret)
+ goto out_free;
if (!devid) {
ret = -EINVAL;
goto out_free;
}
- printk(KERN_INFO "btrfs: resizing devid %llu\n", devid);
+ btrfs_info(root->fs_info, "resizing devid %llu", devid);
}
device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
if (!device) {
- printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
+ btrfs_info(root->fs_info, "resizer unable to find device %llu",
devid);
ret = -ENODEV;
goto out_free;
}
if (!device->writeable) {
- printk(KERN_INFO "btrfs: resizer unable to apply on "
- "readonly device %llu\n",
+ btrfs_info(root->fs_info,
+ "resizer unable to apply on readonly device %llu",
devid);
ret = -EPERM;
goto out_free;
@@ -1448,8 +1574,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
mod = 1;
sizestr++;
}
- new_size = memparse(sizestr, NULL);
- if (new_size == 0) {
+ new_size = memparse(sizestr, &retptr);
+ if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
goto out_free;
}
@@ -1469,6 +1595,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
}
new_size = old_size - new_size;
} else if (mod > 0) {
+ if (new_size > ULLONG_MAX - old_size) {
+ ret = -ERANGE;
+ goto out_free;
+ }
new_size = old_size + new_size;
}
@@ -1484,7 +1614,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
do_div(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
+ printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
@@ -1545,9 +1675,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) {
- printk(KERN_INFO "btrfs: Snapshot src from "
- "another FS\n");
- ret = -EINVAL;
+ btrfs_info(BTRFS_I(src_inode)->root->fs_info,
+ "Snapshot src from another FS");
+ ret = -EXDEV;
+ } else if (!inode_owner_or_capable(src_inode)) {
+ /*
+ * Subvolume creation is not restricted, but snapshots
+ * are limited to own subvolumes only
+ */
+ ret = -EPERM;
} else {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
@@ -1665,6 +1801,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
u64 flags;
int ret = 0;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
ret = mnt_want_write_file(file);
if (ret)
goto out;
@@ -1689,11 +1828,6 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_drop_write;
}
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out_drop_write;
- }
-
down_write(&root->fs_info->subvol_sem);
/* nothing to do */
@@ -1701,12 +1835,28 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
goto out_drop_sem;
root_flags = btrfs_root_flags(&root->root_item);
- if (flags & BTRFS_SUBVOL_RDONLY)
+ if (flags & BTRFS_SUBVOL_RDONLY) {
btrfs_set_root_flags(&root->root_item,
root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
- else
- btrfs_set_root_flags(&root->root_item,
+ } else {
+ /*
+ * Block RO -> RW transition if this subvolume is involved in
+ * send
+ */
+ spin_lock(&root->root_item_lock);
+ if (root->send_in_progress == 0) {
+ btrfs_set_root_flags(&root->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+ spin_unlock(&root->root_item_lock);
+ } else {
+ spin_unlock(&root->root_item_lock);
+ btrfs_warn(root->fs_info,
+ "Attempt to set subvolume %llu read-write during send",
+ root->root_key.objectid);
+ ret = -EPERM;
+ goto out_drop_sem;
+ }
+ }
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
@@ -1751,7 +1901,9 @@ static noinline int may_destroy_subvol(struct btrfs_root *root)
if (di && !IS_ERR(di)) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
if (key.objectid == root->root_key.objectid) {
- ret = -ENOTEMPTY;
+ ret = -EPERM;
+ btrfs_err(root->fs_info, "deleting default subvolume "
+ "%llu is not allowed", key.objectid);
goto out;
}
btrfs_release_path(path);
@@ -1808,7 +1960,8 @@ static noinline int copy_to_sk(struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- char *buf,
+ size_t *buf_size,
+ char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
{
@@ -1840,13 +1993,25 @@ static noinline int copy_to_sk(struct btrfs_root *root,
if (!key_in_sk(key, sk))
continue;
- if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+ if (sizeof(sh) + item_len > *buf_size) {
+ if (*num_found) {
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * return one empty item back for v1, which does not
+ * handle -EOVERFLOW
+ */
+
+ *buf_size = sizeof(sh) + item_len;
item_len = 0;
+ ret = -EOVERFLOW;
+ }
- if (sizeof(sh) + item_len + *sk_offset >
- BTRFS_SEARCH_ARGS_BUFSIZE) {
+ if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
ret = 1;
- goto overflow;
+ goto out;
}
sh.objectid = key->objectid;
@@ -1856,20 +2021,33 @@ static noinline int copy_to_sk(struct btrfs_root *root,
sh.transid = found_transid;
/* copy search result header */
- memcpy(buf + *sk_offset, &sh, sizeof(sh));
+ if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += sizeof(sh);
if (item_len) {
- char *p = buf + *sk_offset;
+ char __user *up = ubuf + *sk_offset;
/* copy the item */
- read_extent_buffer(leaf, p,
- item_off, item_len);
+ if (read_extent_buffer_to_user(leaf, up,
+ item_off, item_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
*sk_offset += item_len;
}
(*num_found)++;
- if (*num_found >= sk->nr_items)
- break;
+ if (ret) /* -EOVERFLOW from above */
+ goto out;
+
+ if (*num_found >= sk->nr_items) {
+ ret = 1;
+ goto out;
+ }
}
advance_key:
ret = 0;
@@ -1884,23 +2062,37 @@ advance_key:
key->objectid++;
} else
ret = 1;
-overflow:
+out:
+ /*
+ * 0: all items from this leaf copied, continue with next
+ * 1: * more items can be copied, but unused buffer is too small
+ * * all items were found
+ * Either way, it will stops the loop which iterates to the next
+ * leaf
+ * -EOVERFLOW: item was to large for buffer
+ * -EFAULT: could not copy extent buffer back to userspace
+ */
return ret;
}
static noinline int search_ioctl(struct inode *inode,
- struct btrfs_ioctl_search_args *args)
+ struct btrfs_ioctl_search_key *sk,
+ size_t *buf_size,
+ char __user *ubuf)
{
struct btrfs_root *root;
struct btrfs_key key;
- struct btrfs_key max_key;
struct btrfs_path *path;
- struct btrfs_ioctl_search_key *sk = &args->key;
struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
+ if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
+ *buf_size = sizeof(struct btrfs_ioctl_search_header);
+ return -EOVERFLOW;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -1914,7 +2106,7 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "could not find root %llu\n",
+ printk(KERN_ERR "BTRFS: could not find root %llu\n",
sk->tree_id);
btrfs_free_path(path);
return -ENOENT;
@@ -1925,28 +2117,24 @@ static noinline int search_ioctl(struct inode *inode,
key.type = sk->min_type;
key.offset = sk->min_offset;
- max_key.objectid = sk->max_objectid;
- max_key.type = sk->max_type;
- max_key.offset = sk->max_offset;
-
path->keep_locks = 1;
- while(1) {
- ret = btrfs_search_forward(root, &key, &max_key, path,
- sk->min_transid);
+ while (1) {
+ ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
ret = 0;
goto err;
}
- ret = copy_to_sk(root, path, &key, sk, args->buf,
+ ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
- if (ret || num_found >= sk->nr_items)
+ if (ret)
break;
}
- ret = 0;
+ if (ret > 0)
+ ret = 0;
err:
sk->nr_items = num_found;
btrfs_free_path(path);
@@ -1956,22 +2144,73 @@ err:
static noinline int btrfs_ioctl_tree_search(struct file *file,
void __user *argp)
{
- struct btrfs_ioctl_search_args *args;
- struct inode *inode;
- int ret;
+ struct btrfs_ioctl_search_args __user *uargs;
+ struct btrfs_ioctl_search_key sk;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- args = memdup_user(argp, sizeof(*args));
- if (IS_ERR(args))
- return PTR_ERR(args);
+ uargs = (struct btrfs_ioctl_search_args __user *)argp;
+
+ if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
+ return -EFAULT;
+
+ buf_size = sizeof(uargs->buf);
inode = file_inode(file);
- ret = search_ioctl(inode, args);
- if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
+ ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
+
+ /*
+ * In the origin implementation an overflow is handled by returning a
+ * search header with a len of zero, so reset ret.
+ */
+ if (ret == -EOVERFLOW)
+ ret = 0;
+
+ if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
ret = -EFAULT;
- kfree(args);
+ return ret;
+}
+
+static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
+ void __user *argp)
+{
+ struct btrfs_ioctl_search_args_v2 __user *uarg;
+ struct btrfs_ioctl_search_args_v2 args;
+ struct inode *inode;
+ int ret;
+ size_t buf_size;
+ const size_t buf_limit = 16 * 1024 * 1024;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* copy search header and buffer size */
+ uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
+ if (copy_from_user(&args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ buf_size = args.buf_size;
+
+ if (buf_size < sizeof(struct btrfs_ioctl_search_header))
+ return -EOVERFLOW;
+
+ /* limit result size to 16MB */
+ if (buf_size > buf_limit)
+ buf_size = buf_limit;
+
+ inode = file_inode(file);
+ ret = search_ioctl(inode, &args.key, &buf_size,
+ (char *)(&uarg->buf[0]));
+ if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
+ ret = -EFAULT;
+ else if (ret == -EOVERFLOW &&
+ copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
+ ret = -EFAULT;
+
return ret;
}
@@ -2009,7 +2248,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "could not find root %llu\n", tree_id);
+ printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
ret = -ENOENT;
goto out;
}
@@ -2018,7 +2257,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
- while(1) {
+ while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
@@ -2047,7 +2286,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
}
*(ptr + len) = '/';
- read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
+ read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
break;
@@ -2058,7 +2297,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
dirid = key.objectid;
}
memmove(name, ptr, total_len);
- name[total_len]='\0';
+ name[total_len] = '\0';
ret = 0;
out:
btrfs_free_path(path);
@@ -2098,7 +2337,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
- struct dentry *parent = fdentry(file);
+ struct dentry *parent = file->f_path.dentry;
struct dentry *dentry;
struct inode *dir = parent->d_inode;
struct inode *inode;
@@ -2107,6 +2346,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_block_rsv block_rsv;
+ u64 root_flags;
u64 qgroup_reserved;
int namelen;
int ret;
@@ -2128,9 +2368,10 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
if (err)
goto out;
+
err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
if (err == -EINTR)
- goto out;
+ goto out_drop_write;
dentry = lookup_one_len(vol_args->name, parent, namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
@@ -2144,7 +2385,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
inode = dentry->d_inode;
dest = BTRFS_I(inode)->root;
- if (!capable(CAP_SYS_ADMIN)){
+ if (!capable(CAP_SYS_ADMIN)) {
/*
* Regular user. Only allow this with a special mount
* option, when the user has write+exec access to the
@@ -2189,6 +2430,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
}
mutex_lock(&inode->i_mutex);
+
+ /*
+ * Don't allow to delete a subvolume with send in progress. This is
+ * inside the i_mutex so the error handling that has to drop the bit
+ * again is not run concurrently.
+ */
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ if (dest->send_in_progress == 0) {
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ } else {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(root->fs_info,
+ "Attempt to delete subvolume %llu during send",
+ dest->root_key.objectid);
+ err = -EPERM;
+ goto out_dput;
+ }
+
err = d_invalidate(dentry);
if (err)
goto out_unlock;
@@ -2234,7 +2496,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
- if (!xchg(&dest->orphan_item_inserted, 1)) {
+ if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
ret = btrfs_insert_orphan_item(trans,
root->fs_info->tree_root,
dest->root_key.objectid);
@@ -2277,11 +2539,19 @@ out_release:
out_up_write:
up_write(&root->fs_info->subvol_sem);
out_unlock:
+ if (err) {
+ spin_lock(&dest->root_item_lock);
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
+ }
mutex_unlock(&inode->i_mutex);
if (!err) {
shrink_dcache_sb(root->fs_info->sb);
btrfs_invalidate_inodes(dest);
d_delete(dentry);
+ ASSERT(dest->send_in_progress == 0);
/* the last ref */
if (dest->cache_inode) {
@@ -2293,6 +2563,7 @@ out_dput:
dput(dentry);
out_unlock_dir:
mutex_unlock(&dir->i_mutex);
+out_drop_write:
mnt_drop_write_file(file);
out:
kfree(vol_args);
@@ -2444,9 +2715,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
if (!fi_args)
return -ENOMEM;
@@ -2461,6 +2729,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
}
mutex_unlock(&fs_devices->device_list_mutex);
+ fi_args->nodesize = root->fs_info->super_copy->nodesize;
+ fi_args->sectorsize = root->fs_info->super_copy->sectorsize;
+ fi_args->clone_alignment = root->fs_info->super_copy->sectorsize;
+
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
@@ -2476,9 +2748,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
int ret = 0;
char *s_uuid = NULL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
@@ -2556,10 +2825,15 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
ordered = btrfs_lookup_first_ordered_extent(inode,
off + len - 1);
- if (!ordered &&
+ if ((!ordered ||
+ ordered->file_offset + ordered->len <= off ||
+ ordered->file_offset >= off + len) &&
!test_range_bit(&BTRFS_I(inode)->io_tree, off,
- off + len - 1, EXTENT_DELALLOC, 0, NULL))
+ off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
+ if (ordered)
+ btrfs_put_ordered_extent(ordered);
break;
+ }
unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
if (ordered)
btrfs_put_ordered_extent(ordered);
@@ -2694,14 +2968,11 @@ out_unlock:
#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
static long btrfs_ioctl_file_extent_same(struct file *file,
- void __user *argp)
+ struct btrfs_ioctl_same_args __user *argp)
{
- struct btrfs_ioctl_same_args tmp;
struct btrfs_ioctl_same_args *same;
struct btrfs_ioctl_same_extent_info *info;
- struct inode *src = file->f_dentry->d_inode;
- struct file *dst_file = NULL;
- struct inode *dst;
+ struct inode *src = file_inode(file);
u64 off;
u64 len;
int i;
@@ -2709,6 +2980,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
unsigned long size;
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
bool is_admin = capable(CAP_SYS_ADMIN);
+ u16 count;
if (!(file->f_mode & FMODE_READ))
return -EINVAL;
@@ -2717,25 +2989,17 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
if (ret)
return ret;
- if (copy_from_user(&tmp,
- (struct btrfs_ioctl_same_args __user *)argp,
- sizeof(tmp))) {
+ if (get_user(count, &argp->dest_count)) {
ret = -EFAULT;
goto out;
}
- size = sizeof(tmp) +
- tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
+ size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
- same = kmalloc(size, GFP_NOFS);
- if (!same) {
- ret = -EFAULT;
- goto out;
- }
+ same = memdup_user(argp, size);
- if (copy_from_user(same,
- (struct btrfs_ioctl_same_args __user *)argp, size)) {
- ret = -EFAULT;
+ if (IS_ERR(same)) {
+ ret = PTR_ERR(same);
goto out;
}
@@ -2769,52 +3033,35 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
goto out;
/* pre-format output fields to sane values */
- for (i = 0; i < same->dest_count; i++) {
+ for (i = 0; i < count; i++) {
same->info[i].bytes_deduped = 0ULL;
same->info[i].status = 0;
}
- ret = 0;
- for (i = 0; i < same->dest_count; i++) {
- info = &same->info[i];
-
- dst_file = fget(info->fd);
- if (!dst_file) {
+ for (i = 0, info = same->info; i < count; i++, info++) {
+ struct inode *dst;
+ struct fd dst_file = fdget(info->fd);
+ if (!dst_file.file) {
info->status = -EBADF;
- goto next;
+ continue;
}
+ dst = file_inode(dst_file.file);
- if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+ if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
info->status = -EINVAL;
- goto next;
- }
-
- info->status = -EXDEV;
- if (file->f_path.mnt != dst_file->f_path.mnt)
- goto next;
-
- dst = dst_file->f_dentry->d_inode;
- if (src->i_sb != dst->i_sb)
- goto next;
-
- if (S_ISDIR(dst->i_mode)) {
+ } else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
+ info->status = -EXDEV;
+ } else if (S_ISDIR(dst->i_mode)) {
info->status = -EISDIR;
- goto next;
- }
-
- if (!S_ISREG(dst->i_mode)) {
+ } else if (!S_ISREG(dst->i_mode)) {
info->status = -EACCES;
- goto next;
+ } else {
+ info->status = btrfs_extent_same(src, off, len, dst,
+ info->logical_offset);
+ if (info->status == 0)
+ info->bytes_deduped += len;
}
-
- info->status = btrfs_extent_same(src, off, len, dst,
- info->logical_offset);
- if (info->status == 0)
- info->bytes_deduped += len;
-
-next:
- if (dst_file)
- fput(dst_file);
+ fdput(dst_file);
}
ret = copy_to_user(argp, same, size);
@@ -2826,6 +3073,129 @@ out:
return ret;
}
+/* Helper to check and see if this root currently has a ref on the given disk
+ * bytenr. If it does then we need to update the quota for this root. This
+ * doesn't do anything if quotas aren't enabled.
+ */
+static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+ u64 disko)
+{
+ struct seq_list tree_mod_seq_elem = {};
+ struct ulist *roots;
+ struct ulist_iterator uiter;
+ struct ulist_node *root_node = NULL;
+ int ret;
+
+ if (!root->fs_info->quota_enabled)
+ return 1;
+
+ btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ ret = btrfs_find_all_roots(trans, root->fs_info, disko,
+ tree_mod_seq_elem.seq, &roots);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ ULIST_ITER_INIT(&uiter);
+ while ((root_node = ulist_next(roots, &uiter))) {
+ if (root_node->val == root->objectid) {
+ ret = 1;
+ break;
+ }
+ }
+ ulist_free(roots);
+out:
+ btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
+ return ret;
+}
+
+static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ u64 endoff,
+ const u64 destoff,
+ const u64 olen)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ /*
+ * We round up to the block size at eof when determining which
+ * extents to clone above, but shouldn't round up the file size.
+ */
+ if (endoff > destoff + olen)
+ endoff = destoff + olen;
+ if (endoff > inode->i_size)
+ btrfs_i_size_write(inode, endoff);
+
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ ret = btrfs_end_transaction(trans, root);
+out:
+ return ret;
+}
+
+static void clone_update_extent_map(struct inode *inode,
+ const struct btrfs_trans_handle *trans,
+ const struct btrfs_path *path,
+ const u64 hole_offset,
+ const u64 hole_len)
+{
+ struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ struct extent_map *em;
+ int ret;
+
+ em = alloc_extent_map();
+ if (!em) {
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ return;
+ }
+
+ if (path) {
+ struct btrfs_file_extent_item *fi;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
+ em->generation = -1;
+ if (btrfs_file_extent_type(path->nodes[0], fi) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ } else {
+ em->start = hole_offset;
+ em->len = hole_len;
+ em->ram_bytes = em->len;
+ em->orig_start = hole_offset;
+ em->block_start = EXTENT_MAP_HOLE;
+ em->block_len = 0;
+ em->orig_block_len = 0;
+ em->compress_type = BTRFS_COMPRESS_NONE;
+ em->generation = trans->transid;
+ }
+
+ while (1) {
+ write_lock(&em_tree->lock);
+ ret = add_extent_mapping(em_tree, em, 1);
+ write_unlock(&em_tree->lock);
+ if (ret != -EEXIST) {
+ free_extent_map(em);
+ break;
+ }
+ btrfs_drop_extent_cache(inode, em->start,
+ em->start + em->len - 1, 0);
+ }
+
+ if (unlikely(ret))
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -2838,7 +3208,8 @@ out:
* @destoff: Offset within @inode to start clone
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
- u64 off, u64 olen, u64 olen_aligned, u64 destoff)
+ const u64 off, const u64 olen, const u64 olen_aligned,
+ const u64 destoff)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
@@ -2849,7 +3220,10 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u32 nritems;
int slot;
int ret;
- u64 len = olen_aligned;
+ int no_quota;
+ const u64 len = olen_aligned;
+ u64 last_disko = 0;
+ u64 last_dest_end = destoff;
ret = -ENOMEM;
buf = vmalloc(btrfs_level_size(root, 0));
@@ -2866,19 +3240,33 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
/* clone data */
key.objectid = btrfs_ino(src);
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = 0;
+ key.offset = off;
while (1) {
/*
* note the key will change type as we walk through the
* tree.
*/
+ path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
0, 0);
if (ret < 0)
goto out;
+ /*
+ * First search, if no extent item that starts at offset off was
+ * found but the previous item is an extent item, it's possible
+ * it might overlap our target range, therefore process it.
+ */
+ if (key.offset == off && ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0] - 1);
+ if (key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
nritems = btrfs_header_nritems(path->nodes[0]);
+process_slot:
+ no_quota = 1;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
@@ -2903,12 +3291,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
- u64 endoff;
-
- size = btrfs_item_size_nr(leaf, slot);
- read_extent_buffer(leaf, buf,
- btrfs_item_ptr_offset(leaf, slot),
- size);
+ u64 drop_start;
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
@@ -2928,11 +3311,26 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
datal = btrfs_file_extent_ram_bytes(leaf,
extent);
}
- btrfs_release_path(path);
- if (key.offset + datal <= off ||
- key.offset >= off + len - 1)
- goto next;
+ /*
+ * The first search might have left us at an extent
+ * item that ends before our target range's start, can
+ * happen if we have holes and NO_HOLES feature enabled.
+ */
+ if (key.offset + datal <= off) {
+ path->slots[0]++;
+ goto process_slot;
+ } else if (key.offset >= off + len) {
+ break;
+ }
+
+ size = btrfs_item_size_nr(leaf, slot);
+ read_extent_buffer(leaf, buf,
+ btrfs_item_ptr_offset(leaf, slot),
+ size);
+
+ btrfs_release_path(path);
+ path->leave_spinning = 0;
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = btrfs_ino(inode);
@@ -2942,6 +3340,18 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
new_key.offset = destoff;
/*
+ * Deal with a hole that doesn't have an extent item
+ * that represents it (NO_HOLES feature enabled).
+ * This hole is either in the middle of the cloning
+ * range or at the beginning (fully overlaps it or
+ * partially overlaps it).
+ */
+ if (new_key.offset != last_dest_end)
+ drop_start = last_dest_end;
+ else
+ drop_start = new_key.offset;
+
+ /*
* 1 - adjusting old extent (we may have to split it)
* 1 - add new extent
* 1 - inode update
@@ -2959,23 +3369,24 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
* | ------------- extent ------------- |
*/
- /* substract range b */
+ /* subtract range b */
if (key.offset + datal > off + len)
datal = off + len - key.offset;
- /* substract range a */
+ /* subtract range a */
if (off > key.offset) {
datao += off - key.offset;
datal -= off - key.offset;
}
ret = btrfs_drop_extents(trans, root, inode,
- new_key.offset,
+ drop_start,
new_key.offset + datal,
1);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+ root, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3006,6 +3417,28 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
datao);
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
+
+ /*
+ * We need to look up the roots that point at
+ * this bytenr and see if the new root does. If
+ * it does not we need to make sure we update
+ * quotas appropriately.
+ */
+ if (disko && root != BTRFS_I(src)->root &&
+ disko != last_disko) {
+ no_quota = check_ref(trans, root,
+ disko);
+ if (no_quota < 0) {
+ btrfs_abort_transaction(trans,
+ root,
+ ret);
+ btrfs_end_transaction(trans,
+ root);
+ ret = no_quota;
+ goto out;
+ }
+ }
+
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root,
@@ -3013,7 +3446,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
root->root_key.objectid,
btrfs_ino(inode),
new_key.offset - datao,
- 0);
+ no_quota);
if (ret) {
btrfs_abort_transaction(trans,
root,
@@ -3027,6 +3460,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
+ u64 aligned_end = 0;
+
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
@@ -3043,13 +3478,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
size -= skip + trim;
datal -= skip + trim;
+ aligned_end = ALIGN(new_key.offset + datal,
+ root->sectorsize);
ret = btrfs_drop_extents(trans, root, inode,
- new_key.offset,
- new_key.offset + datal,
+ drop_start,
+ aligned_end,
1);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+ root, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3078,39 +3516,62 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
inode_add_bytes(inode, datal);
}
+ /* If we have an implicit hole (NO_HOLES feature). */
+ if (drop_start < new_key.offset)
+ clone_update_extent_map(inode, trans,
+ NULL, drop_start,
+ new_key.offset - drop_start);
+
+ clone_update_extent_map(inode, trans, path, 0, 0);
+
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
- inode_inc_iversion(inode);
- inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-
- /*
- * we round up to the block size at eof when
- * determining which extents to clone above,
- * but shouldn't round up the file size
- */
- endoff = new_key.offset + datal;
- if (endoff > destoff+olen)
- endoff = destoff+olen;
- if (endoff > inode->i_size)
- btrfs_i_size_write(inode, endoff);
-
- ret = btrfs_update_inode(trans, root, inode);
- if (ret) {
- btrfs_abort_transaction(trans, root, ret);
- btrfs_end_transaction(trans, root);
+ last_dest_end = new_key.offset + datal;
+ ret = clone_finish_inode_update(trans, inode,
+ last_dest_end,
+ destoff, olen);
+ if (ret)
goto out;
- }
- ret = btrfs_end_transaction(trans, root);
+ if (new_key.offset + datal >= destoff + len)
+ break;
}
-next:
btrfs_release_path(path);
key.offset++;
}
ret = 0;
+ if (last_dest_end < destoff + len) {
+ /*
+ * We have an implicit hole (NO_HOLES feature is enabled) that
+ * fully or partially overlaps our cloning range at its end.
+ */
+ btrfs_release_path(path);
+
+ /*
+ * 1 - remove extent(s)
+ * 1 - inode update
+ */
+ trans = btrfs_start_transaction(root, 2);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ ret = btrfs_drop_extents(trans, root, inode,
+ last_dest_end, destoff + len, 1);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans, root, ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+ clone_update_extent_map(inode, trans, NULL, last_dest_end,
+ destoff + len - last_dest_end);
+ ret = clone_finish_inode_update(trans, inode, destoff + len,
+ destoff, olen);
+ }
+
out:
- btrfs_release_path(path);
btrfs_free_path(path);
vfree(buf);
return ret;
@@ -3119,7 +3580,7 @@ out:
static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
- struct inode *inode = fdentry(file)->d_inode;
+ struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct fd src_file;
struct inode *src;
@@ -3134,8 +3595,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
* decompress into destination's address_space (the file offset
* may change, so source mapping won't do), then recompress (or
* otherwise reinsert) a subrange.
- * - allow ranges within the same file to be cloned (provided
- * they don't overlap)?
+ *
+ * - split destination inode's inline extents. The inline extents can
+ * be either compressed or non-compressed.
*/
/* the destination must be opened for writing */
@@ -3221,19 +3683,53 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
goto out_unlock;
}
- /* truncate page cache pages from target inode range */
- truncate_inode_pages_range(&inode->i_data, destoff,
- PAGE_CACHE_ALIGN(destoff + len) - 1);
+ /*
+ * Lock the target range too. Right after we replace the file extent
+ * items in the fs tree (which now point to the cloned data), we might
+ * have a worker replace them with extent items relative to a write
+ * operation that was issued before this clone operation (i.e. confront
+ * with inode.c:btrfs_finish_ordered_io).
+ */
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
- lock_extent_range(src, off, len);
+ lock_extent_range(src, lock_start, lock_len);
+ } else {
+ lock_extent_range(src, off, len);
+ lock_extent_range(inode, destoff, len);
+ }
ret = btrfs_clone(src, inode, off, olen, len, destoff);
- unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ if (same_inode) {
+ u64 lock_start = min_t(u64, off, destoff);
+ u64 lock_end = max_t(u64, off, destoff) + len - 1;
+
+ unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
+ } else {
+ unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
+ unlock_extent(&BTRFS_I(inode)->io_tree, destoff,
+ destoff + len - 1);
+ }
+ /*
+ * Truncate page cache pages so that future reads will see the cloned
+ * data immediately and not the previous data.
+ */
+ truncate_inode_pages_range(&inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len) - 1);
out_unlock:
- mutex_unlock(&src->i_mutex);
- if (!same_inode)
- mutex_unlock(&inode->i_mutex);
+ if (!same_inode) {
+ if (inode < src) {
+ mutex_unlock(&src->i_mutex);
+ mutex_unlock(&inode->i_mutex);
+ } else {
+ mutex_unlock(&inode->i_mutex);
+ mutex_unlock(&src->i_mutex);
+ }
+ } else {
+ mutex_unlock(&src->i_mutex);
+ }
out_fput:
fdput(src_file);
out_drop_write:
@@ -3356,8 +3852,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
if (IS_ERR_OR_NULL(di)) {
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
- printk(KERN_ERR "Umm, you don't have the default dir item, "
- "this isn't going to work\n");
+ btrfs_err(new_root->fs_info, "Umm, you don't have the default dir"
+ "item, this isn't going to work");
ret = -ENOENT;
goto out;
}
@@ -3438,6 +3934,11 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
+ /*
+ * Global block reserve, exported as a space_info
+ */
+ slot_count++;
+
/* space_slots == 0 means they are asking for a count */
if (space_args.space_slots == 0) {
space_args.total_spaces = slot_count;
@@ -3496,6 +3997,21 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
up_read(&info->groups_sem);
}
+ /*
+ * Add global block reserve
+ */
+ if (slot_count) {
+ struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv;
+
+ spin_lock(&block_rsv->lock);
+ space.total_bytes = block_rsv->size;
+ space.used_bytes = block_rsv->size - block_rsv->reserved;
+ spin_unlock(&block_rsv->lock);
+ space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
+ memcpy(dest, &space, sizeof(space));
+ space_args.total_spaces++;
+ }
+
user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
@@ -3679,9 +4195,10 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
switch (p->cmd) {
case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
- if (root->fs_info->sb->s_flags & MS_RDONLY)
- return -EROFS;
-
+ if (root->fs_info->sb->s_flags & MS_RDONLY) {
+ ret = -EROFS;
+ goto out;
+ }
if (atomic_xchg(
&root->fs_info->mutually_exclusive_operation_running,
1)) {
@@ -3707,7 +4224,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
if (copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
-
+out:
kfree(p);
return ret;
}
@@ -4317,7 +4834,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
{
- struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4325,10 +4842,9 @@ static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
return btrfs_qgroup_wait_for_completion(root->fs_info);
}
-static long btrfs_ioctl_set_received_subvol(struct file *file,
- void __user *arg)
+static long _btrfs_ioctl_set_received_subvol(struct file *file,
+ struct btrfs_ioctl_received_subvol_args *sa)
{
- struct btrfs_ioctl_received_subvol_args *sa = NULL;
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
@@ -4337,6 +4853,9 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
int ret = 0;
int received_uuid_changed;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
ret = mnt_want_write_file(file);
if (ret < 0)
return ret;
@@ -4353,18 +4872,6 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
- if (!inode_owner_or_capable(inode)) {
- ret = -EACCES;
- goto out;
- }
-
- sa = memdup_user(arg, sizeof(*sa));
- if (IS_ERR(sa)) {
- ret = PTR_ERR(sa);
- sa = NULL;
- goto out;
- }
-
/*
* 1 - root item
* 2 - uuid items (received uuid + subvol uuid)
@@ -4418,14 +4925,90 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
goto out;
}
+out:
+ up_write(&root->fs_info->subvol_sem);
+ mnt_drop_write_file(file);
+ return ret;
+}
+
+#ifdef CONFIG_64BIT
+static long btrfs_ioctl_set_received_subvol_32(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
+ struct btrfs_ioctl_received_subvol_args *args64 = NULL;
+ int ret = 0;
+
+ args32 = memdup_user(arg, sizeof(*args32));
+ if (IS_ERR(args32)) {
+ ret = PTR_ERR(args32);
+ args32 = NULL;
+ goto out;
+ }
+
+ args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+ if (!args64) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
+ args64->stransid = args32->stransid;
+ args64->rtransid = args32->rtransid;
+ args64->stime.sec = args32->stime.sec;
+ args64->stime.nsec = args32->stime.nsec;
+ args64->rtime.sec = args32->rtime.sec;
+ args64->rtime.nsec = args32->rtime.nsec;
+ args64->flags = args32->flags;
+
+ ret = _btrfs_ioctl_set_received_subvol(file, args64);
+ if (ret)
+ goto out;
+
+ memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
+ args32->stransid = args64->stransid;
+ args32->rtransid = args64->rtransid;
+ args32->stime.sec = args64->stime.sec;
+ args32->stime.nsec = args64->stime.nsec;
+ args32->rtime.sec = args64->rtime.sec;
+ args32->rtime.nsec = args64->rtime.nsec;
+ args32->flags = args64->flags;
+
+ ret = copy_to_user(arg, args32, sizeof(*args32));
+ if (ret)
+ ret = -EFAULT;
+
+out:
+ kfree(args32);
+ kfree(args64);
+ return ret;
+}
+#endif
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+ void __user *arg)
+{
+ struct btrfs_ioctl_received_subvol_args *sa = NULL;
+ int ret = 0;
+
+ sa = memdup_user(arg, sizeof(*sa));
+ if (IS_ERR(sa)) {
+ ret = PTR_ERR(sa);
+ sa = NULL;
+ goto out;
+ }
+
+ ret = _btrfs_ioctl_set_received_subvol(file, sa);
+
+ if (ret)
+ goto out;
+
ret = copy_to_user(arg, sa, sizeof(*sa));
if (ret)
ret = -EFAULT;
out:
kfree(sa);
- up_write(&root->fs_info->subvol_sem);
- mnt_drop_write_file(file);
return ret;
}
@@ -4443,8 +5026,8 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
len = strnlen(label, BTRFS_LABEL_SIZE);
if (len == BTRFS_LABEL_SIZE) {
- pr_warn("btrfs: label is too long, return the first %zu bytes\n",
- --len);
+ btrfs_warn(root->fs_info,
+ "label is too long, return the first %zu bytes", --len);
}
ret = copy_to_user(arg, label, len);
@@ -4467,7 +5050,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
return -EFAULT;
if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
- pr_err("btrfs: unable to set label with more than %d bytes\n",
+ btrfs_err(root->fs_info, "unable to set label with more than %d bytes",
BTRFS_LABEL_SIZE - 1);
return -EINVAL;
}
@@ -4485,13 +5068,173 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
spin_lock(&root->fs_info->super_lock);
strcpy(super_block->label, label);
spin_unlock(&root->fs_info->super_lock);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_commit_transaction(trans, root);
out_unlock:
mnt_drop_write_file(file);
return ret;
}
+#define INIT_FEATURE_FLAGS(suffix) \
+ { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
+ .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
+ .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
+
+static int btrfs_ioctl_get_supported_features(struct file *file,
+ void __user *arg)
+{
+ static struct btrfs_ioctl_feature_flags features[3] = {
+ INIT_FEATURE_FLAGS(SUPP),
+ INIT_FEATURE_FLAGS(SAFE_SET),
+ INIT_FEATURE_FLAGS(SAFE_CLEAR)
+ };
+
+ if (copy_to_user(arg, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int btrfs_ioctl_get_features(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_ioctl_feature_flags features;
+
+ features.compat_flags = btrfs_super_compat_flags(super_block);
+ features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
+ features.incompat_flags = btrfs_super_incompat_flags(super_block);
+
+ if (copy_to_user(arg, &features, sizeof(features)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int check_feature_bits(struct btrfs_root *root,
+ enum btrfs_feature_set set,
+ u64 change_mask, u64 flags, u64 supported_flags,
+ u64 safe_set, u64 safe_clear)
+{
+ const char *type = btrfs_feature_set_names[set];
+ char *names;
+ u64 disallowed, unsupported;
+ u64 set_mask = flags & change_mask;
+ u64 clear_mask = ~flags & change_mask;
+
+ unsupported = set_mask & ~supported_flags;
+ if (unsupported) {
+ names = btrfs_printable_features(set, unsupported);
+ if (names) {
+ btrfs_warn(root->fs_info,
+ "this kernel does not support the %s feature bit%s",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(root->fs_info,
+ "this kernel does not support %s bits 0x%llx",
+ type, unsupported);
+ return -EOPNOTSUPP;
+ }
+
+ disallowed = set_mask & ~safe_set;
+ if (disallowed) {
+ names = btrfs_printable_features(set, disallowed);
+ if (names) {
+ btrfs_warn(root->fs_info,
+ "can't set the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(root->fs_info,
+ "can't set %s bits 0x%llx while mounted",
+ type, disallowed);
+ return -EPERM;
+ }
+
+ disallowed = clear_mask & ~safe_clear;
+ if (disallowed) {
+ names = btrfs_printable_features(set, disallowed);
+ if (names) {
+ btrfs_warn(root->fs_info,
+ "can't clear the %s feature bit%s while mounted",
+ names, strchr(names, ',') ? "s" : "");
+ kfree(names);
+ } else
+ btrfs_warn(root->fs_info,
+ "can't clear %s bits 0x%llx while mounted",
+ type, disallowed);
+ return -EPERM;
+ }
+
+ return 0;
+}
+
+#define check_feature(root, change_mask, flags, mask_base) \
+check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \
+ BTRFS_FEATURE_ ## mask_base ## _SUPP, \
+ BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
+ BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
+
+static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
+{
+ struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
+ struct btrfs_super_block *super_block = root->fs_info->super_copy;
+ struct btrfs_ioctl_feature_flags flags[2];
+ struct btrfs_trans_handle *trans;
+ u64 newflags;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (copy_from_user(flags, arg, sizeof(flags)))
+ return -EFAULT;
+
+ /* Nothing to do */
+ if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
+ !flags[0].incompat_flags)
+ return 0;
+
+ ret = check_feature(root, flags[0].compat_flags,
+ flags[1].compat_flags, COMPAT);
+ if (ret)
+ return ret;
+
+ ret = check_feature(root, flags[0].compat_ro_flags,
+ flags[1].compat_ro_flags, COMPAT_RO);
+ if (ret)
+ return ret;
+
+ ret = check_feature(root, flags[0].incompat_flags,
+ flags[1].incompat_flags, INCOMPAT);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ spin_lock(&root->fs_info->super_lock);
+ newflags = btrfs_super_compat_flags(super_block);
+ newflags |= flags[0].compat_flags & flags[1].compat_flags;
+ newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
+ btrfs_set_super_compat_flags(super_block, newflags);
+
+ newflags = btrfs_super_compat_ro_flags(super_block);
+ newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
+ newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
+ btrfs_set_super_compat_ro_flags(super_block, newflags);
+
+ newflags = btrfs_super_incompat_flags(super_block);
+ newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
+ newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
+ btrfs_set_super_incompat_flags(super_block, newflags);
+ spin_unlock(&root->fs_info->super_lock);
+
+ return btrfs_commit_transaction(trans, root);
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4549,6 +5292,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_trans_end(file);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
+ case BTRFS_IOC_TREE_SEARCH_V2:
+ return btrfs_ioctl_tree_search_v2(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
case BTRFS_IOC_INO_PATHS:
@@ -4557,9 +5302,15 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_logical_to_ino(root, argp);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(root, argp);
- case BTRFS_IOC_SYNC:
- btrfs_sync_fs(file->f_dentry->d_sb, 1);
- return 0;
+ case BTRFS_IOC_SYNC: {
+ int ret;
+
+ ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
+ if (ret)
+ return ret;
+ ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
+ return ret;
+ }
case BTRFS_IOC_START_SYNC:
return btrfs_ioctl_start_sync(root, argp);
case BTRFS_IOC_WAIT_SYNC:
@@ -4578,6 +5329,10 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_balance_progress(root, argp);
case BTRFS_IOC_SET_RECEIVED_SUBVOL:
return btrfs_ioctl_set_received_subvol(file, argp);
+#ifdef CONFIG_64BIT
+ case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
+ return btrfs_ioctl_set_received_subvol_32(file, argp);
+#endif
case BTRFS_IOC_SEND:
return btrfs_ioctl_send(file, argp);
case BTRFS_IOC_GET_DEV_STATS:
@@ -4604,6 +5359,12 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_set_fslabel(file, argp);
case BTRFS_IOC_FILE_EXTENT_SAME:
return btrfs_ioctl_file_extent_same(file, argp);
+ case BTRFS_IOC_GET_SUPPORTED_FEATURES:
+ return btrfs_ioctl_get_supported_features(file, argp);
+ case BTRFS_IOC_GET_FEATURES:
+ return btrfs_ioctl_get_features(file, argp);
+ case BTRFS_IOC_SET_FEATURES:
+ return btrfs_ioctl_set_features(file, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 01277b8f237..5665d214924 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -33,14 +33,14 @@ static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
*/
void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
if (rw == BTRFS_WRITE_LOCK) {
if (atomic_read(&eb->blocking_writers) == 0) {
WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -65,14 +65,15 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
*/
void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
- }
+ /*
+ * no lock is required. The lock owner may change if
+ * we have a read lock, but it won't change to or away
+ * from us. If we have the write lock, we are the owner
+ * and it'll never change.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner)
+ return;
+
if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
BUG_ON(atomic_read(&eb->blocking_writers) != 1);
write_lock(&eb->lock);
@@ -99,6 +100,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
void btrfs_tree_read_lock(struct extent_buffer *eb)
{
again:
+ BUG_ON(!atomic_read(&eb->blocking_writers) &&
+ current->pid == eb->lock_owner);
+
read_lock(&eb->lock);
if (atomic_read(&eb->blocking_writers) &&
current->pid == eb->lock_owner) {
@@ -132,7 +136,9 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers))
return 0;
- read_lock(&eb->lock);
+ if (!read_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers)) {
read_unlock(&eb->lock);
return 0;
@@ -151,7 +157,10 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers))
return 0;
- write_lock(&eb->lock);
+
+ if (!write_trylock(&eb->lock))
+ return 0;
+
if (atomic_read(&eb->blocking_writers) ||
atomic_read(&eb->blocking_readers)) {
write_unlock(&eb->lock);
@@ -168,14 +177,15 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->spinning_readers) == 0);
@@ -189,14 +199,15 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb)
*/
void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
{
- if (eb->lock_nested) {
- read_lock(&eb->lock);
- if (eb->lock_nested && current->pid == eb->lock_owner) {
- eb->lock_nested = 0;
- read_unlock(&eb->lock);
- return;
- }
- read_unlock(&eb->lock);
+ /*
+ * if we're nested, we have the write lock. No new locking
+ * is needed as long as we are the lock owner.
+ * The write unlock will do a barrier for us, and the lock_nested
+ * field only matters to the lock owner.
+ */
+ if (eb->lock_nested && current->pid == eb->lock_owner) {
+ eb->lock_nested = 0;
+ return;
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
@@ -244,6 +255,7 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
BUG_ON(blockers > 1);
btrfs_assert_tree_locked(eb);
+ eb->lock_owner = 0;
atomic_dec(&eb->write_locks);
if (blockers) {
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index b6a6f07c5ce..dfad8514f0d 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,9 +141,9 @@ static int lzo_compress_pages(struct list_head *ws,
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
&out_len, workspace->mem);
if (ret != LZO_E_OK) {
- printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+ printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -189,7 +189,7 @@ static int lzo_compress_pages(struct list_head *ws,
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -208,7 +208,7 @@ static int lzo_compress_pages(struct list_head *ws,
/* we're making it bigger, give up */
if (tot_in > 8192 && tot_in < tot_out) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -335,7 +335,7 @@ cont:
break;
if (page_in_index + 1 >= total_pages_in) {
- ret = -1;
+ ret = -EIO;
goto done;
}
@@ -357,8 +357,8 @@ cont:
if (need_unmap)
kunmap(pages_in[page_in_index - 1]);
if (ret != LZO_E_OK) {
- printk(KERN_WARNING "btrfs decompress failed\n");
- ret = -1;
+ printk(KERN_WARNING "BTRFS: decompress failed\n");
+ ret = -EIO;
break;
}
@@ -401,13 +401,13 @@ static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
out_len = PAGE_CACHE_SIZE;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
- printk(KERN_WARNING "btrfs decompress failed!\n");
- ret = -1;
+ printk(KERN_WARNING "BTRFS: decompress failed!\n");
+ ret = -EIO;
goto out;
}
if (out_len < start_byte) {
- ret = -1;
+ ret = -EIO;
goto out;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index c702cb62f78..7187b14faa6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
- "%llu\n", offset);
+ "%llu", offset);
}
/*
@@ -336,22 +336,26 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
entry->len);
*file_offset = dec_end;
if (dec_start > dec_end) {
- printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
- dec_start, dec_end);
+ btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ "bad ordering dec_start %llu end %llu", dec_start, dec_end);
}
to_dec = dec_end - dec_start;
if (to_dec > entry->bytes_left) {
- printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
- entry->bytes_left, to_dec);
+ btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ "bad ordered accounting left %llu size %llu",
+ entry->bytes_left, to_dec);
}
entry->bytes_left -= to_dec;
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
- if (entry->bytes_left == 0)
+ if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- else
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
ret = 1;
+ }
out:
if (!ret && cached && entry) {
*cached = entry;
@@ -401,17 +405,21 @@ have_entry:
}
if (io_size > entry->bytes_left) {
- printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+ btrfs_crit(BTRFS_I(inode)->root->fs_info,
+ "bad ordered accounting left %llu size %llu",
entry->bytes_left, io_size);
}
entry->bytes_left -= io_size;
if (!uptodate)
set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
- if (entry->bytes_left == 0)
+ if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
- else
+ if (waitqueue_active(&entry->wait))
+ wake_up(&entry->wait);
+ } else {
ret = 1;
+ }
out:
if (!ret && cached && entry) {
*cached = entry;
@@ -422,27 +430,48 @@ out:
}
/* Needs to either be called under a log transaction or the log_mutex */
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list)
{
struct btrfs_ordered_inode_tree *tree;
struct btrfs_ordered_extent *ordered;
struct rb_node *n;
- int index = log->log_transid % 2;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
- spin_lock(&log->log_extents_lock[index]);
- if (list_empty(&ordered->log_list)) {
- list_add_tail(&ordered->log_list, &log->logged_list[index]);
- atomic_inc(&ordered->refs);
- }
- spin_unlock(&log->log_extents_lock[index]);
+ if (!list_empty(&ordered->log_list))
+ continue;
+ list_add_tail(&ordered->log_list, logged_list);
+ atomic_inc(&ordered->refs);
}
spin_unlock_irq(&tree->lock);
}
+void btrfs_put_logged_extents(struct list_head *logged_list)
+{
+ struct btrfs_ordered_extent *ordered;
+
+ while (!list_empty(logged_list)) {
+ ordered = list_first_entry(logged_list,
+ struct btrfs_ordered_extent,
+ log_list);
+ list_del_init(&ordered->log_list);
+ btrfs_put_ordered_extent(ordered);
+ }
+}
+
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log)
+{
+ int index = log->log_transid % 2;
+
+ spin_lock_irq(&log->log_extents_lock[index]);
+ list_splice_tail(logged_list, &log->logged_list[index]);
+ spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
{
struct btrfs_ordered_extent *ordered;
@@ -455,8 +484,19 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
log_list);
list_del_init(&ordered->log_list);
spin_unlock_irq(&log->log_extents_lock[index]);
+
+ if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
+ !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
+ struct inode *inode = ordered->inode;
+ u64 start = ordered->file_offset;
+ u64 end = ordered->file_offset + ordered->len - 1;
+
+ WARN_ON(!inode);
+ filemap_fdatawrite_range(inode->i_mapping, start, end);
+ }
wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
&ordered->flags));
+
btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
@@ -520,7 +560,8 @@ void btrfs_remove_ordered_extent(struct inode *inode,
spin_lock_irq(&tree->lock);
node = &entry->rb_node;
rb_erase(node, &tree->tree);
- tree->last = NULL;
+ if (tree->last == node)
+ tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
spin_unlock_irq(&tree->lock);
@@ -537,7 +578,9 @@ void btrfs_remove_ordered_extent(struct inode *inode,
*/
if (RB_EMPTY_ROOT(&tree->tree) &&
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+ spin_lock(&root->fs_info->ordered_root_lock);
list_del_init(&BTRFS_I(inode)->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_root_lock);
}
if (!root->nr_ordered_extents) {
@@ -563,18 +606,19 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
* wait for all the ordered extents in a root. This is done when balancing
* space between drives.
*/
-void btrfs_wait_ordered_extents(struct btrfs_root *root)
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
{
struct list_head splice, works;
struct btrfs_ordered_extent *ordered, *next;
+ int count = 0;
INIT_LIST_HEAD(&splice);
INIT_LIST_HEAD(&works);
- mutex_lock(&root->fs_info->ordered_operations_mutex);
+ mutex_lock(&root->ordered_extent_mutex);
spin_lock(&root->ordered_extent_lock);
list_splice_init(&root->ordered_extents, &splice);
- while (!list_empty(&splice)) {
+ while (!list_empty(&splice) && nr) {
ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
root_extent_list);
list_move_tail(&ordered->root_extent_list,
@@ -582,14 +626,19 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
atomic_inc(&ordered->refs);
spin_unlock(&root->ordered_extent_lock);
- ordered->flush_work.func = btrfs_run_ordered_extent_work;
+ btrfs_init_work(&ordered->flush_work,
+ btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &ordered->flush_work);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &ordered->flush_work);
cond_resched();
spin_lock(&root->ordered_extent_lock);
+ if (nr != -1)
+ nr--;
+ count++;
}
+ list_splice_tail(&splice, &root->ordered_extents);
spin_unlock(&root->ordered_extent_lock);
list_for_each_entry_safe(ordered, next, &works, work_list) {
@@ -598,19 +647,23 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root)
btrfs_put_ordered_extent(ordered);
cond_resched();
}
- mutex_unlock(&root->fs_info->ordered_operations_mutex);
+ mutex_unlock(&root->ordered_extent_mutex);
+
+ return count;
}
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
{
struct btrfs_root *root;
struct list_head splice;
+ int done;
INIT_LIST_HEAD(&splice);
+ mutex_lock(&fs_info->ordered_operations_mutex);
spin_lock(&fs_info->ordered_root_lock);
list_splice_init(&fs_info->ordered_roots, &splice);
- while (!list_empty(&splice)) {
+ while (!list_empty(&splice) && nr) {
root = list_first_entry(&splice, struct btrfs_root,
ordered_root);
root = btrfs_grab_fs_root(root);
@@ -619,12 +672,18 @@ void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
&fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
- btrfs_wait_ordered_extents(root);
+ done = btrfs_wait_ordered_extents(root, nr);
btrfs_put_fs_root(root);
spin_lock(&fs_info->ordered_root_lock);
+ if (nr != -1) {
+ nr -= done;
+ WARN_ON(nr < 0);
+ }
}
+ list_splice_tail(&splice, &fs_info->ordered_roots);
spin_unlock(&fs_info->ordered_root_lock);
+ mutex_unlock(&fs_info->ordered_operations_mutex);
}
/*
@@ -686,8 +745,8 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
goto out;
}
list_add_tail(&work->list, &works);
- btrfs_queue_worker(&root->fs_info->flush_workers,
- &work->work);
+ btrfs_queue_work(root->fs_info->flush_workers,
+ &work->work);
cond_resched();
spin_lock(&root->fs_info->ordered_root_lock);
@@ -734,8 +793,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
/*
* Used to wait on ordered extents across a large range of bytes.
*/
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
{
+ int ret = 0;
u64 end;
u64 orig_end;
struct btrfs_ordered_extent *ordered;
@@ -751,8 +811,9 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc
* extents
*/
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
+ ret = filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+ if (ret)
+ return ret;
/*
* So with compression we will find and lock a dirty page and clear the
* first one as dirty, setup an async extent, and immediately return
@@ -768,10 +829,15 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
* right and you are wrong.
*/
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
- &BTRFS_I(inode)->runtime_flags))
- filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
- filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ &BTRFS_I(inode)->runtime_flags)) {
+ ret = filemap_fdatawrite_range(inode->i_mapping, start,
+ orig_end);
+ if (ret)
+ return ret;
+ }
+ ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+ if (ret)
+ return ret;
end = orig_end;
while (1) {
@@ -782,17 +848,20 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
- if (ordered->file_offset + ordered->len < start) {
+ if (ordered->file_offset + ordered->len <= start) {
btrfs_put_ordered_extent(ordered);
break;
}
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
+ if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
+ ret = -EIO;
btrfs_put_ordered_extent(ordered);
- if (end == 0 || end == start)
+ if (ret || end == 0 || end == start)
break;
end--;
}
+ return ret;
}
/*
@@ -1076,7 +1145,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
* if this file hasn't been changed since the last transaction
* commit, we can safely return without doing anything
*/
- if (last_mod < root->fs_info->last_trans_committed)
+ if (last_mod <= root->fs_info->last_trans_committed)
return;
spin_lock(&root->fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 0c0b35612d7..246897058ef 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -180,7 +180,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
u64 file_offset);
void btrfs_start_ordered_extent(struct inode *inode,
struct btrfs_ordered_extent *entry, int wait);
-void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
struct btrfs_ordered_extent *
btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
@@ -195,9 +195,13 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
-void btrfs_wait_ordered_extents(struct btrfs_root *root);
-void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info);
-void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
+void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
+void btrfs_get_logged_extents(struct inode *inode,
+ struct list_head *logged_list);
+void btrfs_put_logged_extents(struct list_head *logged_list);
+void btrfs_submit_logged_extents(struct list_head *logged_list,
+ struct btrfs_root *log);
void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
int __init ordered_data_init(void);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 24cad1695af..65793edb38c 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -69,23 +69,3 @@ out:
btrfs_free_path(path);
return ret;
}
-
-int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
-{
- struct btrfs_path *path;
- struct btrfs_key key;
- int ret;
-
- key.objectid = BTRFS_ORPHAN_OBJECTID;
- key.type = BTRFS_ORPHAN_ITEM_KEY;
- key.offset = offset;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
-
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
- btrfs_free_path(path);
- return ret;
-}
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0088bedc863..9626b4ad3b9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -54,7 +54,7 @@ static void print_extent_data_ref(struct extent_buffer *eb,
btrfs_extent_data_ref_count(eb, ref));
}
-static void print_extent_item(struct extent_buffer *eb, int slot)
+static void print_extent_item(struct extent_buffer *eb, int slot, int type)
{
struct btrfs_extent_item *ei;
struct btrfs_extent_inline_ref *iref;
@@ -63,7 +63,6 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
struct btrfs_disk_key key;
unsigned long end;
unsigned long ptr;
- int type;
u32 item_size = btrfs_item_size_nr(eb, slot);
u64 flags;
u64 offset;
@@ -88,7 +87,8 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
flags);
- if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+ if ((type == BTRFS_EXTENT_ITEM_KEY) &&
+ flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
struct btrfs_tree_block_info *info;
info = (struct btrfs_tree_block_info *)(ei + 1);
btrfs_tree_block_key(eb, info, &key);
@@ -154,7 +154,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
u32 item_size)
{
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("btrfs: uuid item with illegal size %lu!\n",
+ pr_warn("BTRFS: uuid item with illegal size %lu!\n",
(unsigned long)item_size);
return;
}
@@ -193,7 +193,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
for (i = 0 ; i < nr ; i++) {
- item = btrfs_item_nr(l, i);
+ item = btrfs_item_nr(i);
btrfs_item_key_to_cpu(l, &key, i);
type = btrfs_key_type(&key);
printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
@@ -223,7 +223,8 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
btrfs_disk_root_refs(l, ri));
break;
case BTRFS_EXTENT_ITEM_KEY:
- print_extent_item(l, i);
+ case BTRFS_METADATA_ITEM_KEY:
+ print_extent_item(l, i, type);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
printk(KERN_INFO "\t\ttree block backref\n");
@@ -249,7 +250,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
BTRFS_FILE_EXTENT_INLINE) {
printk(KERN_INFO "\t\tinline extent data "
"size %u\n",
- btrfs_file_extent_inline_len(l, fi));
+ btrfs_file_extent_inline_len(l, i, fi));
break;
}
printk(KERN_INFO "\t\textent data disk bytenr %llu "
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
new file mode 100644
index 00000000000..129b1dd2852
--- /dev/null
+++ b/fs/btrfs/props.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/hashtable.h>
+#include "props.h"
+#include "btrfs_inode.h"
+#include "hash.h"
+#include "transaction.h"
+#include "xattr.h"
+
+#define BTRFS_PROP_HANDLERS_HT_BITS 8
+static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
+
+struct prop_handler {
+ struct hlist_node node;
+ const char *xattr_name;
+ int (*validate)(const char *value, size_t len);
+ int (*apply)(struct inode *inode, const char *value, size_t len);
+ const char *(*extract)(struct inode *inode);
+ int inheritable;
+};
+
+static int prop_compression_validate(const char *value, size_t len);
+static int prop_compression_apply(struct inode *inode,
+ const char *value,
+ size_t len);
+static const char *prop_compression_extract(struct inode *inode);
+
+static struct prop_handler prop_handlers[] = {
+ {
+ .xattr_name = XATTR_BTRFS_PREFIX "compression",
+ .validate = prop_compression_validate,
+ .apply = prop_compression_apply,
+ .extract = prop_compression_extract,
+ .inheritable = 1
+ },
+ {
+ .xattr_name = NULL
+ }
+};
+
+void __init btrfs_props_init(void)
+{
+ struct prop_handler *p;
+
+ hash_init(prop_handlers_ht);
+
+ for (p = &prop_handlers[0]; p->xattr_name; p++) {
+ u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
+
+ hash_add(prop_handlers_ht, &p->node, h);
+ }
+}
+
+static const struct hlist_head *find_prop_handlers_by_hash(const u64 hash)
+{
+ struct hlist_head *h;
+
+ h = &prop_handlers_ht[hash_min(hash, BTRFS_PROP_HANDLERS_HT_BITS)];
+ if (hlist_empty(h))
+ return NULL;
+
+ return h;
+}
+
+static const struct prop_handler *
+find_prop_handler(const char *name,
+ const struct hlist_head *handlers)
+{
+ struct prop_handler *h;
+
+ if (!handlers) {
+ u64 hash = btrfs_name_hash(name, strlen(name));
+
+ handlers = find_prop_handlers_by_hash(hash);
+ if (!handlers)
+ return NULL;
+ }
+
+ hlist_for_each_entry(h, handlers, node)
+ if (!strcmp(h->xattr_name, name))
+ return h;
+
+ return NULL;
+}
+
+static int __btrfs_set_prop(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ const char *name,
+ const char *value,
+ size_t value_len,
+ int flags)
+{
+ const struct prop_handler *handler;
+ int ret;
+
+ if (strlen(name) <= XATTR_BTRFS_PREFIX_LEN)
+ return -EINVAL;
+
+ handler = find_prop_handler(name, NULL);
+ if (!handler)
+ return -EINVAL;
+
+ if (value_len == 0) {
+ ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ NULL, 0, flags);
+ if (ret)
+ return ret;
+
+ ret = handler->apply(inode, NULL, 0);
+ ASSERT(ret == 0);
+
+ return ret;
+ }
+
+ ret = handler->validate(value, value_len);
+ if (ret)
+ return ret;
+ ret = __btrfs_setxattr(trans, inode, handler->xattr_name,
+ value, value_len, flags);
+ if (ret)
+ return ret;
+ ret = handler->apply(inode, value, value_len);
+ if (ret) {
+ __btrfs_setxattr(trans, inode, handler->xattr_name,
+ NULL, 0, flags);
+ return ret;
+ }
+
+ set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+
+ return 0;
+}
+
+int btrfs_set_prop(struct inode *inode,
+ const char *name,
+ const char *value,
+ size_t value_len,
+ int flags)
+{
+ return __btrfs_set_prop(NULL, inode, name, value, value_len, flags);
+}
+
+static int iterate_object_props(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 objectid,
+ void (*iterator)(void *,
+ const struct prop_handler *,
+ const char *,
+ size_t),
+ void *ctx)
+{
+ int ret;
+ char *name_buf = NULL;
+ char *value_buf = NULL;
+ int name_buf_len = 0;
+ int value_buf_len = 0;
+
+ while (1) {
+ struct btrfs_key key;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ u32 total_len, cur, this_len;
+ int slot;
+ const struct hlist_head *handlers;
+
+ slot = path->slots[0];
+ leaf = path->nodes[0];
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ if (key.objectid != objectid)
+ break;
+ if (key.type != BTRFS_XATTR_ITEM_KEY)
+ break;
+
+ handlers = find_prop_handlers_by_hash(key.offset);
+ if (!handlers)
+ goto next_slot;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ cur = 0;
+ total_len = btrfs_item_size_nr(leaf, slot);
+
+ while (cur < total_len) {
+ u32 name_len = btrfs_dir_name_len(leaf, di);
+ u32 data_len = btrfs_dir_data_len(leaf, di);
+ unsigned long name_ptr, data_ptr;
+ const struct prop_handler *handler;
+
+ this_len = sizeof(*di) + name_len + data_len;
+ name_ptr = (unsigned long)(di + 1);
+ data_ptr = name_ptr + name_len;
+
+ if (name_len <= XATTR_BTRFS_PREFIX_LEN ||
+ memcmp_extent_buffer(leaf, XATTR_BTRFS_PREFIX,
+ name_ptr,
+ XATTR_BTRFS_PREFIX_LEN))
+ goto next_dir_item;
+
+ if (name_len >= name_buf_len) {
+ kfree(name_buf);
+ name_buf_len = name_len + 1;
+ name_buf = kmalloc(name_buf_len, GFP_NOFS);
+ if (!name_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ read_extent_buffer(leaf, name_buf, name_ptr, name_len);
+ name_buf[name_len] = '\0';
+
+ handler = find_prop_handler(name_buf, handlers);
+ if (!handler)
+ goto next_dir_item;
+
+ if (data_len > value_buf_len) {
+ kfree(value_buf);
+ value_buf_len = data_len;
+ value_buf = kmalloc(data_len, GFP_NOFS);
+ if (!value_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ read_extent_buffer(leaf, value_buf, data_ptr, data_len);
+
+ iterator(ctx, handler, value_buf, data_len);
+next_dir_item:
+ cur += this_len;
+ di = (struct btrfs_dir_item *)((char *) di + this_len);
+ }
+
+next_slot:
+ path->slots[0]++;
+ }
+
+ ret = 0;
+out:
+ btrfs_release_path(path);
+ kfree(name_buf);
+ kfree(value_buf);
+
+ return ret;
+}
+
+static void inode_prop_iterator(void *ctx,
+ const struct prop_handler *handler,
+ const char *value,
+ size_t len)
+{
+ struct inode *inode = ctx;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ ret = handler->apply(inode, value, len);
+ if (unlikely(ret))
+ btrfs_warn(root->fs_info,
+ "error applying prop %s to ino %llu (root %llu): %d",
+ handler->xattr_name, btrfs_ino(inode),
+ root->root_key.objectid, ret);
+ else
+ set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags);
+}
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 ino = btrfs_ino(inode);
+ int ret;
+
+ ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
+
+ return ret;
+}
+
+static int inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *parent)
+{
+ const struct prop_handler *h;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ if (!test_bit(BTRFS_INODE_HAS_PROPS,
+ &BTRFS_I(parent)->runtime_flags))
+ return 0;
+
+ for (h = &prop_handlers[0]; h->xattr_name; h++) {
+ const char *value;
+ u64 num_bytes;
+
+ if (!h->inheritable)
+ continue;
+
+ value = h->extract(parent);
+ if (!value)
+ continue;
+
+ num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+ ret = btrfs_block_rsv_add(root, trans->block_rsv,
+ num_bytes, BTRFS_RESERVE_NO_FLUSH);
+ if (ret)
+ goto out;
+ ret = __btrfs_set_prop(trans, inode, h->xattr_name,
+ value, strlen(value), 0);
+ btrfs_block_rsv_release(root, trans->block_rsv, num_bytes);
+ if (ret)
+ goto out;
+ }
+ ret = 0;
+out:
+ return ret;
+}
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *dir)
+{
+ if (!dir)
+ return 0;
+
+ return inherit_props(trans, inode, dir);
+}
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *parent_root)
+{
+ struct btrfs_key key;
+ struct inode *parent_inode, *child_inode;
+ int ret;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
+ parent_root, NULL);
+ if (IS_ERR(parent_inode))
+ return PTR_ERR(parent_inode);
+
+ child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ if (IS_ERR(child_inode)) {
+ iput(parent_inode);
+ return PTR_ERR(child_inode);
+ }
+
+ ret = inherit_props(trans, child_inode, parent_inode);
+ iput(child_inode);
+ iput(parent_inode);
+
+ return ret;
+}
+
+static int prop_compression_validate(const char *value, size_t len)
+{
+ if (!strncmp("lzo", value, len))
+ return 0;
+ else if (!strncmp("zlib", value, len))
+ return 0;
+
+ return -EINVAL;
+}
+
+static int prop_compression_apply(struct inode *inode,
+ const char *value,
+ size_t len)
+{
+ int type;
+
+ if (len == 0) {
+ BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+
+ return 0;
+ }
+
+ if (!strncmp("lzo", value, len))
+ type = BTRFS_COMPRESS_LZO;
+ else if (!strncmp("zlib", value, len))
+ type = BTRFS_COMPRESS_ZLIB;
+ else
+ return -EINVAL;
+
+ BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+ BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+ BTRFS_I(inode)->force_compress = type;
+
+ return 0;
+}
+
+static const char *prop_compression_extract(struct inode *inode)
+{
+ switch (BTRFS_I(inode)->force_compress) {
+ case BTRFS_COMPRESS_ZLIB:
+ return "zlib";
+ case BTRFS_COMPRESS_LZO:
+ return "lzo";
+ }
+
+ return NULL;
+}
diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h
new file mode 100644
index 00000000000..100f18829d5
--- /dev/null
+++ b/fs/btrfs/props.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (C) 2014 Filipe David Borba Manana <fdmanana@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_PROPS_H
+#define __BTRFS_PROPS_H
+
+#include "ctree.h"
+
+void __init btrfs_props_init(void);
+
+int btrfs_set_prop(struct inode *inode,
+ const char *name,
+ const char *value,
+ size_t value_len,
+ int flags);
+
+int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path);
+
+int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct inode *dir);
+
+int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *parent_root);
+
+#endif
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4e6ef490619..98cb6b2630f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -32,6 +32,7 @@
#include "ulist.h"
#include "backref.h"
#include "extent_io.h"
+#include "qgroup.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -84,8 +85,8 @@ struct btrfs_qgroup {
/*
* temp variables for accounting operations
*/
- u64 tag;
- u64 refcnt;
+ u64 old_refcnt;
+ u64 new_refcnt;
};
/*
@@ -98,6 +99,9 @@ struct btrfs_qgroup_list {
struct btrfs_qgroup *member;
};
+#define ptr_to_u64(x) ((u64)(uintptr_t)x)
+#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x)
+
static int
qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
int init_flags);
@@ -242,6 +246,21 @@ static int del_relation_rb(struct btrfs_fs_info *fs_info,
return -ENOENT;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl)
+{
+ struct btrfs_qgroup *qgroup;
+
+ qgroup = find_qgroup_rb(fs_info, qgroupid);
+ if (!qgroup)
+ return -EINVAL;
+ if (qgroup->rfer != rfer || qgroup->excl != excl)
+ return -EINVAL;
+ return 0;
+}
+#endif
+
/*
* The full config is read in one go, only called from open_ctree()
* It doesn't use any locking, as at this point we're still single-threaded
@@ -301,16 +320,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
if (btrfs_qgroup_status_version(l, ptr) !=
BTRFS_QGROUP_STATUS_VERSION) {
- printk(KERN_ERR
- "btrfs: old qgroup version, quota disabled\n");
+ btrfs_err(fs_info,
+ "old qgroup version, quota disabled");
goto out;
}
if (btrfs_qgroup_status_generation(l, ptr) !=
fs_info->generation) {
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- printk(KERN_ERR
- "btrfs: qgroup generation mismatch, "
- "marked as inconsistent\n");
+ btrfs_err(fs_info,
+ "qgroup generation mismatch, "
+ "marked as inconsistent");
}
fs_info->qgroup_flags = btrfs_qgroup_status_flags(l,
ptr);
@@ -325,7 +344,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = find_qgroup_rb(fs_info, found_key.offset);
if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
(!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
- printk(KERN_ERR "btrfs: inconsitent qgroup config\n");
+ btrfs_err(fs_info, "inconsitent qgroup config");
flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
if (!qgroup) {
@@ -396,8 +415,8 @@ next1:
ret = add_relation_rb(fs_info, found_key.objectid,
found_key.offset);
if (ret == -ENOENT) {
- printk(KERN_WARNING
- "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+ btrfs_warn(fs_info,
+ "orphan qgroup relation 0x%llx->0x%llx",
found_key.objectid, found_key.offset);
ret = 0; /* ignore the error */
}
@@ -520,6 +539,10 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
+ return 0;
+#endif
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -644,8 +667,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- qgroup_limit = btrfs_item_ptr(l, path->slots[0],
- struct btrfs_qgroup_limit_item);
+ qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item);
btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags);
btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer);
btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl);
@@ -670,6 +692,10 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ return 0;
+#endif
key.objectid = 0;
key.type = BTRFS_QGROUP_INFO_KEY;
key.offset = qgroup->qgroupid;
@@ -687,8 +713,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
l = path->nodes[0];
slot = path->slots[0];
- qgroup_info = btrfs_item_ptr(l, path->slots[0],
- struct btrfs_qgroup_info_item);
+ qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item);
btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid);
btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer);
btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr);
@@ -1161,7 +1186,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
limit->rsv_excl);
if (ret) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- printk(KERN_INFO "unable to update quota limit for %llu\n",
+ btrfs_info(fs_info, "unable to update quota limit for %llu",
qgroupid);
}
@@ -1176,33 +1201,198 @@ out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
}
+static int comp_oper(struct btrfs_qgroup_operation *oper1,
+ struct btrfs_qgroup_operation *oper2)
+{
+ if (oper1->bytenr < oper2->bytenr)
+ return -1;
+ if (oper1->bytenr > oper2->bytenr)
+ return 1;
+ if (oper1->seq < oper2->seq)
+ return -1;
+ if (oper1->seq > oper2->seq)
+ return -1;
+ if (oper1->ref_root < oper2->ref_root)
+ return -1;
+ if (oper1->ref_root > oper2->ref_root)
+ return 1;
+ if (oper1->type < oper2->type)
+ return -1;
+ if (oper1->type > oper2->type)
+ return 1;
+ return 0;
+}
+
+static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ struct btrfs_qgroup_operation *cur;
+ int cmp;
+
+ spin_lock(&fs_info->qgroup_op_lock);
+ p = &fs_info->qgroup_op_tree.rb_node;
+ while (*p) {
+ parent = *p;
+ cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
+ cmp = comp_oper(cur, oper);
+ if (cmp < 0) {
+ p = &(*p)->rb_right;
+ } else if (cmp) {
+ p = &(*p)->rb_left;
+ } else {
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&oper->n, parent, p);
+ rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ return 0;
+}
/*
- * btrfs_qgroup_record_ref is called when the ref is added or deleted. it puts
- * the modification into a list that's later used by btrfs_end_transaction to
- * pass the recorded modifications on to btrfs_qgroup_account_ref.
+ * Record a quota operation for processing later on.
+ * @trans: the transaction we are adding the delayed op to.
+ * @fs_info: the fs_info for this fs.
+ * @ref_root: the root of the reference we are acting on,
+ * @bytenr: the bytenr we are acting on.
+ * @num_bytes: the number of bytes in the reference.
+ * @type: the type of operation this is.
+ * @mod_seq: do we need to get a sequence number for looking up roots.
+ *
+ * We just add it to our trans qgroup_ref_list and carry on and process these
+ * operations in order at some later point. If the reference root isn't a fs
+ * root then we don't bother with doing anything.
+ *
+ * MUST BE HOLDING THE REF LOCK.
*/
int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op)
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type, int mod_seq)
{
- struct qgroup_update *u;
+ struct btrfs_qgroup_operation *oper;
+ int ret;
- BUG_ON(!trans->delayed_ref_elem.seq);
- u = kmalloc(sizeof(*u), GFP_NOFS);
- if (!u)
+ if (!is_fstree(ref_root) || !fs_info->quota_enabled)
+ return 0;
+
+ oper = kmalloc(sizeof(*oper), GFP_NOFS);
+ if (!oper)
return -ENOMEM;
- u->node = node;
- u->extent_op = extent_op;
- list_add_tail(&u->list, &trans->qgroup_ref_list);
+ oper->ref_root = ref_root;
+ oper->bytenr = bytenr;
+ oper->num_bytes = num_bytes;
+ oper->type = type;
+ oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
+ INIT_LIST_HEAD(&oper->elem.list);
+ oper->elem.seq = 0;
+ ret = insert_qgroup_oper(fs_info, oper);
+ if (ret) {
+ /* Shouldn't happen so have an assert for developers */
+ ASSERT(0);
+ kfree(oper);
+ return ret;
+ }
+ list_add_tail(&oper->list, &trans->qgroup_ref_list);
+
+ if (mod_seq)
+ btrfs_get_tree_mod_seq(fs_info, &oper->elem);
return 0;
}
-static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq)
+/*
+ * The easy accounting, if we are adding/removing the only ref for an extent
+ * then this qgroup and all of the parent qgroups get their refrence and
+ * exclusive counts adjusted.
+ */
+static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct btrfs_qgroup *qgroup;
+ struct ulist *tmp;
+ struct btrfs_qgroup_list *glist;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int sign = 0;
+ int ret = 0;
+
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&fs_info->qgroup_lock);
+ if (!fs_info->quota_root)
+ goto out;
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
+ if (!qgroup)
+ goto out;
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ sign = 1;
+ break;
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ sign = -1;
+ break;
+ default:
+ ASSERT(0);
+ }
+ qgroup->rfer += sign * oper->num_bytes;
+ qgroup->rfer_cmpr += sign * oper->num_bytes;
+
+ WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
+ qgroup->excl += sign * oper->num_bytes;
+ qgroup->excl_cmpr += sign * oper->num_bytes;
+
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Get all of the parent groups that contain this qgroup */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+
+ /* Iterate all of the parents and adjust their reference counts */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ qgroup = u64_to_ptr(unode->aux);
+ qgroup->rfer += sign * oper->num_bytes;
+ qgroup->rfer_cmpr += sign * oper->num_bytes;
+ qgroup->excl += sign * oper->num_bytes;
+ if (sign < 0)
+ WARN_ON(qgroup->excl < oper->num_bytes);
+ qgroup->excl_cmpr += sign * oper->num_bytes;
+ qgroup_dirty(fs_info, qgroup);
+
+ /* Add any parents of the parents */
+ list_for_each_entry(glist, &qgroup->groups, next_group) {
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ goto out;
+ }
+ }
+ ret = 0;
+out:
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
+ * properly.
+ */
+static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, struct ulist *tmp,
+ struct ulist *roots, struct ulist *qgroups,
+ u64 seq, int *old_roots, int rescan)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -1213,261 +1403,551 @@ static int qgroup_account_ref_step1(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(roots, &uiter))) {
+ /* We don't count our current root here */
+ if (unode->val == root_to_skip)
+ continue;
qg = find_qgroup_rb(fs_info, unode->val);
if (!qg)
continue;
+ /*
+ * We could have a pending removal of this same ref so we may
+ * not have actually found our ref root when doing
+ * btrfs_find_all_roots, so we need to keep track of how many
+ * old roots we find in case we removed ours and added a
+ * different one at the same time. I don't think this could
+ * happen in practice but that sort of thinking leads to pain
+ * and suffering and to the dark side.
+ */
+ (*old_roots)++;
ulist_reinit(tmp);
- /* XXX id not needed */
- ret = ulist_add(tmp, qg->qgroupid,
- (u64)(uintptr_t)qg, GFP_ATOMIC);
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&tmp_uiter);
while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->refcnt < seq)
- qg->refcnt = seq + 1;
+ qg = u64_to_ptr(tmp_unode->aux);
+ /*
+ * We use this sequence number to keep from having to
+ * run the whole list and 0 out the refcnt every time.
+ * We basically use sequnce as the known 0 count and
+ * then add 1 everytime we see a qgroup. This is how we
+ * get how many of the roots actually point up to the
+ * upper level qgroups in order to determine exclusive
+ * counts.
+ *
+ * For rescan we want to set old_refcnt to seq so our
+ * exclusive calculations end up correct.
+ */
+ if (rescan)
+ qg->old_refcnt = seq;
+ else if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
else
- ++qg->refcnt;
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
ret = ulist_add(tmp, glist->group->qgroupid,
- (u64)(uintptr_t)glist->group,
+ ptr_to_u64(glist->group),
GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
}
+ return 0;
+}
+
+/*
+ * We need to walk forward in our operation tree and account for any roots that
+ * were deleted after we made this operation.
+ */
+static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct ulist *tmp,
+ struct ulist *qgroups, u64 seq,
+ int *old_roots)
+{
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct btrfs_qgroup *qg;
+ struct btrfs_qgroup_operation *tmp_oper;
+ struct rb_node *n;
+ int ret;
+ ulist_reinit(tmp);
+
+ /*
+ * We only walk forward in the tree since we're only interested in
+ * removals that happened _after_ our operation.
+ */
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ return 0;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ while (tmp_oper->bytenr == oper->bytenr) {
+ /*
+ * If it's not a removal we don't care, additions work out
+ * properly with our refcnt tracking.
+ */
+ if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
+ tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
+ goto next;
+ qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
+ if (!qg)
+ goto next;
+ ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret) {
+ if (ret < 0)
+ return ret;
+ /*
+ * We only want to increase old_roots if this qgroup is
+ * not already in the list of qgroups. If it is already
+ * there then that means it must have been re-added or
+ * the delete will be discarded because we had an
+ * existing ref that we haven't looked up yet. In this
+ * case we don't want to increase old_roots. So if ret
+ * == 1 then we know that this is the first time we've
+ * seen this qgroup and we can bump the old_roots.
+ */
+ (*old_roots)++;
+ ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+next:
+ spin_lock(&fs_info->qgroup_op_lock);
+ n = rb_next(&tmp_oper->n);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ if (!n)
+ break;
+ tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
+ }
+
+ /* Ok now process the qgroups we found */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(tmp, &uiter))) {
+ struct btrfs_qgroup_list *glist;
+
+ qg = u64_to_ptr(unode->aux);
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ list_for_each_entry(glist, &qg->groups, next_group) {
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ }
+ }
return 0;
}
-static int qgroup_account_ref_step2(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq, int sgn, u64 num_bytes,
- struct btrfs_qgroup *qgroup)
+/* Add refcnt for the newly added reference. */
+static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper,
+ struct btrfs_qgroup *qgroup,
+ struct ulist *tmp, struct ulist *qgroups,
+ u64 seq)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
int ret;
ulist_reinit(tmp);
- ret = ulist_add(tmp, qgroup->qgroupid, (uintptr_t)qgroup, GFP_ATOMIC);
+ ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
+ GFP_ATOMIC);
if (ret < 0)
return ret;
-
ULIST_ITER_INIT(&uiter);
while ((unode = ulist_next(tmp, &uiter))) {
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
- if (qg->refcnt < seq) {
- /* not visited by step 1 */
- qg->rfer += sgn * num_bytes;
- qg->rfer_cmpr += sgn * num_bytes;
- if (roots->nnodes == 0) {
- qg->excl += sgn * num_bytes;
- qg->excl_cmpr += sgn * num_bytes;
- }
- qgroup_dirty(fs_info, qg);
- }
- WARN_ON(qg->tag >= seq);
- qg->tag = seq;
+ struct btrfs_qgroup_list *glist;
+ qg = u64_to_ptr(unode->aux);
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ if (qg->new_refcnt < seq)
+ qg->new_refcnt = seq + 1;
+ else
+ qg->new_refcnt++;
+ } else {
+ if (qg->old_refcnt < seq)
+ qg->old_refcnt = seq + 1;
+ else
+ qg->old_refcnt++;
+ }
list_for_each_entry(glist, &qg->groups, next_group) {
ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group, GFP_ATOMIC);
+ ptr_to_u64(glist->group), GFP_ATOMIC);
+ if (ret < 0)
+ return ret;
+ ret = ulist_add(qgroups, glist->group->qgroupid,
+ ptr_to_u64(glist->group), GFP_ATOMIC);
if (ret < 0)
return ret;
}
}
-
return 0;
}
-static int qgroup_account_ref_step3(struct btrfs_fs_info *fs_info,
- struct ulist *roots, struct ulist *tmp,
- u64 seq, int sgn, u64 num_bytes)
+/*
+ * This adjusts the counters for all referenced qgroups if need be.
+ */
+static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
+ u64 root_to_skip, u64 num_bytes,
+ struct ulist *qgroups, u64 seq,
+ int old_roots, int new_roots, int rescan)
{
struct ulist_node *unode;
struct ulist_iterator uiter;
struct btrfs_qgroup *qg;
- struct ulist_node *tmp_unode;
- struct ulist_iterator tmp_uiter;
- int ret;
+ u64 cur_new_count, cur_old_count;
ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
+ while ((unode = ulist_next(qgroups, &uiter))) {
+ bool dirty = false;
- ulist_reinit(tmp);
- ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg, GFP_ATOMIC);
- if (ret < 0)
- return ret;
+ qg = u64_to_ptr(unode->aux);
+ /*
+ * Wasn't referenced before but is now, add to the reference
+ * counters.
+ */
+ if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+ qg->rfer += num_bytes;
+ qg->rfer_cmpr += num_bytes;
+ dirty = true;
+ }
- ULIST_ITER_INIT(&tmp_uiter);
- while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) {
- struct btrfs_qgroup_list *glist;
+ /*
+ * Was referenced before but isn't now, subtract from the
+ * reference counters.
+ */
+ if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+ qg->rfer -= num_bytes;
+ qg->rfer_cmpr -= num_bytes;
+ dirty = true;
+ }
- qg = (struct btrfs_qgroup *)(uintptr_t)tmp_unode->aux;
- if (qg->tag == seq)
- continue;
+ if (qg->old_refcnt < seq)
+ cur_old_count = 0;
+ else
+ cur_old_count = qg->old_refcnt - seq;
+ if (qg->new_refcnt < seq)
+ cur_new_count = 0;
+ else
+ cur_new_count = qg->new_refcnt - seq;
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl -= sgn * num_bytes;
- qg->excl_cmpr -= sgn * num_bytes;
- qgroup_dirty(fs_info, qg);
- }
+ /*
+ * If our refcount was the same as the roots previously but our
+ * new count isn't the same as the number of roots now then we
+ * went from having a exclusive reference on this range to not.
+ */
+ if (old_roots && cur_old_count == old_roots &&
+ (cur_new_count != new_roots || new_roots == 0)) {
+ WARN_ON(cur_new_count != new_roots && new_roots == 0);
+ qg->excl -= num_bytes;
+ qg->excl_cmpr -= num_bytes;
+ dirty = true;
+ }
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- if (ret < 0)
- return ret;
- }
+ /*
+ * If we didn't reference all the roots before but now we do we
+ * have an exclusive reference to this range.
+ */
+ if ((!old_roots || (old_roots && cur_old_count != old_roots))
+ && cur_new_count == new_roots) {
+ qg->excl += num_bytes;
+ qg->excl_cmpr += num_bytes;
+ dirty = true;
}
- }
+ if (dirty)
+ qgroup_dirty(fs_info, qg);
+ }
return 0;
}
/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
+ * If we removed a data extent and there were other references for that bytenr
+ * then we need to lookup all referenced roots to make sure we still don't
+ * reference this bytenr. If we do then we can just discard this operation.
*/
-int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_node *node,
- struct btrfs_delayed_extent_op *extent_op)
+static int check_existing_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
{
- struct btrfs_key ins;
- struct btrfs_root *quota_root;
- u64 ref_root;
- struct btrfs_qgroup *qgroup;
struct ulist *roots = NULL;
- u64 seq;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
int ret = 0;
- int sgn;
- if (!fs_info->quota_enabled)
- return 0;
-
- BUG_ON(!fs_info->quota_root);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
+ oper->elem.seq, &roots);
+ if (ret < 0)
+ return ret;
+ ret = 0;
- ins.objectid = node->bytenr;
- ins.offset = node->num_bytes;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
- if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
- node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
- struct btrfs_delayed_tree_ref *ref;
- ref = btrfs_delayed_node_to_tree_ref(node);
- ref_root = ref->root;
- } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
- node->type == BTRFS_SHARED_DATA_REF_KEY) {
- struct btrfs_delayed_data_ref *ref;
- ref = btrfs_delayed_node_to_data_ref(node);
- ref_root = ref->root;
- } else {
- BUG();
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(roots, &uiter))) {
+ if (unode->val == oper->ref_root) {
+ ret = 1;
+ break;
+ }
}
+ ulist_free(roots);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
- if (!is_fstree(ref_root)) {
- /*
- * non-fs-trees are not being accounted
- */
- return 0;
- }
+ return ret;
+}
- switch (node->action) {
- case BTRFS_ADD_DELAYED_REF:
- case BTRFS_ADD_DELAYED_EXTENT:
- sgn = 1;
- seq = btrfs_tree_mod_seq_prev(node->seq);
- break;
- case BTRFS_DROP_DELAYED_REF:
- sgn = -1;
- seq = node->seq;
- break;
- case BTRFS_UPDATE_DELAYED_HEAD:
- return 0;
- default:
- BUG();
- }
+/*
+ * If we share a reference across multiple roots then we may need to adjust
+ * various qgroups referenced and exclusive counters. The basic premise is this
+ *
+ * 1) We have seq to represent a 0 count. Instead of looping through all of the
+ * qgroups and resetting their refcount to 0 we just constantly bump this
+ * sequence number to act as the base reference count. This means that if
+ * anybody is equal to or below this sequence they were never referenced. We
+ * jack this sequence up by the number of roots we found each time in order to
+ * make sure we don't have any overlap.
+ *
+ * 2) We first search all the roots that reference the area _except_ the root
+ * we're acting on currently. This makes up the old_refcnt of all the qgroups
+ * before.
+ *
+ * 3) We walk all of the qgroups referenced by the root we are currently acting
+ * on, and will either adjust old_refcnt in the case of a removal or the
+ * new_refcnt in the case of an addition.
+ *
+ * 4) Finally we walk all the qgroups that are referenced by this range
+ * including the root we are acting on currently. We will adjust the counters
+ * based on the number of roots we had and will have after this operation.
+ *
+ * Take this example as an illustration
+ *
+ * [qgroup 1/0]
+ * / | \
+ * [qg 0/0] [qg 0/1] [qg 0/2]
+ * \ | /
+ * [ extent ]
+ *
+ * Say we are adding a reference that is covered by qg 0/0. The first step
+ * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
+ * old_roots being 2. Because it is adding new_roots will be 1. We then go
+ * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
+ * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we
+ * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
+ * reference and thus must add the size to the referenced bytes. Everything
+ * else is the same so nothing else changes.
+ */
+static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ struct ulist *roots = NULL;
+ struct ulist *qgroups, *tmp;
+ struct btrfs_qgroup *qgroup;
+ struct seq_list elem = {};
+ u64 seq;
+ int old_roots = 0;
+ int new_roots = 0;
+ int ret = 0;
- mutex_lock(&fs_info->qgroup_rescan_lock);
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
- if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) {
- mutex_unlock(&fs_info->qgroup_rescan_lock);
+ if (oper->elem.seq) {
+ ret = check_existing_refs(trans, fs_info, oper);
+ if (ret < 0)
+ return ret;
+ if (ret)
return 0;
- }
}
- mutex_unlock(&fs_info->qgroup_rescan_lock);
-
- /*
- * the delayed ref sequence number we pass depends on the direction of
- * the operation. for add operations, we pass
- * tree_mod_log_prev_seq(node->seq) to skip
- * the delayed ref's current sequence number, because we need the state
- * of the tree before the add operation. for delete operations, we pass
- * (node->seq) to include the delayed ref's current sequence number,
- * because we need the state of the tree after the delete operation.
- */
- ret = btrfs_find_all_roots(trans, fs_info, node->bytenr, seq, &roots);
- if (ret < 0)
- return ret;
- spin_lock(&fs_info->qgroup_lock);
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root)
- goto unlock;
+ tmp = ulist_alloc(GFP_NOFS);
+ if (!tmp) {
+ ulist_free(qgroups);
+ return -ENOMEM;
+ }
- qgroup = find_qgroup_rb(fs_info, ref_root);
+ btrfs_get_tree_mod_seq(fs_info, &elem);
+ ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
+ &roots);
+ btrfs_put_tree_mod_seq(fs_info, &elem);
+ if (ret < 0) {
+ ulist_free(qgroups);
+ ulist_free(tmp);
+ return ret;
+ }
+ spin_lock(&fs_info->qgroup_lock);
+ qgroup = find_qgroup_rb(fs_info, oper->ref_root);
if (!qgroup)
- goto unlock;
+ goto out;
+ seq = fs_info->qgroup_seq;
/*
- * step 1: for each old ref, visit all nodes once and inc refcnt
+ * So roots is the list of all the roots currently pointing at the
+ * bytenr, including the ref we are adding if we are adding, or not if
+ * we are removing a ref. So we pass in the ref_root to skip that root
+ * in our calculations. We set old_refnct and new_refcnt cause who the
+ * hell knows what everything looked like before, and it doesn't matter
+ * except...
*/
- ulist_reinit(fs_info->qgroup_ulist);
- seq = fs_info->qgroup_seq;
- fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
+ ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
+ seq, &old_roots, 0);
+ if (ret < 0)
+ goto out;
- ret = qgroup_account_ref_step1(fs_info, roots, fs_info->qgroup_ulist,
- seq);
- if (ret)
- goto unlock;
+ /*
+ * Now adjust the refcounts of the qgroups that care about this
+ * reference, either the old_count in the case of removal or new_count
+ * in the case of an addition.
+ */
+ ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
+ seq);
+ if (ret < 0)
+ goto out;
/*
- * step 2: walk from the new root
+ * ...in the case of removals. If we had a removal before we got around
+ * to processing this operation then we need to find that guy and count
+ * his references as if they really existed so we don't end up screwing
+ * up the exclusive counts. Then whenever we go to process the delete
+ * everything will be grand and we can account for whatever exclusive
+ * changes need to be made there. We also have to pass in old_roots so
+ * we have an accurate count of the roots as it pertains to this
+ * operations view of the world.
*/
- ret = qgroup_account_ref_step2(fs_info, roots, fs_info->qgroup_ulist,
- seq, sgn, node->num_bytes, qgroup);
- if (ret)
- goto unlock;
+ ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
+ &old_roots);
+ if (ret < 0)
+ goto out;
/*
- * step 3: walk again from old refs
+ * We are adding our root, need to adjust up the number of roots,
+ * otherwise old_roots is the number of roots we want.
*/
- ret = qgroup_account_ref_step3(fs_info, roots, fs_info->qgroup_ulist,
- seq, sgn, node->num_bytes);
- if (ret)
- goto unlock;
+ if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
+ new_roots = old_roots + 1;
+ } else {
+ new_roots = old_roots;
+ old_roots++;
+ }
+ fs_info->qgroup_seq += old_roots + 1;
-unlock:
+
+ /*
+ * And now the magic happens, bless Arne for having a pretty elegant
+ * solution for this.
+ */
+ qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
+ qgroups, seq, old_roots, new_roots, 0);
+out:
spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(qgroups);
ulist_free(roots);
+ ulist_free(tmp);
+ return ret;
+}
+
+/*
+ * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
+ * from the fs. First, all roots referencing the extent are searched, and
+ * then the space is accounted accordingly to the different roots. The
+ * accounting algorithm works in 3 steps documented inline.
+ */
+static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper)
+{
+ int ret = 0;
+
+ if (!fs_info->quota_enabled)
+ return 0;
+
+ BUG_ON(!fs_info->quota_root);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ ASSERT(is_fstree(oper->ref_root));
+
+ switch (oper->type) {
+ case BTRFS_QGROUP_OPER_ADD_EXCL:
+ case BTRFS_QGROUP_OPER_SUB_EXCL:
+ ret = qgroup_excl_accounting(fs_info, oper);
+ break;
+ case BTRFS_QGROUP_OPER_ADD_SHARED:
+ case BTRFS_QGROUP_OPER_SUB_SHARED:
+ ret = qgroup_shared_accounting(trans, fs_info, oper);
+ break;
+ default:
+ ASSERT(0);
+ }
+ return ret;
+}
+
+/*
+ * Needs to be called everytime we run delayed refs, even if there is an error
+ * in order to cleanup outstanding operations.
+ */
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info)
+{
+ struct btrfs_qgroup_operation *oper;
+ int ret = 0;
+ while (!list_empty(&trans->qgroup_ref_list)) {
+ oper = list_first_entry(&trans->qgroup_ref_list,
+ struct btrfs_qgroup_operation, list);
+ list_del_init(&oper->list);
+ if (!ret || !trans->aborted)
+ ret = btrfs_qgroup_account(trans, fs_info, oper);
+ spin_lock(&fs_info->qgroup_op_lock);
+ rb_erase(&oper->n, &fs_info->qgroup_op_tree);
+ spin_unlock(&fs_info->qgroup_op_lock);
+ btrfs_put_tree_mod_seq(fs_info, &oper->elem);
+ kfree(oper);
+ }
return ret;
}
@@ -1516,8 +1996,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
}
ret = 0;
}
@@ -1636,8 +2116,16 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
srcgroup = find_qgroup_rb(fs_info, srcid);
if (!srcgroup)
goto unlock;
- dstgroup->rfer = srcgroup->rfer - level_size;
- dstgroup->rfer_cmpr = srcgroup->rfer_cmpr - level_size;
+
+ /*
+ * We call inherit after we clone the root in order to make sure
+ * our counts don't go crazy, so at this point the only
+ * difference between the two roots should be the root node.
+ */
+ dstgroup->rfer = srcgroup->rfer;
+ dstgroup->rfer_cmpr = srcgroup->rfer_cmpr;
+ dstgroup->excl = level_size;
+ dstgroup->excl_cmpr = level_size;
srcgroup->excl = level_size;
srcgroup->excl_cmpr = level_size;
qgroup_dirty(fs_info, dstgroup);
@@ -1741,7 +2229,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qg->reserved + (s64)qg->rfer + num_bytes >
@@ -1773,7 +2261,7 @@ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) {
struct btrfs_qgroup *qg;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
qg->reserved += num_bytes;
}
@@ -1819,7 +2307,7 @@ void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
struct btrfs_qgroup *qg;
struct btrfs_qgroup_list *glist;
- qg = (struct btrfs_qgroup *)(uintptr_t)unode->aux;
+ qg = u64_to_ptr(unode->aux);
qg->reserved -= num_bytes;
@@ -1840,7 +2328,9 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
return;
- pr_err("btrfs: qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x\n",
+ btrfs_err(trans->root->fs_info,
+ "qgroups not uptodate in trans handle %p: list is%s empty, "
+ "seq is %#x.%x",
trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
(u32)(trans->delayed_ref_elem.seq >> 32),
(u32)trans->delayed_ref_elem.seq);
@@ -1853,15 +2343,15 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans, struct ulist *tmp,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans, struct ulist *qgroups,
+ struct ulist *tmp, struct extent_buffer *scratch_leaf)
{
struct btrfs_key found;
struct ulist *roots = NULL;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
struct seq_list tree_mod_seq_elem = {};
+ u64 num_bytes;
u64 seq;
+ int new_roots;
int slot;
int ret;
@@ -1903,78 +2393,42 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
- if (found.type != BTRFS_EXTENT_ITEM_KEY)
+ if (found.type != BTRFS_EXTENT_ITEM_KEY &&
+ found.type != BTRFS_METADATA_ITEM_KEY)
continue;
- ret = btrfs_find_all_roots(trans, fs_info, found.objectid,
- tree_mod_seq_elem.seq, &roots);
+ if (found.type == BTRFS_METADATA_ITEM_KEY)
+ num_bytes = fs_info->extent_root->leafsize;
+ else
+ num_bytes = found.offset;
+
+ ulist_reinit(qgroups);
+ ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+ &roots);
if (ret < 0)
goto out;
spin_lock(&fs_info->qgroup_lock);
seq = fs_info->qgroup_seq;
fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
- ret = qgroup_account_ref_step1(fs_info, roots, tmp, seq);
- if (ret) {
+ new_roots = 0;
+ ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
+ seq, &new_roots, 1);
+ if (ret < 0) {
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
goto out;
}
- /*
- * step2 of btrfs_qgroup_account_ref works from a single root,
- * we're doing all at once here.
- */
- ulist_reinit(tmp);
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(roots, &uiter))) {
- struct btrfs_qgroup *qg;
-
- qg = find_qgroup_rb(fs_info, unode->val);
- if (!qg)
- continue;
-
- ret = ulist_add(tmp, qg->qgroupid, (uintptr_t)qg,
- GFP_ATOMIC);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
- }
-
- /* this loop is similar to step 2 of btrfs_qgroup_account_ref */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(tmp, &uiter))) {
- struct btrfs_qgroup *qg;
- struct btrfs_qgroup_list *glist;
-
- qg = (struct btrfs_qgroup *)(uintptr_t) unode->aux;
- qg->rfer += found.offset;
- qg->rfer_cmpr += found.offset;
- WARN_ON(qg->tag >= seq);
- if (qg->refcnt - seq == roots->nnodes) {
- qg->excl += found.offset;
- qg->excl_cmpr += found.offset;
- }
- qgroup_dirty(fs_info, qg);
-
- list_for_each_entry(glist, &qg->groups, next_group) {
- ret = ulist_add(tmp, glist->group->qgroupid,
- (uintptr_t)glist->group,
- GFP_ATOMIC);
- if (ret < 0) {
- spin_unlock(&fs_info->qgroup_lock);
- ulist_free(roots);
- goto out;
- }
- }
+ ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
+ seq, 0, new_roots, 1);
+ if (ret < 0) {
+ spin_unlock(&fs_info->qgroup_lock);
+ ulist_free(roots);
+ goto out;
}
-
spin_unlock(&fs_info->qgroup_lock);
ulist_free(roots);
- ret = 0;
}
-
out:
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
@@ -1987,13 +2441,16 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct ulist *tmp = NULL;
+ struct ulist *tmp = NULL, *qgroups = NULL;
struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
path = btrfs_alloc_path();
if (!path)
goto out;
+ qgroups = ulist_alloc(GFP_NOFS);
+ if (!qgroups)
+ goto out;
tmp = ulist_alloc(GFP_NOFS);
if (!tmp)
goto out;
@@ -2012,7 +2469,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = -EINTR;
} else {
err = qgroup_rescan_leaf(fs_info, path, trans,
- tmp, scratch_leaf);
+ qgroups, tmp, scratch_leaf);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2022,6 +2479,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
out:
kfree(scratch_leaf);
+ ulist_free(qgroups);
ulist_free(tmp);
btrfs_free_path(path);
@@ -2037,10 +2495,10 @@ out:
mutex_unlock(&fs_info->qgroup_rescan_lock);
if (err >= 0) {
- pr_info("btrfs: qgroup scan completed%s\n",
+ btrfs_info(fs_info, "qgroup scan completed%s",
err == 2 ? " (inconsistency flag cleared)" : "");
} else {
- pr_err("btrfs: qgroup scan failed with %d\n", err);
+ btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
complete_all(&fs_info->qgroup_rescan_completion);
@@ -2092,11 +2550,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
- fs_info->qgroup_rescan_work.func = btrfs_qgroup_rescan_worker;
+ btrfs_init_work(&fs_info->qgroup_rescan_work,
+ btrfs_qgroup_rescan_worker, NULL, NULL);
if (ret) {
err:
- pr_info("btrfs: qgroup_rescan_init failed with %d\n", ret);
+ btrfs_info(fs_info, "qgroup_rescan_init failed with %d", ret);
return ret;
}
@@ -2155,8 +2614,8 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
return 0;
}
@@ -2187,6 +2646,6 @@ void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
- btrfs_queue_worker(&fs_info->qgroup_rescan_workers,
- &fs_info->qgroup_rescan_work);
+ btrfs_queue_work(fs_info->qgroup_rescan_workers,
+ &fs_info->qgroup_rescan_work);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
new file mode 100644
index 00000000000..5952ff1fbd7
--- /dev/null
+++ b/fs/btrfs/qgroup.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_QGROUP__
+#define __BTRFS_QGROUP__
+
+/*
+ * A description of the operations, all of these operations only happen when we
+ * are adding the 1st reference for that subvolume in the case of adding space
+ * or on the last reference delete in the case of subtraction. The only
+ * exception is the last one, which is added for confusion.
+ *
+ * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only
+ * one pointing at the bytes we are adding. This is called on the first
+ * allocation.
+ *
+ * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be
+ * shared between subvols. This is called on the creation of a ref that already
+ * has refs from a different subvolume, so basically reflink.
+ *
+ * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only
+ * one referencing the range.
+ *
+ * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with
+ * refs with other subvolumes.
+ */
+enum btrfs_qgroup_operation_type {
+ BTRFS_QGROUP_OPER_ADD_EXCL,
+ BTRFS_QGROUP_OPER_ADD_SHARED,
+ BTRFS_QGROUP_OPER_SUB_EXCL,
+ BTRFS_QGROUP_OPER_SUB_SHARED,
+};
+
+struct btrfs_qgroup_operation {
+ u64 ref_root;
+ u64 bytenr;
+ u64 num_bytes;
+ u64 seq;
+ enum btrfs_qgroup_operation_type type;
+ struct seq_list elem;
+ struct rb_node n;
+ struct list_head list;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 qgroupid,
+ struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 ref_root,
+ u64 bytenr, u64 num_bytes,
+ enum btrfs_qgroup_operation_type type,
+ int mod_seq);
+int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_qgroup_operation *oper);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+ struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ u64 rfer, u64 excl);
+#endif
+
+#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index d0ecfbd9cc9..4a88f073fdd 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -33,7 +33,6 @@
#include <linux/raid/xor.h>
#include <linux/vmalloc.h>
#include <asm/div64.h>
-#include "compat.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
@@ -1033,8 +1032,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
/* see if we can add this page onto our existing bio */
if (last) {
- last_end = (u64)last->bi_sector << 9;
- last_end += last->bi_size;
+ last_end = (u64)last->bi_iter.bi_sector << 9;
+ last_end += last->bi_iter.bi_size;
/*
* we can't merge these if they are from different
@@ -1054,9 +1053,9 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
if (!bio)
return -ENOMEM;
- bio->bi_size = 0;
+ bio->bi_iter.bi_size = 0;
bio->bi_bdev = stripe->dev->bdev;
- bio->bi_sector = disk_start >> 9;
+ bio->bi_iter.bi_sector = disk_start >> 9;
set_bit(BIO_UPTODATE, &bio->bi_flags);
bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -1112,7 +1111,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
spin_lock_irq(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list) {
- start = (u64)bio->bi_sector << 9;
+ start = (u64)bio->bi_iter.bi_sector << 9;
stripe_offset = start - rbio->raid_map[0];
page_index = stripe_offset >> PAGE_CACHE_SHIFT;
@@ -1273,7 +1272,7 @@ cleanup:
static int find_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 physical = bio->bi_sector;
+ u64 physical = bio->bi_iter.bi_sector;
u64 stripe_start;
int i;
struct btrfs_bio_stripe *stripe;
@@ -1299,7 +1298,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
struct bio *bio)
{
- u64 logical = bio->bi_sector;
+ u64 logical = bio->bi_iter.bi_sector;
u64 stripe_start;
int i;
@@ -1417,20 +1416,18 @@ cleanup:
static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
{
- rbio->work.flags = 0;
- rbio->work.func = rmw_work;
+ btrfs_init_work(&rbio->work, rmw_work, NULL, NULL);
- btrfs_queue_worker(&rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
}
static void async_read_rebuild(struct btrfs_raid_bio *rbio)
{
- rbio->work.flags = 0;
- rbio->work.func = read_rebuild_work;
+ btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL);
- btrfs_queue_worker(&rbio->fs_info->rmw_workers,
- &rbio->work);
+ btrfs_queue_work(rbio->fs_info->rmw_workers,
+ &rbio->work);
}
/*
@@ -1603,8 +1600,8 @@ static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
plug_list);
struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
plug_list);
- u64 a_sector = ra->bio_list.head->bi_sector;
- u64 b_sector = rb->bio_list.head->bi_sector;
+ u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
+ u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
if (a_sector < b_sector)
return -1;
@@ -1668,10 +1665,9 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- plug->work.flags = 0;
- plug->work.func = unplug_work;
- btrfs_queue_worker(&plug->info->rmw_workers,
- &plug->work);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
+ btrfs_queue_work(plug->info->rmw_workers,
+ &plug->work);
return;
}
run_plug(plug);
@@ -1692,7 +1688,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
if (IS_ERR(rbio))
return PTR_ERR(rbio);
bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_size;
+ rbio->bio_list_bytes = bio->bi_iter.bi_size;
/*
* don't plug on full rbios, just get them out the door
@@ -1960,9 +1956,10 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
* pages are going to be uptodate.
*/
for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
- if (rbio->faila == stripe ||
- rbio->failb == stripe)
+ if (rbio->faila == stripe || rbio->failb == stripe) {
+ atomic_inc(&rbio->bbio->error);
continue;
+ }
for (pagenr = 0; pagenr < nr_pages; pagenr++) {
struct page *p;
@@ -2045,7 +2042,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->read_rebuild = 1;
bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_size;
+ rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 1031b69252c..09230cf3a24 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -189,8 +189,8 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
*/
#ifdef DEBUG
if (rec->generation != generation) {
- printk(KERN_DEBUG "generation mismatch for "
- "(%llu,%d,%llu) %llu != %llu\n",
+ btrfs_debug(root->fs_info,
+ "generation mismatch for (%llu,%d,%llu) %llu != %llu",
key.objectid, key.type, key.offset,
rec->generation, generation);
}
@@ -365,8 +365,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
goto error;
if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
- printk(KERN_ERR "btrfs readahead: more than %d copies not "
- "supported", BTRFS_MAX_MIRRORS);
+ btrfs_err(root->fs_info,
+ "readahead: more than %d copies not supported",
+ BTRFS_MAX_MIRRORS);
goto error;
}
@@ -427,8 +428,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
continue;
}
if (!dev->bdev) {
- /* cannot read ahead on missing device */
- continue;
+ /*
+ * cannot read ahead on missing device, but for RAID5/6,
+ * REQ_GET_READ_MIRRORS return 1. So don't skip missing
+ * device for such case.
+ */
+ if (nzones > 1)
+ continue;
}
if (dev_replace_is_ongoing &&
dev == fs_info->dev_replace.tgtdev) {
@@ -792,10 +798,10 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- rmw->work.func = reada_start_machine_worker;
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
- btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+ btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
}
#ifdef DEBUG
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a5a26320503..65245a07275 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -94,6 +94,7 @@ struct backref_edge {
#define LOWER 0
#define UPPER 1
+#define RELOCATION_RESERVED_NODES 256
struct backref_cache {
/* red black tree of all backref nodes in the cache */
@@ -176,6 +177,8 @@ struct reloc_control {
u64 merging_rsv_size;
/* size of relocated tree nodes */
u64 nodes_relocated;
+ /* reserved size for block group relocation*/
+ u64 reserved_bytes;
u64 search_start;
u64 extents_found;
@@ -184,7 +187,6 @@ struct reloc_control {
unsigned int create_reloc_tree:1;
unsigned int merge_reloc_tree:1;
unsigned int found_file_extent:1;
- unsigned int commit_transaction:1;
};
/* stages of data relocation */
@@ -335,7 +337,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
if (bnode->root)
fs_info = bnode->root->fs_info;
btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
- "found at offset %llu\n", bytenr);
+ "found at offset %llu", bytenr);
}
/*
@@ -526,7 +528,7 @@ static int should_ignore_root(struct btrfs_root *root)
{
struct btrfs_root *reloc_root;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
reloc_root = root->reloc_root;
@@ -571,7 +573,9 @@ static int is_cowonly_root(u64 root_objectid)
root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
root_objectid == BTRFS_DEV_TREE_OBJECTID ||
root_objectid == BTRFS_TREE_LOG_OBJECTID ||
- root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+ root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
+ root_objectid == BTRFS_UUID_TREE_OBJECTID ||
+ root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
return 1;
return 0;
}
@@ -588,7 +592,7 @@ static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
else
key.offset = (u64)-1;
- return btrfs_read_fs_root_no_name(fs_info, &key);
+ return btrfs_get_fs_root(fs_info, &key, false);
}
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -606,7 +610,7 @@ struct btrfs_root *find_tree_root(struct reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, root_objectid);
BUG_ON(IS_ERR(root));
- if (root->ref_cows &&
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
generation != btrfs_root_generation(&root->root_item))
return NULL;
@@ -883,7 +887,7 @@ again:
goto out;
}
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
cur->cowonly = 1;
if (btrfs_root_level(&root->root_item) == cur->level) {
@@ -950,7 +954,8 @@ again:
upper->bytenr = eb->start;
upper->owner = btrfs_header_owner(eb);
upper->level = lower->level + 1;
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state))
upper->cowonly = 1;
/*
@@ -1254,7 +1259,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (rb_node) {
btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
"for start=%llu while inserting into relocation "
- "tree\n", node->bytenr);
+ "tree", node->bytenr);
kfree(node);
return -EEXIST;
}
@@ -1264,10 +1269,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
}
/*
- * helper to update/delete the 'address of tree root -> reloc tree'
+ * helper to delete the 'address of tree root -> reloc tree'
* mapping
*/
-static int __update_reloc_root(struct btrfs_root *root, int del)
+static void __del_reloc_root(struct btrfs_root *root)
{
struct rb_node *rb_node;
struct mapping_node *node = NULL;
@@ -1275,7 +1280,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->commit_root->start);
+ root->node->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1283,23 +1288,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
spin_unlock(&rc->reloc_root_tree.lock);
if (!node)
- return 0;
+ return;
BUG_ON((struct btrfs_root *)node->data != root);
- if (!del) {
- spin_lock(&rc->reloc_root_tree.lock);
- node->bytenr = root->node->start;
- rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
- node->bytenr, &node->rb_node);
- spin_unlock(&rc->reloc_root_tree.lock);
- if (rb_node)
- backref_tree_panic(rb_node, -EEXIST, node->bytenr);
- } else {
- spin_lock(&root->fs_info->trans_lock);
- list_del_init(&root->root_list);
- spin_unlock(&root->fs_info->trans_lock);
- kfree(node);
+ spin_lock(&root->fs_info->trans_lock);
+ list_del_init(&root->root_list);
+ spin_unlock(&root->fs_info->trans_lock);
+ kfree(node);
+}
+
+/*
+ * helper to update the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+{
+ struct rb_node *rb_node;
+ struct mapping_node *node = NULL;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+ root->node->start);
+ if (rb_node) {
+ node = rb_entry(rb_node, struct mapping_node, rb_node);
+ rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
}
+ spin_unlock(&rc->reloc_root_tree.lock);
+
+ if (!node)
+ return 0;
+ BUG_ON((struct btrfs_root *)node->data != root);
+
+ spin_lock(&rc->reloc_root_tree.lock);
+ node->bytenr = new_bytenr;
+ rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+ node->bytenr, &node->rb_node);
+ spin_unlock(&rc->reloc_root_tree.lock);
+ if (rb_node)
+ backref_tree_panic(rb_node, -EEXIST, node->bytenr);
return 0;
}
@@ -1383,6 +1410,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
{
struct btrfs_root *reloc_root;
struct reloc_control *rc = root->fs_info->reloc_ctl;
+ struct btrfs_block_rsv *rsv;
int clear_rsv = 0;
int ret;
@@ -1396,13 +1424,14 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
return 0;
- if (!trans->block_rsv) {
+ if (!trans->reloc_reserved) {
+ rsv = trans->block_rsv;
trans->block_rsv = rc->block_rsv;
clear_rsv = 1;
}
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
- trans->block_rsv = NULL;
+ trans->block_rsv = rsv;
ret = __add_reloc_root(reloc_root);
BUG_ON(ret < 0);
@@ -1418,7 +1447,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
{
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
- int del = 0;
int ret;
if (!root->reloc_root)
@@ -1430,11 +1458,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
if (root->fs_info->reloc_ctl->merge_reloc_tree &&
btrfs_root_refs(root_item) == 0) {
root->reloc_root = NULL;
- del = 1;
+ __del_reloc_root(reloc_root);
}
- __update_reloc_root(reloc_root, del);
-
if (reloc_root->commit_root != reloc_root->node) {
btrfs_set_root_node(root_item, reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
@@ -1775,8 +1801,7 @@ again:
new_ptr_gen = 0;
}
- if (new_bytenr > 0 && new_bytenr == old_bytenr) {
- WARN_ON(1);
+ if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
ret = level;
break;
}
@@ -2058,7 +2083,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
LIST_HEAD(inode_list);
struct btrfs_key key;
struct btrfs_key next_key;
- struct btrfs_trans_handle *trans;
+ struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *reloc_root;
struct btrfs_root_item *root_item;
struct btrfs_path *path;
@@ -2107,18 +2132,19 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
memset(&next_key, 0, sizeof(next_key));
while (1) {
- trans = btrfs_start_transaction(root, 0);
- BUG_ON(IS_ERR(trans));
- trans->block_rsv = rc->block_rsv;
-
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
- BUG_ON(ret != -EAGAIN);
- ret = btrfs_commit_transaction(trans, root);
- BUG_ON(ret);
- continue;
+ err = ret;
+ goto out;
+ }
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ goto out;
}
+ trans->block_rsv = rc->block_rsv;
replaced = 0;
max_level = level;
@@ -2164,6 +2190,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
root_item->drop_level = level;
btrfs_end_transaction_throttle(trans, root);
+ trans = NULL;
btrfs_btree_balance_dirty(root);
@@ -2192,7 +2219,8 @@ out:
btrfs_update_reloc_root(trans, root);
}
- btrfs_end_transaction_throttle(trans, root);
+ if (trans)
+ btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root);
@@ -2283,17 +2311,13 @@ void free_reloc_roots(struct list_head *list)
while (!list_empty(list)) {
reloc_root = list_entry(list->next, struct btrfs_root,
root_list);
- __update_reloc_root(reloc_root, 1);
- free_extent_buffer(reloc_root->node);
- free_extent_buffer(reloc_root->commit_root);
- kfree(reloc_root);
+ __del_reloc_root(reloc_root);
}
}
static noinline_for_stack
int merge_reloc_roots(struct reloc_control *rc)
{
- struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_root *reloc_root;
u64 last_snap;
@@ -2328,10 +2352,9 @@ again:
ret = merge_reloc_root(rc, root);
if (ret) {
- __update_reloc_root(reloc_root, 1);
- free_extent_buffer(reloc_root->node);
- free_extent_buffer(reloc_root->commit_root);
- kfree(reloc_root);
+ if (list_empty(&reloc_root->root_list))
+ list_add_tail(&reloc_root->root_list,
+ &reloc_roots);
goto out;
}
} else {
@@ -2352,26 +2375,6 @@ again:
list_add_tail(&reloc_root->root_list,
&reloc_roots);
goto out;
- } else if (!ret) {
- /*
- * recover the last snapshot tranid to avoid
- * the space balance break NOCOW.
- */
- root = read_fs_root(rc->extent_root->fs_info,
- objectid);
- if (IS_ERR(root))
- continue;
-
- trans = btrfs_join_transaction(root);
- BUG_ON(IS_ERR(trans));
-
- /* Check if the fs/file tree was snapshoted or not. */
- if (btrfs_root_last_snapshot(&root->root_item) ==
- otransid - 1)
- btrfs_set_root_last_snapshot(&root->root_item,
- last_snap);
-
- btrfs_end_transaction(trans, root);
}
}
@@ -2384,6 +2387,13 @@ out:
btrfs_std_error(root->fs_info, ret);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
+
+ /* new reloc root may be added */
+ mutex_lock(&root->fs_info->reloc_mutex);
+ list_splice_init(&rc->reloc_roots, &reloc_roots);
+ mutex_unlock(&root->fs_info->reloc_mutex);
+ if (!list_empty(&reloc_roots))
+ free_reloc_roots(&reloc_roots);
}
BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
@@ -2420,7 +2430,7 @@ static noinline_for_stack
struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct reloc_control *rc,
struct backref_node *node,
- struct backref_edge *edges[], int *nr)
+ struct backref_edge *edges[])
{
struct backref_node *next;
struct btrfs_root *root;
@@ -2432,7 +2442,7 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
next = walk_up_backref(next, edges, &index);
root = next->root;
BUG_ON(!root);
- BUG_ON(!root->ref_cows);
+ BUG_ON(!test_bit(BTRFS_ROOT_REF_COWS, &root->state));
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
record_reloc_root_in_trans(trans, root);
@@ -2462,7 +2472,6 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!root)
return NULL;
- *nr = index;
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
while (1) {
@@ -2498,7 +2507,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
BUG_ON(!root);
/* no other choice for non-references counted tree */
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return root;
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
@@ -2558,28 +2567,36 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
struct btrfs_root *root = rc->extent_root;
u64 num_bytes;
int ret;
+ u64 tmp;
num_bytes = calcu_metadata_size(rc, node, 1) * 2;
trans->block_rsv = rc->block_rsv;
- ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_ALL);
+ rc->reserved_bytes += num_bytes;
+ ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
+ BTRFS_RESERVE_FLUSH_ALL);
if (ret) {
- if (ret == -EAGAIN)
- rc->commit_transaction = 1;
+ if (ret == -EAGAIN) {
+ tmp = rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ while (tmp <= rc->reserved_bytes)
+ tmp <<= 1;
+ /*
+ * only one thread can access block_rsv at this point,
+ * so we don't need hold lock to protect block_rsv.
+ * we expand more reservation size here to allow enough
+ * space for relocation and we will return eailer in
+ * enospc case.
+ */
+ rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ }
return ret;
}
return 0;
}
-static void release_metadata_space(struct reloc_control *rc,
- struct backref_node *node)
-{
- u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
- btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
-}
-
/*
* relocate a block tree, and then update pointers in upper level
* blocks that reference the block to point to the new location.
@@ -2601,7 +2618,6 @@ static int do_relocation(struct btrfs_trans_handle *trans,
u32 blocksize;
u64 bytenr;
u64 generation;
- int nr;
int slot;
int ret;
int err = 0;
@@ -2614,7 +2630,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
cond_resched();
upper = edge->node[UPPER];
- root = select_reloc_root(trans, rc, upper, edges, &nr);
+ root = select_reloc_root(trans, rc, upper, edges);
BUG_ON(!root);
if (upper->eb && !upper->locked) {
@@ -2866,7 +2882,6 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_path *path)
{
struct btrfs_root *root;
- int release = 0;
int ret = 0;
if (!node)
@@ -2879,15 +2894,14 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
goto out;
}
- if (!root || root->ref_cows) {
+ if (!root || test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = reserve_metadata_space(trans, rc, node);
if (ret)
goto out;
- release = 1;
}
if (root) {
- if (root->ref_cows) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
BUG_ON(node->new_bytenr);
BUG_ON(!list_empty(&node->list));
btrfs_record_root_in_trans(trans, root);
@@ -2908,11 +2922,8 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
ret = do_relocation(trans, rc, node, key, path, 1);
}
out:
- if (ret || node->level == 0 || node->cowonly) {
- if (release)
- release_metadata_space(rc, node);
+ if (ret || node->level == 0 || node->cowonly)
remove_backref_node(&rc->backref_cache, node);
- }
return ret;
}
@@ -3258,7 +3269,7 @@ static int add_tree_block(struct reloc_control *rc,
struct rb_node *rb_node;
u32 item_size;
int level = -1;
- int generation;
+ u64 generation;
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -3407,7 +3418,6 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
struct inode *inode, u64 ino)
{
struct btrfs_key key;
- struct btrfs_path *path;
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_trans_handle *trans;
int ret = 0;
@@ -3432,22 +3442,14 @@ truncate:
if (ret)
goto out;
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
- btrfs_free_path(path);
ret = PTR_ERR(trans);
goto out;
}
- ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+ ret = btrfs_truncate_free_space_cache(root, trans, inode);
- btrfs_free_path(path);
btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
out:
@@ -3549,10 +3551,8 @@ static int find_data_references(struct reloc_control *rc,
err = ret;
goto out;
}
- if (ret > 0) {
- WARN_ON(1);
+ if (WARN_ON(ret > 0))
goto out;
- }
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
@@ -3572,11 +3572,9 @@ static int find_data_references(struct reloc_control *rc,
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid != ref_objectid ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
- WARN_ON(1);
+ if (WARN_ON(key.objectid != ref_objectid ||
+ key.type != BTRFS_EXTENT_DATA_KEY))
break;
- }
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -3848,29 +3846,20 @@ static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
struct btrfs_trans_handle *trans;
- int ret;
rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
BTRFS_BLOCK_RSV_TEMP);
if (!rc->block_rsv)
return -ENOMEM;
- /*
- * reserve some space for creating reloc trees.
- * btrfs_init_reloc_root will use them when there
- * is no reservation in transaction handle.
- */
- ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
- rc->extent_root->nodesize * 256,
- BTRFS_RESERVE_FLUSH_ALL);
- if (ret)
- return ret;
-
memset(&rc->cluster, 0, sizeof(rc->cluster));
rc->search_start = rc->block_group->key.objectid;
rc->extents_found = 0;
rc->nodes_relocated = 0;
rc->merging_rsv_size = 0;
+ rc->reserved_bytes = 0;
+ rc->block_rsv->size = rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
rc->create_reloc_tree = 1;
set_reloc_control(rc);
@@ -3914,6 +3903,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
while (1) {
+ rc->reserved_bytes = 0;
+ ret = btrfs_block_rsv_refill(rc->extent_root,
+ rc->block_rsv, rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret) {
+ err = ret;
+ break;
+ }
progress++;
trans = btrfs_start_transaction(rc->extent_root, 0);
if (IS_ERR(trans)) {
@@ -3992,6 +3989,12 @@ restart:
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
+ /*
+ * if we fail to relocate tree blocks, force to update
+ * backref cache when committing transaction.
+ */
+ rc->backref_cache.last_trans = trans->transid - 1;
+
if (ret != -EAGAIN) {
err = ret;
break;
@@ -4001,24 +4004,8 @@ restart:
}
}
- ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
- if (ret < 0) {
- if (ret != -ENOSPC) {
- err = ret;
- WARN_ON(1);
- break;
- }
- rc->commit_transaction = 1;
- }
-
- if (rc->commit_transaction) {
- rc->commit_transaction = 0;
- ret = btrfs_commit_transaction(trans, rc->extent_root);
- BUG_ON(ret);
- } else {
- btrfs_end_transaction_throttle(trans, rc->extent_root);
- btrfs_btree_balance_dirty(rc->extent_root);
- }
+ btrfs_end_transaction_throttle(trans, rc->extent_root);
+ btrfs_btree_balance_dirty(rc->extent_root);
trans = NULL;
if (rc->stage == MOVE_DATA_EXTENTS &&
@@ -4238,15 +4225,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
goto out;
}
- printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+ btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
rc->block_group->key.objectid, rc->block_group->flags);
- ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
+ ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
if (ret < 0) {
err = ret;
goto out;
}
- btrfs_wait_all_ordered_extents(fs_info);
+ btrfs_wait_ordered_roots(fs_info, -1);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
@@ -4260,22 +4247,22 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
if (rc->extents_found == 0)
break;
- printk(KERN_INFO "btrfs: found %llu extents\n",
+ btrfs_info(extent_root->fs_info, "found %llu extents",
rc->extents_found);
if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
- btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+ ret = btrfs_wait_ordered_range(rc->data_inode, 0,
+ (u64)-1);
+ if (ret) {
+ err = ret;
+ goto out;
+ }
invalidate_mapping_pages(rc->data_inode->i_mapping,
0, -1);
rc->stage = UPDATE_DATA_PTRS;
}
}
- filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
- rc->block_group->key.objectid,
- rc->block_group->key.objectid +
- rc->block_group->key.offset - 1);
-
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
@@ -4481,6 +4468,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
u64 disk_bytenr;
+ u64 new_bytenr;
LIST_HEAD(list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
@@ -4492,13 +4480,24 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
if (ret)
goto out;
- disk_bytenr = ordered->start;
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
- sums->bytenr = disk_bytenr;
- disk_bytenr += sums->len;
+ /*
+ * We need to offset the new_bytenr based on where the csum is.
+ * We need to do this because we will read in entire prealloc
+ * extents but we may have written to say the middle of the
+ * prealloc extent, so we need to make sure the csum goes with
+ * the right disk offset.
+ *
+ * We can do this because the data reloc inode refers strictly
+ * to the on disk bytes, so we don't have to worry about
+ * disk_len vs real len like with real inodes since it's all
+ * disk length.
+ */
+ new_bytenr = ordered->start + (sums->bytenr - disk_bytenr);
+ sums->bytenr = new_bytenr;
btrfs_add_ordered_sum(inode, ordered, sums);
}
@@ -4524,6 +4523,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+ if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+ if (buf == root->node)
+ __update_reloc_root(root, cow->start);
+ }
+
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item))
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0b1f4ef8db9..360a728a639 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -16,6 +16,7 @@
* Boston, MA 021110-1307, USA.
*/
+#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "transaction.h"
@@ -44,7 +45,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
- printk(KERN_WARNING "btrfs: mismatching "
+ printk(KERN_WARNING "BTRFS: mismatching "
"generation and generation_v2 "
"found in root item. This root "
"was probably mounted with an "
@@ -154,7 +155,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
if (ret != 0) {
btrfs_print_leaf(root, path->nodes[0]);
- printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
+ btrfs_crit(root->fs_info, "unable to update root key %llu %u %llu",
key->objectid, key->type, key->offset);
BUG_ON(1);
}
@@ -271,7 +272,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
key.offset++;
root = btrfs_read_fs_root(tree_root, &root_key);
- err = PTR_RET(root);
+ err = PTR_ERR_OR_ZERO(root);
if (err && err != -ENOENT) {
break;
} else if (err == -ENOENT) {
@@ -299,18 +300,13 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
continue;
}
- if (btrfs_root_refs(&root->root_item) == 0) {
- btrfs_add_dead_root(root);
- continue;
- }
-
err = btrfs_init_fs_root(root);
if (err) {
btrfs_free_fs_root(root);
break;
}
- root->orphan_item_inserted = 1;
+ set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
err = btrfs_insert_fs_root(root->fs_info, root);
if (err) {
@@ -318,6 +314,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
btrfs_free_fs_root(root);
break;
}
+
+ if (btrfs_root_refs(&root->root_item) == 0)
+ btrfs_add_dead_root(root);
}
btrfs_free_path(path);
@@ -402,21 +401,6 @@ out:
return err;
}
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
- struct btrfs_path *path,
- u64 root_id, u64 ref_id)
-{
- struct btrfs_key key;
- int ret;
-
- key.objectid = root_id;
- key.type = BTRFS_ROOT_REF_KEY;
- key.offset = ref_id;
-
- ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
- return ret;
-}
-
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a18e0e23f6a..b6d198f5181 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
int is_metadata, int have_csum,
const u8 *csum, u64 generation,
u16 csum_size);
-static void scrub_complete_bio_end_io(struct bio *bio, int err);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int force_write);
@@ -257,6 +256,8 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
int mirror_num, u64 physical_for_dev_replace);
static void copy_nocow_pages_worker(struct btrfs_work *work);
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
@@ -270,6 +271,29 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
wake_up(&sctx->list_wait);
}
+static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ while (atomic_read(&fs_info->scrub_pause_req)) {
+ mutex_unlock(&fs_info->scrub_lock);
+ wait_event(fs_info->scrub_pause_wait,
+ atomic_read(&fs_info->scrub_pause_req) == 0);
+ mutex_lock(&fs_info->scrub_lock);
+ }
+}
+
+static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+{
+ atomic_inc(&fs_info->scrubs_paused);
+ wake_up(&fs_info->scrub_pause_wait);
+
+ mutex_lock(&fs_info->scrub_lock);
+ __scrub_blocked_if_needed(fs_info);
+ atomic_dec(&fs_info->scrubs_paused);
+ mutex_unlock(&fs_info->scrub_lock);
+
+ wake_up(&fs_info->scrub_pause_wait);
+}
+
/*
* used for workers that require transaction commits (i.e., for the
* NOCOW case)
@@ -291,6 +315,16 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
atomic_inc(&fs_info->scrubs_running);
atomic_inc(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
+
+ /*
+ * check if @scrubs_running=@scrubs_paused condition
+ * inside wait_event() is not an atomic operation.
+ * which means we may inc/dec @scrub_running/paused
+ * at any time. Let's wake up @scrub_pause_wait as
+ * much as we can to let commit transaction blocked less.
+ */
+ wake_up(&fs_info->scrub_pause_wait);
+
atomic_inc(&sctx->workers_pending);
}
@@ -394,7 +428,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- sbio->work.func = scrub_bio_end_io_worker;
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
+ NULL, NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -481,7 +516,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
"length %llu, links %u (path: %s)\n", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -493,7 +528,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
return 0;
err:
- printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
+ printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
"resolving failed with ret=%d\n", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
@@ -553,10 +588,11 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
do {
- ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
- &ref_root, &ref_level);
+ ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
+ item_size, &ref_root,
+ &ref_level);
printk_in_rcu(KERN_WARNING
- "btrfs: %s at logical %llu on dev %s, "
+ "BTRFS: %s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
"%llu\n", errstr, swarn.logical,
rcu_str_deref(dev->name),
@@ -682,8 +718,8 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
out:
if (page)
put_page(page);
- if (inode)
- iput(inode);
+
+ iput(inode);
if (ret < 0)
return ret;
@@ -705,13 +741,11 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work)
struct scrub_fixup_nodatasum *fixup;
struct scrub_ctx *sctx;
struct btrfs_trans_handle *trans = NULL;
- struct btrfs_fs_info *fs_info;
struct btrfs_path *path;
int uncorrectable = 0;
fixup = container_of(work, struct scrub_fixup_nodatasum, work);
sctx = fixup->sctx;
- fs_info = fixup->root->fs_info;
path = btrfs_alloc_path();
if (!path) {
@@ -760,8 +794,8 @@ out:
btrfs_dev_replace_stats_inc(
&sctx->dev_root->fs_info->dev_replace.
num_uncorrectable_read_errors);
- printk_ratelimited_in_rcu(KERN_ERR
- "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+ "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
@@ -938,8 +972,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
BTRFS_DEV_STAT_CORRUPTION_ERRS);
}
- if (sctx->readonly && !sctx->is_dev_replace)
- goto did_not_correct_error;
+ if (sctx->readonly) {
+ ASSERT(!sctx->is_dev_replace);
+ goto out;
+ }
if (!is_metadata && !have_csum) {
struct scrub_fixup_nodatasum *fixup_nodatasum;
@@ -963,9 +999,10 @@ nodatasum_case:
fixup_nodatasum->root = fs_info->extent_root;
fixup_nodatasum->mirror_num = failed_mirror_index + 1;
scrub_pending_trans_workers_inc(sctx);
- fixup_nodatasum->work.func = scrub_fixup_nodatasum;
- btrfs_queue_worker(&fs_info->scrub_workers,
- &fixup_nodatasum->work);
+ btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
+ NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_workers,
+ &fixup_nodatasum->work);
goto out;
}
@@ -1160,7 +1197,7 @@ corrected_error:
sctx->stat.corrected_errors++;
spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
- "btrfs: fixed up error at logical %llu on dev %s\n",
+ "BTRFS: fixed up error at logical %llu on dev %s\n",
logical, rcu_str_deref(dev->name));
}
} else {
@@ -1169,7 +1206,7 @@ did_not_correct_error:
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
printk_ratelimited_in_rcu(KERN_ERR
- "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+ "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
logical, rcu_str_deref(dev->name));
}
@@ -1292,7 +1329,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
struct scrub_page *page = sblock->pagev[page_num];
- DECLARE_COMPLETION_ONSTACK(complete);
if (page->dev->bdev == NULL) {
page->io_error = 1;
@@ -1308,19 +1344,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
continue;
}
bio->bi_bdev = page->dev->bdev;
- bio->bi_sector = page->physical >> 9;
- bio->bi_end_io = scrub_complete_bio_end_io;
- bio->bi_private = &complete;
+ bio->bi_iter.bi_sector = page->physical >> 9;
bio_add_page(bio, page->page, PAGE_SIZE, 0);
- btrfsic_submit_bio(READ, bio);
-
- /* this will also unplug the queue */
- wait_for_completion(&complete);
-
- page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (btrfsic_submit_bio_wait(READ, bio))
sblock->no_io_error_seen = 0;
+
bio_put(bio);
}
@@ -1389,11 +1418,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
sblock->checksum_error = 1;
}
-static void scrub_complete_bio_end_io(struct bio *bio, int err)
-{
- complete((struct completion *)bio->bi_private);
-}
-
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int force_write)
@@ -1428,11 +1452,11 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
sblock_bad->checksum_error || page_bad->io_error) {
struct bio *bio;
int ret;
- DECLARE_COMPLETION_ONSTACK(complete);
if (!page_bad->dev->bdev) {
- printk_ratelimited(KERN_WARNING
- "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+ printk_ratelimited(KERN_WARNING "BTRFS: "
+ "scrub_repair_page_from_good_copy(bdev == NULL) "
+ "is unexpected!\n");
return -EIO;
}
@@ -1440,20 +1464,15 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
if (!bio)
return -EIO;
bio->bi_bdev = page_bad->dev->bdev;
- bio->bi_sector = page_bad->physical >> 9;
- bio->bi_end_io = scrub_complete_bio_end_io;
- bio->bi_private = &complete;
+ bio->bi_iter.bi_sector = page_bad->physical >> 9;
ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
if (PAGE_SIZE != ret) {
bio_put(bio);
return -EIO;
}
- btrfsic_submit_bio(WRITE, bio);
- /* this will also unplug the queue */
- wait_for_completion(&complete);
- if (!bio_flagged(bio, BIO_UPTODATE)) {
+ if (btrfsic_submit_bio_wait(WRITE, bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
btrfs_dev_replace_stats_inc(
@@ -1538,7 +1557,7 @@ again:
bio->bi_private = sbio;
bio->bi_end_io = scrub_wr_bio_end_io;
bio->bi_bdev = sbio->dev->bdev;
- bio->bi_sector = sbio->physical >> 9;
+ bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical_for_dev_replace ||
@@ -1597,8 +1616,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- sbio->work.func = scrub_wr_bio_end_io_worker;
- btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
@@ -1895,7 +1914,7 @@ static void scrub_submit(struct scrub_ctx *sctx)
* This case is handled correctly (but _very_ slowly).
*/
printk_ratelimited(KERN_WARNING
- "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+ "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
bio_endio(sbio->bio, -EIO);
} else {
btrfsic_submit_bio(READ, sbio->bio);
@@ -1944,7 +1963,7 @@ again:
bio->bi_private = sbio;
bio->bi_end_io = scrub_bio_end_io;
bio->bi_bdev = sbio->dev->bdev;
- bio->bi_sector = sbio->physical >> 9;
+ bio->bi_iter.bi_sector = sbio->physical >> 9;
sbio->err = 0;
} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
spage->physical ||
@@ -2066,7 +2085,7 @@ static void scrub_bio_end_io(struct bio *bio, int err)
sbio->err = err;
sbio->bio = bio;
- btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+ btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
}
static void scrub_bio_end_io_worker(struct btrfs_work *work)
@@ -2217,6 +2236,47 @@ behind_scrub_pages:
return 0;
}
+/*
+ * Given a physical address, this will calculate it's
+ * logical offset. if this is a parity stripe, it will return
+ * the most left data stripe's logical offset.
+ *
+ * return 0 if it is a data stripe, 1 means parity stripe.
+ */
+static int get_raid56_logic_offset(u64 physical, int num,
+ struct map_lookup *map, u64 *offset)
+{
+ int i;
+ int j = 0;
+ u64 stripe_nr;
+ u64 last_offset;
+ int stripe_index;
+ int rot;
+
+ last_offset = (physical - map->stripes[num].physical) *
+ nr_data_stripes(map);
+ *offset = last_offset;
+ for (i = 0; i < nr_data_stripes(map); i++) {
+ *offset = last_offset + i * map->stripe_len;
+
+ stripe_nr = *offset;
+ do_div(stripe_nr, map->stripe_len);
+ do_div(stripe_nr, nr_data_stripes(map));
+
+ /* Work out the disk rotation on this stripe-set */
+ rot = do_div(stripe_nr, map->num_stripes);
+ /* calculate which stripe this data locates */
+ rot += i;
+ stripe_index = rot % map->num_stripes;
+ if (stripe_index == num)
+ return 0;
+ if (stripe_index < num)
+ j++;
+ }
+ *offset = last_offset + j * map->stripe_len;
+ return 1;
+}
+
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
@@ -2238,6 +2298,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 physical;
u64 logical;
u64 logic_end;
+ u64 physical_end;
u64 generation;
int mirror_num;
struct reada_control *reada1;
@@ -2251,16 +2312,10 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 extent_len;
struct btrfs_device *extent_dev;
int extent_mirror_num;
- int stop_loop;
-
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
- if (num >= nr_data_stripes(map)) {
- return 0;
- }
- }
+ int stop_loop = 0;
nstripes = length;
+ physical = map->stripes[num].physical;
offset = 0;
do_div(nstripes, map->stripe_len);
if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
@@ -2278,6 +2333,11 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ get_raid56_logic_offset(physical, num, map, &offset);
+ increment = map->stripe_len * nr_data_stripes(map);
+ mirror_num = 1;
} else {
increment = map->stripe_len;
mirror_num = 1;
@@ -2301,17 +2361,24 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* to not hold off transaction commits
*/
logical = base + offset;
-
+ physical_end = physical + nstripes * map->stripe_len;
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ get_raid56_logic_offset(physical_end, num,
+ map, &logic_end);
+ logic_end += base;
+ } else {
+ logic_end = logical + increment * nstripes;
+ }
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_inc(&fs_info->scrubs_paused);
- wake_up(&fs_info->scrub_pause_wait);
+ scrub_blocked_if_needed(fs_info);
/* FIXME it might be better to start readahead at commit root */
key_start.objectid = logical;
key_start.type = BTRFS_EXTENT_ITEM_KEY;
key_start.offset = (u64)0;
- key_end.objectid = base + offset + nstripes * increment;
+ key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key_start, &key_end);
@@ -2321,7 +2388,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
key_start.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
- key_end.offset = base + offset + nstripes * increment;
+ key_end.offset = logic_end;
reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
if (!IS_ERR(reada1))
@@ -2329,16 +2396,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (!IS_ERR(reada2))
btrfs_reada_wait(reada2);
- mutex_lock(&fs_info->scrub_lock);
- while (atomic_read(&fs_info->scrub_pause_req)) {
- mutex_unlock(&fs_info->scrub_lock);
- wait_event(fs_info->scrub_pause_wait,
- atomic_read(&fs_info->scrub_pause_req) == 0);
- mutex_lock(&fs_info->scrub_lock);
- }
- atomic_dec(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- wake_up(&fs_info->scrub_pause_wait);
/*
* collect all data csums for the stripe to avoid seeking during
@@ -2349,11 +2406,17 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/*
* now find all extents for each stripe and scrub them
*/
- logical = base + offset;
- physical = map->stripes[num].physical;
- logic_end = logical + increment * nstripes;
ret = 0;
- while (logical < logic_end) {
+ while (physical < physical_end) {
+ /* for raid56, we skip parity stripe */
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ ret = get_raid56_logic_offset(physical, num,
+ map, &logical);
+ logical += base;
+ if (ret)
+ goto skip;
+ }
/*
* canceled?
*/
@@ -2375,22 +2438,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
- atomic_inc(&fs_info->scrubs_paused);
- wake_up(&fs_info->scrub_pause_wait);
- mutex_lock(&fs_info->scrub_lock);
- while (atomic_read(&fs_info->scrub_pause_req)) {
- mutex_unlock(&fs_info->scrub_lock);
- wait_event(fs_info->scrub_pause_wait,
- atomic_read(&fs_info->scrub_pause_req) == 0);
- mutex_lock(&fs_info->scrub_lock);
- }
- atomic_dec(&fs_info->scrubs_paused);
- mutex_unlock(&fs_info->scrub_lock);
- wake_up(&fs_info->scrub_pause_wait);
+ scrub_blocked_if_needed(fs_info);
}
+ if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+ key.type = BTRFS_METADATA_ITEM_KEY;
+ else
+ key.type = BTRFS_EXTENT_ITEM_KEY;
key.objectid = logical;
- key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -2398,8 +2453,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
goto out;
if (ret > 0) {
- ret = btrfs_previous_item(root, path, 0,
- BTRFS_EXTENT_ITEM_KEY);
+ ret = btrfs_previous_extent_item(root, path, 0);
if (ret < 0)
goto out;
if (ret > 0) {
@@ -2457,9 +2511,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (key.objectid < logical &&
(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
- printk(KERN_ERR
- "btrfs scrub: tree block %llu spanning "
- "stripes, ignored. logical=%llu\n",
+ btrfs_err(fs_info,
+ "scrub: tree block %llu spanning "
+ "stripes, ignored. logical=%llu",
key.objectid, logical);
goto next;
}
@@ -2506,15 +2560,29 @@ again:
scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
- logical += increment;
- physical += map->stripe_len;
-
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ /*
+ * loop until we find next data stripe
+ * or we have finished all stripes.
+ */
+ do {
+ physical += map->stripe_len;
+ ret = get_raid56_logic_offset(
+ physical, num,
+ map, &logical);
+ logical += base;
+ } while (physical < physical_end && ret);
+ } else {
+ physical += map->stripe_len;
+ logical += increment;
+ }
if (logical < key.objectid + bytes) {
cond_resched();
goto again;
}
- if (logical >= logic_end) {
+ if (physical >= physical_end) {
stop_loop = 1;
break;
}
@@ -2523,6 +2591,7 @@ next:
path->slots[0]++;
}
btrfs_release_path(path);
+skip:
logical += increment;
physical += map->stripe_len;
spin_lock(&sctx->stat_lock);
@@ -2656,11 +2725,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
- if (found_key.offset + length <= start) {
- key.offset = found_key.offset + length;
- btrfs_release_path(path);
- continue;
- }
+ if (found_key.offset + length <= start)
+ goto skip;
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
@@ -2671,10 +2737,12 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
* the chunk from going away while we scrub it
*/
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
- if (!cache) {
- ret = -ENOENT;
- break;
- }
+
+ /* some chunks are removed but not committed to disk yet,
+ * continue scrubbing */
+ if (!cache)
+ goto skip;
+
dev_replace->cursor_right = found_key.offset + length;
dev_replace->cursor_left = found_key.offset;
dev_replace->item_needs_writeback = 1;
@@ -2700,25 +2768,24 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
wait_event(sctx->list_wait,
atomic_read(&sctx->bios_in_flight) == 0);
- atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
atomic_inc(&fs_info->scrubs_paused);
wake_up(&fs_info->scrub_pause_wait);
+
+ /*
+ * must be called before we decrease @scrub_paused.
+ * make sure we don't block transaction commit while
+ * we are waiting pending workers finished.
+ */
wait_event(sctx->list_wait,
atomic_read(&sctx->workers_pending) == 0);
+ atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
mutex_lock(&fs_info->scrub_lock);
- while (atomic_read(&fs_info->scrub_pause_req)) {
- mutex_unlock(&fs_info->scrub_lock);
- wait_event(fs_info->scrub_pause_wait,
- atomic_read(&fs_info->scrub_pause_req) == 0);
- mutex_lock(&fs_info->scrub_lock);
- }
+ __scrub_blocked_if_needed(fs_info);
atomic_dec(&fs_info->scrubs_paused);
mutex_unlock(&fs_info->scrub_lock);
wake_up(&fs_info->scrub_pause_wait);
- dev_replace->cursor_left = dev_replace->cursor_right;
- dev_replace->item_needs_writeback = 1;
btrfs_put_block_group(cache);
if (ret)
break;
@@ -2732,6 +2799,9 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
break;
}
+ dev_replace->cursor_left = dev_replace->cursor_right;
+ dev_replace->item_needs_writeback = 1;
+skip:
key.offset = found_key.offset + length;
btrfs_release_path(path);
}
@@ -2782,52 +2852,49 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
int ret = 0;
+ int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ int max_active = fs_info->thread_pool_size;
- mutex_lock(&fs_info->scrub_lock);
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
- btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
- &fs_info->generic_worker);
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ 1, 4);
else
- btrfs_init_workers(&fs_info->scrub_workers, "scrub",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- fs_info->scrub_workers.idle_thresh = 4;
- ret = btrfs_start_workers(&fs_info->scrub_workers);
- if (ret)
+ fs_info->scrub_workers =
+ btrfs_alloc_workqueue("btrfs-scrub", flags,
+ max_active, 4);
+ if (!fs_info->scrub_workers) {
+ ret = -ENOMEM;
goto out;
- btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
- "scrubwrc",
- fs_info->thread_pool_size,
- &fs_info->generic_worker);
- fs_info->scrub_wr_completion_workers.idle_thresh = 2;
- ret = btrfs_start_workers(
- &fs_info->scrub_wr_completion_workers);
- if (ret)
+ }
+ fs_info->scrub_wr_completion_workers =
+ btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+ max_active, 2);
+ if (!fs_info->scrub_wr_completion_workers) {
+ ret = -ENOMEM;
goto out;
- btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
- &fs_info->generic_worker);
- ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
- if (ret)
+ }
+ fs_info->scrub_nocow_workers =
+ btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+ if (!fs_info->scrub_nocow_workers) {
+ ret = -ENOMEM;
goto out;
+ }
}
++fs_info->scrub_workers_refcnt;
out:
- mutex_unlock(&fs_info->scrub_lock);
-
return ret;
}
static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
{
- mutex_lock(&fs_info->scrub_lock);
if (--fs_info->scrub_workers_refcnt == 0) {
- btrfs_stop_workers(&fs_info->scrub_workers);
- btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
- btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
}
WARN_ON(fs_info->scrub_workers_refcnt < 0);
- mutex_unlock(&fs_info->scrub_lock);
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -2845,8 +2912,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
* check some assumptions
*/
if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
- printk(KERN_ERR
- "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+ btrfs_err(fs_info,
+ "scrub: size assumption nodesize == leafsize (%d == %d) fails",
fs_info->chunk_root->nodesize,
fs_info->chunk_root->leafsize);
return -EINVAL;
@@ -2858,16 +2925,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
* the way scrub is implemented. Do not handle this
* situation at all because it won't ever happen.
*/
- printk(KERN_ERR
- "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+ btrfs_err(fs_info,
+ "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
return -EINVAL;
}
if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
/* not supported for data w/o checksums */
- printk(KERN_ERR
- "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
+ btrfs_err(fs_info,
+ "scrub: size assumption sectorsize != PAGE_SIZE "
+ "(%d != %lu) fails",
fs_info->chunk_root->sectorsize, PAGE_SIZE);
return -EINVAL;
}
@@ -2880,7 +2948,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
* would exhaust the array bounds of pagev member in
* struct scrub_block
*/
- pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+ btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
+ "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
fs_info->chunk_root->nodesize,
SCRUB_MAX_PAGES_PER_BLOCK,
fs_info->chunk_root->sectorsize,
@@ -2888,23 +2957,18 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
return -EINVAL;
}
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret)
- return ret;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info, devid, NULL, NULL);
if (!dev || (dev->missing && !is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(fs_info);
return -ENODEV;
}
- mutex_lock(&fs_info->scrub_lock);
+ mutex_lock(&fs_info->scrub_lock);
if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(fs_info);
return -EIO;
}
@@ -2915,10 +2979,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
btrfs_dev_replace_unlock(&fs_info->dev_replace);
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- scrub_workers_put(fs_info);
return -EINPROGRESS;
}
btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret) {
+ mutex_unlock(&fs_info->scrub_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ return ret;
+ }
+
sctx = scrub_setup_ctx(dev, is_dev_replace);
if (IS_ERR(sctx)) {
mutex_unlock(&fs_info->scrub_lock);
@@ -2928,15 +2999,24 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
sctx->readonly = readonly;
dev->scrub_device = sctx;
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ /*
+ * checking @scrub_pause_req here, we can avoid
+ * race between committing transaction and scrubbing.
+ */
+ __scrub_blocked_if_needed(fs_info);
atomic_inc(&fs_info->scrubs_running);
mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
if (!is_dev_replace) {
- down_read(&fs_info->scrub_super_lock);
+ /*
+ * by holding device list mutex, we can
+ * kick off writing super in log tree sync.
+ */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
- up_read(&fs_info->scrub_super_lock);
+ mutex_unlock(&fs_info->fs_devices->device_list_mutex);
}
if (!ret)
@@ -2954,10 +3034,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_device = NULL;
+ scrub_workers_put(fs_info);
mutex_unlock(&fs_info->scrub_lock);
scrub_free_ctx(sctx);
- scrub_workers_put(fs_info);
return ret;
}
@@ -2987,16 +3067,6 @@ void btrfs_scrub_continue(struct btrfs_root *root)
wake_up(&fs_info->scrub_pause_wait);
}
-void btrfs_scrub_pause_super(struct btrfs_root *root)
-{
- down_write(&root->fs_info->scrub_super_lock);
-}
-
-void btrfs_scrub_continue_super(struct btrfs_root *root)
-{
- up_write(&root->fs_info->scrub_super_lock);
-}
-
int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
{
mutex_lock(&fs_info->scrub_lock);
@@ -3133,10 +3203,10 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
nocow_ctx->len = len;
nocow_ctx->mirror_num = mirror_num;
nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
- nocow_ctx->work.func = copy_nocow_pages_worker;
+ btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
INIT_LIST_HEAD(&nocow_ctx->inodes);
- btrfs_queue_worker(&fs_info->scrub_nocow_workers,
- &nocow_ctx->work);
+ btrfs_queue_work(fs_info->scrub_nocow_workers,
+ &nocow_ctx->work);
return 0;
}
@@ -3195,7 +3265,8 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
ret = iterate_inodes_from_logical(logical, fs_info, path,
record_inode_for_nocow, nocow_ctx);
if (ret != 0 && ret != -ENOENT) {
- pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
+ btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
+ "phys %llu, len %llu, mir %u, ret %d",
logical, physical_for_dev_replace, len, mirror_num,
ret);
not_written = 1;
@@ -3317,7 +3388,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
again:
page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
if (!page) {
- pr_err("find_or_create_page() failed\n");
+ btrfs_err(fs_info, "find_or_create_page() failed");
ret = -ENOMEM;
goto out;
}
@@ -3383,14 +3454,13 @@ static int write_page_nocow(struct scrub_ctx *sctx,
struct bio *bio;
struct btrfs_device *dev;
int ret;
- DECLARE_COMPLETION_ONSTACK(compl);
dev = sctx->wr_ctx.tgtdev;
if (!dev)
return -EIO;
if (!dev->bdev) {
printk_ratelimited(KERN_WARNING
- "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
return -EIO;
}
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
@@ -3400,10 +3470,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
spin_unlock(&sctx->stat_lock);
return -ENOMEM;
}
- bio->bi_private = &compl;
- bio->bi_end_io = scrub_complete_bio_end_io;
- bio->bi_size = 0;
- bio->bi_sector = physical_for_dev_replace >> 9;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
bio->bi_bdev = dev->bdev;
ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
if (ret != PAGE_CACHE_SIZE) {
@@ -3412,10 +3480,8 @@ leave_with_eio:
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
}
- btrfsic_submit_bio(WRITE_SYNC, bio);
- wait_for_completion(&compl);
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
goto leave_with_eio;
bio_put(bio);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e46e0ed7492..6528aa66218 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -24,12 +24,12 @@
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
#include <linux/radix-tree.h>
-#include <linux/crc32c.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include "send.h"
#include "backref.h"
+#include "hash.h"
#include "locking.h"
#include "disk-io.h"
#include "btrfs_inode.h"
@@ -51,15 +51,18 @@ struct fs_path {
struct {
char *start;
char *end;
- char *prepared;
char *buf;
- int buf_len;
- unsigned int reversed:1;
- unsigned int virtual_mem:1;
+ unsigned short buf_len:15;
+ unsigned short reversed:1;
char inline_buf[];
};
- char pad[PAGE_SIZE];
+ /*
+ * Average path length does not exceed 200 bytes, we'll have
+ * better packing in the slab and higher chance to satisfy
+ * a allocation later during send.
+ */
+ char pad[256];
};
};
#define FS_PATH_INLINE_SIZE \
@@ -88,8 +91,6 @@ struct send_ctx {
u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
- struct vfsmount *mnt;
-
struct btrfs_root *send_root;
struct btrfs_root *parent_root;
struct clone_root *clone_roots;
@@ -111,6 +112,8 @@ struct send_ctx {
int cur_inode_deleted;
u64 cur_inode_size;
u64 cur_inode_mode;
+ u64 cur_inode_rdev;
+ u64 cur_inode_last_extent;
u64 send_progress;
@@ -121,8 +124,130 @@ struct send_ctx {
struct list_head name_cache_list;
int name_cache_size;
- struct file *cur_inode_filp;
+ struct file_ra_state ra;
+
char *read_buf;
+
+ /*
+ * We process inodes by their increasing order, so if before an
+ * incremental send we reverse the parent/child relationship of
+ * directories such that a directory with a lower inode number was
+ * the parent of a directory with a higher inode number, and the one
+ * becoming the new parent got renamed too, we can't rename/move the
+ * directory with lower inode number when we finish processing it - we
+ * must process the directory with higher inode number first, then
+ * rename/move it and then rename/move the directory with lower inode
+ * number. Example follows.
+ *
+ * Tree state when the first send was performed:
+ *
+ * .
+ * |-- a (ino 257)
+ * |-- b (ino 258)
+ * |
+ * |
+ * |-- c (ino 259)
+ * | |-- d (ino 260)
+ * |
+ * |-- c2 (ino 261)
+ *
+ * Tree state when the second (incremental) send is performed:
+ *
+ * .
+ * |-- a (ino 257)
+ * |-- b (ino 258)
+ * |-- c2 (ino 261)
+ * |-- d2 (ino 260)
+ * |-- cc (ino 259)
+ *
+ * The sequence of steps that lead to the second state was:
+ *
+ * mv /a/b/c/d /a/b/c2/d2
+ * mv /a/b/c /a/b/c2/d2/cc
+ *
+ * "c" has lower inode number, but we can't move it (2nd mv operation)
+ * before we move "d", which has higher inode number.
+ *
+ * So we just memorize which move/rename operations must be performed
+ * later when their respective parent is processed and moved/renamed.
+ */
+
+ /* Indexed by parent directory inode number. */
+ struct rb_root pending_dir_moves;
+
+ /*
+ * Reverse index, indexed by the inode number of a directory that
+ * is waiting for the move/rename of its immediate parent before its
+ * own move/rename can be performed.
+ */
+ struct rb_root waiting_dir_moves;
+
+ /*
+ * A directory that is going to be rm'ed might have a child directory
+ * which is in the pending directory moves index above. In this case,
+ * the directory can only be removed after the move/rename of its child
+ * is performed. Example:
+ *
+ * Parent snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- c/ (ino 259)
+ * | |-- x/ (ino 260)
+ * |
+ * |-- y/ (ino 261)
+ *
+ * Send snapshot:
+ *
+ * . (ino 256)
+ * |-- a/ (ino 257)
+ * |-- b/ (ino 258)
+ * |-- YY/ (ino 261)
+ * |-- x/ (ino 260)
+ *
+ * Sequence of steps that lead to the send snapshot:
+ * rm -f /a/b/c/foo.txt
+ * mv /a/b/y /a/b/YY
+ * mv /a/b/c/x /a/b/YY
+ * rmdir /a/b/c
+ *
+ * When the child is processed, its move/rename is delayed until its
+ * parent is processed (as explained above), but all other operations
+ * like update utimes, chown, chgrp, etc, are performed and the paths
+ * that it uses for those operations must use the orphanized name of
+ * its parent (the directory we're going to rm later), so we need to
+ * memorize that name.
+ *
+ * Indexed by the inode number of the directory to be deleted.
+ */
+ struct rb_root orphan_dirs;
+};
+
+struct pending_dir_move {
+ struct rb_node node;
+ struct list_head list;
+ u64 parent_ino;
+ u64 ino;
+ u64 gen;
+ struct list_head update_refs;
+};
+
+struct waiting_dir_move {
+ struct rb_node node;
+ u64 ino;
+ /*
+ * There might be some directory that could not be removed because it
+ * was waiting for this directory inode to be moved first. Therefore
+ * after this directory is moved, we can try to rmdir the ino rmdir_ino.
+ */
+ u64 rmdir_ino;
+};
+
+struct orphan_dir_info {
+ struct rb_node node;
+ u64 ino;
+ u64 gen;
};
struct name_cache_entry {
@@ -146,6 +271,20 @@ struct name_cache_entry {
char name[];
};
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+
+static int need_send_hole(struct send_ctx *sctx)
+{
+ return (sctx->parent_root && !sctx->cur_inode_new &&
+ !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
+ S_ISREG(sctx->cur_inode_mode));
+}
+
static void fs_path_reset(struct fs_path *p)
{
if (p->reversed) {
@@ -167,7 +306,6 @@ static struct fs_path *fs_path_alloc(void)
if (!p)
return NULL;
p->reversed = 0;
- p->virtual_mem = 0;
p->buf = p->inline_buf;
p->buf_len = FS_PATH_INLINE_SIZE;
fs_path_reset(p);
@@ -190,12 +328,8 @@ static void fs_path_free(struct fs_path *p)
{
if (!p)
return;
- if (p->buf != p->inline_buf) {
- if (p->virtual_mem)
- vfree(p->buf);
- else
- kfree(p->buf);
- }
+ if (p->buf != p->inline_buf)
+ kfree(p->buf);
kfree(p);
}
@@ -215,42 +349,33 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
if (p->buf_len >= len)
return 0;
+ if (len > PATH_MAX) {
+ WARN_ON(1);
+ return -ENOMEM;
+ }
+
path_len = p->end - p->start;
old_buf_len = p->buf_len;
- len = PAGE_ALIGN(len);
+ /*
+ * First time the inline_buf does not suffice
+ */
if (p->buf == p->inline_buf) {
- tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN);
- if (!tmp_buf) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- p->virtual_mem = 1;
- }
- memcpy(tmp_buf, p->buf, p->buf_len);
- p->buf = tmp_buf;
- p->buf_len = len;
+ tmp_buf = kmalloc(len, GFP_NOFS);
+ if (tmp_buf)
+ memcpy(tmp_buf, p->buf, old_buf_len);
} else {
- if (p->virtual_mem) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- memcpy(tmp_buf, p->buf, p->buf_len);
- vfree(p->buf);
- } else {
- tmp_buf = krealloc(p->buf, len, GFP_NOFS);
- if (!tmp_buf) {
- tmp_buf = vmalloc(len);
- if (!tmp_buf)
- return -ENOMEM;
- memcpy(tmp_buf, p->buf, p->buf_len);
- kfree(p->buf);
- p->virtual_mem = 1;
- }
- }
- p->buf = tmp_buf;
- p->buf_len = len;
+ tmp_buf = krealloc(p->buf, len, GFP_NOFS);
}
+ if (!tmp_buf)
+ return -ENOMEM;
+ p->buf = tmp_buf;
+ /*
+ * The real size of the buffer is bigger, this will let the fast path
+ * happen most of the time
+ */
+ p->buf_len = ksize(p->buf);
+
if (p->reversed) {
tmp_buf = p->buf + old_buf_len - path_len - 1;
p->end = p->buf + p->buf_len - 1;
@@ -263,7 +388,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len)
return 0;
}
-static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
+static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
+ char **prepared)
{
int ret;
int new_len;
@@ -279,11 +405,11 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len)
if (p->start != p->end)
*--p->start = '/';
p->start -= name_len;
- p->prepared = p->start;
+ *prepared = p->start;
} else {
if (p->start != p->end)
*p->end++ = '/';
- p->prepared = p->end;
+ *prepared = p->end;
p->end += name_len;
*p->end = 0;
}
@@ -295,12 +421,12 @@ out:
static int fs_path_add(struct fs_path *p, const char *name, int name_len)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, name_len);
+ ret = fs_path_prepare_for_add(p, name_len, &prepared);
if (ret < 0)
goto out;
- memcpy(p->prepared, name, name_len);
- p->prepared = NULL;
+ memcpy(prepared, name, name_len);
out:
return ret;
@@ -309,12 +435,12 @@ out:
static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, p2->end - p2->start);
+ ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
if (ret < 0)
goto out;
- memcpy(p->prepared, p2->start, p2->end - p2->start);
- p->prepared = NULL;
+ memcpy(prepared, p2->start, p2->end - p2->start);
out:
return ret;
@@ -325,28 +451,18 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p,
unsigned long off, int len)
{
int ret;
+ char *prepared;
- ret = fs_path_prepare_for_add(p, len);
+ ret = fs_path_prepare_for_add(p, len, &prepared);
if (ret < 0)
goto out;
- read_extent_buffer(eb, p->prepared, off, len);
- p->prepared = NULL;
+ read_extent_buffer(eb, prepared, off, len);
out:
return ret;
}
-#if 0
-static void fs_path_remove(struct fs_path *p)
-{
- BUG_ON(p->reversed);
- while (p->start != p->end && *p->end != '/')
- p->end--;
- *p->end = 0;
-}
-#endif
-
static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
int ret;
@@ -385,6 +501,7 @@ static struct btrfs_path *alloc_path_for_send(void)
return NULL;
path->search_commit_root = 1;
path->skip_locking = 1;
+ path->need_commit_sem = 1;
return path;
}
@@ -437,30 +554,15 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
return 0;
}
-#if 0
-static int tlv_put_u8(struct send_ctx *sctx, u16 attr, u8 value)
-{
- return tlv_put(sctx, attr, &value, sizeof(value));
-}
-
-static int tlv_put_u16(struct send_ctx *sctx, u16 attr, u16 value)
-{
- __le16 tmp = cpu_to_le16(value);
- return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-
-static int tlv_put_u32(struct send_ctx *sctx, u16 attr, u32 value)
-{
- __le32 tmp = cpu_to_le32(value);
- return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
-#endif
+#define TLV_PUT_DEFINE_INT(bits) \
+ static int tlv_put_u##bits(struct send_ctx *sctx, \
+ u##bits attr, u##bits value) \
+ { \
+ __le##bits __tmp = cpu_to_le##bits(value); \
+ return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
+ }
-static int tlv_put_u64(struct send_ctx *sctx, u16 attr, u64 value)
-{
- __le64 tmp = cpu_to_le64(value);
- return tlv_put(sctx, attr, &tmp, sizeof(tmp));
-}
+TLV_PUT_DEFINE_INT(64)
static int tlv_put_string(struct send_ctx *sctx, u16 attr,
const char *str, int len)
@@ -476,17 +578,6 @@ static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
}
-#if 0
-static int tlv_put_timespec(struct send_ctx *sctx, u16 attr,
- struct timespec *ts)
-{
- struct btrfs_timespec bts;
- bts.sec = cpu_to_le64(ts->tv_sec);
- bts.nsec = cpu_to_le32(ts->tv_nsec);
- return tlv_put(sctx, attr, &bts, sizeof(bts));
-}
-#endif
-
static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
struct extent_buffer *eb,
struct btrfs_timespec *ts)
@@ -534,12 +625,6 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
if (ret < 0) \
goto tlv_put_failure; \
} while (0)
-#define TLV_PUT_TIMESPEC(sctx, attrtype, ts) \
- do { \
- ret = tlv_put_timespec(sctx, attrtype, ts); \
- if (ret < 0) \
- goto tlv_put_failure; \
- } while (0)
#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
do { \
ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
@@ -565,10 +650,8 @@ static int begin_cmd(struct send_ctx *sctx, int cmd)
{
struct btrfs_cmd_header *hdr;
- if (!sctx->send_buf) {
- WARN_ON(1);
+ if (WARN_ON(!sctx->send_buf))
return -EINVAL;
- }
BUG_ON(sctx->send_size);
@@ -589,7 +672,7 @@ static int send_cmd(struct send_ctx *sctx)
hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
hdr->crc = 0;
- crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
+ crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
hdr->crc = cpu_to_le32(crc);
ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
@@ -697,29 +780,22 @@ out:
/*
* Helper function to retrieve some fields from an inode item.
*/
-static int get_inode_info(struct btrfs_root *root,
- u64 ino, u64 *size, u64 *gen,
- u64 *mode, u64 *uid, u64 *gid,
- u64 *rdev)
+static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
+ u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
+ u64 *gid, u64 *rdev)
{
int ret;
struct btrfs_inode_item *ii;
struct btrfs_key key;
- struct btrfs_path *path;
-
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
key.objectid = ino;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
- goto out;
if (ret) {
- ret = -ENOENT;
- goto out;
+ if (ret > 0)
+ ret = -ENOENT;
+ return ret;
}
ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -737,7 +813,22 @@ static int get_inode_info(struct btrfs_root *root,
if (rdev)
*rdev = btrfs_inode_rdev(path->nodes[0], ii);
-out:
+ return ret;
+}
+
+static int get_inode_info(struct btrfs_root *root,
+ u64 ino, u64 *size, u64 *gen,
+ u64 *mode, u64 *uid, u64 *gid,
+ u64 *rdev)
+{
+ struct btrfs_path *path;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+ ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
+ rdev);
btrfs_free_path(path);
return ret;
}
@@ -791,7 +882,7 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
if (found_key->type == BTRFS_INODE_REF_KEY) {
ptr = (unsigned long)btrfs_item_ptr(eb, slot,
struct btrfs_inode_ref);
- item = btrfs_item_nr(eb, slot);
+ item = btrfs_item_nr(slot);
total = btrfs_item_size(eb, item);
elem_size = sizeof(*iref);
} else {
@@ -884,9 +975,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_dir_item *di;
struct btrfs_key di_key;
char *buf = NULL;
- char *buf2 = NULL;
int buf_len;
- int buf_virtual = 0;
u32 name_len;
u32 data_len;
u32 cur;
@@ -896,7 +985,11 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
int num;
u8 type;
- buf_len = PAGE_SIZE;
+ if (found_key->type == BTRFS_XATTR_ITEM_KEY)
+ buf_len = BTRFS_MAX_XATTR_SIZE(root);
+ else
+ buf_len = PATH_MAX;
+
buf = kmalloc(buf_len, GFP_NOFS);
if (!buf) {
ret = -ENOMEM;
@@ -905,7 +998,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
eb = path->nodes[0];
slot = path->slots[0];
- item = btrfs_item_nr(eb, slot);
+ item = btrfs_item_nr(slot);
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
cur = 0;
len = 0;
@@ -918,30 +1011,23 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
type = btrfs_dir_type(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
- if (name_len + data_len > buf_len) {
- buf_len = PAGE_ALIGN(name_len + data_len);
- if (buf_virtual) {
- buf2 = vmalloc(buf_len);
- if (!buf2) {
- ret = -ENOMEM;
- goto out;
- }
- vfree(buf);
- } else {
- buf2 = krealloc(buf, buf_len, GFP_NOFS);
- if (!buf2) {
- buf2 = vmalloc(buf_len);
- if (!buf2) {
- ret = -ENOMEM;
- goto out;
- }
- kfree(buf);
- buf_virtual = 1;
- }
+ if (type == BTRFS_FT_XATTR) {
+ if (name_len > XATTR_NAME_MAX) {
+ ret = -ENAMETOOLONG;
+ goto out;
+ }
+ if (name_len + data_len > buf_len) {
+ ret = -E2BIG;
+ goto out;
+ }
+ } else {
+ /*
+ * Path too long
+ */
+ if (name_len + data_len > buf_len) {
+ ret = -ENAMETOOLONG;
+ goto out;
}
-
- buf = buf2;
- buf2 = NULL;
}
read_extent_buffer(eb, buf, (unsigned long)(di + 1),
@@ -964,10 +1050,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
}
out:
- if (buf_virtual)
- vfree(buf);
- else
- kfree(buf);
+ kfree(buf);
return ret;
}
@@ -1035,6 +1118,7 @@ out:
struct backref_ctx {
struct send_ctx *sctx;
+ struct btrfs_path *path;
/* number of total found references */
u64 found;
@@ -1105,8 +1189,9 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
* There are inodes that have extents that lie behind its i_size. Don't
* accept clones from these extents.
*/
- ret = get_inode_info(found->root, ino, &i_size, NULL, NULL, NULL, NULL,
- NULL);
+ ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
+ NULL, NULL, NULL);
+ btrfs_release_path(bctx->path);
if (ret < 0)
return ret;
@@ -1185,12 +1270,17 @@ static int find_extent_clone(struct send_ctx *sctx,
if (!tmp_path)
return -ENOMEM;
+ /* We only use this path under the commit sem */
+ tmp_path->need_commit_sem = 0;
+
backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_NOFS);
if (!backref_ctx) {
ret = -ENOMEM;
goto out;
}
+ backref_ctx->path = tmp_path;
+
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
@@ -1218,8 +1308,10 @@ static int find_extent_clone(struct send_ctx *sctx,
}
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
+ down_read(&sctx->send_root->fs_info->commit_root_sem);
ret = extent_from_logical(sctx->send_root->fs_info, disk_byte, tmp_path,
&found_key, &flags);
+ up_read(&sctx->send_root->fs_info->commit_root_sem);
btrfs_release_path(tmp_path);
if (ret < 0)
@@ -1261,8 +1353,6 @@ static int find_extent_clone(struct send_ctx *sctx,
extent_item_pos = logical - found_key.objectid;
else
extent_item_pos = 0;
-
- extent_item_pos = logical - found_key.objectid;
ret = iterate_extent_inodes(sctx->send_root->fs_info,
found_key.objectid, extent_item_pos, 1,
__iterate_backrefs, backref_ctx);
@@ -1273,9 +1363,9 @@ static int find_extent_clone(struct send_ctx *sctx,
if (!backref_ctx->found_itself) {
/* found a bug in backref code? */
ret = -EIO;
- printk(KERN_ERR "btrfs: ERROR did not find backref in "
+ btrfs_err(sctx->send_root->fs_info, "did not find backref in "
"send_root. inode=%llu, offset=%llu, "
- "disk_byte=%llu found extent=%llu\n",
+ "disk_byte=%llu found extent=%llu",
ino, data_offset, disk_byte, found_key.objectid);
goto out;
}
@@ -1301,6 +1391,16 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
+ if (compressed != BTRFS_COMPRESS_NONE) {
+ /*
+ * Offsets given by iterate_extent_inodes() are relative
+ * to the start of the extent, we need to add logical
+ * offset from the file extent item.
+ * (See why at backref.c:check_extent_in_eb())
+ */
+ cur_clone_root->offset += btrfs_file_extent_offset(eb,
+ fi);
+ }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1346,7 +1446,7 @@ static int read_symlink(struct btrfs_root *root,
BUG_ON(compression);
off = btrfs_file_extent_inline_start(ei);
- len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+ len = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], ei);
ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
@@ -1375,13 +1475,9 @@ static int gen_unique_name(struct send_ctx *sctx,
return -ENOMEM;
while (1) {
- len = snprintf(tmp, sizeof(tmp) - 1, "o%llu-%llu-%llu",
+ len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
- if (len >= sizeof(tmp)) {
- /* should really not happen */
- ret = -EOVERFLOW;
- goto out;
- }
+ ASSERT(len < sizeof(tmp));
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
@@ -1548,6 +1644,10 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+ if (key.type == BTRFS_ROOT_ITEM_KEY) {
+ ret = -ENOENT;
+ goto out;
+ }
*found_inode = key.objectid;
*found_type = btrfs_dir_type(path->nodes[0], di);
@@ -1591,7 +1691,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
goto out;
}
- if (key.type == BTRFS_INODE_REF_KEY) {
+ if (found_key.type == BTRFS_INODE_REF_KEY) {
struct btrfs_inode_ref *iref;
iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
@@ -1613,10 +1713,12 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
goto out;
btrfs_release_path(path);
- ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
- goto out;
+ if (dir_gen) {
+ ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
+ NULL, NULL, NULL);
+ if (ret < 0)
+ goto out;
+ }
*dir = parent_dir;
@@ -1632,13 +1734,12 @@ static int is_first_ref(struct btrfs_root *root,
int ret;
struct fs_path *tmp_name;
u64 tmp_dir;
- u64 tmp_dir_gen;
tmp_name = fs_path_alloc();
if (!tmp_name)
return -ENOMEM;
- ret = get_first_ref(root, ino, &tmp_dir, &tmp_dir_gen, tmp_name);
+ ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
if (ret < 0)
goto out;
@@ -1857,13 +1958,20 @@ static void name_cache_delete(struct send_ctx *sctx,
nce_head = radix_tree_lookup(&sctx->name_cache,
(unsigned long)nce->ino);
- BUG_ON(!nce_head);
+ if (!nce_head) {
+ btrfs_err(sctx->send_root->fs_info,
+ "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
+ nce->ino, sctx->name_cache_size);
+ }
list_del(&nce->radix_list);
list_del(&nce->list);
sctx->name_cache_size--;
- if (list_empty(nce_head)) {
+ /*
+ * We may not get to the final release of nce_head if the lookup fails
+ */
+ if (nce_head && list_empty(nce_head)) {
radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
@@ -1942,7 +2050,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
{
int ret;
int nce_ret;
- struct btrfs_path *path = NULL;
struct name_cache_entry *nce = NULL;
/*
@@ -1968,10 +2075,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx,
}
}
- path = alloc_path_for_send();
- if (!path)
- return -ENOMEM;
-
/*
* If the inode is not existent yet, add the orphan name and return 1.
* This should only happen for the parent dir that we determine in
@@ -2047,7 +2150,6 @@ out_cache:
name_cache_clean_unused(sctx);
out:
- btrfs_free_path(path);
return ret;
}
@@ -2097,12 +2199,27 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
fs_path_reset(name);
- ret = __get_cur_name_and_parent(sctx, ino, gen,
- &parent_inode, &parent_gen, name);
+ if (is_waiting_for_rm(sctx, ino)) {
+ ret = gen_unique_name(sctx, ino, gen, name);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(dest, name);
+ break;
+ }
+
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret)
+ stop = 1;
+ }
+
if (ret < 0)
goto out;
- if (ret)
- stop = 1;
ret = fs_path_add_path(dest, name);
if (ret < 0)
@@ -2120,77 +2237,6 @@ out:
}
/*
- * Called for regular files when sending extents data. Opens a struct file
- * to read from the file.
- */
-static int open_cur_inode_file(struct send_ctx *sctx)
-{
- int ret = 0;
- struct btrfs_key key;
- struct path path;
- struct inode *inode;
- struct dentry *dentry;
- struct file *filp;
- int new = 0;
-
- if (sctx->cur_inode_filp)
- goto out;
-
- key.objectid = sctx->cur_ino;
- key.type = BTRFS_INODE_ITEM_KEY;
- key.offset = 0;
-
- inode = btrfs_iget(sctx->send_root->fs_info->sb, &key, sctx->send_root,
- &new);
- if (IS_ERR(inode)) {
- ret = PTR_ERR(inode);
- goto out;
- }
-
- dentry = d_obtain_alias(inode);
- inode = NULL;
- if (IS_ERR(dentry)) {
- ret = PTR_ERR(dentry);
- goto out;
- }
-
- path.mnt = sctx->mnt;
- path.dentry = dentry;
- filp = dentry_open(&path, O_RDONLY | O_LARGEFILE, current_cred());
- dput(dentry);
- dentry = NULL;
- if (IS_ERR(filp)) {
- ret = PTR_ERR(filp);
- goto out;
- }
- sctx->cur_inode_filp = filp;
-
-out:
- /*
- * no xxxput required here as every vfs op
- * does it by itself on failure
- */
- return ret;
-}
-
-/*
- * Closes the struct file that was created in open_cur_inode_file
- */
-static int close_cur_inode_file(struct send_ctx *sctx)
-{
- int ret = 0;
-
- if (!sctx->cur_inode_filp)
- goto out;
-
- ret = filp_close(sctx->cur_inode_filp, NULL);
- sctx->cur_inode_filp = NULL;
-
-out:
- return ret;
-}
-
-/*
* Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
*/
static int send_subvol_begin(struct send_ctx *sctx)
@@ -2205,7 +2251,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
char *name = NULL;
int namelen;
- path = alloc_path_for_send();
+ path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -2254,12 +2300,12 @@ static int send_subvol_begin(struct send_ctx *sctx)
TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
sctx->send_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
- sctx->send_root->root_item.ctransid);
+ le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
sctx->parent_root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- sctx->parent_root->root_item.ctransid);
+ le64_to_cpu(sctx->parent_root->root_item.ctransid));
}
ret = send_cmd(sctx);
@@ -2437,10 +2483,16 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
if (!p)
return -ENOMEM;
- ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, NULL,
- NULL, &rdev);
- if (ret < 0)
- goto out;
+ if (ino != sctx->cur_ino) {
+ ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
+ NULL, NULL, &rdev);
+ if (ret < 0)
+ goto out;
+ } else {
+ gen = sctx->cur_inode_gen;
+ mode = sctx->cur_inode_mode;
+ rdev = sctx->cur_inode_rdev;
+ }
if (S_ISREG(mode)) {
cmd = BTRFS_SEND_C_MKFILE;
@@ -2520,17 +2572,26 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+ ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
while (1) {
- ret = btrfs_search_slot_for_read(sctx->send_root, &key, path,
- 1, 0);
- if (ret < 0)
- goto out;
- if (!ret) {
- eb = path->nodes[0];
- slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ eb = path->nodes[0];
+ slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(sctx->send_root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
}
- if (ret || found_key.objectid != key.objectid ||
+
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
goto out;
@@ -2545,8 +2606,7 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
goto out;
}
- key.offset = found_key.offset + 1;
- btrfs_release_path(path);
+ path->slots[0]++;
}
out:
@@ -2598,7 +2658,7 @@ struct recorded_ref {
* everything mixed. So we first record all refs and later process them.
* This function is a helper to record one ref.
*/
-static int record_ref(struct list_head *head, u64 dir,
+static int __record_ref(struct list_head *head, u64 dir,
u64 dir_gen, struct fs_path *path)
{
struct recorded_ref *ref;
@@ -2684,12 +2744,78 @@ out:
return ret;
}
+static struct orphan_dir_info *
+add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node **p = &sctx->orphan_dirs.rb_node;
+ struct rb_node *parent = NULL;
+ struct orphan_dir_info *entry, *odi;
+
+ odi = kmalloc(sizeof(*odi), GFP_NOFS);
+ if (!odi)
+ return ERR_PTR(-ENOMEM);
+ odi->ino = dir_ino;
+ odi->gen = 0;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (dir_ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(odi);
+ return entry;
+ }
+ }
+
+ rb_link_node(&odi->node, parent, p);
+ rb_insert_color(&odi->node, &sctx->orphan_dirs);
+ return odi;
+}
+
+static struct orphan_dir_info *
+get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct rb_node *n = sctx->orphan_dirs.rb_node;
+ struct orphan_dir_info *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct orphan_dir_info, node);
+ if (dir_ino < entry->ino)
+ n = n->rb_left;
+ else if (dir_ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+{
+ struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+
+ return odi != NULL;
+}
+
+static void free_orphan_dir_info(struct send_ctx *sctx,
+ struct orphan_dir_info *odi)
+{
+ if (!odi)
+ return;
+ rb_erase(&odi->node, &sctx->orphan_dirs);
+ kfree(odi);
+}
+
/*
* Returns 1 if a directory can be removed at this point in time.
* We check this by iterating all dir items and checking if the inode behind
* the dir item was already processed.
*/
-static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
+static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
+ u64 send_progress)
{
int ret = 0;
struct btrfs_root *root = sctx->parent_root;
@@ -2712,31 +2838,52 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 send_progress)
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (!ret) {
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- path->slots[0]);
+ struct waiting_dir_move *dm;
+
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
}
- if (ret || found_key.objectid != key.objectid ||
- found_key.type != key.type) {
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+ path->slots[0]);
+ if (found_key.objectid != key.objectid ||
+ found_key.type != key.type)
break;
- }
di = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
+ dm = get_waiting_dir_move(sctx, loc.objectid);
+ if (dm) {
+ struct orphan_dir_info *odi;
+
+ odi = add_orphan_dir_info(sctx, dir);
+ if (IS_ERR(odi)) {
+ ret = PTR_ERR(odi);
+ goto out;
+ }
+ odi->gen = dir_gen;
+ dm->rmdir_ino = dir;
+ ret = 0;
+ goto out;
+ }
+
if (loc.objectid > send_progress) {
ret = 0;
goto out;
}
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
ret = 1;
@@ -2746,10 +2893,452 @@ out:
return ret;
}
+static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
+{
+ struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
+
+ return entry != NULL;
+}
+
+static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+ struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
+ struct rb_node *parent = NULL;
+ struct waiting_dir_move *entry, *dm;
+
+ dm = kmalloc(sizeof(*dm), GFP_NOFS);
+ if (!dm)
+ return -ENOMEM;
+ dm->ino = ino;
+ dm->rmdir_ino = 0;
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct waiting_dir_move, node);
+ if (ino < entry->ino) {
+ p = &(*p)->rb_left;
+ } else if (ino > entry->ino) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(dm);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&dm->node, parent, p);
+ rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
+ return 0;
+}
+
+static struct waiting_dir_move *
+get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
+{
+ struct rb_node *n = sctx->waiting_dir_moves.rb_node;
+ struct waiting_dir_move *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct waiting_dir_move, node);
+ if (ino < entry->ino)
+ n = n->rb_left;
+ else if (ino > entry->ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static void free_waiting_dir_move(struct send_ctx *sctx,
+ struct waiting_dir_move *dm)
+{
+ if (!dm)
+ return;
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+}
+
+static int add_pending_dir_move(struct send_ctx *sctx,
+ u64 ino,
+ u64 ino_gen,
+ u64 parent_ino,
+ struct list_head *new_refs,
+ struct list_head *deleted_refs)
+{
+ struct rb_node **p = &sctx->pending_dir_moves.rb_node;
+ struct rb_node *parent = NULL;
+ struct pending_dir_move *entry = NULL, *pm;
+ struct recorded_ref *cur;
+ int exists = 0;
+ int ret;
+
+ pm = kmalloc(sizeof(*pm), GFP_NOFS);
+ if (!pm)
+ return -ENOMEM;
+ pm->parent_ino = parent_ino;
+ pm->ino = ino;
+ pm->gen = ino_gen;
+ INIT_LIST_HEAD(&pm->list);
+ INIT_LIST_HEAD(&pm->update_refs);
+ RB_CLEAR_NODE(&pm->node);
+
+ while (*p) {
+ parent = *p;
+ entry = rb_entry(parent, struct pending_dir_move, node);
+ if (parent_ino < entry->parent_ino) {
+ p = &(*p)->rb_left;
+ } else if (parent_ino > entry->parent_ino) {
+ p = &(*p)->rb_right;
+ } else {
+ exists = 1;
+ break;
+ }
+ }
+
+ list_for_each_entry(cur, deleted_refs, list) {
+ ret = dup_ref(cur, &pm->update_refs);
+ if (ret < 0)
+ goto out;
+ }
+ list_for_each_entry(cur, new_refs, list) {
+ ret = dup_ref(cur, &pm->update_refs);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = add_waiting_dir_move(sctx, pm->ino);
+ if (ret)
+ goto out;
+
+ if (exists) {
+ list_add_tail(&pm->list, &entry->list);
+ } else {
+ rb_link_node(&pm->node, parent, p);
+ rb_insert_color(&pm->node, &sctx->pending_dir_moves);
+ }
+ ret = 0;
+out:
+ if (ret) {
+ __free_recorded_refs(&pm->update_refs);
+ kfree(pm);
+ }
+ return ret;
+}
+
+static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
+ u64 parent_ino)
+{
+ struct rb_node *n = sctx->pending_dir_moves.rb_node;
+ struct pending_dir_move *entry;
+
+ while (n) {
+ entry = rb_entry(n, struct pending_dir_move, node);
+ if (parent_ino < entry->parent_ino)
+ n = n->rb_left;
+ else if (parent_ino > entry->parent_ino)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+static int path_loop(struct send_ctx *sctx, struct fs_path *name,
+ u64 ino, u64 gen, u64 *ancestor_ino)
+{
+ int ret = 0;
+ u64 parent_inode = 0;
+ u64 parent_gen = 0;
+ u64 start_ino = ino;
+
+ *ancestor_ino = 0;
+ while (ino != BTRFS_FIRST_FREE_OBJECTID) {
+ fs_path_reset(name);
+
+ if (is_waiting_for_rm(sctx, ino))
+ break;
+ if (is_waiting_for_move(sctx, ino)) {
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ ret = get_first_ref(sctx->parent_root, ino,
+ &parent_inode, &parent_gen, name);
+ } else {
+ ret = __get_cur_name_and_parent(sctx, ino, gen,
+ &parent_inode,
+ &parent_gen, name);
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ }
+ if (ret < 0)
+ break;
+ if (parent_inode == start_ino) {
+ ret = 1;
+ if (*ancestor_ino == 0)
+ *ancestor_ino = ino;
+ break;
+ }
+ ino = parent_inode;
+ gen = parent_gen;
+ }
+ return ret;
+}
+
+static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
+{
+ struct fs_path *from_path = NULL;
+ struct fs_path *to_path = NULL;
+ struct fs_path *name = NULL;
+ u64 orig_progress = sctx->send_progress;
+ struct recorded_ref *cur;
+ u64 parent_ino, parent_gen;
+ struct waiting_dir_move *dm = NULL;
+ u64 rmdir_ino = 0;
+ int ret;
+ u64 ancestor = 0;
+
+ name = fs_path_alloc();
+ from_path = fs_path_alloc();
+ if (!name || !from_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ rmdir_ino = dm->rmdir_ino;
+ free_waiting_dir_move(sctx, dm);
+
+ ret = get_first_ref(sctx->parent_root, pm->ino,
+ &parent_ino, &parent_gen, name);
+ if (ret < 0)
+ goto out;
+
+ ret = get_cur_path(sctx, parent_ino, parent_gen,
+ from_path);
+ if (ret < 0)
+ goto out;
+ ret = fs_path_add_path(from_path, name);
+ if (ret < 0)
+ goto out;
+
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
+ if (ret) {
+ LIST_HEAD(deleted_refs);
+ ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
+ ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
+ &pm->update_refs, &deleted_refs);
+ if (ret < 0)
+ goto out;
+ if (rmdir_ino) {
+ dm = get_waiting_dir_move(sctx, pm->ino);
+ ASSERT(dm);
+ dm->rmdir_ino = rmdir_ino;
+ }
+ goto out;
+ }
+ fs_path_reset(name);
+ to_path = name;
+ name = NULL;
+ ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
+ if (ret < 0)
+ goto out;
+
+ ret = send_rename(sctx, from_path, to_path);
+ if (ret < 0)
+ goto out;
+
+ if (rmdir_ino) {
+ struct orphan_dir_info *odi;
+
+ odi = get_orphan_dir_info(sctx, rmdir_ino);
+ if (!odi) {
+ /* already deleted */
+ goto finish;
+ }
+ ret = can_rmdir(sctx, rmdir_ino, odi->gen, sctx->cur_ino + 1);
+ if (ret < 0)
+ goto out;
+ if (!ret)
+ goto finish;
+
+ name = fs_path_alloc();
+ if (!name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = get_cur_path(sctx, rmdir_ino, odi->gen, name);
+ if (ret < 0)
+ goto out;
+ ret = send_rmdir(sctx, name);
+ if (ret < 0)
+ goto out;
+ free_orphan_dir_info(sctx, odi);
+ }
+
+finish:
+ ret = send_utimes(sctx, pm->ino, pm->gen);
+ if (ret < 0)
+ goto out;
+
+ /*
+ * After rename/move, need to update the utimes of both new parent(s)
+ * and old parent(s).
+ */
+ list_for_each_entry(cur, &pm->update_refs, list) {
+ if (cur->dir == rmdir_ino)
+ continue;
+ ret = send_utimes(sctx, cur->dir, cur->dir_gen);
+ if (ret < 0)
+ goto out;
+ }
+
+out:
+ fs_path_free(name);
+ fs_path_free(from_path);
+ fs_path_free(to_path);
+ sctx->send_progress = orig_progress;
+
+ return ret;
+}
+
+static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
+{
+ if (!list_empty(&m->list))
+ list_del(&m->list);
+ if (!RB_EMPTY_NODE(&m->node))
+ rb_erase(&m->node, &sctx->pending_dir_moves);
+ __free_recorded_refs(&m->update_refs);
+ kfree(m);
+}
+
+static void tail_append_pending_moves(struct pending_dir_move *moves,
+ struct list_head *stack)
+{
+ if (list_empty(&moves->list)) {
+ list_add_tail(&moves->list, stack);
+ } else {
+ LIST_HEAD(list);
+ list_splice_init(&moves->list, &list);
+ list_add_tail(&moves->list, stack);
+ list_splice_tail(&list, stack);
+ }
+}
+
+static int apply_children_dir_moves(struct send_ctx *sctx)
+{
+ struct pending_dir_move *pm;
+ struct list_head stack;
+ u64 parent_ino = sctx->cur_ino;
+ int ret = 0;
+
+ pm = get_pending_dir_moves(sctx, parent_ino);
+ if (!pm)
+ return 0;
+
+ INIT_LIST_HEAD(&stack);
+ tail_append_pending_moves(pm, &stack);
+
+ while (!list_empty(&stack)) {
+ pm = list_first_entry(&stack, struct pending_dir_move, list);
+ parent_ino = pm->ino;
+ ret = apply_dir_move(sctx, pm);
+ free_pending_move(sctx, pm);
+ if (ret)
+ goto out;
+ pm = get_pending_dir_moves(sctx, parent_ino);
+ if (pm)
+ tail_append_pending_moves(pm, &stack);
+ }
+ return 0;
+
+out:
+ while (!list_empty(&stack)) {
+ pm = list_first_entry(&stack, struct pending_dir_move, list);
+ free_pending_move(sctx, pm);
+ }
+ return ret;
+}
+
+static int wait_for_parent_move(struct send_ctx *sctx,
+ struct recorded_ref *parent_ref)
+{
+ int ret = 0;
+ u64 ino = parent_ref->dir;
+ u64 parent_ino_before, parent_ino_after;
+ struct fs_path *path_before = NULL;
+ struct fs_path *path_after = NULL;
+ int len1, len2;
+
+ path_after = fs_path_alloc();
+ path_before = fs_path_alloc();
+ if (!path_after || !path_before) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Our current directory inode may not yet be renamed/moved because some
+ * ancestor (immediate or not) has to be renamed/moved first. So find if
+ * such ancestor exists and make sure our own rename/move happens after
+ * that ancestor is processed.
+ */
+ while (ino > BTRFS_FIRST_FREE_OBJECTID) {
+ if (is_waiting_for_move(sctx, ino)) {
+ ret = 1;
+ break;
+ }
+
+ fs_path_reset(path_before);
+ fs_path_reset(path_after);
+
+ ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
+ NULL, path_after);
+ if (ret < 0)
+ goto out;
+ ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
+ NULL, path_before);
+ if (ret < 0 && ret != -ENOENT) {
+ goto out;
+ } else if (ret == -ENOENT) {
+ ret = 1;
+ break;
+ }
+
+ len1 = fs_path_len(path_before);
+ len2 = fs_path_len(path_after);
+ if (ino > sctx->cur_ino &&
+ (parent_ino_before != parent_ino_after || len1 != len2 ||
+ memcmp(path_before->start, path_after->start, len1))) {
+ ret = 1;
+ break;
+ }
+ ino = parent_ino_after;
+ }
+
+out:
+ fs_path_free(path_before);
+ fs_path_free(path_after);
+
+ if (ret == 1) {
+ ret = add_pending_dir_move(sctx,
+ sctx->cur_ino,
+ sctx->cur_inode_gen,
+ ino,
+ &sctx->new_refs,
+ &sctx->deleted_refs);
+ if (!ret)
+ ret = 1;
+ }
+
+ return ret;
+}
+
/*
* This does all the move/link/unlink/rmdir magic.
*/
-static int process_recorded_refs(struct send_ctx *sctx)
+static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
{
int ret = 0;
struct recorded_ref *cur;
@@ -2760,6 +3349,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
u64 ow_gen;
int did_overwrite = 0;
int is_orphan = 0;
+ u64 last_dir_ino_rm = 0;
verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
@@ -2898,11 +3488,18 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* dirs, we always have one new and one deleted
* ref. The deleted ref is ignored later.
*/
- ret = send_rename(sctx, valid_path,
- cur->full_path);
+ ret = wait_for_parent_move(sctx, cur);
if (ret < 0)
goto out;
- ret = fs_path_copy(valid_path, cur->full_path);
+ if (ret) {
+ *pending_move = 1;
+ } else {
+ ret = send_rename(sctx, valid_path,
+ cur->full_path);
+ if (!ret)
+ ret = fs_path_copy(valid_path,
+ cur->full_path);
+ }
if (ret < 0)
goto out;
} else {
@@ -2924,7 +3521,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
* later, we do this check again and rmdir it then if possible.
* See the use of check_dirs for more details.
*/
- ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_ino);
+ ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
+ sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
@@ -3015,8 +3613,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = send_utimes(sctx, cur->dir, cur->dir_gen);
if (ret < 0)
goto out;
- } else if (ret == inode_state_did_delete) {
- ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
+ } else if (ret == inode_state_did_delete &&
+ cur->dir != last_dir_ino_rm) {
+ ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
+ sctx->cur_ino);
if (ret < 0)
goto out;
if (ret) {
@@ -3027,6 +3627,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
ret = send_rmdir(sctx, valid_path);
if (ret < 0)
goto out;
+ last_dir_ino_rm = cur->dir;
}
}
}
@@ -3040,9 +3641,8 @@ out:
return ret;
}
-static int __record_new_ref(int num, u64 dir, int index,
- struct fs_path *name,
- void *ctx)
+static int record_ref(struct btrfs_root *root, int num, u64 dir, int index,
+ struct fs_path *name, void *ctx, struct list_head *refs)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -3053,7 +3653,7 @@ static int __record_new_ref(int num, u64 dir, int index,
if (!p)
return -ENOMEM;
- ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, NULL,
+ ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
NULL, NULL);
if (ret < 0)
goto out;
@@ -3065,7 +3665,7 @@ static int __record_new_ref(int num, u64 dir, int index,
if (ret < 0)
goto out;
- ret = record_ref(&sctx->new_refs, dir, gen, p);
+ ret = __record_ref(refs, dir, gen, p);
out:
if (ret)
@@ -3073,37 +3673,23 @@ out:
return ret;
}
+static int __record_new_ref(int num, u64 dir, int index,
+ struct fs_path *name,
+ void *ctx)
+{
+ struct send_ctx *sctx = ctx;
+ return record_ref(sctx->send_root, num, dir, index, name,
+ ctx, &sctx->new_refs);
+}
+
+
static int __record_deleted_ref(int num, u64 dir, int index,
struct fs_path *name,
void *ctx)
{
- int ret = 0;
struct send_ctx *sctx = ctx;
- struct fs_path *p;
- u64 gen;
-
- p = fs_path_alloc();
- if (!p)
- return -ENOMEM;
-
- ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, NULL,
- NULL, NULL);
- if (ret < 0)
- goto out;
-
- ret = get_cur_path(sctx, dir, gen, p);
- if (ret < 0)
- goto out;
- ret = fs_path_add_path(p, name);
- if (ret < 0)
- goto out;
-
- ret = record_ref(&sctx->deleted_refs, dir, gen, p);
-
-out:
- if (ret)
- fs_path_free(p);
- return ret;
+ return record_ref(sctx->parent_root, num, dir, index, name,
+ ctx, &sctx->deleted_refs);
}
static int record_new_ref(struct send_ctx *sctx)
@@ -3271,6 +3857,7 @@ static int process_all_refs(struct send_ctx *sctx,
struct extent_buffer *eb;
int slot;
iterate_inode_ref_t cb;
+ int pending_move = 0;
path = alloc_path_for_send();
if (!path)
@@ -3283,21 +3870,31 @@ static int process_all_refs(struct send_ctx *sctx,
root = sctx->parent_root;
cb = __record_deleted_ref;
} else {
- BUG();
+ btrfs_err(sctx->send_root->fs_info,
+ "Wrong command %d in process_all_refs", cmd);
+ ret = -EINVAL;
+ goto out;
}
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret)
- break;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
@@ -3306,15 +3903,16 @@ static int process_all_refs(struct send_ctx *sctx,
break;
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
- btrfs_release_path(path);
if (ret < 0)
goto out;
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
btrfs_release_path(path);
- ret = process_recorded_refs(sctx);
+ ret = process_recorded_refs(sctx, &pending_move);
+ /* Only applicable to an incremental send. */
+ ASSERT(pending_move == 0);
out:
btrfs_free_path(path);
@@ -3589,19 +4187,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
- btrfs_item_key_to_cpu(eb, &found_key, slot);
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+ btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
@@ -3613,8 +4217,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
if (ret < 0)
goto out;
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
out:
@@ -3622,6 +4225,79 @@ out:
return ret;
}
+static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
+{
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct inode *inode;
+ struct page *page;
+ char *addr;
+ struct btrfs_key key;
+ pgoff_t index = offset >> PAGE_CACHE_SHIFT;
+ pgoff_t last_index;
+ unsigned pg_offset = offset & ~PAGE_CACHE_MASK;
+ ssize_t ret = 0;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ if (offset + len > i_size_read(inode)) {
+ if (offset > i_size_read(inode))
+ len = 0;
+ else
+ len = offset - i_size_read(inode);
+ }
+ if (len == 0)
+ goto out;
+
+ last_index = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+
+ /* initial readahead */
+ memset(&sctx->ra, 0, sizeof(struct file_ra_state));
+ file_ra_state_init(&sctx->ra, inode->i_mapping);
+ btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index,
+ last_index - index + 1);
+
+ while (index <= last_index) {
+ unsigned cur_len = min_t(unsigned, len,
+ PAGE_CACHE_SIZE - pg_offset);
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ if (!PageUptodate(page)) {
+ btrfs_readpage(NULL, page);
+ lock_page(page);
+ if (!PageUptodate(page)) {
+ unlock_page(page);
+ page_cache_release(page);
+ ret = -EIO;
+ break;
+ }
+ }
+
+ addr = kmap(page);
+ memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
+ kunmap(page);
+ unlock_page(page);
+ page_cache_release(page);
+ index++;
+ pg_offset = 0;
+ len -= cur_len;
+ ret += cur_len;
+ }
+out:
+ iput(inode);
+ return ret;
+}
+
/*
* Read some bytes from the current inode/file and send a write command to
* user space.
@@ -3630,35 +4306,20 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
{
int ret = 0;
struct fs_path *p;
- loff_t pos = offset;
- int num_read = 0;
- mm_segment_t old_fs;
+ ssize_t num_read = 0;
p = fs_path_alloc();
if (!p)
return -ENOMEM;
- /*
- * vfs normally only accepts user space buffers for security reasons.
- * we only read from the file and also only provide the read_buf buffer
- * to vfs. As this buffer does not come from a user space call, it's
- * ok to temporary allow kernel space buffers.
- */
- old_fs = get_fs();
- set_fs(KERNEL_DS);
-
verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
- ret = open_cur_inode_file(sctx);
- if (ret < 0)
- goto out;
-
- ret = vfs_read(sctx->cur_inode_filp, sctx->read_buf, len, &pos);
- if (ret < 0)
- goto out;
- num_read = ret;
- if (!num_read)
+ num_read = fill_read_buf(sctx, offset, len);
+ if (num_read <= 0) {
+ if (num_read < 0)
+ ret = num_read;
goto out;
+ }
ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
if (ret < 0)
@@ -3677,7 +4338,6 @@ verbose_printk("btrfs: send_write offset=%llu, len=%d\n", offset, len);
tlv_put_failure:
out:
fs_path_free(p);
- set_fs(old_fs);
if (ret < 0)
return ret;
return num_read;
@@ -3730,7 +4390,7 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, "
TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
clone_root->root->root_item.uuid);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
- clone_root->root->root_item.ctransid);
+ le64_to_cpu(clone_root->root->root_item.ctransid));
TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
clone_root->offset);
@@ -3776,6 +4436,39 @@ out:
return ret;
}
+static int send_hole(struct send_ctx *sctx, u64 end)
+{
+ struct fs_path *p = NULL;
+ u64 offset = sctx->cur_inode_last_extent;
+ u64 len;
+ int ret = 0;
+
+ p = fs_path_alloc();
+ if (!p)
+ return -ENOMEM;
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+ if (ret < 0)
+ goto tlv_put_failure;
+ memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
+ while (offset < end) {
+ len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
+
+ ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
+ if (ret < 0)
+ break;
+ TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+ TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+ TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
+ ret = send_cmd(sctx);
+ if (ret < 0)
+ break;
+ offset += len;
+ }
+tlv_put_failure:
+ fs_path_free(p);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -3788,12 +4481,14 @@ static int send_write_or_clone(struct send_ctx *sctx,
u64 len;
u32 l;
u8 type;
+ u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], ei);
if (type == BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_inline_len(path->nodes[0], ei);
+ len = btrfs_file_extent_inline_len(path->nodes[0],
+ path->slots[0], ei);
/*
* it is possible the inline item won't cover the whole page,
* but there may be items after this page. Make
@@ -3811,7 +4506,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
goto out;
}
- if (clone_root) {
+ if (clone_root && IS_ALIGNED(offset + len, bs)) {
ret = send_clone(sctx, offset, len, clone_root);
} else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
ret = send_update_extent(sctx, offset, len);
@@ -3926,16 +4621,16 @@ static int is_extent_unchanged(struct send_ctx *sctx,
while (key.offset < ekey->offset + left_len) {
ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
right_type = btrfs_file_extent_type(eb, ei);
- right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
- right_len = btrfs_file_extent_num_bytes(eb, ei);
- right_offset = btrfs_file_extent_offset(eb, ei);
- right_gen = btrfs_file_extent_generation(eb, ei);
-
if (right_type != BTRFS_FILE_EXTENT_REG) {
ret = 0;
goto out;
}
+ right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
+ right_len = btrfs_file_extent_num_bytes(eb, ei);
+ right_offset = btrfs_file_extent_offset(eb, ei);
+ right_gen = btrfs_file_extent_generation(eb, ei);
+
/*
* Are we at extent 8? If yes, we know the extent is changed.
* This may only happen on the first iteration.
@@ -4003,6 +4698,101 @@ out:
return ret;
}
+static int get_last_extent(struct send_ctx *sctx, u64 offset)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root = sctx->send_root;
+ struct btrfs_file_extent_item *fi;
+ struct btrfs_key key;
+ u64 extent_end;
+ u8 type;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ sctx->cur_inode_last_extent = 0;
+
+ key.objectid = sctx->cur_ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = offset;
+ ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ goto out;
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+ path->slots[0], fi);
+ extent_end = ALIGN(key.offset + size,
+ sctx->send_root->sectorsize);
+ } else {
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(path->nodes[0], fi);
+ }
+ sctx->cur_inode_last_extent = extent_end;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
+ struct btrfs_key *key)
+{
+ struct btrfs_file_extent_item *fi;
+ u64 extent_end;
+ u8 type;
+ int ret = 0;
+
+ if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
+ return 0;
+
+ if (sctx->cur_inode_last_extent == (u64)-1) {
+ ret = get_last_extent(sctx, key->offset - 1);
+ if (ret)
+ return ret;
+ }
+
+ fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(path->nodes[0], fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 size = btrfs_file_extent_inline_len(path->nodes[0],
+ path->slots[0], fi);
+ extent_end = ALIGN(key->offset + size,
+ sctx->send_root->sectorsize);
+ } else {
+ extent_end = key->offset +
+ btrfs_file_extent_num_bytes(path->nodes[0], fi);
+ }
+
+ if (path->slots[0] == 0 &&
+ sctx->cur_inode_last_extent < key->offset) {
+ /*
+ * We might have skipped entire leafs that contained only
+ * file extent items for our current inode. These leafs have
+ * a generation number smaller (older) than the one in the
+ * current leaf and the leaf our last extent came from, and
+ * are located between these 2 leafs.
+ */
+ ret = get_last_extent(sctx, key->offset - 1);
+ if (ret)
+ return ret;
+ }
+
+ if (sctx->cur_inode_last_extent < key->offset)
+ ret = send_hole(sctx, key->offset);
+ sctx->cur_inode_last_extent = extent_end;
+ return ret;
+}
+
static int process_extent(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key)
@@ -4019,7 +4809,7 @@ static int process_extent(struct send_ctx *sctx,
goto out;
if (ret) {
ret = 0;
- goto out;
+ goto out_hole;
}
} else {
struct btrfs_file_extent_item *ei;
@@ -4055,7 +4845,10 @@ static int process_extent(struct send_ctx *sctx,
goto out;
ret = send_write_or_clone(sctx, path, key, found_clone);
-
+ if (ret)
+ goto out;
+out_hole:
+ ret = maybe_send_hole(sctx, path, key);
out:
return ret;
}
@@ -4078,17 +4871,25 @@ static int process_all_extents(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
- while (1) {
- ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
- if (ret < 0)
- goto out;
- if (ret) {
- ret = 0;
- goto out;
- }
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ while (1) {
eb = path->nodes[0];
slot = path->slots[0];
+
+ if (slot >= btrfs_header_nritems(eb)) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ continue;
+ }
+
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
@@ -4101,8 +4902,7 @@ static int process_all_extents(struct send_ctx *sctx)
if (ret < 0)
goto out;
- btrfs_release_path(path);
- key.offset = found_key.offset + 1;
+ path->slots[0]++;
}
out:
@@ -4110,7 +4910,9 @@ out:
return ret;
}
-static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
+static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
+ int *pending_move,
+ int *refs_processed)
{
int ret = 0;
@@ -4122,17 +4924,11 @@ static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end)
if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
goto out;
- ret = process_recorded_refs(sctx);
+ ret = process_recorded_refs(sctx, pending_move);
if (ret < 0)
goto out;
- /*
- * We have processed the refs and thus need to advance send_progress.
- * Now, calls to get_cur_xxx will take the updated refs of the current
- * inode into account.
- */
- sctx->send_progress = sctx->cur_ino + 1;
-
+ *refs_processed = 1;
out:
return ret;
}
@@ -4148,11 +4944,29 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
u64 right_gid;
int need_chmod = 0;
int need_chown = 0;
+ int pending_move = 0;
+ int refs_processed = 0;
- ret = process_recorded_refs_if_needed(sctx, at_end);
+ ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
+ &refs_processed);
if (ret < 0)
goto out;
+ /*
+ * We have processed the refs and thus need to advance send_progress.
+ * Now, calls to get_cur_xxx will take the updated refs of the current
+ * inode into account.
+ *
+ * On the other hand, if our current inode is a directory and couldn't
+ * be moved/renamed because its parent was renamed/moved too and it has
+ * a higher inode number, we can only move/rename our current inode
+ * after we moved/renamed its parent. Therefore in this case operate on
+ * the old path (pre move/rename) of our current inode, and the
+ * move/rename will be performed later.
+ */
+ if (refs_processed && !pending_move)
+ sctx->send_progress = sctx->cur_ino + 1;
+
if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
goto out;
if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
@@ -4181,6 +4995,21 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
}
if (S_ISREG(sctx->cur_inode_mode)) {
+ if (need_send_hole(sctx)) {
+ if (sctx->cur_inode_last_extent == (u64)-1 ||
+ sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
+ ret = get_last_extent(sctx, (u64)-1);
+ if (ret)
+ goto out;
+ }
+ if (sctx->cur_inode_last_extent <
+ sctx->cur_inode_size) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret)
+ goto out;
+ }
+ }
ret = send_truncate(sctx, sctx->cur_ino, sctx->cur_inode_gen,
sctx->cur_inode_size);
if (ret < 0)
@@ -4201,12 +5030,25 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
}
/*
- * Need to send that every time, no matter if it actually changed
- * between the two trees as we have done changes to the inode before.
+ * If other directory inodes depended on our current directory
+ * inode's move/rename, now do their move/rename operations.
*/
- ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
- if (ret < 0)
- goto out;
+ if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
+ ret = apply_children_dir_moves(sctx);
+ if (ret)
+ goto out;
+ /*
+ * Need to send that every time, no matter if it actually
+ * changed between the two trees as we have done changes to
+ * the inode before. If our inode is a directory and it's
+ * waiting to be moved/renamed, we will send its utimes when
+ * it's moved/renamed, therefore we don't need to do it here.
+ */
+ sctx->send_progress = sctx->cur_ino + 1;
+ ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
+ if (ret < 0)
+ goto out;
+ }
out:
return ret;
@@ -4222,12 +5064,9 @@ static int changed_inode(struct send_ctx *sctx,
u64 left_gen = 0;
u64 right_gen = 0;
- ret = close_cur_inode_file(sctx);
- if (ret < 0)
- goto out;
-
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
+ sctx->cur_inode_last_extent = (u64)-1;
/*
* Set send_progress to current inode. This will tell all get_cur_xxx
@@ -4276,6 +5115,8 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
ret = send_create_inode_if_needed(sctx);
} else if (result == BTRFS_COMPARE_TREE_DELETED) {
@@ -4320,6 +5161,8 @@ static int changed_inode(struct send_ctx *sctx,
sctx->left_path->nodes[0], left_ii);
sctx->cur_inode_mode = btrfs_inode_mode(
sctx->left_path->nodes[0], left_ii);
+ sctx->cur_inode_rdev = btrfs_inode_rdev(
+ sctx->left_path->nodes[0], left_ii);
ret = send_create_inode_if_needed(sctx);
if (ret < 0)
goto out;
@@ -4508,14 +5351,18 @@ static int changed_cb(struct btrfs_root *left_root,
struct send_ctx *sctx = ctx;
if (result == BTRFS_COMPARE_TREE_SAME) {
- if (key->type != BTRFS_INODE_REF_KEY &&
- key->type != BTRFS_INODE_EXTREF_KEY)
- return 0;
- ret = compare_refs(sctx, left_path, key);
- if (!ret)
+ if (key->type == BTRFS_INODE_REF_KEY ||
+ key->type == BTRFS_INODE_EXTREF_KEY) {
+ ret = compare_refs(sctx, left_path, key);
+ if (!ret)
+ return 0;
+ if (ret < 0)
+ return ret;
+ } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
+ return maybe_send_hole(sctx, left_path, key);
+ } else {
return 0;
- if (ret < 0)
- return ret;
+ }
result = BTRFS_COMPARE_TREE_CHANGED;
ret = 0;
}
@@ -4550,57 +5397,21 @@ out:
static int full_send_tree(struct send_ctx *sctx)
{
int ret;
- struct btrfs_trans_handle *trans = NULL;
struct btrfs_root *send_root = sctx->send_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_path *path;
struct extent_buffer *eb;
int slot;
- u64 start_ctransid;
- u64 ctransid;
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
- spin_lock(&send_root->root_item_lock);
- start_ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
-join_trans:
- /*
- * We need to make sure the transaction does not get committed
- * while we do anything on commit roots. Join a transaction to prevent
- * this.
- */
- trans = btrfs_join_transaction(send_root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
-
- /*
- * Make sure the tree has not changed after re-joining. We detect this
- * by comparing start_ctransid and ctransid. They should always match.
- */
- spin_lock(&send_root->root_item_lock);
- ctransid = btrfs_root_ctransid(&send_root->root_item);
- spin_unlock(&send_root->root_item_lock);
-
- if (ctransid != start_ctransid) {
- WARN(1, KERN_WARNING "btrfs: the root that you're trying to "
- "send was modified in between. This is "
- "probably a bug.\n");
- ret = -EIO;
- goto out;
- }
-
ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
if (ret < 0)
goto out;
@@ -4608,19 +5419,6 @@ join_trans:
goto out_finish;
while (1) {
- /*
- * When someone want to commit while we iterate, end the
- * joined transaction and rejoin.
- */
- if (btrfs_should_end_transaction(trans, send_root)) {
- ret = btrfs_end_transaction(trans, send_root);
- trans = NULL;
- if (ret < 0)
- goto out;
- btrfs_release_path(path);
- goto join_trans;
- }
-
eb = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
@@ -4648,12 +5446,6 @@ out_finish:
out:
btrfs_free_path(path);
- if (trans) {
- if (!ret)
- ret = btrfs_end_transaction(trans, send_root);
- else
- btrfs_end_transaction(trans, send_root);
- }
return ret;
}
@@ -4686,15 +5478,25 @@ static int send_subvol(struct send_ctx *sctx)
}
out:
- if (!ret)
- ret = close_cur_inode_file(sctx);
- else
- close_cur_inode_file(sctx);
-
free_recorded_refs(sctx);
return ret;
}
+static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
+{
+ spin_lock(&root->root_item_lock);
+ root->send_in_progress--;
+ /*
+ * Not much left to do, we don't know why it's unbalanced and
+ * can't blindly reset it to 0.
+ */
+ if (root->send_in_progress < 0)
+ btrfs_err(root->fs_info,
+ "send_in_progres unbalanced %d root %llu",
+ root->send_in_progress, root->root_key.objectid);
+ spin_unlock(&root->root_item_lock);
+}
+
long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
{
int ret = 0;
@@ -4706,6 +5508,9 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
struct send_ctx *sctx = NULL;
u32 i;
u64 *clone_sources_tmp = NULL;
+ int clone_sources_to_rollback = 0;
+ int sort_clone_roots = 0;
+ int index;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4714,38 +5519,26 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
fs_info = send_root->fs_info;
/*
+ * The subvolume must remain read-only during send, protect against
+ * making it RW. This also protects against deletion.
+ */
+ spin_lock(&send_root->root_item_lock);
+ send_root->send_in_progress++;
+ spin_unlock(&send_root->root_item_lock);
+
+ /*
* This is done when we lookup the root, it should already be complete
* by the time we get here.
*/
WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
/*
- * If we just created this root we need to make sure that the orphan
- * cleanup has been done and committed since we search the commit root,
- * so check its commit root transid with our otransid and if they match
- * commit the transaction to make sure everything is updated.
+ * Userspace tools do the checks and warn the user if it's
+ * not RO.
*/
- down_read(&send_root->fs_info->extent_commit_sem);
- if (btrfs_header_generation(send_root->commit_root) ==
- btrfs_root_otransid(&send_root->root_item)) {
- struct btrfs_trans_handle *trans;
-
- up_read(&send_root->fs_info->extent_commit_sem);
-
- trans = btrfs_attach_transaction_barrier(send_root);
- if (IS_ERR(trans)) {
- if (PTR_ERR(trans) != -ENOENT) {
- ret = PTR_ERR(trans);
- goto out;
- }
- /* ENOENT means theres no transaction */
- } else {
- ret = btrfs_commit_transaction(trans, send_root);
- if (ret)
- goto out;
- }
- } else {
- up_read(&send_root->fs_info->extent_commit_sem);
+ if (!btrfs_root_readonly(send_root)) {
+ ret = -EPERM;
+ goto out;
}
arg = memdup_user(arg_, sizeof(*arg));
@@ -4756,8 +5549,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
}
if (!access_ok(VERIFY_READ, arg->clone_sources,
- sizeof(*arg->clone_sources *
- arg->clone_sources_count))) {
+ sizeof(*arg->clone_sources) *
+ arg->clone_sources_count)) {
ret = -EFAULT;
goto out;
}
@@ -4786,9 +5579,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
- sctx->mnt = mnt_file->f_path.mnt;
-
sctx->send_root = send_root;
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started
+ */
+ if (btrfs_root_dead(sctx->send_root)) {
+ ret = -EPERM;
+ goto out;
+ }
+
sctx->clone_roots_cnt = arg->clone_sources_count;
sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
@@ -4804,6 +5604,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
goto out;
}
+ sctx->pending_dir_moves = RB_ROOT;
+ sctx->waiting_dir_moves = RB_ROOT;
+ sctx->orphan_dirs = RB_ROOT;
+
sctx->clone_roots = vzalloc(sizeof(struct clone_root) *
(arg->clone_sources_count + 1));
if (!sctx->clone_roots) {
@@ -4831,11 +5635,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.objectid = clone_sources_tmp[i];
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(clone_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(clone_root);
goto out;
}
+ clone_sources_to_rollback = i + 1;
+ spin_lock(&clone_root->root_item_lock);
+ clone_root->send_in_progress++;
+ if (!btrfs_root_readonly(clone_root)) {
+ spin_unlock(&clone_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -EPERM;
+ goto out;
+ }
+ spin_unlock(&clone_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+
sctx->clone_roots[i].root = clone_root;
}
vfree(clone_sources_tmp);
@@ -4846,11 +5666,28 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
key.objectid = arg->parent_root;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
+
+ index = srcu_read_lock(&fs_info->subvol_srcu);
+
sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(sctx->parent_root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = PTR_ERR(sctx->parent_root);
goto out;
}
+
+ spin_lock(&sctx->parent_root->root_item_lock);
+ sctx->parent_root->send_in_progress++;
+ if (!btrfs_root_readonly(sctx->parent_root) ||
+ btrfs_root_dead(sctx->parent_root)) {
+ spin_unlock(&sctx->parent_root->root_item_lock);
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -EPERM;
+ goto out;
+ }
+ spin_unlock(&sctx->parent_root->root_item_lock);
+
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
}
/*
@@ -4864,8 +5701,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
sort(sctx->clone_roots, sctx->clone_roots_cnt,
sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
NULL);
+ sort_clone_roots = 1;
+ current->journal_info = (void *)BTRFS_SEND_TRANS_STUB;
ret = send_subvol(sctx);
+ current->journal_info = NULL;
if (ret < 0)
goto out;
@@ -4879,6 +5719,58 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
}
out:
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
+ struct rb_node *n;
+ struct pending_dir_move *pm;
+
+ n = rb_first(&sctx->pending_dir_moves);
+ pm = rb_entry(n, struct pending_dir_move, node);
+ while (!list_empty(&pm->list)) {
+ struct pending_dir_move *pm2;
+
+ pm2 = list_first_entry(&pm->list,
+ struct pending_dir_move, list);
+ free_pending_move(sctx, pm2);
+ }
+ free_pending_move(sctx, pm);
+ }
+
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
+ struct rb_node *n;
+ struct waiting_dir_move *dm;
+
+ n = rb_first(&sctx->waiting_dir_moves);
+ dm = rb_entry(n, struct waiting_dir_move, node);
+ rb_erase(&dm->node, &sctx->waiting_dir_moves);
+ kfree(dm);
+ }
+
+ WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
+ while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
+ struct rb_node *n;
+ struct orphan_dir_info *odi;
+
+ n = rb_first(&sctx->orphan_dirs);
+ odi = rb_entry(n, struct orphan_dir_info, node);
+ free_orphan_dir_info(sctx, odi);
+ }
+
+ if (sort_clone_roots) {
+ for (i = 0; i < sctx->clone_roots_cnt; i++)
+ btrfs_root_dec_send_in_progress(
+ sctx->clone_roots[i].root);
+ } else {
+ for (i = 0; sctx && i < clone_sources_to_rollback; i++)
+ btrfs_root_dec_send_in_progress(
+ sctx->clone_roots[i].root);
+
+ btrfs_root_dec_send_in_progress(send_root);
+ }
+ if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
+ btrfs_root_dec_send_in_progress(sctx->parent_root);
+
kfree(arg);
vfree(clone_sources_tmp);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e913328d0f2..8e16bca69c5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -42,13 +42,14 @@
#include <linux/cleancache.h>
#include <linux/ratelimit.h>
#include <linux/btrfs.h>
-#include "compat.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
+#include "hash.h"
+#include "props.h"
#include "xattr.h"
#include "volumes.h"
#include "export.h"
@@ -65,6 +66,8 @@
static const struct super_operations btrfs_super_ops;
static struct file_system_type btrfs_fs_type;
+static int btrfs_remount(struct super_block *sb, int *flags, char *data);
+
static const char *btrfs_decode_error(int errno)
{
char *errstr = "unknown";
@@ -153,11 +156,12 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+ printk(KERN_CRIT
+ "BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
sb->s_id, function, line, errno, errstr, &vaf);
va_end(args);
} else {
- printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n",
+ printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
@@ -251,7 +255,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
*/
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
&root->fs_info->fs_state)) {
- WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n",
+ WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n",
errno);
}
trans->aborted = errno;
@@ -295,8 +299,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
s_id, function, line, &vaf, errno, errstr);
- printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
- s_id, function, line, &vaf, errno, errstr);
+ btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
+ function, line, &vaf, errno, errstr);
va_end(args);
/* Caller calls BUG() */
}
@@ -323,7 +327,9 @@ enum {
Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
Opt_check_integrity, Opt_check_integrity_including_extent_data,
Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
- Opt_commit_interval,
+ Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
+ Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
+ Opt_datasum, Opt_treelog, Opt_noinode_cache,
Opt_err,
};
@@ -333,8 +339,11 @@ static match_table_t tokens = {
{Opt_subvolid, "subvolid=%s"},
{Opt_device, "device=%s"},
{Opt_nodatasum, "nodatasum"},
+ {Opt_datasum, "datasum"},
{Opt_nodatacow, "nodatacow"},
+ {Opt_datacow, "datacow"},
{Opt_nobarrier, "nobarrier"},
+ {Opt_barrier, "barrier"},
{Opt_max_inline, "max_inline=%s"},
{Opt_alloc_start, "alloc_start=%s"},
{Opt_thread_pool, "thread_pool=%d"},
@@ -345,18 +354,25 @@ static match_table_t tokens = {
{Opt_ssd, "ssd"},
{Opt_ssd_spread, "ssd_spread"},
{Opt_nossd, "nossd"},
+ {Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_notreelog, "notreelog"},
+ {Opt_treelog, "treelog"},
{Opt_flushoncommit, "flushoncommit"},
+ {Opt_noflushoncommit, "noflushoncommit"},
{Opt_ratio, "metadata_ratio=%d"},
{Opt_discard, "discard"},
+ {Opt_nodiscard, "nodiscard"},
{Opt_space_cache, "space_cache"},
{Opt_clear_cache, "clear_cache"},
{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
{Opt_enospc_debug, "enospc_debug"},
+ {Opt_noenospc_debug, "noenospc_debug"},
{Opt_subvolrootid, "subvolrootid=%d"},
{Opt_defrag, "autodefrag"},
+ {Opt_nodefrag, "noautodefrag"},
{Opt_inode_cache, "inode_cache"},
+ {Opt_noinode_cache, "noinode_cache"},
{Opt_no_space_cache, "nospace_cache"},
{Opt_recovery, "recovery"},
{Opt_skip_balance, "skip_balance"},
@@ -384,6 +400,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
int ret = 0;
char *compress_type;
bool compress_force = false;
+ bool compress = false;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
if (cache_gen)
@@ -410,7 +427,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
token = match_token(p, tokens, args);
switch (token) {
case Opt_degraded:
- printk(KERN_INFO "btrfs: allowing degraded mounts\n");
+ btrfs_info(root->fs_info, "allowing degraded mounts");
btrfs_set_opt(info->mount_opt, DEGRADED);
break;
case Opt_subvol:
@@ -423,28 +440,45 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
*/
break;
case Opt_nodatasum:
- printk(KERN_INFO "btrfs: setting nodatasum\n");
- btrfs_set_opt(info->mount_opt, NODATASUM);
+ btrfs_set_and_info(root, NODATASUM,
+ "setting nodatasum");
+ break;
+ case Opt_datasum:
+ if (btrfs_test_opt(root, NODATASUM)) {
+ if (btrfs_test_opt(root, NODATACOW))
+ btrfs_info(root->fs_info, "setting datasum, datacow enabled");
+ else
+ btrfs_info(root->fs_info, "setting datasum");
+ }
+ btrfs_clear_opt(info->mount_opt, NODATACOW);
+ btrfs_clear_opt(info->mount_opt, NODATASUM);
break;
case Opt_nodatacow:
- if (!btrfs_test_opt(root, COMPRESS) ||
- !btrfs_test_opt(root, FORCE_COMPRESS)) {
- printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
- } else {
- printk(KERN_INFO "btrfs: setting nodatacow\n");
+ if (!btrfs_test_opt(root, NODATACOW)) {
+ if (!btrfs_test_opt(root, COMPRESS) ||
+ !btrfs_test_opt(root, FORCE_COMPRESS)) {
+ btrfs_info(root->fs_info,
+ "setting nodatacow, compression disabled");
+ } else {
+ btrfs_info(root->fs_info, "setting nodatacow");
+ }
}
- info->compress_type = BTRFS_COMPRESS_NONE;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
btrfs_set_opt(info->mount_opt, NODATACOW);
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
+ case Opt_datacow:
+ btrfs_clear_and_info(root, NODATACOW,
+ "setting datacow");
+ break;
case Opt_compress_force:
case Opt_compress_force_type:
compress_force = true;
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
+ compress = true;
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -462,7 +496,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
btrfs_set_fs_incompat(info, COMPRESS_LZO);
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
- info->compress_type = BTRFS_COMPRESS_NONE;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
@@ -472,33 +505,37 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
if (compress_force) {
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
- pr_info("btrfs: force %s compression\n",
- compress_type);
- } else
- pr_info("btrfs: use %s compression\n",
- compress_type);
+ btrfs_set_and_info(root, FORCE_COMPRESS,
+ "force %s compression",
+ compress_type);
+ } else if (compress) {
+ if (!btrfs_test_opt(root, COMPRESS))
+ btrfs_info(root->fs_info,
+ "btrfs: use %s compression",
+ compress_type);
+ }
break;
case Opt_ssd:
- printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
- btrfs_set_opt(info->mount_opt, SSD);
+ btrfs_set_and_info(root, SSD,
+ "use ssd allocation scheme");
break;
case Opt_ssd_spread:
- printk(KERN_INFO "btrfs: use spread ssd "
- "allocation scheme\n");
+ btrfs_set_and_info(root, SSD_SPREAD,
+ "use spread ssd allocation scheme");
btrfs_set_opt(info->mount_opt, SSD);
- btrfs_set_opt(info->mount_opt, SSD_SPREAD);
break;
case Opt_nossd:
- printk(KERN_INFO "btrfs: not using ssd allocation "
- "scheme\n");
- btrfs_set_opt(info->mount_opt, NOSSD);
+ btrfs_set_and_info(root, NOSSD,
+ "not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
- btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
+ break;
+ case Opt_barrier:
+ btrfs_clear_and_info(root, NOBARRIER,
+ "turning on barriers");
break;
case Opt_nobarrier:
- printk(KERN_INFO "btrfs: turning off barriers\n");
- btrfs_set_opt(info->mount_opt, NOBARRIER);
+ btrfs_set_and_info(root, NOBARRIER,
+ "turning off barriers");
break;
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
@@ -518,11 +555,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
kfree(num);
if (info->max_inline) {
- info->max_inline = max_t(u64,
+ info->max_inline = min_t(u64,
info->max_inline,
root->sectorsize);
}
- printk(KERN_INFO "btrfs: max_inline at %llu\n",
+ btrfs_info(root->fs_info, "max_inline at %llu",
info->max_inline);
} else {
ret = -ENOMEM;
@@ -536,24 +573,41 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->alloc_start = memparse(num, NULL);
mutex_unlock(&info->chunk_mutex);
kfree(num);
- printk(KERN_INFO
- "btrfs: allocations start at %llu\n",
+ btrfs_info(root->fs_info, "allocations start at %llu",
info->alloc_start);
} else {
ret = -ENOMEM;
goto out;
}
break;
+ case Opt_acl:
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ root->fs_info->sb->s_flags |= MS_POSIXACL;
+ break;
+#else
+ btrfs_err(root->fs_info,
+ "support for ACL not compiled in!");
+ ret = -EINVAL;
+ goto out;
+#endif
case Opt_noacl:
root->fs_info->sb->s_flags &= ~MS_POSIXACL;
break;
case Opt_notreelog:
- printk(KERN_INFO "btrfs: disabling tree log\n");
- btrfs_set_opt(info->mount_opt, NOTREELOG);
+ btrfs_set_and_info(root, NOTREELOG,
+ "disabling tree log");
+ break;
+ case Opt_treelog:
+ btrfs_clear_and_info(root, NOTREELOG,
+ "enabling tree log");
break;
case Opt_flushoncommit:
- printk(KERN_INFO "btrfs: turning on flush-on-commit\n");
- btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
+ btrfs_set_and_info(root, FLUSHONCOMMIT,
+ "turning on flush-on-commit");
+ break;
+ case Opt_noflushoncommit:
+ btrfs_clear_and_info(root, FLUSHONCOMMIT,
+ "turning off flush-on-commit");
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
@@ -561,7 +615,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
goto out;
} else if (intarg >= 0) {
info->metadata_ratio = intarg;
- printk(KERN_INFO "btrfs: metadata ratio %d\n",
+ btrfs_info(root->fs_info, "metadata ratio %d",
info->metadata_ratio);
} else {
ret = -EINVAL;
@@ -569,25 +623,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
break;
case Opt_discard:
- btrfs_set_opt(info->mount_opt, DISCARD);
+ btrfs_set_and_info(root, DISCARD,
+ "turning on discard");
+ break;
+ case Opt_nodiscard:
+ btrfs_clear_and_info(root, DISCARD,
+ "turning off discard");
break;
case Opt_space_cache:
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_set_and_info(root, SPACE_CACHE,
+ "enabling disk space caching");
break;
case Opt_rescan_uuid_tree:
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- printk(KERN_INFO "btrfs: disabling disk space caching\n");
- btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_clear_and_info(root, SPACE_CACHE,
+ "disabling disk space caching");
break;
case Opt_inode_cache:
- printk(KERN_INFO "btrfs: enabling inode map caching\n");
- btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+ btrfs_set_and_info(root, CHANGE_INODE_CACHE,
+ "enabling inode map caching");
+ break;
+ case Opt_noinode_cache:
+ btrfs_clear_and_info(root, CHANGE_INODE_CACHE,
+ "disabling inode map caching");
break;
case Opt_clear_cache:
- printk(KERN_INFO "btrfs: force clearing of disk cache\n");
- btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+ btrfs_set_and_info(root, CLEAR_CACHE,
+ "force clearing of disk cache");
break;
case Opt_user_subvol_rm_allowed:
btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
@@ -595,12 +659,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_enospc_debug:
btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
break;
+ case Opt_noenospc_debug:
+ btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ break;
case Opt_defrag:
- printk(KERN_INFO "btrfs: enabling auto defrag\n");
- btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+ btrfs_set_and_info(root, AUTO_DEFRAG,
+ "enabling auto defrag");
+ break;
+ case Opt_nodefrag:
+ btrfs_clear_and_info(root, AUTO_DEFRAG,
+ "disabling auto defrag");
break;
case Opt_recovery:
- printk(KERN_INFO "btrfs: enabling auto recovery\n");
+ btrfs_info(root->fs_info, "enabling auto recovery");
btrfs_set_opt(info->mount_opt, RECOVERY);
break;
case Opt_skip_balance:
@@ -608,14 +679,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
break;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
case Opt_check_integrity_including_extent_data:
- printk(KERN_INFO "btrfs: enabling check integrity"
- " including extent data\n");
+ btrfs_info(root->fs_info,
+ "enabling check integrity including extent data");
btrfs_set_opt(info->mount_opt,
CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity:
- printk(KERN_INFO "btrfs: enabling check integrity\n");
+ btrfs_info(root->fs_info, "enabling check integrity");
btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
break;
case Opt_check_integrity_print_mask:
@@ -624,8 +695,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
goto out;
} else if (intarg >= 0) {
info->check_integrity_print_mask = intarg;
- printk(KERN_INFO "btrfs:"
- " check_integrity_print_mask 0x%x\n",
+ btrfs_info(root->fs_info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
} else {
ret = -EINVAL;
@@ -636,8 +706,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
case Opt_check_integrity_including_extent_data:
case Opt_check_integrity:
case Opt_check_integrity_print_mask:
- printk(KERN_ERR "btrfs: support for check_integrity*"
- " not compiled in!\n");
+ btrfs_err(root->fs_info,
+ "support for check_integrity* not compiled in!");
ret = -EINVAL;
goto out;
#endif
@@ -657,28 +727,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
intarg = 0;
ret = match_int(&args[0], &intarg);
if (ret < 0) {
- printk(KERN_ERR
- "btrfs: invalid commit interval\n");
+ btrfs_err(root->fs_info, "invalid commit interval");
ret = -EINVAL;
goto out;
}
if (intarg > 0) {
if (intarg > 300) {
- printk(KERN_WARNING
- "btrfs: excessive commit interval %d\n",
+ btrfs_warn(root->fs_info, "excessive commit interval %d",
intarg);
}
info->commit_interval = intarg;
} else {
- printk(KERN_INFO
- "btrfs: using default commit interval %ds\n",
+ btrfs_info(root->fs_info, "using default commit interval %ds",
BTRFS_DEFAULT_COMMIT_INTERVAL);
info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
case Opt_err:
- printk(KERN_INFO "btrfs: unrecognized mount option "
- "'%s'\n", p);
+ btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
goto out;
default:
@@ -687,7 +753,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
}
out:
if (!ret && btrfs_test_opt(root, SPACE_CACHE))
- printk(KERN_INFO "btrfs: disk space caching is enabled\n");
+ btrfs_info(root->fs_info, "disk space caching is enabled");
kfree(orig);
return ret;
}
@@ -750,7 +816,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
break;
case Opt_subvolrootid:
printk(KERN_WARNING
- "btrfs: 'subvolrootid' mount option is deprecated and has no effect\n");
+ "BTRFS: 'subvolrootid' mount option is deprecated and has "
+ "no effect\n");
break;
case Opt_device:
device_name = match_strdup(&args[0]);
@@ -784,6 +851,7 @@ static struct dentry *get_default_root(struct super_block *sb,
struct btrfs_path *path;
struct btrfs_key location;
struct inode *inode;
+ struct dentry *dentry;
u64 dir_id;
int new = 0;
@@ -854,7 +922,13 @@ setup_root:
return dget(sb->s_root);
}
- return d_obtain_alias(inode);
+ dentry = d_obtain_alias(inode);
+ if (!IS_ERR(dentry)) {
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_DISCONNECTED;
+ spin_unlock(&dentry->d_lock);
+ }
+ return dentry;
}
static int btrfs_fill_super(struct super_block *sb,
@@ -879,7 +953,7 @@ static int btrfs_fill_super(struct super_block *sb,
sb->s_flags |= MS_I_VERSION;
err = open_ctree(sb, fs_devices, (char *)data);
if (err) {
- printk("btrfs: open_ctree failed\n");
+ printk(KERN_ERR "BTRFS: open_ctree failed\n");
return err;
}
@@ -921,7 +995,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
return 0;
}
- btrfs_wait_all_ordered_extents(fs_info);
+ btrfs_wait_ordered_roots(fs_info, -1);
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
@@ -1106,7 +1180,31 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
return ERR_PTR(-ENOMEM);
mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
newargs);
+
+ if (PTR_RET(mnt) == -EBUSY) {
+ if (flags & MS_RDONLY) {
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name,
+ newargs);
+ } else {
+ int r;
+ mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name,
+ newargs);
+ if (IS_ERR(mnt)) {
+ kfree(newargs);
+ return ERR_CAST(mnt);
+ }
+
+ r = btrfs_remount(mnt->mnt_sb, &flags, NULL);
+ if (r < 0) {
+ /* FIXME: release vfsmount mnt ??*/
+ kfree(newargs);
+ return ERR_PTR(r);
+ }
+ }
+ }
+
kfree(newargs);
+
if (IS_ERR(mnt))
return ERR_CAST(mnt);
@@ -1117,7 +1215,7 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
dput(root);
root = ERR_PTR(-EINVAL);
deactivate_locked_super(s);
- printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+ printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n",
subvol_name);
}
@@ -1227,13 +1325,6 @@ error_fs_info:
return ERR_PTR(error);
}
-static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
-{
- spin_lock_irq(&workers->lock);
- workers->max_workers = new_limit;
- spin_unlock_irq(&workers->lock);
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
int new_pool_size, int old_pool_size)
{
@@ -1242,24 +1333,23 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
fs_info->thread_pool_size = new_pool_size;
- printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+ btrfs_info(fs_info, "resize thread pool %d -> %d",
old_pool_size, new_pool_size);
- btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
- btrfs_set_max_workers(&fs_info->workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
- btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
- btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
- new_pool_size);
+ btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->submit_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
+ new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
+ btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
+ new_pool_size);
}
static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
@@ -1310,6 +1400,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
unsigned int old_metadata_ratio = fs_info->metadata_ratio;
int ret;
+ sync_filesystem(sb);
btrfs_remount_prepare(fs_info);
ret = btrfs_parse_options(root, data);
@@ -1330,6 +1421,13 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* this also happens on 'umount -rf' or on shutdown, when
* the filesystem is busy.
*/
+ cancel_work_sync(&fs_info->async_reclaim_work);
+
+ /* wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
+
sb->s_flags |= MS_RDONLY;
btrfs_dev_replace_suspend_for_unmount(fs_info);
@@ -1342,7 +1440,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
} else {
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
btrfs_err(fs_info,
- "Remounting read-write after error is not allowed\n");
+ "Remounting read-write after error is not allowed");
ret = -EINVAL;
goto restore;
}
@@ -1354,8 +1452,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
if (fs_info->fs_devices->missing_devices >
fs_info->num_tolerated_disk_barrier_failures &&
!(*flags & MS_RDONLY)) {
- printk(KERN_WARNING
- "Btrfs: too many missing devices, writeable remount is not allowed\n");
+ btrfs_warn(fs_info,
+ "too many missing devices, writeable remount is not allowed");
ret = -EACCES;
goto restore;
}
@@ -1370,7 +1468,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
goto restore;
/* recover relocation */
+ mutex_lock(&fs_info->cleaner_mutex);
ret = btrfs_recover_relocation(root);
+ mutex_unlock(&fs_info->cleaner_mutex);
if (ret)
goto restore;
@@ -1380,22 +1480,22 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
ret = btrfs_resume_dev_replace_async(fs_info);
if (ret) {
- pr_warn("btrfs: failed to resume dev_replace\n");
+ btrfs_warn(fs_info, "failed to resume dev_replace");
goto restore;
}
if (!fs_info->uuid_root) {
- pr_info("btrfs: creating UUID tree\n");
+ btrfs_info(fs_info, "creating UUID tree");
ret = btrfs_create_uuid_tree(fs_info);
if (ret) {
- pr_warn("btrfs: failed to create the uuid tree"
- "%d\n", ret);
+ btrfs_warn(fs_info, "failed to create the UUID tree %d", ret);
goto restore;
}
}
sb->s_flags &= ~MS_RDONLY;
}
out:
+ wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
return 0;
@@ -1465,7 +1565,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
nr_devices = fs_info->fs_devices->open_devices;
BUG_ON(!nr_devices);
- devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+ devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
GFP_NOFS);
if (!devices_info)
return -ENOMEM;
@@ -1711,6 +1811,8 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
list_for_each_entry(dev, head, dev_list) {
if (dev->missing)
continue;
+ if (!dev->name)
+ continue;
if (!first_dev || dev->devid < first_dev->devid)
first_dev = dev;
}
@@ -1769,7 +1871,7 @@ static int btrfs_interface_init(void)
static void btrfs_interface_exit(void)
{
if (misc_deregister(&btrfs_misc) < 0)
- printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
+ printk(KERN_INFO "BTRFS: misc_deregister failed for control device\n");
}
static void btrfs_print_info(void)
@@ -1789,17 +1891,44 @@ static void btrfs_print_info(void)
static int btrfs_run_sanity_tests(void)
{
- return btrfs_test_free_space_cache();
+ int ret;
+
+ ret = btrfs_init_test_fs();
+ if (ret)
+ return ret;
+
+ ret = btrfs_test_free_space_cache();
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_buffer_operations();
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_io();
+ if (ret)
+ goto out;
+ ret = btrfs_test_inodes();
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups();
+out:
+ btrfs_destroy_test_fs();
+ return ret;
}
static int __init init_btrfs_fs(void)
{
int err;
- err = btrfs_init_sysfs();
+ err = btrfs_hash_init();
if (err)
return err;
+ btrfs_props_init();
+
+ err = btrfs_init_sysfs();
+ if (err)
+ goto free_hash;
+
btrfs_init_compress();
err = btrfs_init_cachep();
@@ -1873,6 +2002,8 @@ free_cachep:
free_compress:
btrfs_exit_compress();
btrfs_exit_sysfs();
+free_hash:
+ btrfs_hash_exit();
return err;
}
@@ -1891,9 +2022,10 @@ static void __exit exit_btrfs_fs(void)
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
+ btrfs_hash_exit();
}
-module_init(init_btrfs_fs)
+late_initcall(init_btrfs_fs);
module_exit(exit_btrfs_fs)
MODULE_LICENSE("GPL");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5b326cd60a4..78699364f53 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -22,24 +22,732 @@
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/kobject.h>
+#include <linux/bug.h>
+#include <linux/genhd.h>
+#include <linux/debugfs.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
+#include "sysfs.h"
+#include "volumes.h"
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj);
+
+static u64 get_features(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ if (set == FEAT_COMPAT)
+ return btrfs_super_compat_flags(disk_super);
+ else if (set == FEAT_COMPAT_RO)
+ return btrfs_super_compat_ro_flags(disk_super);
+ else
+ return btrfs_super_incompat_flags(disk_super);
+}
+
+static void set_features(struct btrfs_fs_info *fs_info,
+ enum btrfs_feature_set set, u64 features)
+{
+ struct btrfs_super_block *disk_super = fs_info->super_copy;
+ if (set == FEAT_COMPAT)
+ btrfs_set_super_compat_flags(disk_super, features);
+ else if (set == FEAT_COMPAT_RO)
+ btrfs_set_super_compat_ro_flags(disk_super, features);
+ else
+ btrfs_set_super_incompat_flags(disk_super, features);
+}
+
+static int can_modify_feature(struct btrfs_feature_attr *fa)
+{
+ int val = 0;
+ u64 set, clear;
+ switch (fa->feature_set) {
+ case FEAT_COMPAT:
+ set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+ break;
+ case FEAT_COMPAT_RO:
+ set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+ break;
+ case FEAT_INCOMPAT:
+ set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+ break;
+ default:
+ printk(KERN_WARNING "btrfs: sysfs: unknown feature set %d\n",
+ fa->feature_set);
+ return 0;
+ }
+
+ if (set & fa->feature_bit)
+ val |= 1;
+ if (clear & fa->feature_bit)
+ val |= 2;
+
+ return val;
+}
+
+static ssize_t btrfs_feature_attr_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ int val = 0;
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+ if (fs_info) {
+ u64 features = get_features(fs_info, fa->feature_set);
+ if (features & fa->feature_bit)
+ val = 1;
+ } else
+ val = can_modify_feature(fa);
+
+ return snprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t count)
+{
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_feature_attr *fa = to_btrfs_feature_attr(a);
+ struct btrfs_trans_handle *trans;
+ u64 features, set, clear;
+ unsigned long val;
+ int ret;
+
+ fs_info = to_fs_info(kobj);
+ if (!fs_info)
+ return -EPERM;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &val);
+ if (ret)
+ return ret;
+
+ if (fa->feature_set == FEAT_COMPAT) {
+ set = BTRFS_FEATURE_COMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_SAFE_CLEAR;
+ } else if (fa->feature_set == FEAT_COMPAT_RO) {
+ set = BTRFS_FEATURE_COMPAT_RO_SAFE_SET;
+ clear = BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR;
+ } else {
+ set = BTRFS_FEATURE_INCOMPAT_SAFE_SET;
+ clear = BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR;
+ }
+
+ features = get_features(fs_info, fa->feature_set);
+
+ /* Nothing to do */
+ if ((val && (features & fa->feature_bit)) ||
+ (!val && !(features & fa->feature_bit)))
+ return count;
+
+ if ((val && !(set & fa->feature_bit)) ||
+ (!val && !(clear & fa->feature_bit))) {
+ btrfs_info(fs_info,
+ "%sabling feature %s on mounted fs is not supported.",
+ val ? "En" : "Dis", fa->kobj_attr.attr.name);
+ return -EPERM;
+ }
+
+ btrfs_info(fs_info, "%s %s feature flag",
+ val ? "Setting" : "Clearing", fa->kobj_attr.attr.name);
+
+ trans = btrfs_start_transaction(fs_info->fs_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ spin_lock(&fs_info->super_lock);
+ features = get_features(fs_info, fa->feature_set);
+ if (val)
+ features |= fa->feature_bit;
+ else
+ features &= ~fa->feature_bit;
+ set_features(fs_info, fa->feature_set, features);
+ spin_unlock(&fs_info->super_lock);
+
+ ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static umode_t btrfs_feature_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ umode_t mode = attr->mode;
+
+ if (fs_info) {
+ struct btrfs_feature_attr *fa;
+ u64 features;
+
+ fa = attr_to_btrfs_feature_attr(attr);
+ features = get_features(fs_info, fa->feature_set);
+
+ if (can_modify_feature(fa))
+ mode |= S_IWUSR;
+ else if (!(features & fa->feature_bit))
+ mode = 0;
+ }
+
+ return mode;
+}
+
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF);
+BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL);
+BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS);
+BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO);
+BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
+BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
+BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
+BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+
+static struct attribute *btrfs_supported_feature_attrs[] = {
+ BTRFS_FEAT_ATTR_PTR(mixed_backref),
+ BTRFS_FEAT_ATTR_PTR(default_subvol),
+ BTRFS_FEAT_ATTR_PTR(mixed_groups),
+ BTRFS_FEAT_ATTR_PTR(compress_lzo),
+ BTRFS_FEAT_ATTR_PTR(big_metadata),
+ BTRFS_FEAT_ATTR_PTR(extended_iref),
+ BTRFS_FEAT_ATTR_PTR(raid56),
+ BTRFS_FEAT_ATTR_PTR(skinny_metadata),
+ BTRFS_FEAT_ATTR_PTR(no_holes),
+ NULL
+};
+
+static const struct attribute_group btrfs_feature_attr_group = {
+ .name = "features",
+ .is_visible = btrfs_feature_visible,
+ .attrs = btrfs_supported_feature_attrs,
+};
+
+static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf)
+{
+ u64 val;
+ if (lock)
+ spin_lock(lock);
+ val = *value_ptr;
+ if (lock)
+ spin_unlock(lock);
+ return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static ssize_t global_rsv_size_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show);
+
+static ssize_t global_rsv_reserved_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj->parent);
+ struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+ return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
+}
+BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show);
+
+#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
+#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf);
+BTRFS_RAID_ATTR(total_bytes, raid_bytes_show);
+BTRFS_RAID_ATTR(used_bytes, raid_bytes_show);
+
+static ssize_t raid_bytes_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj->parent);
+ struct btrfs_block_group_cache *block_group;
+ int index = to_raid_kobj(kobj)->raid_type;
+ u64 val = 0;
+
+ down_read(&sinfo->groups_sem);
+ list_for_each_entry(block_group, &sinfo->block_groups[index], list) {
+ if (&attr->attr == BTRFS_RAID_ATTR_PTR(total_bytes))
+ val += block_group->key.offset;
+ else
+ val += btrfs_block_group_used(&block_group->item);
+ }
+ up_read(&sinfo->groups_sem);
+ return snprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static struct attribute *raid_attributes[] = {
+ BTRFS_RAID_ATTR_PTR(total_bytes),
+ BTRFS_RAID_ATTR_PTR(used_bytes),
+ NULL
+};
+
+static void release_raid_kobj(struct kobject *kobj)
+{
+ kfree(to_raid_kobj(kobj));
+}
+
+struct kobj_type btrfs_raid_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = release_raid_kobj,
+ .default_attrs = raid_attributes,
+};
+
+#define SPACE_INFO_ATTR(field) \
+static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
+ struct kobj_attribute *a, \
+ char *buf) \
+{ \
+ struct btrfs_space_info *sinfo = to_space_info(kobj); \
+ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
+} \
+BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field)
+
+static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+ s64 val = percpu_counter_sum(&sinfo->total_bytes_pinned);
+ return snprintf(buf, PAGE_SIZE, "%lld\n", val);
+}
+
+SPACE_INFO_ATTR(flags);
+SPACE_INFO_ATTR(total_bytes);
+SPACE_INFO_ATTR(bytes_used);
+SPACE_INFO_ATTR(bytes_pinned);
+SPACE_INFO_ATTR(bytes_reserved);
+SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(disk_used);
+SPACE_INFO_ATTR(disk_total);
+BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned);
+
+static struct attribute *space_info_attrs[] = {
+ BTRFS_ATTR_PTR(flags),
+ BTRFS_ATTR_PTR(total_bytes),
+ BTRFS_ATTR_PTR(bytes_used),
+ BTRFS_ATTR_PTR(bytes_pinned),
+ BTRFS_ATTR_PTR(bytes_reserved),
+ BTRFS_ATTR_PTR(bytes_may_use),
+ BTRFS_ATTR_PTR(disk_used),
+ BTRFS_ATTR_PTR(disk_total),
+ BTRFS_ATTR_PTR(total_bytes_pinned),
+ NULL,
+};
+
+static void space_info_release(struct kobject *kobj)
+{
+ struct btrfs_space_info *sinfo = to_space_info(kobj);
+ percpu_counter_destroy(&sinfo->total_bytes_pinned);
+ kfree(sinfo);
+}
+
+struct kobj_type space_info_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = space_info_release,
+ .default_attrs = space_info_attrs,
+};
+
+static const struct attribute *allocation_attrs[] = {
+ BTRFS_ATTR_PTR(global_rsv_reserved),
+ BTRFS_ATTR_PTR(global_rsv_size),
+ NULL,
+};
+
+static ssize_t btrfs_label_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label);
+}
+
+static ssize_t btrfs_label_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = fs_info->fs_root;
+ int ret;
+
+ if (len >= BTRFS_LABEL_SIZE)
+ return -EINVAL;
+
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ spin_lock(&root->fs_info->super_lock);
+ strcpy(fs_info->super_copy->label, buf);
+ spin_unlock(&root->fs_info->super_lock);
+ ret = btrfs_commit_transaction(trans, root);
+
+ if (!ret)
+ return len;
+
+ return ret;
+}
+BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store);
+
+static ssize_t btrfs_no_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ return -EPERM;
+}
+
+static ssize_t btrfs_nodesize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
+}
+
+BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store);
+
+static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store);
+
+static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
+ struct kobj_attribute *a, char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
+}
+
+BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store);
+
+static struct attribute *btrfs_attrs[] = {
+ BTRFS_ATTR_PTR(label),
+ BTRFS_ATTR_PTR(nodesize),
+ BTRFS_ATTR_PTR(sectorsize),
+ BTRFS_ATTR_PTR(clone_alignment),
+ NULL,
+};
+
+static void btrfs_release_super_kobj(struct kobject *kobj)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ complete(&fs_info->kobj_unregister);
+}
+
+static struct kobj_type btrfs_ktype = {
+ .sysfs_ops = &kobj_sysfs_ops,
+ .release = btrfs_release_super_kobj,
+ .default_attrs = btrfs_attrs,
+};
+
+static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
+{
+ if (kobj->ktype != &btrfs_ktype)
+ return NULL;
+ return container_of(kobj, struct btrfs_fs_info, super_kobj);
+}
+
+#define NUM_FEATURE_BITS 64
+static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13];
+static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS];
+
+static u64 supported_feature_masks[3] = {
+ [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP,
+ [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP,
+ [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP,
+};
+
+static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
+{
+ int set;
+
+ for (set = 0; set < FEAT_MAX; set++) {
+ int i;
+ struct attribute *attrs[2];
+ struct attribute_group agroup = {
+ .name = "features",
+ .attrs = attrs,
+ };
+ u64 features = get_features(fs_info, set);
+ features &= ~supported_feature_masks[set];
+
+ if (!features)
+ continue;
+
+ attrs[1] = NULL;
+ for (i = 0; i < NUM_FEATURE_BITS; i++) {
+ struct btrfs_feature_attr *fa;
+
+ if (!(features & (1ULL << i)))
+ continue;
+
+ fa = &btrfs_feature_attrs[set][i];
+ attrs[0] = &fa->kobj_attr.attr;
+ if (add) {
+ int ret;
+ ret = sysfs_merge_group(&fs_info->super_kobj,
+ &agroup);
+ if (ret)
+ return ret;
+ } else
+ sysfs_unmerge_group(&fs_info->super_kobj,
+ &agroup);
+ }
+
+ }
+ return 0;
+}
+
+static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+ kobject_del(&fs_info->super_kobj);
+ kobject_put(&fs_info->super_kobj);
+ wait_for_completion(&fs_info->kobj_unregister);
+}
+
+void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+{
+ if (fs_info->space_info_kobj) {
+ sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs);
+ kobject_del(fs_info->space_info_kobj);
+ kobject_put(fs_info->space_info_kobj);
+ }
+ kobject_del(fs_info->device_dir_kobj);
+ kobject_put(fs_info->device_dir_kobj);
+ addrm_unknown_feature_attrs(fs_info, false);
+ sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group);
+ __btrfs_sysfs_remove_one(fs_info);
+}
+
+const char * const btrfs_feature_set_names[3] = {
+ [FEAT_COMPAT] = "compat",
+ [FEAT_COMPAT_RO] = "compat_ro",
+ [FEAT_INCOMPAT] = "incompat",
+};
+
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags)
+{
+ size_t bufsize = 4096; /* safe max, 64 names * 64 bytes */
+ int len = 0;
+ int i;
+ char *str;
+
+ str = kmalloc(bufsize, GFP_KERNEL);
+ if (!str)
+ return str;
+
+ for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+ const char *name;
+
+ if (!(flags & (1ULL << i)))
+ continue;
+
+ name = btrfs_feature_attrs[set][i].kobj_attr.attr.name;
+ len += snprintf(str + len, bufsize - len, "%s%s",
+ len ? "," : "", name);
+ }
+
+ return str;
+}
+
+static void init_feature_attrs(void)
+{
+ struct btrfs_feature_attr *fa;
+ int set, i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names) !=
+ ARRAY_SIZE(btrfs_feature_attrs));
+ BUILD_BUG_ON(ARRAY_SIZE(btrfs_unknown_feature_names[0]) !=
+ ARRAY_SIZE(btrfs_feature_attrs[0]));
+
+ memset(btrfs_feature_attrs, 0, sizeof(btrfs_feature_attrs));
+ memset(btrfs_unknown_feature_names, 0,
+ sizeof(btrfs_unknown_feature_names));
+
+ for (i = 0; btrfs_supported_feature_attrs[i]; i++) {
+ struct btrfs_feature_attr *sfa;
+ struct attribute *a = btrfs_supported_feature_attrs[i];
+ int bit;
+ sfa = attr_to_btrfs_feature_attr(a);
+ bit = ilog2(sfa->feature_bit);
+ fa = &btrfs_feature_attrs[sfa->feature_set][bit];
+
+ fa->kobj_attr.attr.name = sfa->kobj_attr.attr.name;
+ }
+
+ for (set = 0; set < FEAT_MAX; set++) {
+ for (i = 0; i < ARRAY_SIZE(btrfs_feature_attrs[set]); i++) {
+ char *name = btrfs_unknown_feature_names[set][i];
+ fa = &btrfs_feature_attrs[set][i];
+
+ if (fa->kobj_attr.attr.name)
+ continue;
+
+ snprintf(name, 13, "%s:%u",
+ btrfs_feature_set_names[set], i);
+
+ fa->kobj_attr.attr.name = name;
+ fa->kobj_attr.attr.mode = S_IRUGO;
+ fa->feature_set = set;
+ fa->feature_bit = 1ULL << i;
+ }
+ }
+}
+
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!fs_info->device_dir_kobj)
+ return -EINVAL;
+
+ if (one_device) {
+ disk = one_device->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ sysfs_remove_link(fs_info->device_dir_kobj,
+ disk_kobj->name);
+ }
+
+ return 0;
+}
+
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device)
+{
+ int error = 0;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+ struct btrfs_device *dev;
+
+ if (!fs_info->device_dir_kobj)
+ fs_info->device_dir_kobj = kobject_create_and_add("devices",
+ &fs_info->super_kobj);
+
+ if (!fs_info->device_dir_kobj)
+ return -ENOMEM;
+
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ struct hd_struct *disk;
+ struct kobject *disk_kobj;
+
+ if (!dev->bdev)
+ continue;
+
+ if (one_device && one_device != dev)
+ continue;
+
+ disk = dev->bdev->bd_part;
+ disk_kobj = &part_to_dev(disk)->kobj;
+
+ error = sysfs_create_link(fs_info->device_dir_kobj,
+ disk_kobj, disk_kobj->name);
+ if (error)
+ break;
+ }
+
+ return error;
+}
/* /sys/fs/btrfs/ entry */
static struct kset *btrfs_kset;
+/* /sys/kernel/debug/btrfs */
+static struct dentry *btrfs_debugfs_root_dentry;
+
+/* Debugging tunables and exported data */
+u64 btrfs_debugfs_test;
+
+int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+{
+ int error;
+
+ init_completion(&fs_info->kobj_unregister);
+ fs_info->super_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL,
+ "%pU", fs_info->fsid);
+ if (error)
+ return error;
+
+ error = sysfs_create_group(&fs_info->super_kobj,
+ &btrfs_feature_attr_group);
+ if (error) {
+ __btrfs_sysfs_remove_one(fs_info);
+ return error;
+ }
+
+ error = addrm_unknown_feature_attrs(fs_info, true);
+ if (error)
+ goto failure;
+
+ error = btrfs_kobj_add_device(fs_info, NULL);
+ if (error)
+ goto failure;
+
+ fs_info->space_info_kobj = kobject_create_and_add("allocation",
+ &fs_info->super_kobj);
+ if (!fs_info->space_info_kobj) {
+ error = -ENOMEM;
+ goto failure;
+ }
+
+ error = sysfs_create_files(fs_info->space_info_kobj, allocation_attrs);
+ if (error)
+ goto failure;
+
+ return 0;
+failure:
+ btrfs_sysfs_remove_one(fs_info);
+ return error;
+}
+
+static int btrfs_init_debugfs(void)
+{
+#ifdef CONFIG_DEBUG_FS
+ btrfs_debugfs_root_dentry = debugfs_create_dir("btrfs", NULL);
+ if (!btrfs_debugfs_root_dentry)
+ return -ENOMEM;
+
+ debugfs_create_u64("test", S_IRUGO | S_IWUGO, btrfs_debugfs_root_dentry,
+ &btrfs_debugfs_test);
+#endif
+ return 0;
+}
+
int btrfs_init_sysfs(void)
{
+ int ret;
+
btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
if (!btrfs_kset)
return -ENOMEM;
- return 0;
+
+ ret = btrfs_init_debugfs();
+ if (ret)
+ return ret;
+
+ init_feature_attrs();
+ ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+
+ return ret;
}
void btrfs_exit_sysfs(void)
{
+ sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
kset_unregister(btrfs_kset);
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
new file mode 100644
index 00000000000..ac46df37504
--- /dev/null
+++ b/fs/btrfs/sysfs.h
@@ -0,0 +1,73 @@
+#ifndef _BTRFS_SYSFS_H_
+#define _BTRFS_SYSFS_H_
+
+/*
+ * Data exported through sysfs
+ */
+extern u64 btrfs_debugfs_test;
+
+enum btrfs_feature_set {
+ FEAT_COMPAT,
+ FEAT_COMPAT_RO,
+ FEAT_INCOMPAT,
+ FEAT_MAX
+};
+
+#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store) \
+{ \
+ .attr = { .name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+}
+
+#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \
+static struct kobj_attribute btrfs_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, _mode, _show, _store)
+#define BTRFS_ATTR(_name, _mode, _show) \
+ BTRFS_ATTR_RW(_name, _mode, _show, NULL)
+#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
+
+#define BTRFS_RAID_ATTR(_name, _show) \
+static struct kobj_attribute btrfs_raid_attr_##_name = \
+ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
+#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
+
+
+struct btrfs_feature_attr {
+ struct kobj_attribute kobj_attr;
+ enum btrfs_feature_set feature_set;
+ u64 feature_bit;
+};
+
+#define BTRFS_FEAT_ATTR(_name, _feature_set, _prefix, _feature_bit) \
+static struct btrfs_feature_attr btrfs_attr_##_name = { \
+ .kobj_attr = __INIT_KOBJ_ATTR(_name, S_IRUGO, \
+ btrfs_feature_attr_show, \
+ btrfs_feature_attr_store), \
+ .feature_set = _feature_set, \
+ .feature_bit = _prefix ##_## _feature_bit, \
+}
+#define BTRFS_FEAT_ATTR_PTR(_name) (&btrfs_attr_##_name.kobj_attr.attr)
+
+#define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+#define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
+ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
+
+/* convert from attribute */
+#define to_btrfs_feature_attr(a) \
+ container_of(a, struct btrfs_feature_attr, kobj_attr)
+#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr)
+#define attr_to_btrfs_feature_attr(a) \
+ to_btrfs_feature_attr(attr_to_btrfs_attr(a))
+char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
+extern const char * const btrfs_feature_set_names[3];
+extern struct kobj_type space_info_ktype;
+extern struct kobj_type btrfs_raid_ktype;
+int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
+ struct btrfs_device *one_device);
+#endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
new file mode 100644
index 00000000000..9626252ee6b
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../volumes.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static struct vfsmount *test_mnt = NULL;
+
+static const struct super_operations btrfs_test_super_ops = {
+ .alloc_inode = btrfs_alloc_inode,
+ .destroy_inode = btrfs_test_destroy_inode,
+};
+
+static struct dentry *btrfs_test_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *data)
+{
+ return mount_pseudo(fs_type, "btrfs_test:", &btrfs_test_super_ops,
+ NULL, BTRFS_TEST_MAGIC);
+}
+
+static struct file_system_type test_type = {
+ .name = "btrfs_test_fs",
+ .mount = btrfs_test_mount,
+ .kill_sb = kill_anon_super,
+};
+
+struct inode *btrfs_new_test_inode(void)
+{
+ return new_inode(test_mnt->mnt_sb);
+}
+
+int btrfs_init_test_fs(void)
+{
+ int ret;
+
+ ret = register_filesystem(&test_type);
+ if (ret) {
+ printk(KERN_ERR "btrfs: cannot register test file system\n");
+ return ret;
+ }
+
+ test_mnt = kern_mount(&test_type);
+ if (IS_ERR(test_mnt)) {
+ printk(KERN_ERR "btrfs: cannot mount test file system\n");
+ unregister_filesystem(&test_type);
+ return ret;
+ }
+ return 0;
+}
+
+void btrfs_destroy_test_fs(void)
+{
+ kern_unmount(test_mnt);
+ unregister_filesystem(&test_type);
+}
+
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
+{
+ struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
+ GFP_NOFS);
+
+ if (!fs_info)
+ return fs_info;
+ fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
+ GFP_NOFS);
+ if (!fs_info->fs_devices) {
+ kfree(fs_info);
+ return NULL;
+ }
+ fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
+ GFP_NOFS);
+ if (!fs_info->super_copy) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ if (init_srcu_struct(&fs_info->subvol_srcu)) {
+ kfree(fs_info->fs_devices);
+ kfree(fs_info->super_copy);
+ kfree(fs_info);
+ return NULL;
+ }
+
+ spin_lock_init(&fs_info->buffer_lock);
+ spin_lock_init(&fs_info->qgroup_lock);
+ spin_lock_init(&fs_info->qgroup_op_lock);
+ spin_lock_init(&fs_info->super_lock);
+ spin_lock_init(&fs_info->fs_roots_radix_lock);
+ spin_lock_init(&fs_info->tree_mod_seq_lock);
+ mutex_init(&fs_info->qgroup_ioctl_lock);
+ mutex_init(&fs_info->qgroup_rescan_lock);
+ rwlock_init(&fs_info->tree_mod_log_lock);
+ fs_info->running_transaction = NULL;
+ fs_info->qgroup_tree = RB_ROOT;
+ fs_info->qgroup_ulist = NULL;
+ atomic64_set(&fs_info->tree_mod_seq, 0);
+ INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+ INIT_LIST_HEAD(&fs_info->dead_roots);
+ INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
+ INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
+ INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+ return fs_info;
+}
+
+static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+
+ spin_lock(&fs_info->buffer_lock);
+restart:
+ radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
+ struct extent_buffer *eb;
+
+ eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
+ if (!eb)
+ continue;
+ /* Shouldn't happen but that kind of thinking creates CVE's */
+ if (radix_tree_exception(eb)) {
+ if (radix_tree_deref_retry(eb))
+ goto restart;
+ continue;
+ }
+ spin_unlock(&fs_info->buffer_lock);
+ free_extent_buffer_stale(eb);
+ spin_lock(&fs_info->buffer_lock);
+ }
+ spin_unlock(&fs_info->buffer_lock);
+
+ btrfs_free_qgroup_config(fs_info);
+ btrfs_free_fs_roots(fs_info);
+ cleanup_srcu_struct(&fs_info->subvol_srcu);
+ kfree(fs_info->super_copy);
+ kfree(fs_info->fs_devices);
+ kfree(fs_info);
+}
+
+void btrfs_free_dummy_root(struct btrfs_root *root)
+{
+ if (!root)
+ return;
+ if (root->node)
+ free_extent_buffer(root->node);
+ if (root->fs_info)
+ btrfs_free_dummy_fs_info(root->fs_info);
+ kfree(root);
+}
+
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 58087762577..fd395422448 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -21,14 +21,48 @@
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
+#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
+
+struct btrfs_root;
int btrfs_test_free_space_cache(void);
+int btrfs_test_extent_buffer_operations(void);
+int btrfs_test_extent_io(void);
+int btrfs_test_inodes(void);
+int btrfs_test_qgroups(void);
+int btrfs_init_test_fs(void);
+void btrfs_destroy_test_fs(void);
+struct inode *btrfs_new_test_inode(void);
+struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_root(struct btrfs_root *root);
#else
static inline int btrfs_test_free_space_cache(void)
{
return 0;
}
+static inline int btrfs_test_extent_buffer_operations(void)
+{
+ return 0;
+}
+static inline int btrfs_init_test_fs(void)
+{
+ return 0;
+}
+static inline void btrfs_destroy_test_fs(void)
+{
+}
+static inline int btrfs_test_extent_io(void)
+{
+ return 0;
+}
+static inline int btrfs_test_inodes(void)
+{
+ return 0;
+}
+static inline int btrfs_test_qgroups(void)
+{
+ return 0;
+}
#endif
#endif
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
new file mode 100644
index 00000000000..cc286ce97d1
--- /dev/null
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../extent_io.h"
+#include "../disk-io.h"
+
+static int test_btrfs_split_item(void)
+{
+ struct btrfs_path *path;
+ struct btrfs_root *root;
+ struct extent_buffer *eb;
+ struct btrfs_item *item;
+ char *value = "mary had a little lamb";
+ char *split1 = "mary had a little";
+ char *split2 = " lamb";
+ char *split3 = "mary";
+ char *split4 = " had a little";
+ char buf[32];
+ struct btrfs_key key;
+ u32 value_len = strlen(value);
+ int ret = 0;
+
+ test_msg("Running btrfs_split_item tests\n");
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Could not allocate root\n");
+ return PTR_ERR(root);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Could not allocate path\n");
+ kfree(root);
+ return -ENOMEM;
+ }
+
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+ if (!eb) {
+ test_msg("Could not allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ path->slots[0] = 0;
+
+ key.objectid = 0;
+ key.type = BTRFS_EXTENT_CSUM_KEY;
+ key.offset = 0;
+
+ setup_items_for_insert(root, path, &key, &value_len, value_len,
+ value_len + sizeof(struct btrfs_item), 1);
+ item = btrfs_item_nr(0);
+ write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0),
+ value_len);
+
+ key.offset = 3;
+
+ /*
+ * Passing NULL trans here should be safe because we have plenty of
+ * space in this leaf to split the item without having to split the
+ * leaf.
+ */
+ ret = btrfs_split_item(NULL, root, path, &key, 17);
+ if (ret) {
+ test_msg("Split item failed %d\n", ret);
+ goto out;
+ }
+
+ /*
+ * Read the first slot, it should have the original key and contain only
+ * 'mary had a little'
+ */
+ btrfs_item_key_to_cpu(eb, &key, 0);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 0) {
+ test_msg("Invalid key at slot 0\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(0);
+ if (btrfs_item_size(eb, item) != strlen(split1)) {
+ test_msg("Invalid len in the first split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+ strlen(split1));
+ if (memcmp(buf, split1, strlen(split1))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the first split have='%.*s' want '%s'\n",
+ (int)strlen(split1), buf, split1);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 1);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 3) {
+ test_msg("Invalid key at slot 1\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(1);
+ if (btrfs_item_size(eb, item) != strlen(split2)) {
+ test_msg("Invalid len in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+ strlen(split2));
+ if (memcmp(buf, split2, strlen(split2))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ key.offset = 1;
+ /* Do it again so we test memmoving the other items in the leaf */
+ ret = btrfs_split_item(NULL, root, path, &key, 4);
+ if (ret) {
+ test_msg("Second split item failed %d\n", ret);
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 0);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 0) {
+ test_msg("Invalid key at slot 0\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(0);
+ if (btrfs_item_size(eb, item) != strlen(split3)) {
+ test_msg("Invalid len in the first split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 0),
+ strlen(split3));
+ if (memcmp(buf, split3, strlen(split3))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the third split");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 1);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 1) {
+ test_msg("Invalid key at slot 1\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(1);
+ if (btrfs_item_size(eb, item) != strlen(split4)) {
+ test_msg("Invalid len in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 1),
+ strlen(split4));
+ if (memcmp(buf, split4, strlen(split4))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the fourth split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ btrfs_item_key_to_cpu(eb, &key, 2);
+ if (key.objectid != 0 || key.type != BTRFS_EXTENT_CSUM_KEY ||
+ key.offset != 3) {
+ test_msg("Invalid key at slot 2\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ item = btrfs_item_nr(2);
+ if (btrfs_item_size(eb, item) != strlen(split2)) {
+ test_msg("Invalid len in the second split\n");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ read_extent_buffer(eb, buf, btrfs_item_ptr_offset(eb, 2),
+ strlen(split2));
+ if (memcmp(buf, split2, strlen(split2))) {
+ test_msg("Data in the buffer doesn't match what it should "
+ "in the last chunk\n");
+ ret = -EINVAL;
+ goto out;
+ }
+out:
+ btrfs_free_path(path);
+ kfree(root);
+ return ret;
+}
+
+int btrfs_test_extent_buffer_operations(void)
+{
+ test_msg("Running extent buffer operation tests");
+ return test_btrfs_split_item();
+}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
new file mode 100644
index 00000000000..7e99c2f98dd
--- /dev/null
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include "btrfs-tests.h"
+#include "../extent_io.h"
+
+#define PROCESS_UNLOCK (1 << 0)
+#define PROCESS_RELEASE (1 << 1)
+#define PROCESS_TEST_LOCKED (1 << 2)
+
+static noinline int process_page_range(struct inode *inode, u64 start, u64 end,
+ unsigned long flags)
+{
+ int ret;
+ struct page *pages[16];
+ unsigned long index = start >> PAGE_CACHE_SHIFT;
+ unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+ unsigned long nr_pages = end_index - index + 1;
+ int i;
+ int count = 0;
+ int loops = 0;
+
+ while (nr_pages > 0) {
+ ret = find_get_pages_contig(inode->i_mapping, index,
+ min_t(unsigned long, nr_pages,
+ ARRAY_SIZE(pages)), pages);
+ for (i = 0; i < ret; i++) {
+ if (flags & PROCESS_TEST_LOCKED &&
+ !PageLocked(pages[i]))
+ count++;
+ if (flags & PROCESS_UNLOCK && PageLocked(pages[i]))
+ unlock_page(pages[i]);
+ page_cache_release(pages[i]);
+ if (flags & PROCESS_RELEASE)
+ page_cache_release(pages[i]);
+ }
+ nr_pages -= ret;
+ index += ret;
+ cond_resched();
+ loops++;
+ if (loops > 100000) {
+ printk(KERN_ERR "stuck in a loop, start %Lu, end %Lu, nr_pages %lu, ret %d\n", start, end, nr_pages, ret);
+ break;
+ }
+ }
+ return count;
+}
+
+static int test_find_delalloc(void)
+{
+ struct inode *inode;
+ struct extent_io_tree tmp;
+ struct page *page;
+ struct page *locked_page = NULL;
+ unsigned long index = 0;
+ u64 total_dirty = 256 * 1024 * 1024;
+ u64 max_bytes = 128 * 1024 * 1024;
+ u64 start, end, test_start;
+ u64 found;
+ int ret = -EINVAL;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Failed to allocate test inode\n");
+ return -ENOMEM;
+ }
+
+ extent_io_tree_init(&tmp, &inode->i_data);
+
+ /*
+ * First go through and create and mark all of our pages dirty, we pin
+ * everything to make sure our pages don't get evicted and screw up our
+ * test.
+ */
+ for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
+ page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+ if (!page) {
+ test_msg("Failed to allocate test page\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ SetPageDirty(page);
+ if (index) {
+ unlock_page(page);
+ } else {
+ page_cache_get(page);
+ locked_page = page;
+ }
+ }
+
+ /* Test this scenario
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+ start = 0;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Should have found at least one delalloc\n");
+ goto out_bits;
+ }
+ if (start != 0 || end != 4095) {
+ test_msg("Expected start 0 end 4095, got start %Lu end %Lu\n",
+ start, end);
+ goto out_bits;
+ }
+ unlock_extent(&tmp, start, end);
+ unlock_page(locked_page);
+ page_cache_release(locked_page);
+
+ /*
+ * Test this scenario
+ *
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ test_start = 64 * 1024 * 1024;
+ locked_page = find_lock_page(inode->i_mapping,
+ test_start >> PAGE_CACHE_SHIFT);
+ if (!locked_page) {
+ test_msg("Couldn't find the locked page\n");
+ goto out_bits;
+ }
+ set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+ start = test_start;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Couldn't find delalloc in our range\n");
+ goto out_bits;
+ }
+ if (start != test_start || end != max_bytes - 1) {
+ test_msg("Expected start %Lu end %Lu, got start %Lu, end "
+ "%Lu\n", test_start, max_bytes - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end,
+ PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+ test_msg("There were unlocked pages in the range\n");
+ goto out_bits;
+ }
+ unlock_extent(&tmp, start, end);
+ /* locked_page was unlocked above */
+ page_cache_release(locked_page);
+
+ /*
+ * Test this scenario
+ * |--- delalloc ---|
+ * |--- search ---|
+ */
+ test_start = max_bytes + 4096;
+ locked_page = find_lock_page(inode->i_mapping, test_start >>
+ PAGE_CACHE_SHIFT);
+ if (!locked_page) {
+ test_msg("Could'nt find the locked page\n");
+ goto out_bits;
+ }
+ start = test_start;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (found) {
+ test_msg("Found range when we shouldn't have\n");
+ goto out_bits;
+ }
+ if (end != (u64)-1) {
+ test_msg("Did not return the proper end offset\n");
+ goto out_bits;
+ }
+
+ /*
+ * Test this scenario
+ * [------- delalloc -------|
+ * [max_bytes]|-- search--|
+ *
+ * We are re-using our test_start from above since it works out well.
+ */
+ set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+ start = test_start;
+ end = 0;
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Didn't find our range\n");
+ goto out_bits;
+ }
+ if (start != test_start || end != total_dirty - 1) {
+ test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+ test_start, total_dirty - 1, start, end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end,
+ PROCESS_TEST_LOCKED | PROCESS_UNLOCK)) {
+ test_msg("Pages in range were not all locked\n");
+ goto out_bits;
+ }
+ unlock_extent(&tmp, start, end);
+
+ /*
+ * Now to test where we run into a page that is no longer dirty in the
+ * range we want to find.
+ */
+ page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
+ >> PAGE_CACHE_SHIFT);
+ if (!page) {
+ test_msg("Couldn't find our page\n");
+ goto out_bits;
+ }
+ ClearPageDirty(page);
+ page_cache_release(page);
+
+ /* We unlocked it in the previous test */
+ lock_page(locked_page);
+ start = test_start;
+ end = 0;
+ /*
+ * Currently if we fail to find dirty pages in the delalloc range we
+ * will adjust max_bytes down to PAGE_CACHE_SIZE and then re-search. If
+ * this changes at any point in the future we will need to fix this
+ * tests expected behavior.
+ */
+ found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ &end, max_bytes);
+ if (!found) {
+ test_msg("Didn't find our range\n");
+ goto out_bits;
+ }
+ if (start != test_start && end != test_start + PAGE_CACHE_SIZE - 1) {
+ test_msg("Expected start %Lu end %Lu, got start %Lu end %Lu\n",
+ test_start, test_start + PAGE_CACHE_SIZE - 1, start,
+ end);
+ goto out_bits;
+ }
+ if (process_page_range(inode, start, end, PROCESS_TEST_LOCKED |
+ PROCESS_UNLOCK)) {
+ test_msg("Pages in range were not all locked\n");
+ goto out_bits;
+ }
+ ret = 0;
+out_bits:
+ clear_extent_bits(&tmp, 0, total_dirty - 1,
+ (unsigned long)-1, GFP_NOFS);
+out:
+ if (locked_page)
+ page_cache_release(locked_page);
+ process_page_range(inode, 0, total_dirty - 1,
+ PROCESS_UNLOCK | PROCESS_RELEASE);
+ iput(inode);
+ return ret;
+}
+
+int btrfs_test_extent_io(void)
+{
+ test_msg("Running find delalloc tests\n");
+ return test_find_delalloc();
+}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 6fc82010dc1..c8d9ddf84c6 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -101,7 +101,7 @@ static int test_extents(struct btrfs_block_group_cache *cache)
ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
if (ret) {
- test_msg("Error removing middle peice %d\n", ret);
+ test_msg("Error removing middle piece %d\n", ret);
return ret;
}
@@ -266,7 +266,7 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
}
if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
- test_msg("Left over peices after removing overlapping\n");
+ test_msg("Left over pieces after removing overlapping\n");
return -1;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
new file mode 100644
index 00000000000..3ae0f5b8bb8
--- /dev/null
+++ b/fs/btrfs/tests/inode-tests.c
@@ -0,0 +1,928 @@
+/*
+ * Copyright (C) 2013 Fusion IO. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../btrfs_inode.h"
+#include "../disk-io.h"
+#include "../extent_io.h"
+#include "../volumes.h"
+
+static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
+ u64 ram_bytes, u64 offset, u64 disk_bytenr,
+ u64 disk_len, u32 type, u8 compression, int slot)
+{
+ struct btrfs_path path;
+ struct btrfs_file_extent_item *fi;
+ struct extent_buffer *leaf = root->node;
+ struct btrfs_key key;
+ u32 value_len = sizeof(struct btrfs_file_extent_item);
+
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ value_len += len;
+ memset(&path, 0, sizeof(path));
+
+ path.nodes[0] = leaf;
+ path.slots[0] = slot;
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = start;
+
+ setup_items_for_insert(root, &path, &key, &value_len, value_len,
+ value_len + sizeof(struct btrfs_item), 1);
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ btrfs_set_file_extent_generation(leaf, fi, 1);
+ btrfs_set_file_extent_type(leaf, fi, type);
+ btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
+ btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_len);
+ btrfs_set_file_extent_offset(leaf, fi, offset);
+ btrfs_set_file_extent_num_bytes(leaf, fi, len);
+ btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
+ btrfs_set_file_extent_compression(leaf, fi, compression);
+ btrfs_set_file_extent_encryption(leaf, fi, 0);
+ btrfs_set_file_extent_other_encoding(leaf, fi, 0);
+}
+
+static void insert_inode_item_key(struct btrfs_root *root)
+{
+ struct btrfs_path path;
+ struct extent_buffer *leaf = root->node;
+ struct btrfs_key key;
+ u32 value_len = 0;
+
+ memset(&path, 0, sizeof(path));
+
+ path.nodes[0] = leaf;
+ path.slots[0] = 0;
+
+ key.objectid = BTRFS_INODE_ITEM_KEY;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ setup_items_for_insert(root, &path, &key, &value_len, value_len,
+ value_len + sizeof(struct btrfs_item), 1);
+}
+
+/*
+ * Build the most complicated map of extents the earth has ever seen. We want
+ * this so we can test all of the corner cases of btrfs_get_extent. Here is a
+ * diagram of how the extents will look though this may not be possible we still
+ * want to make sure everything acts normally (the last number is not inclusive)
+ *
+ * [0 - 5][5 - 6][6 - 10][10 - 4096][ 4096 - 8192 ][8192 - 12288]
+ * [hole ][inline][ hole ][ regular ][regular1 split][ hole ]
+ *
+ * [ 12288 - 20480][20480 - 24576][ 24576 - 28672 ][28672 - 36864][36864 - 45056]
+ * [regular1 split][ prealloc1 ][prealloc1 written][ prealloc1 ][ compressed ]
+ *
+ * [45056 - 49152][49152-53248][53248-61440][61440-65536][ 65536+81920 ]
+ * [ compressed1 ][ regular ][compressed1][ regular ][ hole but no extent]
+ *
+ * [81920-86016]
+ * [ regular ]
+ */
+static void setup_file_extents(struct btrfs_root *root)
+{
+ int slot = 0;
+ u64 disk_bytenr = 1 * 1024 * 1024;
+ u64 offset = 0;
+
+ /* First we want a hole */
+ insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += 5;
+
+ /*
+ * Now we want an inline extent, I don't think this is possible but hey
+ * why not? Also keep in mind if we have an inline extent it counts as
+ * the whole first page. If we were to expand it we would have to cow
+ * and we wouldn't have an inline extent anymore.
+ */
+ insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
+ slot);
+ slot++;
+ offset = 4096;
+
+ /* Now another hole */
+ insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
+ slot);
+ slot++;
+ offset += 4;
+
+ /* Now for a regular extent */
+ insert_extent(root, offset, 4095, 4095, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ disk_bytenr += 4096;
+ offset += 4095;
+
+ /*
+ * Now for 3 extents that were split from a hole punch so we test
+ * offsets properly.
+ */
+ insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 4096, 4096, 0, 0, 0, BTRFS_FILE_EXTENT_REG,
+ 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 8192;
+ disk_bytenr += 16384;
+
+ /* Now for a unwritten prealloc extent */
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 4096;
+
+ /*
+ * We want to jack up disk_bytenr a little more so the em stuff doesn't
+ * merge our records.
+ */
+ disk_bytenr += 8192;
+
+ /*
+ * Now for a partially written prealloc extent, basically the same as
+ * the hole punch example above. Ram_bytes never changes when you mark
+ * extents written btw.
+ */
+ insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 4096, 16384, 4096, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 16384,
+ BTRFS_FILE_EXTENT_PREALLOC, 0, slot);
+ slot++;
+ offset += 8192;
+ disk_bytenr += 16384;
+
+ /* Now a normal compressed extent */
+ insert_extent(root, offset, 8192, 8192, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 8192;
+ /* No merges */
+ disk_bytenr += 8192;
+
+ /* Now a split compressed extent */
+ insert_extent(root, offset, 4096, 16384, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr + 4096, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 4096;
+ insert_extent(root, offset, 8192, 16384, 8192, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot);
+ slot++;
+ offset += 8192;
+ disk_bytenr += 8192;
+
+ /* Now extents that have a hole but no hole extent */
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+ slot++;
+ offset += 16384;
+ disk_bytenr += 4096;
+ insert_extent(root, offset, 4096, 4096, 0, disk_bytenr, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, slot);
+}
+
+static unsigned long prealloc_only = 0;
+static unsigned long compressed_only = 0;
+static unsigned long vacancy_only = 0;
+
+static noinline int test_btrfs_get_extent(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_map *em = NULL;
+ u64 orig_start;
+ u64 disk_bytenr;
+ u64 offset;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ /*
+ * We do this since btrfs_get_extent wants to assign em->bdev to
+ * root->fs_info->fs_devices->latest_bdev.
+ */
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ root->node = alloc_dummy_extent_buffer(0, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ goto out;
+ }
+
+ /*
+ * We will just free a dummy node if it's ref count is 2 so we need an
+ * extra ref so our searches don't accidently release our page.
+ */
+ extent_buffer_get(root->node);
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ ret = -EINVAL;
+
+ /* First with no extents */
+ BTRFS_I(inode)->root = root;
+ em = btrfs_get_extent(inode, NULL, 0, 0, 4096, 0);
+ if (IS_ERR(em)) {
+ em = NULL;
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+ test_msg("Vacancy flag wasn't set properly\n");
+ goto out;
+ }
+ free_extent_map(em);
+ btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
+
+ /*
+ * All of the magic numbers are based on the mapping setup in
+ * setup_file_extents, so if you change anything there you need to
+ * update the comment and update the expected values below.
+ */
+ setup_file_extents(root);
+
+ em = btrfs_get_extent(inode, NULL, 0, 0, (u64)-1, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != 0 || em->len != 5) {
+ test_msg("Unexpected extent wanted start 0 len 5, got start "
+ "%llu len %llu\n", em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_INLINE) {
+ test_msg("Expected an inline, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4091) {
+ test_msg("Unexpected extent wanted start %llu len 1, got start "
+ "%llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ /*
+ * We don't test anything else for inline since it doesn't get set
+ * unless we have a page for it to write into. Maybe we should change
+ * this?
+ */
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4) {
+ test_msg("Unexpected extent wanted start %llu len 4, got start "
+ "%llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Regular extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4095) {
+ test_msg("Unexpected extent wanted start %llu len 4095, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* The next 3 are split extents */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ orig_start, em->orig_start);
+ goto out;
+ }
+ disk_bytenr += (em->start - orig_start);
+ if (em->block_start != disk_bytenr) {
+ test_msg("Wrong block start, want %llu, have %llu\n",
+ disk_bytenr, em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Prealloc extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* The next 3 are a half written prealloc extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_HOLE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Unexpected orig offset, wanted %llu, have %llu\n",
+ orig_start, em->orig_start);
+ goto out;
+ }
+ if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ disk_bytenr + (em->start - em->orig_start),
+ em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != prealloc_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ prealloc_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", orig_start,
+ em->orig_start);
+ goto out;
+ }
+ if (em->block_start != (disk_bytenr + (em->start - em->orig_start))) {
+ test_msg("Unexpected block start, wanted %llu, have %llu\n",
+ disk_bytenr + (em->start - em->orig_start),
+ em->block_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Now for the compressed extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ em->start, em->orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_msg("Unexpected compress type, wanted %d, got %d\n",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* Split compressed extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ em->start, em->orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_msg("Unexpected compress type, wanted %d, got %d\n",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ disk_bytenr = em->block_start;
+ orig_start = em->start;
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != disk_bytenr) {
+ test_msg("Block start does not match, want %llu got %llu\n",
+ disk_bytenr, em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 8192) {
+ test_msg("Unexpected extent wanted start %llu len 8192, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != compressed_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ compressed_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != orig_start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n",
+ em->start, orig_start);
+ goto out;
+ }
+ if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ test_msg("Unexpected compress type, wanted %d, got %d\n",
+ BTRFS_COMPRESS_ZLIB, em->compress_type);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ /* A hole between regular extents but no hole extent */
+ em = btrfs_get_extent(inode, NULL, 0, offset + 6, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096 * 1024, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ /*
+ * Currently we just return a length that we requested rather than the
+ * length of the actual hole, if this changes we'll have to change this
+ * test.
+ */
+ if (em->start != offset || em->len != 12288) {
+ test_msg("Unexpected extent wanted start %llu len 12288, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != vacancy_only) {
+ test_msg("Unexpected flags set, want %lu have %lu\n",
+ vacancy_only, em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ offset = em->start + em->len;
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, offset, 4096, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != offset || em->len != 4096) {
+ test_msg("Unexpected extent wanted start %llu len 4096, got "
+ "start %llu len %llu\n", offset, em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, want 0 have %lu\n", em->flags);
+ goto out;
+ }
+ if (em->orig_start != em->start) {
+ test_msg("Wrong orig offset, want %llu, have %llu\n", em->start,
+ em->orig_start);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR(em))
+ free_extent_map(em);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+static int test_hole_first(void)
+{
+ struct inode *inode = NULL;
+ struct btrfs_root *root = NULL;
+ struct extent_map *em = NULL;
+ int ret = -ENOMEM;
+
+ inode = btrfs_new_test_inode();
+ if (!inode) {
+ test_msg("Couldn't allocate inode\n");
+ return ret;
+ }
+
+ BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+ BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ BTRFS_I(inode)->location.offset = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ goto out;
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ goto out;
+ }
+
+ root->node = alloc_dummy_extent_buffer(0, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ goto out;
+ }
+
+ extent_buffer_get(root->node);
+ btrfs_set_header_nritems(root->node, 0);
+ btrfs_set_header_level(root->node, 0);
+ BTRFS_I(inode)->root = root;
+ ret = -EINVAL;
+
+ /*
+ * Need a blank inode item here just so we don't confuse
+ * btrfs_get_extent.
+ */
+ insert_inode_item_key(root);
+ insert_extent(root, 4096, 4096, 4096, 0, 4096, 4096,
+ BTRFS_FILE_EXTENT_REG, 0, 1);
+ em = btrfs_get_extent(inode, NULL, 0, 0, 8192, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != EXTENT_MAP_HOLE) {
+ test_msg("Expected a hole, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != 0 || em->len != 4096) {
+ test_msg("Unexpected extent wanted start 0 len 4096, got start "
+ "%llu len %llu\n", em->start, em->len);
+ goto out;
+ }
+ if (em->flags != vacancy_only) {
+ test_msg("Wrong flags, wanted %lu, have %lu\n", vacancy_only,
+ em->flags);
+ goto out;
+ }
+ free_extent_map(em);
+
+ em = btrfs_get_extent(inode, NULL, 0, 4096, 8192, 0);
+ if (IS_ERR(em)) {
+ test_msg("Got an error when we shouldn't have\n");
+ goto out;
+ }
+ if (em->block_start != 4096) {
+ test_msg("Expected a real extent, got %llu\n", em->block_start);
+ goto out;
+ }
+ if (em->start != 4096 || em->len != 4096) {
+ test_msg("Unexpected extent wanted start 4096 len 4096, got "
+ "start %llu len %llu\n", em->start, em->len);
+ goto out;
+ }
+ if (em->flags != 0) {
+ test_msg("Unexpected flags set, wanted 0 got %lu\n",
+ em->flags);
+ goto out;
+ }
+ ret = 0;
+out:
+ if (!IS_ERR(em))
+ free_extent_map(em);
+ iput(inode);
+ btrfs_free_dummy_root(root);
+ return ret;
+}
+
+int btrfs_test_inodes(void)
+{
+ int ret;
+
+ set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
+ set_bit(EXTENT_FLAG_VACANCY, &vacancy_only);
+ set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+
+ test_msg("Running btrfs_get_extent tests\n");
+ ret = test_btrfs_get_extent();
+ if (ret)
+ return ret;
+ test_msg("Running hole first btrfs_get_extent test\n");
+ return test_hole_first();
+}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
new file mode 100644
index 00000000000..ec3dcb20235
--- /dev/null
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (C) 2013 Facebook. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../transaction.h"
+#include "../disk-io.h"
+#include "../qgroup.h"
+
+static void init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+ memset(trans, 0, sizeof(*trans));
+ trans->transid = 1;
+ INIT_LIST_HEAD(&trans->qgroup_ref_list);
+ trans->type = __TRANS_DUMMY;
+}
+
+static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_extent_inline_ref *iref;
+ struct btrfs_tree_block_info *block_info;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_key ins;
+ u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ ins.objectid = bytenr;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+ ins.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_insert_empty_item(&trans, root, path, &ins, size);
+ if (ret) {
+ test_msg("Couldn't insert ref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+ item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+ btrfs_set_extent_refs(leaf, item, 1);
+ btrfs_set_extent_generation(leaf, item, 1);
+ btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_TREE_BLOCK);
+ block_info = (struct btrfs_tree_block_info *)(item + 1);
+ btrfs_set_tree_block_level(leaf, block_info, 1);
+ iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+ if (parent > 0) {
+ btrfs_set_extent_inline_ref_type(leaf, iref,
+ BTRFS_SHARED_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+ } else {
+ btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_TREE_BLOCK_REF_KEY);
+ btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+ }
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
+ u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs + 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_insert_empty_item(&trans, root, path, &key, 0);
+ if (ret)
+ test_msg("Failed to insert backref\n");
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_key key;
+ struct btrfs_path *path;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+ path->leave_spinning = 1;
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Didn't find our key %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return 0;
+}
+
+static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 root_objectid)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_extent_item *item;
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ u64 refs;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ test_msg("Couldn't allocate path\n");
+ return -ENOMEM;
+ }
+
+ path->leave_spinning = 1;
+ ret = btrfs_search_slot(&trans, root, &key, path, 0, 1);
+ if (ret) {
+ test_msg("Couldn't find extent ref\n");
+ btrfs_free_path(path);
+ return ret;
+ }
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_extent_item);
+ refs = btrfs_extent_refs(path->nodes[0], item);
+ btrfs_set_extent_refs(path->nodes[0], item, refs - 1);
+ btrfs_release_path(path);
+
+ key.objectid = bytenr;
+ if (parent) {
+ key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+ key.offset = parent;
+ } else {
+ key.type = BTRFS_TREE_BLOCK_REF_KEY;
+ key.offset = root_objectid;
+ }
+
+ ret = btrfs_search_slot(&trans, root, &key, path, -1, 1);
+ if (ret) {
+ test_msg("Couldn't find backref %d\n", ret);
+ btrfs_free_path(path);
+ return ret;
+ }
+ btrfs_del_item(&trans, root, path);
+ btrfs_free_path(path);
+ return ret;
+}
+
+static int test_no_shared_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup basic add\n");
+ ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_item(root, 4096, 4096);
+ if (ret)
+ return -EINVAL;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't remove space from the qgroup %d\n", ret);
+ return -EINVAL;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Add a ref for two different roots to make sure the shared value comes out
+ * right, also remove one of the roots and make sure the exclusive count is
+ * adjusted properly.
+ */
+static int test_multiple_refs(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
+
+ init_dummy_trans(&trans);
+
+ test_msg("Qgroup multiple refs test\n");
+
+ /* We have 5 created already from the previous test */
+ ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL);
+ if (ret) {
+ test_msg("Couldn't create a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_EXCL, 0);
+ if (ret) {
+ test_msg("Couldn't add space to a qgroup %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Delayed qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = add_tree_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_ADD_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 4096, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ ret = remove_extent_ref(root, 4096, 4096, 0, 256);
+ if (ret)
+ return ret;
+
+ ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096,
+ BTRFS_QGROUP_OPER_SUB_SHARED, 0);
+ if (ret) {
+ test_msg("Qgroup record ref failed %d\n", ret);
+ return ret;
+ }
+
+ ret = btrfs_delayed_qgroup_accounting(&trans, fs_info);
+ if (ret) {
+ test_msg("Qgroup accounting failed %d\n", ret);
+ return ret;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 256, 0, 0)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ if (btrfs_verify_qgroup_counts(fs_info, 5, 4096, 4096)) {
+ test_msg("Qgroup counts didn't match expected values\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int btrfs_test_qgroups(void)
+{
+ struct btrfs_root *root;
+ struct btrfs_root *tmp_root;
+ int ret = 0;
+
+ root = btrfs_alloc_dummy_root();
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ return PTR_ERR(root);
+ }
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Can't use bytenr 0, some things freak out
+ * *cough*backref walking code*cough*
+ */
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+ if (!root->node) {
+ test_msg("Couldn't allocate dummy buffer\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 8192;
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 5;
+ root->fs_info->fs_root = tmp_root;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ tmp_root = btrfs_alloc_dummy_root();
+ if (IS_ERR(tmp_root)) {
+ test_msg("Couldn't allocate a fs root\n");
+ ret = PTR_ERR(tmp_root);
+ goto out;
+ }
+
+ tmp_root->root_key.objectid = 256;
+ ret = btrfs_insert_fs_root(root->fs_info, tmp_root);
+ if (ret) {
+ test_msg("Couldn't insert fs root %d\n", ret);
+ goto out;
+ }
+
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to addt he root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
+
+ test_msg("Running qgroup tests\n");
+ ret = test_no_shared_qgroup(root);
+ if (ret)
+ goto out;
+ ret = test_multiple_refs(root);
+out:
+ btrfs_free_dummy_root(root);
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8c81bdc1ef9..5f379affdf2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -31,6 +31,7 @@
#include "inode-map.h"
#include "volumes.h"
#include "dev-replace.h"
+#include "qgroup.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -57,12 +58,12 @@ static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
__TRANS_JOIN_NOLOCK),
};
-static void put_transaction(struct btrfs_transaction *transaction)
+void btrfs_put_transaction(struct btrfs_transaction *transaction)
{
WARN_ON(atomic_read(&transaction->use_count) == 0);
if (atomic_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(transaction->delayed_refs.root.rb_node);
+ WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
while (!list_empty(&transaction->pending_chunks)) {
struct extent_map *em;
@@ -75,10 +76,21 @@ static void put_transaction(struct btrfs_transaction *transaction)
}
}
-static noinline void switch_commit_root(struct btrfs_root *root)
+static noinline void switch_commit_roots(struct btrfs_transaction *trans,
+ struct btrfs_fs_info *fs_info)
{
- free_extent_buffer(root->commit_root);
- root->commit_root = btrfs_root_node(root);
+ struct btrfs_root *root, *tmp;
+
+ down_write(&fs_info->commit_root_sem);
+ list_for_each_entry_safe(root, tmp, &trans->switch_commits,
+ dirty_list) {
+ list_del_init(&root->dirty_list);
+ free_extent_buffer(root->commit_root);
+ root->commit_root = btrfs_root_node(root);
+ if (is_fstree(root->objectid))
+ btrfs_unpin_free_ino(root);
+ }
+ up_write(&fs_info->commit_root_sem);
}
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
@@ -183,8 +195,8 @@ loop:
atomic_set(&cur_trans->use_count, 2);
cur_trans->start_time = get_seconds();
- cur_trans->delayed_refs.root = RB_ROOT;
- cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.href_root = RB_ROOT;
+ atomic_set(&cur_trans->delayed_refs.num_entries, 0);
cur_trans->delayed_refs.num_heads_ready = 0;
cur_trans->delayed_refs.num_heads = 0;
cur_trans->delayed_refs.flushing = 0;
@@ -196,21 +208,19 @@ loop:
*/
smp_mb();
if (!list_empty(&fs_info->tree_mod_seq_list))
- WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+ WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when "
"creating a fresh transaction\n");
if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
- WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+ WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when "
"creating a fresh transaction\n");
atomic64_set(&fs_info->tree_mod_seq, 0);
spin_lock_init(&cur_trans->delayed_refs.lock);
- atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
- atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
- init_waitqueue_head(&cur_trans->delayed_refs.wait);
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->ordered_operations);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
+ INIT_LIST_HEAD(&cur_trans->switch_commits);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -232,18 +242,19 @@ loop:
static int record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (root->ref_cows && root->last_trans < trans->transid) {
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
+ root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->commit_root != root->node);
/*
- * see below for in_trans_setup usage rules
+ * see below for IN_TRANS_SETUP usage rules
* we have the reloc mutex held now, so there
* is only one writer in this function
*/
- root->in_trans_setup = 1;
+ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
- /* make sure readers find in_trans_setup before
+ /* make sure readers find IN_TRANS_SETUP before
* they find our root->last_trans update
*/
smp_wmb();
@@ -270,7 +281,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* But, we have to set root->last_trans before we
* init the relocation root, otherwise, we trip over warnings
* in ctree.c. The solution used here is to flag ourselves
- * with root->in_trans_setup. When this is 1, we're still
+ * with root IN_TRANS_SETUP. When this is 1, we're still
* fixing up the reloc trees and everyone must wait.
*
* When this is zero, they can trust root->last_trans and fly
@@ -279,8 +290,8 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* done before we pop in the zero below
*/
btrfs_init_reloc_root(trans, root);
- smp_wmb();
- root->in_trans_setup = 0;
+ smp_mb__before_atomic();
+ clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
return 0;
}
@@ -289,16 +300,16 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
- if (!root->ref_cows)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return 0;
/*
- * see record_root_in_trans for comments about in_trans_setup usage
+ * see record_root_in_trans for comments about IN_TRANS_SETUP usage
* and barriers
*/
smp_rmb();
if (root->last_trans == trans->transid &&
- !root->in_trans_setup)
+ !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
return 0;
mutex_lock(&root->fs_info->reloc_mutex);
@@ -332,7 +343,7 @@ static void wait_current_trans(struct btrfs_root *root)
wait_event(root->fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
cur_trans->aborted);
- put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
}
@@ -353,6 +364,17 @@ static int may_wait_transaction(struct btrfs_root *root, int type)
return 0;
}
+static inline bool need_reserve_reloc_root(struct btrfs_root *root)
+{
+ if (!root->fs_info->reloc_ctl ||
+ !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+ root->reloc_root)
+ return false;
+
+ return true;
+}
+
static struct btrfs_trans_handle *
start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
enum btrfs_reserve_flush_enum flush)
@@ -360,8 +382,12 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
u64 num_bytes = 0;
- int ret;
u64 qgroup_reserved = 0;
+ bool reloc_reserved = false;
+ int ret;
+
+ /* Send isn't supposed to start transactions. */
+ ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB);
if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
return ERR_PTR(-EROFS);
@@ -390,6 +416,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
}
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+ /*
+ * Do the reservation for the relocation root creation
+ */
+ if (unlikely(need_reserve_reloc_root(root))) {
+ num_bytes += root->nodesize;
+ reloc_reserved = true;
+ }
+
ret = btrfs_block_rsv_add(root,
&root->fs_info->trans_block_rsv,
num_bytes, flush);
@@ -451,12 +485,15 @@ again:
h->delayed_ref_elem.seq = 0;
h->type = type;
h->allocating_chunk = false;
+ h->reloc_reserved = false;
+ h->sync = false;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
may_wait_transaction(root, type)) {
+ current->journal_info = h;
btrfs_commit_transaction(h, root);
goto again;
}
@@ -466,6 +503,7 @@ again:
h->transid, num_bytes, 1);
h->block_rsv = &root->fs_info->trans_block_rsv;
h->bytes_reserved = num_bytes;
+ h->reloc_reserved = reloc_reserved;
}
h->qgroup_reserved = qgroup_reserved;
@@ -610,7 +648,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
}
wait_for_commit(root, cur_trans);
- put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
out:
return ret;
}
@@ -625,7 +663,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
if (root->fs_info->global_block_rsv.space_info->full &&
- btrfs_should_throttle_delayed_refs(trans, root))
+ btrfs_check_space_for_delayed_refs(trans, root))
return 1;
return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
@@ -662,20 +700,35 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
unsigned long cur = trans->delayed_ref_updates;
int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
+ int must_run_delayed_refs = 0;
- if (--trans->use_count) {
+ if (trans->use_count > 1) {
+ trans->use_count--;
trans->block_rsv = trans->orig_rsv;
return 0;
}
- /*
- * do the qgroup accounting as early as possible
- */
- err = btrfs_delayed_refs_qgroup_accounting(trans, info);
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
+ if (!list_empty(&trans->new_bgs))
+ btrfs_create_pending_block_groups(trans, root);
+
+ trans->delayed_ref_updates = 0;
+ if (!trans->sync) {
+ must_run_delayed_refs =
+ btrfs_should_throttle_delayed_refs(trans, root);
+ cur = max_t(unsigned long, cur, 32);
+
+ /*
+ * don't make the caller wait if they are from a NOLOCK
+ * or ATTACH transaction, it will deadlock with commit
+ */
+ if (must_run_delayed_refs == 1 &&
+ (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
+ must_run_delayed_refs = 2;
+ }
+
if (trans->qgroup_reserved) {
/*
* the same root has to be passed here between start_transaction
@@ -685,16 +738,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->qgroup_reserved = 0;
}
- if (!list_empty(&trans->new_bgs))
- btrfs_create_pending_block_groups(trans, root);
-
- trans->delayed_ref_updates = 0;
- if (btrfs_should_throttle_delayed_refs(trans, root)) {
- cur = max_t(unsigned long, cur, 1);
- trans->delayed_ref_updates = 0;
- btrfs_run_delayed_refs(trans, root, cur);
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -711,17 +754,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
}
if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle) {
- /*
- * We may race with somebody else here so end up having
- * to call end_transaction on ourselves again, so inc
- * our use_count.
- */
- trans->use_count++;
+ if (throttle)
return btrfs_commit_transaction(trans, root);
- } else {
+ else
wake_up_process(info->transaction_kthread);
- }
}
if (trans->type & __TRANS_FREEZABLE)
@@ -735,7 +771,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
- put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
if (current->journal_info == trans)
current->journal_info = NULL;
@@ -744,11 +780,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_run_delayed_iputs(root);
if (trans->aborted ||
- test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+ test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+ wake_up_process(info->transaction_kthread);
err = -EIO;
+ }
assert_qgroups_uptodate(trans);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
+ if (must_run_delayed_refs) {
+ btrfs_async_run_delayed_refs(root, cur,
+ must_run_delayed_refs == 1);
+ }
return err;
}
@@ -764,12 +806,6 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
return __btrfs_end_transaction(trans, root, 1);
}
-int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
-{
- return __btrfs_end_transaction(trans, root, 1);
-}
-
/*
* when btree blocks are allocated, they have some corresponding bits set for
* them in one of two extent_io trees. This is used to make sure all of
@@ -909,9 +945,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
}
- if (root != root->fs_info->extent_root)
- switch_commit_root(root);
-
return 0;
}
@@ -948,31 +981,35 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
return ret;
ret = btrfs_run_dev_stats(trans, root->fs_info);
- WARN_ON(ret);
+ if (ret)
+ return ret;
ret = btrfs_run_dev_replace(trans, root->fs_info);
- WARN_ON(ret);
-
+ if (ret)
+ return ret;
ret = btrfs_run_qgroups(trans, root->fs_info);
- BUG_ON(ret);
+ if (ret)
+ return ret;
/* run_qgroups might have added some more refs */
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- BUG_ON(ret);
+ if (ret)
+ return ret;
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ if (root != fs_info->extent_root)
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
ret = update_cowonly_root(trans, root);
if (ret)
return ret;
}
- down_write(&fs_info->extent_commit_sem);
- switch_commit_root(fs_info->extent_root);
- up_write(&fs_info->extent_commit_sem);
-
+ list_add_tail(&fs_info->extent_root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_after_dev_replace_commit(fs_info);
return 0;
@@ -1025,15 +1062,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
btrfs_save_ino_cache(root, trans);
/* see comments in should_cow_block() */
- root->force_cow = 0;
- smp_wmb();
+ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
+ smp_mb__after_atomic();
if (root->commit_root != root->node) {
- mutex_lock(&root->fs_commit_mutex);
- switch_commit_root(root);
- btrfs_unpin_free_ino(root);
- mutex_unlock(&root->fs_commit_mutex);
-
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item,
root->node);
}
@@ -1060,7 +1094,7 @@ int btrfs_defrag_root(struct btrfs_root *root)
struct btrfs_trans_handle *trans;
int ret;
- if (xchg(&root->defrag_running, 1))
+ if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
return 0;
while (1) {
@@ -1078,12 +1112,12 @@ int btrfs_defrag_root(struct btrfs_root *root)
break;
if (btrfs_defrag_cancelled(root->fs_info)) {
- printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+ pr_debug("BTRFS: defrag_root cancelled\n");
ret = -EAGAIN;
break;
}
}
- root->defrag_running = 0;
+ clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
return ret;
}
@@ -1147,12 +1181,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto no_free_objectid;
}
- pending->error = btrfs_qgroup_inherit(trans, fs_info,
- root->root_key.objectid,
- objectid, pending->inherit);
- if (pending->error)
- goto no_free_objectid;
-
key.objectid = objectid;
key.offset = (u64)-1;
key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1249,8 +1277,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
+ /*
+ * We need to flush delayed refs in order to make sure all of our quota
+ * operations have been done before we call btrfs_qgroup_inherit.
+ */
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
+ ret = btrfs_qgroup_inherit(trans, fs_info,
+ root->root_key.objectid,
+ objectid, pending->inherit);
+ if (ret) {
+ btrfs_abort_transaction(trans, root, ret);
+ goto fail;
+ }
+
/* see comments in should_cow_block() */
- root->force_cow = 1;
+ set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_wmb();
btrfs_set_root_node(new_root_item, tmp);
@@ -1453,7 +1499,7 @@ static void do_async_commit(struct work_struct *work)
* We've got freeze protection passed with the transaction.
* Tell lockdep about it.
*/
- if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+ if (ac->newtrans->type & __TRANS_FREEZABLE)
rwsem_acquire_read(
&ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
0, 1, _THIS_IP_);
@@ -1494,7 +1540,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
* Tell lockdep we've released the freeze rwsem, since the
* async commit thread will be the one to unlock it.
*/
- if (trans->type < TRANS_JOIN_NOLOCK)
+ if (ac->newtrans->type & __TRANS_FREEZABLE)
rwsem_release(
&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1, _THIS_IP_);
@@ -1510,7 +1556,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
if (current->journal_info == trans)
current->journal_info = NULL;
- put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
return 0;
}
@@ -1552,15 +1598,16 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
root->fs_info->running_transaction = NULL;
spin_unlock(&root->fs_info->trans_lock);
- put_transaction(cur_trans);
- put_transaction(cur_trans);
+ if (trans->type & __TRANS_FREEZABLE)
+ sb_end_intwrite(root->fs_info->sb);
+ btrfs_put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
trace_btrfs_transaction_commit(root);
- btrfs_scrub_continue(root);
-
if (current->journal_info == trans)
current->journal_info = NULL;
+ btrfs_scrub_cancel(root->fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}
@@ -1575,13 +1622,6 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
return ret;
/*
- * running the delayed items may have added new refs. account
- * them now so that they hinder processing of more delayed refs
- * as little as possible.
- */
- btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
-
- /*
* rename don't use btrfs_join_transaction, so, once we
* set the transaction to blocked above, we aren't going
* to get any new ordered operations. We can safely run
@@ -1596,14 +1636,14 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- return btrfs_start_all_delalloc_inodes(fs_info, 1);
+ return btrfs_start_delalloc_roots(fs_info, 1, -1);
return 0;
}
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
- btrfs_wait_all_ordered_extents(fs_info);
+ btrfs_wait_ordered_roots(fs_info, -1);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1669,7 +1709,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_for_commit(root, cur_trans);
- put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
return ret;
}
@@ -1686,7 +1726,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
wait_for_commit(root, prev_trans);
- put_transaction(prev_trans);
+ btrfs_put_transaction(prev_trans);
} else {
spin_unlock(&root->fs_info->trans_lock);
}
@@ -1713,6 +1753,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
goto cleanup_transaction;
btrfs_wait_delalloc_flush(root->fs_info);
+
+ btrfs_scrub_pause(root);
/*
* Ok now we need to make sure to block out any other joins while we
* commit the transaction. We could have started a join before setting
@@ -1727,7 +1769,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
/* ->aborted might be set after the previous check, so check it */
if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
* the reloc mutex makes sure that we stop
@@ -1744,7 +1786,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = create_pending_snapshots(trans, root->fs_info);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1760,13 +1802,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_run_delayed_items(trans, root);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1777,7 +1819,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
WARN_ON(cur_trans != trans->transaction);
- btrfs_scrub_pause(root);
/* btrfs_commit_tree_roots is responsible for getting the
* various roots consistent with each other. Every pointer
* in the tree of tree roots has to point to the most up to date
@@ -1797,9 +1838,18 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
+ /*
+ * Since the transaction is done, we should set the inode map cache flag
+ * before any other comming transaction.
+ */
+ if (btrfs_test_opt(root, CHANGE_INODE_CACHE))
+ btrfs_set_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+ else
+ btrfs_clear_opt(root->fs_info->mount_opt, INODE_MAP_CACHE);
+
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
*/
@@ -1809,7 +1859,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1820,7 +1870,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = cur_trans->aborted;
mutex_unlock(&root->fs_info->tree_log_mutex);
mutex_unlock(&root->fs_info->reloc_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
btrfs_prepare_extent_commit(trans, root);
@@ -1829,11 +1879,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_set_root_node(&root->fs_info->tree_root->root_item,
root->fs_info->tree_root->node);
- switch_commit_root(root->fs_info->tree_root);
+ list_add_tail(&root->fs_info->tree_root->dirty_list,
+ &cur_trans->switch_commits);
btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
root->fs_info->chunk_root->node);
- switch_commit_root(root->fs_info->chunk_root);
+ list_add_tail(&root->fs_info->chunk_root->dirty_list,
+ &cur_trans->switch_commits);
+
+ switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
update_super_roots(root);
@@ -1856,13 +1910,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
ret = write_ctree_super(trans, root, 0);
if (ret) {
mutex_unlock(&root->fs_info->tree_log_mutex);
- goto cleanup_transaction;
+ goto scrub_continue;
}
/*
@@ -1885,8 +1939,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
list_del_init(&cur_trans->list);
spin_unlock(&root->fs_info->trans_lock);
- put_transaction(cur_trans);
- put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
+ btrfs_put_transaction(cur_trans);
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(root->fs_info->sb);
@@ -1905,6 +1959,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
+scrub_continue:
+ btrfs_scrub_continue(root);
cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -1945,7 +2001,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- pr_debug("btrfs: cleaner removing %llu\n", root->objectid);
+ pr_debug("BTRFS: cleaner removing %llu\n", root->objectid);
btrfs_kill_all_delayed_nodes(root);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 5c2af849162..7dd558ed071 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -57,6 +57,7 @@ struct btrfs_transaction {
struct list_head pending_snapshots;
struct list_head ordered_operations;
struct list_head pending_chunks;
+ struct list_head switch_commits;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
@@ -68,6 +69,7 @@ struct btrfs_transaction {
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
+#define __TRANS_DUMMY (1U << 13)
#define TRANS_USERSPACE (__TRANS_USERSPACE | __TRANS_FREEZABLE)
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
@@ -78,6 +80,8 @@ struct btrfs_transaction {
#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
__TRANS_ATTACH)
+#define BTRFS_SEND_TRANS_STUB 1
+
struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
@@ -92,6 +96,8 @@ struct btrfs_trans_handle {
short aborted;
short adding_csums;
bool allocating_chunk;
+ bool reloc_reserved;
+ bool sync;
unsigned int type;
/*
* this root is only needed to validate that the root passed to
@@ -153,8 +159,6 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
-int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_throttle(struct btrfs_root *root);
@@ -166,4 +170,5 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
struct extent_io_tree *dirty_pages, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
+void btrfs_put_transaction(struct btrfs_transaction *transaction);
#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 94e05c1f118..a63719cc957 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -37,7 +37,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int ret = 0;
int wret;
int level;
- int is_extent = 0;
int next_key_ret = 0;
u64 last_ret = 0;
u64 min_trans = 0;
@@ -50,7 +49,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
goto out;
}
- if (root->ref_cows == 0 && !is_extent)
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
goto out;
if (btrfs_test_opt(root, SSD))
@@ -85,7 +84,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
- ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
+ ret = btrfs_search_forward(root, &key, path, min_trans);
if (ret < 0)
goto out;
if (ret > 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 79f057c0619..9e1f2cd5e67 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -20,14 +20,11 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/list_sort.h>
-#include "ctree.h"
-#include "transaction.h"
+#include "tree-log.h"
#include "disk-io.h"
#include "locking.h"
#include "print-tree.h"
#include "backref.h"
-#include "compat.h"
-#include "tree-log.h"
#include "hash.h"
/* magic values for the inode_only field in btrfs_log_inode:
@@ -137,43 +134,61 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
* syncing the tree wait for us to finish
*/
static int start_log_trans(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
{
+ int index;
int ret;
- int err = 0;
mutex_lock(&root->log_mutex);
if (root->log_root) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
+ ret = -EAGAIN;
+ goto out;
+ }
if (!root->log_start_pid) {
root->log_start_pid = current->pid;
- root->log_multiple_pids = false;
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
} else if (root->log_start_pid != current->pid) {
- root->log_multiple_pids = true;
+ set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
mutex_unlock(&root->log_mutex);
return 0;
}
- root->log_multiple_pids = false;
- root->log_start_pid = current->pid;
+
+ ret = 0;
mutex_lock(&root->fs_info->tree_log_mutex);
- if (!root->fs_info->log_root_tree) {
+ if (!root->fs_info->log_root_tree)
ret = btrfs_init_log_root_tree(trans, root->fs_info);
- if (ret)
- err = ret;
- }
- if (err == 0 && !root->log_root) {
+ mutex_unlock(&root->fs_info->tree_log_mutex);
+ if (ret)
+ goto out;
+
+ if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
- err = ret;
+ goto out;
}
- mutex_unlock(&root->fs_info->tree_log_mutex);
+ clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+ root->log_start_pid = current->pid;
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
+ if (ctx) {
+ index = root->log_transid % 2;
+ list_add_tail(&ctx->list, &root->log_ctxs[index]);
+ ctx->log_transid = root->log_transid;
+ }
+out:
mutex_unlock(&root->log_mutex);
- return err;
+ return ret;
}
/*
@@ -571,7 +586,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
nbytes = 0;
} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
- size = btrfs_file_extent_inline_len(eb, item);
+ size = btrfs_file_extent_inline_len(eb, slot, item);
nbytes = btrfs_file_extent_ram_bytes(eb, item);
extent_end = ALIGN(start + size, root->sectorsize);
} else {
@@ -936,7 +951,7 @@ again:
parent_objectid,
victim_name,
victim_name_len)) {
- btrfs_inc_nlink(inode);
+ inc_nlink(inode);
btrfs_release_path(path);
ret = btrfs_unlink_inode(trans, root, dir,
@@ -1006,7 +1021,7 @@ again:
victim_parent = read_one_inode(root,
parent_objectid);
if (victim_parent) {
- btrfs_inc_nlink(inode);
+ inc_nlink(inode);
btrfs_release_path(path);
ret = btrfs_unlink_inode(trans, root,
@@ -1113,11 +1128,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, int slot,
struct btrfs_key *key)
{
- struct inode *dir;
- struct inode *inode;
+ struct inode *dir = NULL;
+ struct inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
- char *name;
+ char *name = NULL;
int namelen;
int ret;
int search_done = 0;
@@ -1150,13 +1165,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
* care of the rest
*/
dir = read_one_inode(root, parent_objectid);
- if (!dir)
- return -ENOENT;
+ if (!dir) {
+ ret = -ENOENT;
+ goto out;
+ }
inode = read_one_inode(root, inode_objectid);
if (!inode) {
- iput(dir);
- return -EIO;
+ ret = -EIO;
+ goto out;
}
while (ref_ptr < ref_end) {
@@ -1169,14 +1186,16 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!dir)
dir = read_one_inode(root, parent_objectid);
- if (!dir)
- return -ENOENT;
+ if (!dir) {
+ ret = -ENOENT;
+ goto out;
+ }
} else {
ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
&ref_index);
}
if (ret)
- return ret;
+ goto out;
/* if we already have a perfect match, we're done */
if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
@@ -1196,12 +1215,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
parent_objectid,
ref_index, name, namelen,
&search_done);
- if (ret == 1) {
- ret = 0;
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
goto out;
}
- if (ret)
- goto out;
}
/* insert our name */
@@ -1215,6 +1233,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
+ name = NULL;
if (log_ref_ver) {
iput(dir);
dir = NULL;
@@ -1225,6 +1244,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
ret = overwrite_item(trans, root, path, eb, slot, key);
out:
btrfs_release_path(path);
+ kfree(name);
iput(dir);
iput(inode);
return ret;
@@ -1234,7 +1254,8 @@ static int insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)
{
int ret;
- ret = btrfs_find_orphan_item(root, offset);
+ ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
+ offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
if (ret > 0)
ret = btrfs_insert_orphan_item(trans, root, offset);
return ret;
@@ -1307,6 +1328,7 @@ static int count_inode_refs(struct btrfs_root *root,
break;
path->slots[0]--;
}
+process_slot:
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid != ino ||
@@ -1327,6 +1349,10 @@ static int count_inode_refs(struct btrfs_root *root,
if (key.offset == 0)
break;
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ goto process_slot;
+ }
key.offset--;
btrfs_release_path(path);
}
@@ -1480,7 +1506,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
if (!inode->i_nlink)
set_nlink(inode, 1);
else
- btrfs_inc_nlink(inode);
+ inc_nlink(inode);
ret = btrfs_update_inode(trans, root, inode);
} else if (ret == -EEXIST) {
ret = 0;
@@ -1823,7 +1849,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (IS_ERR_OR_NULL(log_di)) {
+ if (!log_di || (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -1841,7 +1867,7 @@ again:
goto out;
}
- btrfs_inc_nlink(inode);
+ inc_nlink(inode);
ret = btrfs_unlink_inode(trans, root, dir, inode,
name, name_len);
if (!ret)
@@ -1860,6 +1886,9 @@ again:
goto again;
ret = 0;
goto out;
+ } else if (IS_ERR(log_di)) {
+ kfree(name);
+ return PTR_ERR(log_di);
}
btrfs_release_path(log_path);
kfree(name);
@@ -2118,8 +2147,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
- if (btrfs_header_level(cur) != *level)
- WARN_ON(1);
+ WARN_ON(btrfs_header_level(cur) != *level);
if (path->slots[*level] >=
btrfs_header_nritems(cur))
@@ -2151,11 +2179,13 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
return ret;
}
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, root, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ }
WARN_ON(root_owner !=
BTRFS_TREE_LOG_OBJECTID);
@@ -2227,11 +2257,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
next = path->nodes[*level];
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(trans, root, next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, root, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ }
WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
ret = btrfs_free_and_pin_reserved_extent(root,
@@ -2301,11 +2333,13 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
next = path->nodes[orig_level];
- btrfs_tree_lock(next);
- btrfs_set_lock_blocking(next);
- clean_tree_block(trans, log, next);
- btrfs_wait_tree_block_writeback(next);
- btrfs_tree_unlock(next);
+ if (trans) {
+ btrfs_tree_lock(next);
+ btrfs_set_lock_blocking(next);
+ clean_tree_block(trans, log, next);
+ btrfs_wait_tree_block_writeback(next);
+ btrfs_tree_unlock(next);
+ }
WARN_ON(log->root_key.objectid !=
BTRFS_TREE_LOG_OBJECTID);
@@ -2341,8 +2375,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, unsigned long transid)
+static void wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, int transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2357,36 +2391,63 @@ static int wait_log_commit(struct btrfs_trans_handle *trans,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ if (root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index]))
schedule();
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
- } while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && root->log_transid < transid + 2 &&
+ } while (root->log_transid_committed < transid &&
atomic_read(&root->log_commit[index]));
- return 0;
}
static void wait_for_writer(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
DEFINE_WAIT(wait);
- while (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers)) {
+
+ while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->fs_info->last_trans_log_full_commit !=
- trans->transid && atomic_read(&root->log_writers))
+ if (atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
}
}
+static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
+ struct btrfs_log_ctx *ctx)
+{
+ if (!ctx)
+ return;
+
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx->list);
+ mutex_unlock(&root->log_mutex);
+}
+
+/*
+ * Invoked in log mutex context, or be sure there is no other task which
+ * can access the list.
+ */
+static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
+ int index, int error)
+{
+ struct btrfs_log_ctx *ctx;
+
+ if (!error) {
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+ return;
+ }
+
+ list_for_each_entry(ctx, &root->log_ctxs[index], list)
+ ctx->log_ret = error;
+
+ INIT_LIST_HEAD(&root->log_ctxs[index]);
+}
+
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
@@ -2400,7 +2461,7 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
* that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx)
{
int index1;
int index2;
@@ -2408,26 +2469,35 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
- unsigned long log_transid = 0;
+ int log_transid = 0;
+ struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
mutex_lock(&root->log_mutex);
- log_transid = root->log_transid;
- index1 = root->log_transid % 2;
+ log_transid = ctx->log_transid;
+ if (root->log_transid_committed >= log_transid) {
+ mutex_unlock(&root->log_mutex);
+ return ctx->log_ret;
+ }
+
+ index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(trans, root, root->log_transid);
+ wait_log_commit(trans, root, log_transid);
mutex_unlock(&root->log_mutex);
- return 0;
+ return ctx->log_ret;
}
+ ASSERT(log_transid == root->log_transid);
atomic_set(&root->log_commit[index1], 1);
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(trans, root, root->log_transid - 1);
+ wait_log_commit(trans, root, log_transid - 1);
+
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
+ if (!btrfs_test_opt(root, SSD) &&
+ test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
@@ -2438,7 +2508,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
/* bail out if we need to do a full commit */
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&root->log_mutex);
@@ -2459,6 +2529,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
+ btrfs_set_log_full_commit(root->fs_info, trans);
mutex_unlock(&root->log_mutex);
goto out;
}
@@ -2468,7 +2539,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
- smp_mb();
/*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
@@ -2476,9 +2546,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
+ btrfs_init_log_ctx(&root_log_ctx);
+
mutex_lock(&log_root_tree->log_mutex);
atomic_inc(&log_root_tree->log_batch);
atomic_inc(&log_root_tree->log_writers);
+
+ index2 = log_root_tree->log_transid % 2;
+ list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
+ root_log_ctx.log_transid = log_root_tree->log_transid;
+
mutex_unlock(&log_root_tree->log_mutex);
ret = update_log_root(trans, log);
@@ -2491,13 +2568,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (ret) {
+ if (!list_empty(&root_log_ctx.list))
+ list_del_init(&root_log_ctx.list);
+
blk_finish_plug(&plug);
+ btrfs_set_log_full_commit(root->fs_info, trans);
+
if (ret != -ENOSPC) {
btrfs_abort_transaction(trans, root, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
- root->fs_info->last_trans_log_full_commit = trans->transid;
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2505,22 +2586,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
- index2 = log_root_tree->log_transid % 2;
+ if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = root_log_ctx.log_ret;
+ goto out;
+ }
+
+ index2 = root_log_ctx.log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid);
+ root_log_ctx.log_transid);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
- ret = 0;
+ ret = root_log_ctx.log_ret;
goto out;
}
+ ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
atomic_set(&log_root_tree->log_commit[index2], 1);
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
wait_log_commit(trans, log_root_tree,
- log_root_tree->log_transid - 1);
+ root_log_ctx.log_transid - 1);
}
wait_for_writer(trans, log_root_tree);
@@ -2529,7 +2617,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
*/
- if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ if (btrfs_need_log_full_commit(root->fs_info, trans)) {
blk_finish_plug(&plug);
btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
btrfs_free_logged_extents(log, log_transid);
@@ -2543,6 +2631,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
EXTENT_DIRTY | EXTENT_NEW);
blk_finish_plug(&plug);
if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
btrfs_abort_transaction(trans, root, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
@@ -2560,8 +2649,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_header_level(log_root_tree->node));
log_root_tree->log_transid++;
- smp_mb();
-
mutex_unlock(&log_root_tree->log_mutex);
/*
@@ -2571,10 +2658,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* the running transaction open, so a full commit can't hop
* in and cause problems either.
*/
- btrfs_scrub_pause_super(root);
ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
- btrfs_scrub_continue_super(root);
if (ret) {
+ btrfs_set_log_full_commit(root->fs_info, trans);
btrfs_abort_transaction(trans, root, ret);
goto out_wake_log_root;
}
@@ -2585,13 +2671,28 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
out_wake_log_root:
+ /*
+ * We needn't get log_mutex here because we are sure all
+ * the other tasks are blocked.
+ */
+ btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
+
+ mutex_lock(&log_root_tree->log_mutex);
+ log_root_tree->log_transid_committed++;
atomic_set(&log_root_tree->log_commit[index2], 0);
- smp_mb();
+ mutex_unlock(&log_root_tree->log_mutex);
+
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
+ /* See above. */
+ btrfs_remove_all_log_ctxs(root, index1, ret);
+
+ mutex_lock(&root->log_mutex);
+ root->log_transid_committed++;
atomic_set(&root->log_commit[index1], 0);
- smp_mb();
+ mutex_unlock(&root->log_mutex);
+
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -2608,13 +2709,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
.process_func = process_one_buffer
};
- if (trans) {
- ret = walk_log_tree(trans, log, &wc);
-
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, log, ret);
- }
+ ret = walk_log_tree(trans, log, &wc);
+ /* I don't think this can happen but just in case */
+ if (ret)
+ btrfs_abort_transaction(trans, log, ret);
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2780,7 +2878,7 @@ fail:
out_unlock:
mutex_unlock(&BTRFS_I(dir)->log_mutex);
if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0)
btrfs_abort_transaction(trans, root, ret);
@@ -2813,7 +2911,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
if (ret == -ENOSPC) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
btrfs_abort_transaction(trans, root, ret);
@@ -2867,7 +2965,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
u64 min_offset, u64 *last_offset_ret)
{
struct btrfs_key min_key;
- struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src;
int err = 0;
@@ -2879,9 +2976,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
u64 ino = btrfs_ino(inode);
log = root->log_root;
- max_key.objectid = ino;
- max_key.offset = (u64)-1;
- max_key.type = key_type;
min_key.objectid = ino;
min_key.type = key_type;
@@ -2889,8 +2983,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
path->keep_locks = 1;
- ret = btrfs_search_forward(root, &min_key, &max_key,
- path, trans->transid);
+ ret = btrfs_search_forward(root, &min_key, path, trans->transid);
/*
* we didn't find anything from this transaction, see if there
@@ -2943,10 +3036,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
/* find the first key from this transaction again */
ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
- if (ret != 0) {
- WARN_ON(1);
+ if (WARN_ON(ret != 0))
goto done;
- }
/*
* we have a block from this transaction, log every item in it
@@ -3172,11 +3263,10 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_inode_item *inode_item;
- struct btrfs_key key;
int ret;
- memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
- ret = btrfs_insert_empty_item(trans, log, path, &key,
+ ret = btrfs_insert_empty_item(trans, log, path,
+ &BTRFS_I(inode)->location,
sizeof(*inode_item));
if (ret && ret != -EEXIST)
return ret;
@@ -3190,7 +3280,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *dst_path,
- struct extent_buffer *src,
+ struct btrfs_path *src_path, u64 *last_extent,
int start_slot, int nr, int inode_only)
{
unsigned long src_offset;
@@ -3198,6 +3288,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
+ struct extent_buffer *src = src_path->nodes[0];
+ struct btrfs_key first_key, last_key, key;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -3205,6 +3297,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int i;
struct list_head ordered_sums;
int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+ bool has_extents = false;
+ bool need_find_last_extent = (*last_extent == 0);
+ bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@@ -3213,6 +3308,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
if (!ins_data)
return -ENOMEM;
+ first_key.objectid = (u64)-1;
+
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -3233,6 +3330,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
+ if ((i == (nr - 1)))
+ last_key = ins_keys[i];
+
if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
@@ -3244,6 +3344,21 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset, ins_sizes[i]);
}
+ /*
+ * We set need_find_last_extent here in case we know we were
+ * processing other items and then walk into the first extent in
+ * the inode. If we don't hit an extent then nothing changes,
+ * we'll do the last search the next time around.
+ */
+ if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
+ has_extents = true;
+ if (need_find_last_extent &&
+ first_key.objectid == (u64)-1)
+ first_key = ins_keys[i];
+ } else {
+ need_find_last_extent = false;
+ }
+
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
@@ -3308,6 +3423,128 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
list_del(&sums->list);
kfree(sums);
}
+
+ if (!has_extents)
+ return ret;
+
+ /*
+ * Because we use btrfs_search_forward we could skip leaves that were
+ * not modified and then assume *last_extent is valid when it really
+ * isn't. So back up to the previous leaf and read the end of the last
+ * extent before we go and fill in holes.
+ */
+ if (need_find_last_extent) {
+ u64 len;
+
+ ret = btrfs_prev_leaf(BTRFS_I(inode)->root, src_path);
+ if (ret < 0)
+ return ret;
+ if (ret)
+ goto fill_holes;
+ if (src_path->slots[0])
+ src_path->slots[0]--;
+ src = src_path->nodes[0];
+ btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ goto fill_holes;
+ extent = btrfs_item_ptr(src, src_path->slots[0],
+ struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(src, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(src,
+ src_path->slots[0],
+ extent);
+ *last_extent = ALIGN(key.offset + len,
+ log->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(src, extent);
+ *last_extent = key.offset + len;
+ }
+ }
+fill_holes:
+ /* So we did prev_leaf, now we need to move to the next leaf, but a few
+ * things could have happened
+ *
+ * 1) A merge could have happened, so we could currently be on a leaf
+ * that holds what we were copying in the first place.
+ * 2) A split could have happened, and now not all of the items we want
+ * are on the same leaf.
+ *
+ * So we need to adjust how we search for holes, we need to drop the
+ * path and re-search for the first extent key we found, and then walk
+ * forward until we hit the last one we copied.
+ */
+ if (need_find_last_extent) {
+ /* btrfs_prev_leaf could return 1 without releasing the path */
+ btrfs_release_path(src_path);
+ ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &first_key,
+ src_path, 0, 0);
+ if (ret < 0)
+ return ret;
+ ASSERT(ret == 0);
+ src = src_path->nodes[0];
+ i = src_path->slots[0];
+ } else {
+ i = start_slot;
+ }
+
+ /*
+ * Ok so here we need to go through and fill in any holes we may have
+ * to make sure that holes are punched for those areas in case they had
+ * extents previously.
+ */
+ while (!done) {
+ u64 offset, len;
+ u64 extent_end;
+
+ if (i >= btrfs_header_nritems(src_path->nodes[0])) {
+ ret = btrfs_next_leaf(BTRFS_I(inode)->root, src_path);
+ if (ret < 0)
+ return ret;
+ ASSERT(ret == 0);
+ src = src_path->nodes[0];
+ i = 0;
+ }
+
+ btrfs_item_key_to_cpu(src, &key, i);
+ if (!btrfs_comp_cpu_keys(&key, &last_key))
+ done = true;
+ if (key.objectid != btrfs_ino(inode) ||
+ key.type != BTRFS_EXTENT_DATA_KEY) {
+ i++;
+ continue;
+ }
+ extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
+ if (btrfs_file_extent_type(src, extent) ==
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_inline_len(src, i, extent);
+ extent_end = ALIGN(key.offset + len, log->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(src, extent);
+ extent_end = key.offset + len;
+ }
+ i++;
+
+ if (*last_extent == key.offset) {
+ *last_extent = extent_end;
+ continue;
+ }
+ offset = *last_extent;
+ len = key.offset - *last_extent;
+ ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
+ offset, 0, 0, len, 0, len, 0,
+ 0, 0);
+ if (ret)
+ break;
+ *last_extent = offset + len;
+ }
+ /*
+ * Need to let the callers know we dropped the path so they should
+ * re-search.
+ */
+ if (!ret && need_find_last_extent)
+ ret = 1;
return ret;
}
@@ -3327,7 +3564,8 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
static int log_one_extent(struct btrfs_trans_handle *trans,
struct inode *inode, struct btrfs_root *root,
- struct extent_map *em, struct btrfs_path *path)
+ struct extent_map *em, struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct btrfs_root *log = root->log_root;
struct btrfs_file_extent_item *fi;
@@ -3343,23 +3581,28 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
- int index = log->log_transid % 2;
bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-
- ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
- em->start + em->len, NULL, 0);
- if (ret)
- return ret;
+ int extent_inserted = 0;
INIT_LIST_HEAD(&ordered_sums);
btrfs_init_map_token(&token);
- key.objectid = btrfs_ino(inode);
- key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = em->start;
- ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+ ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+ em->start + em->len, NULL, 0, 1,
+ sizeof(*fi), &extent_inserted);
if (ret)
return ret;
+
+ if (!extent_inserted) {
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = em->start;
+
+ ret = btrfs_insert_empty_item(trans, log, path, &key,
+ sizeof(*fi));
+ if (ret)
+ return ret;
+ }
leaf = path->nodes[0];
fi = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
@@ -3375,7 +3618,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_token_file_extent_type(leaf, fi,
BTRFS_FILE_EXTENT_REG,
&token);
- if (em->block_start == 0)
+ if (em->block_start == EXTENT_MAP_HOLE)
skip_csum = true;
}
@@ -3417,26 +3660,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
if (skip_csum)
return 0;
- if (em->compress_type) {
- csum_offset = 0;
- csum_len = block_len;
- }
-
/*
* First check and see if our csums are on our outstanding ordered
* extents.
*/
-again:
- spin_lock_irq(&log->log_extents_lock[index]);
- list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+ list_for_each_entry(ordered, logged_list, log_list) {
struct btrfs_ordered_sum *sum;
if (!mod_len)
break;
- if (ordered->inode != inode)
- continue;
-
if (ordered->file_offset + ordered->len <= mod_start ||
mod_start + mod_len <= ordered->file_offset)
continue;
@@ -3479,34 +3712,32 @@ again:
if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
&ordered->flags))
continue;
- atomic_inc(&ordered->refs);
- spin_unlock_irq(&log->log_extents_lock[index]);
- /*
- * we've dropped the lock, we must either break or
- * start over after this.
- */
- wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+ if (ordered->csum_bytes_left) {
+ btrfs_start_ordered_extent(inode, ordered, 0);
+ wait_event(ordered->wait,
+ ordered->csum_bytes_left == 0);
+ }
list_for_each_entry(sum, &ordered->list, list) {
ret = btrfs_csum_file_blocks(trans, log, sum);
- if (ret) {
- btrfs_put_ordered_extent(ordered);
+ if (ret)
goto unlocked;
- }
}
- btrfs_put_ordered_extent(ordered);
- goto again;
}
- spin_unlock_irq(&log->log_extents_lock[index]);
unlocked:
if (!mod_len || ret)
return ret;
- csum_offset = mod_start - em->start;
- csum_len = mod_len;
+ if (em->compress_type) {
+ csum_offset = 0;
+ csum_len = block_len;
+ } else {
+ csum_offset = mod_start - em->start;
+ csum_len = mod_len;
+ }
/* block start is already adjusted for the file extent offset. */
ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
@@ -3532,7 +3763,8 @@ unlocked:
static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ struct list_head *logged_list)
{
struct extent_map *em, *n;
struct list_head extents;
@@ -3590,7 +3822,7 @@ process:
write_unlock(&tree->lock);
- ret = log_one_extent(trans, inode, root, em, path);
+ ret = log_one_extent(trans, inode, root, em, path, logged_list);
write_lock(&tree->lock);
clear_em_logging(tree, em);
free_extent_map(em);
@@ -3626,6 +3858,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
struct extent_buffer *src = NULL;
+ LIST_HEAD(logged_list);
+ u64 last_extent = 0;
int err = 0;
int ret;
int nritems;
@@ -3673,7 +3907,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
mutex_lock(&BTRFS_I(inode)->log_mutex);
- btrfs_get_logged_extents(log, inode);
+ btrfs_get_logged_extents(inode, &logged_list);
/*
* a brute force approach to making sure we get the most uptodate
@@ -3693,7 +3927,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ret = btrfs_truncate_inode_items(trans, log,
inode, 0, 0);
} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags)) {
+ &BTRFS_I(inode)->runtime_flags) ||
+ inode_only == LOG_INODE_EXISTS) {
if (inode_only == LOG_INODE_ALL)
fast_search = true;
max_key.type = BTRFS_XATTR_ITEM_KEY;
@@ -3719,7 +3954,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
while (1) {
ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key, &max_key,
+ ret = btrfs_search_forward(root, &min_key,
path, trans->transid);
if (ret != 0)
break;
@@ -3740,11 +3975,15 @@ again:
goto next_slot;
}
- ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
- ins_nr, inode_only);
- if (ret) {
+ ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ins_start_slot, ins_nr, inode_only);
+ if (ret < 0) {
err = ret;
goto out_unlock;
+ } if (ret) {
+ ins_nr = 0;
+ btrfs_release_path(path);
+ continue;
}
ins_nr = 1;
ins_start_slot = path->slots[0];
@@ -3758,33 +3997,35 @@ next_slot:
goto again;
}
if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, src,
- ins_start_slot,
+ ret = copy_items(trans, inode, dst_path, path,
+ &last_extent, ins_start_slot,
ins_nr, inode_only);
- if (ret) {
+ if (ret < 0) {
err = ret;
goto out_unlock;
}
+ ret = 0;
ins_nr = 0;
}
btrfs_release_path(path);
- if (min_key.offset < (u64)-1)
+ if (min_key.offset < (u64)-1) {
min_key.offset++;
- else if (min_key.type < (u8)-1)
+ } else if (min_key.type < max_key.type) {
min_key.type++;
- else if (min_key.objectid < (u64)-1)
- min_key.objectid++;
- else
+ min_key.offset = 0;
+ } else {
break;
+ }
}
if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
- ins_nr, inode_only);
- if (ret) {
+ ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ins_start_slot, ins_nr, inode_only);
+ if (ret < 0) {
err = ret;
goto out_unlock;
}
+ ret = 0;
ins_nr = 0;
}
@@ -3792,12 +4033,13 @@ log_extents:
btrfs_release_path(path);
btrfs_release_path(dst_path);
if (fast_search) {
- ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+ ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
+ &logged_list);
if (ret) {
err = ret;
goto out_unlock;
}
- } else {
+ } else if (inode_only == LOG_INODE_ALL) {
struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em, *n;
@@ -3817,8 +4059,10 @@ log_extents:
BTRFS_I(inode)->logged_trans = trans->transid;
BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
out_unlock:
- if (err)
- btrfs_free_logged_extents(log, log->log_transid);
+ if (unlikely(err))
+ btrfs_put_logged_extents(&logged_list);
+ else
+ btrfs_submit_logged_extents(&logged_list, log);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
btrfs_free_path(path);
@@ -3878,8 +4122,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
* make sure any commits to the log are forced
* to be full commits
*/
- root->fs_info->last_trans_log_full_commit =
- trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 1;
break;
}
@@ -3909,7 +4152,8 @@ out:
*/
static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
- struct dentry *parent, int exists_only)
+ struct dentry *parent, int exists_only,
+ struct btrfs_log_ctx *ctx)
{
int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
@@ -3924,6 +4168,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
+ /*
+ * The prev transaction commit doesn't complete, we need do
+ * full commit by ourselves.
+ */
if (root->fs_info->last_trans_log_full_commit >
root->fs_info->last_trans_committed) {
ret = 1;
@@ -3946,9 +4194,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
- ret = start_log_trans(trans, root);
+ ret = start_log_trans(trans, root, ctx);
if (ret)
- goto end_trans;
+ goto end_no_trans;
ret = btrfs_log_inode(trans, root, inode, inode_only);
if (ret)
@@ -3993,9 +4241,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
end_trans:
dput(old_parent);
if (ret < 0) {
- root->fs_info->last_trans_log_full_commit = trans->transid;
+ btrfs_set_log_full_commit(root->fs_info, trans);
ret = 1;
}
+
+ if (ret)
+ btrfs_remove_log_ctx(root, ctx);
btrfs_end_log_trans(root);
end_no_trans:
return ret;
@@ -4008,12 +4259,14 @@ end_no_trans:
* data on disk.
*/
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+ struct btrfs_root *root, struct dentry *dentry,
+ struct btrfs_log_ctx *ctx)
{
struct dentry *parent = dget_parent(dentry);
int ret;
- ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+ ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
+ 0, ctx);
dput(parent);
return ret;
@@ -4250,6 +4503,6 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
root->fs_info->last_trans_committed))
return 0;
- return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 1d4ae0d15a7..7f5b41bd537 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -19,17 +19,47 @@
#ifndef __TREE_LOG_
#define __TREE_LOG_
+#include "ctree.h"
+#include "transaction.h"
+
/* return value for btrfs_log_dentry_safe that means we don't need to log it at all */
#define BTRFS_NO_LOG_SYNC 256
+struct btrfs_log_ctx {
+ int log_ret;
+ int log_transid;
+ struct list_head list;
+};
+
+static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
+{
+ ctx->log_ret = 0;
+ ctx->log_transid = 0;
+ INIT_LIST_HEAD(&ctx->list);
+}
+
+static inline void btrfs_set_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ ACCESS_ONCE(fs_info->last_trans_log_full_commit) = trans->transid;
+}
+
+static inline int btrfs_need_log_full_commit(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans)
+{
+ return ACCESS_ONCE(fs_info->last_trans_log_full_commit) ==
+ trans->transid;
+}
+
int btrfs_sync_log(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
+ struct btrfs_root *root, struct btrfs_log_ctx *ctx);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
+ struct btrfs_root *root, struct dentry *dentry,
+ struct btrfs_log_ctx *ctx);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index b0a523b2c60..840a38b2778 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,8 +5,8 @@
*/
#include <linux/slab.h>
-#include <linux/export.h>
#include "ulist.h"
+#include "ctree.h"
/*
* ulist is a generic data structure to hold a collection of unique u64
@@ -14,10 +14,6 @@
* enumerating it.
* It is possible to store an auxiliary value along with the key.
*
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
- *
* A sample usage for ulists is the enumeration of directed graphs without
* visiting a node twice. The pseudo-code could look like this:
*
@@ -50,12 +46,10 @@
*/
void ulist_init(struct ulist *ulist)
{
- ulist->nnodes = 0;
- ulist->nodes = ulist->int_nodes;
- ulist->nodes_alloced = ULIST_SIZE;
+ INIT_LIST_HEAD(&ulist->nodes);
ulist->root = RB_ROOT;
+ ulist->nnodes = 0;
}
-EXPORT_SYMBOL(ulist_init);
/**
* ulist_fini - free up additionally allocated memory for the ulist
@@ -64,18 +58,17 @@ EXPORT_SYMBOL(ulist_init);
* This is useful in cases where the base 'struct ulist' has been statically
* allocated.
*/
-void ulist_fini(struct ulist *ulist)
+static void ulist_fini(struct ulist *ulist)
{
- /*
- * The first ULIST_SIZE elements are stored inline in struct ulist.
- * Only if more elements are alocated they need to be freed.
- */
- if (ulist->nodes_alloced > ULIST_SIZE)
- kfree(ulist->nodes);
- ulist->nodes_alloced = 0; /* in case ulist_fini is called twice */
+ struct ulist_node *node;
+ struct ulist_node *next;
+
+ list_for_each_entry_safe(node, next, &ulist->nodes, list) {
+ kfree(node);
+ }
ulist->root = RB_ROOT;
+ INIT_LIST_HEAD(&ulist->nodes);
}
-EXPORT_SYMBOL(ulist_fini);
/**
* ulist_reinit - prepare a ulist for reuse
@@ -89,7 +82,6 @@ void ulist_reinit(struct ulist *ulist)
ulist_fini(ulist);
ulist_init(ulist);
}
-EXPORT_SYMBOL(ulist_reinit);
/**
* ulist_alloc - dynamically allocate a ulist
@@ -108,7 +100,6 @@ struct ulist *ulist_alloc(gfp_t gfp_mask)
return ulist;
}
-EXPORT_SYMBOL(ulist_alloc);
/**
* ulist_free - free dynamically allocated ulist
@@ -123,7 +114,6 @@ void ulist_free(struct ulist *ulist)
ulist_fini(ulist);
kfree(ulist);
}
-EXPORT_SYMBOL(ulist_free);
static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val)
{
@@ -192,63 +182,32 @@ int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask)
int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
u64 *old_aux, gfp_t gfp_mask)
{
- int ret = 0;
- struct ulist_node *node = NULL;
+ int ret;
+ struct ulist_node *node;
+
node = ulist_rbtree_search(ulist, val);
if (node) {
if (old_aux)
*old_aux = node->aux;
return 0;
}
+ node = kmalloc(sizeof(*node), gfp_mask);
+ if (!node)
+ return -ENOMEM;
- if (ulist->nnodes >= ulist->nodes_alloced) {
- u64 new_alloced = ulist->nodes_alloced + 128;
- struct ulist_node *new_nodes;
- void *old = NULL;
- int i;
-
- for (i = 0; i < ulist->nnodes; i++)
- rb_erase(&ulist->nodes[i].rb_node, &ulist->root);
-
- /*
- * if nodes_alloced == ULIST_SIZE no memory has been allocated
- * yet, so pass NULL to krealloc
- */
- if (ulist->nodes_alloced > ULIST_SIZE)
- old = ulist->nodes;
+ node->val = val;
+ node->aux = aux;
+#ifdef CONFIG_BTRFS_DEBUG
+ node->seqnum = ulist->nnodes;
+#endif
- new_nodes = krealloc(old, sizeof(*new_nodes) * new_alloced,
- gfp_mask);
- if (!new_nodes)
- return -ENOMEM;
-
- if (!old)
- memcpy(new_nodes, ulist->int_nodes,
- sizeof(ulist->int_nodes));
-
- ulist->nodes = new_nodes;
- ulist->nodes_alloced = new_alloced;
-
- /*
- * krealloc actually uses memcpy, which does not copy rb_node
- * pointers, so we have to do it ourselves. Otherwise we may
- * be bitten by crashes.
- */
- for (i = 0; i < ulist->nnodes; i++) {
- ret = ulist_rbtree_insert(ulist, &ulist->nodes[i]);
- if (ret < 0)
- return ret;
- }
- }
- ulist->nodes[ulist->nnodes].val = val;
- ulist->nodes[ulist->nnodes].aux = aux;
- ret = ulist_rbtree_insert(ulist, &ulist->nodes[ulist->nnodes]);
- BUG_ON(ret);
- ++ulist->nnodes;
+ ret = ulist_rbtree_insert(ulist, node);
+ ASSERT(!ret);
+ list_add_tail(&node->list, &ulist->nodes);
+ ulist->nnodes++;
return 1;
}
-EXPORT_SYMBOL(ulist_add);
/**
* ulist_next - iterate ulist
@@ -268,11 +227,25 @@ EXPORT_SYMBOL(ulist_add);
*/
struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
{
- if (ulist->nnodes == 0)
+ struct ulist_node *node;
+
+ if (list_empty(&ulist->nodes))
return NULL;
- if (uiter->i < 0 || uiter->i >= ulist->nnodes)
+ if (uiter->cur_list && uiter->cur_list->next == &ulist->nodes)
return NULL;
-
- return &ulist->nodes[uiter->i++];
+ if (uiter->cur_list) {
+ uiter->cur_list = uiter->cur_list->next;
+ } else {
+ uiter->cur_list = ulist->nodes.next;
+#ifdef CONFIG_BTRFS_DEBUG
+ uiter->i = 0;
+#endif
+ }
+ node = list_entry(uiter->cur_list, struct ulist_node, list);
+#ifdef CONFIG_BTRFS_DEBUG
+ ASSERT(node->seqnum == uiter->i);
+ ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes);
+ uiter->i++;
+#endif
+ return node;
}
-EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index fb36731074b..7f78cbf5cf4 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -17,18 +17,12 @@
* enumerating it.
* It is possible to store an auxiliary value along with the key.
*
- * The implementation is preliminary and can probably be sped up
- * significantly. A first step would be to store the values in an rbtree
- * as soon as ULIST_SIZE is exceeded.
*/
-
-/*
- * number of elements statically allocated inside struct ulist
- */
-#define ULIST_SIZE 16
-
struct ulist_iterator {
+#ifdef CONFIG_BTRFS_DEBUG
int i;
+#endif
+ struct list_head *cur_list; /* hint to start search */
};
/*
@@ -37,6 +31,12 @@ struct ulist_iterator {
struct ulist_node {
u64 val; /* value to store */
u64 aux; /* auxiliary value saved along with the val */
+
+#ifdef CONFIG_BTRFS_DEBUG
+ int seqnum; /* sequence number this node is added */
+#endif
+
+ struct list_head list; /* used to link node */
struct rb_node rb_node; /* used to speed up search */
};
@@ -46,28 +46,11 @@ struct ulist {
*/
unsigned long nnodes;
- /*
- * number of nodes we already have room for
- */
- unsigned long nodes_alloced;
-
- /*
- * pointer to the array storing the elements. The first ULIST_SIZE
- * elements are stored inline. In this case the it points to int_nodes.
- * After exceeding ULIST_SIZE, dynamic memory is allocated.
- */
- struct ulist_node *nodes;
-
+ struct list_head nodes;
struct rb_root root;
-
- /*
- * inline storage space for the first ULIST_SIZE entries
- */
- struct ulist_node int_nodes[ULIST_SIZE];
};
void ulist_init(struct ulist *ulist);
-void ulist_fini(struct ulist *ulist);
void ulist_reinit(struct ulist *ulist);
struct ulist *ulist_alloc(gfp_t gfp_mask);
void ulist_free(struct ulist *ulist);
@@ -77,6 +60,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
struct ulist_node *ulist_next(struct ulist *ulist,
struct ulist_iterator *uiter);
-#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
+#define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL)
#endif
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index dd0dea3766f..f6a4c03ee7d 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -69,7 +69,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
ret = -ENOENT;
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("btrfs: uuid item with illegal size %lu!\n",
+ btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
goto out;
}
@@ -137,7 +137,8 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
offset = btrfs_item_ptr_offset(eb, slot);
offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
} else if (ret < 0) {
- pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n",
+ btrfs_warn(uuid_root->fs_info, "insert uuid item failed %d "
+ "(0x%016llx, 0x%016llx) type %u!",
ret, (unsigned long long)key.objectid,
(unsigned long long)key.offset, type);
goto out;
@@ -183,7 +184,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
if (ret < 0) {
- pr_warn("btrfs: error %d while searching for uuid item!\n",
+ btrfs_warn(uuid_root->fs_info, "error %d while searching for uuid item!",
ret);
goto out;
}
@@ -197,7 +198,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
offset = btrfs_item_ptr_offset(eb, slot);
item_size = btrfs_item_size_nr(eb, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("btrfs: uuid item with illegal size %lu!\n",
+ btrfs_warn(uuid_root->fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
ret = -ENOENT;
goto out;
@@ -260,7 +261,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root = fs_info->uuid_root;
struct btrfs_key key;
- struct btrfs_key max_key;
struct btrfs_path *path;
int ret = 0;
struct extent_buffer *leaf;
@@ -277,13 +277,10 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
key.objectid = 0;
key.type = 0;
key.offset = 0;
- max_key.objectid = (u64)-1;
- max_key.type = (u8)-1;
- max_key.offset = (u64)-1;
again_search_slot:
path->keep_locks = 1;
- ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+ ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
if (ret > 0)
ret = 0;
@@ -303,7 +300,7 @@ again_search_slot:
offset = btrfs_item_ptr_offset(leaf, slot);
item_size = btrfs_item_size_nr(leaf, slot);
if (!IS_ALIGNED(item_size, sizeof(u64))) {
- pr_warn("btrfs: uuid item with illegal size %lu!\n",
+ btrfs_warn(fs_info, "uuid item with illegal size %lu!",
(unsigned long)item_size);
goto skip;
}
@@ -353,6 +350,6 @@ skip:
out:
btrfs_free_path(path);
if (ret)
- pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret);
+ btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
return 0;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 043b215769c..6cb82f62cb7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -28,7 +28,6 @@
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <asm/div64.h>
-#include "compat.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
@@ -41,6 +40,7 @@
#include "rcu-string.h"
#include "math.h"
#include "dev-replace.h"
+#include "sysfs.h"
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -126,7 +126,7 @@ static void btrfs_kobject_uevent(struct block_device *bdev,
ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
if (ret)
- pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+ pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
action,
kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
&disk_to_dev(bdev->bd_disk)->kobj);
@@ -201,7 +201,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
- printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+ printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
@@ -416,7 +416,8 @@ loop_lock:
device->running_pending = 1;
spin_unlock(&device->io_lock);
- btrfs_requeue_work(&device->work);
+ btrfs_queue_work(fs_info->submit_workers,
+ &device->work);
goto done;
}
/* unplug every 64 requests just for good measure */
@@ -448,6 +449,14 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
+/*
+ * Add new device to list of registered devices
+ *
+ * Returns:
+ * 1 - first time device is seen
+ * 0 - device already known
+ * < 0 - error
+ */
static noinline int device_list_add(const char *path,
struct btrfs_super_block *disk_super,
u64 devid, struct btrfs_fs_devices **fs_devices_ret)
@@ -455,6 +464,7 @@ static noinline int device_list_add(const char *path,
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices;
struct rcu_string *name;
+ int ret = 0;
u64 found_transid = btrfs_super_generation(disk_super);
fs_devices = find_fsid(disk_super->fsid);
@@ -495,6 +505,7 @@ static noinline int device_list_add(const char *path,
fs_devices->num_devices++;
mutex_unlock(&fs_devices->device_list_mutex);
+ ret = 1;
device->fs_devices = fs_devices;
} else if (!device->name || strcmp(device->name->str, path)) {
name = rcu_string_strdup(path, GFP_NOFS);
@@ -513,7 +524,8 @@ static noinline int device_list_add(const char *path,
fs_devices->latest_trans = found_transid;
}
*fs_devices_ret = fs_devices;
- return 0;
+
+ return ret;
}
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
@@ -543,12 +555,14 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
* This is ok to do without rcu read locked because we hold the
* uuid mutex so nothing we touch in here is going to disappear.
*/
- name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
- if (!name) {
- kfree(device);
- goto error;
+ if (orig_dev->name) {
+ name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+ if (!name) {
+ kfree(device);
+ goto error;
+ }
+ rcu_assign_pointer(device->name, name);
}
- rcu_assign_pointer(device->name, name);
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
@@ -666,7 +680,8 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
if (device->bdev)
fs_devices->open_devices--;
- if (device->writeable && !device->is_tgtdev_for_dev_replace) {
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
list_del_init(&device->dev_alloc_list);
fs_devices->rw_devices--;
}
@@ -909,17 +924,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
transid = btrfs_super_generation(disk_super);
total_devices = btrfs_super_num_devices(disk_super);
- if (disk_super->label[0]) {
- if (disk_super->label[BTRFS_LABEL_SIZE - 1])
- disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
- printk(KERN_INFO "btrfs: device label %s ", disk_super->label);
- } else {
- printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid);
- }
-
- printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
-
ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+ if (ret > 0) {
+ if (disk_super->label[0]) {
+ if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+ disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+ printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
+ } else {
+ printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
+ }
+
+ printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+ ret = 0;
+ }
if (!ret && fs_devices_ret)
(*fs_devices_ret)->total_devices = total_devices;
@@ -1438,6 +1455,22 @@ out:
return ret;
}
+/*
+ * Function to update ctime/mtime for a given device path.
+ * Mainly used for ctime/mtime based probe like libblkid.
+ */
+static void update_dev_time(char *path_name)
+{
+ struct file *filp;
+
+ filp = filp_open(path_name, O_RDWR, 0);
+ if (!filp)
+ return;
+ file_update_time(filp);
+ filp_close(filp, NULL);
+ return;
+}
+
static int btrfs_rm_dev_item(struct btrfs_root *root,
struct btrfs_device *device)
{
@@ -1647,8 +1680,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev == root->fs_info->fs_devices->latest_bdev)
root->fs_info->fs_devices->latest_bdev = next_device->bdev;
- if (device->bdev)
+ if (device->bdev) {
device->fs_devices->open_devices--;
+ /* remove sysfs entry */
+ btrfs_kobj_rm_device(root->fs_info, device);
+ }
call_rcu(&device->rcu, free_device);
@@ -1660,11 +1696,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
struct btrfs_fs_devices *fs_devices;
fs_devices = root->fs_info->fs_devices;
while (fs_devices) {
- if (fs_devices->seed == cur_devices)
+ if (fs_devices->seed == cur_devices) {
+ fs_devices->seed = cur_devices->seed;
break;
+ }
fs_devices = fs_devices->seed;
}
- fs_devices->seed = cur_devices->seed;
cur_devices->seed = NULL;
lock_chunks(root);
__btrfs_close_devices(cur_devices);
@@ -1680,20 +1717,55 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
* remove it from the devices list and zero out the old super
*/
if (clear_super && disk_super) {
+ u64 bytenr;
+ int i;
+
/* make sure this device isn't detected as part of
* the FS anymore
*/
memset(&disk_super->magic, 0, sizeof(disk_super->magic));
set_buffer_dirty(bh);
sync_dirty_buffer(bh);
+
+ /* clear the mirror copies of super block on the disk
+ * being removed, 0th copy is been taken care above and
+ * the below would take of the rest
+ */
+ for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+ bytenr = btrfs_sb_offset(i);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+ i_size_read(bdev->bd_inode))
+ break;
+
+ brelse(bh);
+ bh = __bread(bdev, bytenr / 4096,
+ BTRFS_SUPER_INFO_SIZE);
+ if (!bh)
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ if (btrfs_super_bytenr(disk_super) != bytenr ||
+ btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
+ continue;
+ }
+ memset(&disk_super->magic, 0,
+ sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ }
}
ret = 0;
- /* Notify udev that device has changed */
- if (bdev)
+ if (bdev) {
+ /* Notify udev that device has changed */
btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
+ }
+
error_brelse:
brelse(bh);
if (bdev)
@@ -1813,7 +1885,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
if (!*device) {
- pr_err("btrfs: no missing device found\n");
+ btrfs_err(root->fs_info, "no missing device found");
return -ENOENT;
}
@@ -1869,7 +1941,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
fs_devices->seeding = 0;
fs_devices->num_devices = 0;
fs_devices->open_devices = 0;
- fs_devices->total_devices = 0;
fs_devices->seed = seed_devices;
generate_random_uuid(fs_devices->fsid);
@@ -2041,6 +2112,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 0;
device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
set_blocksize(device->bdev, 4096);
if (seeding_dev) {
@@ -2077,9 +2149,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
btrfs_set_super_num_devices(root->fs_info->super_copy,
total_bytes + 1);
+
+ /* add sysfs device entry */
+ btrfs_kobj_add_device(root->fs_info, device);
+
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (seeding_dev) {
+ char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
ret = init_first_rw_device(trans, root, device);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
@@ -2090,6 +2167,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
btrfs_abort_transaction(trans, root, ret);
goto error_trans;
}
+
+ /* Sprouting would change fsid of the mounted root,
+ * so rename the fsid on the sysfs
+ */
+ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
+ root->fs_info->fsid);
+ if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
+ goto error_trans;
} else {
ret = btrfs_add_device(trans, root, device);
if (ret) {
@@ -2131,12 +2216,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_commit_transaction(trans, root);
}
+ /* Update ctime/mtime for libblkid */
+ update_dev_time(device_path);
return ret;
error_trans:
unlock_chunks(root);
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
+ btrfs_kobj_rm_device(root->fs_info, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2208,6 +2296,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
device->in_fs_metadata = 1;
device->is_tgtdev_for_dev_replace = 1;
device->mode = FMODE_EXCL;
+ device->dev_stats_valid = 1;
set_blocksize(device->bdev, 4096);
device->fs_devices = fs_info->fs_devices;
list_add(&device->dev_list, &fs_info->fs_devices->devices);
@@ -2474,9 +2563,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
remove_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
- kfree(map);
- em->bdev = NULL;
-
/* once for the tree */
free_extent_map(em);
/* once for us */
@@ -2550,8 +2636,7 @@ again:
failed = 0;
retried = true;
goto again;
- } else if (failed && retried) {
- WARN_ON(1);
+ } else if (WARN_ON(failed && retried)) {
ret = -ENOSPC;
}
error:
@@ -2907,6 +2992,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /*
+ * limited by count, must be the last filter
+ */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
+ if (bargs->limit == 0)
+ return 0;
+ else
+ bargs->limit--;
+ }
+
return 1;
}
@@ -2929,6 +3024,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ u64 limit_data = bctl->data.limit;
+ u64 limit_meta = bctl->meta.limit;
+ u64 limit_sys = bctl->sys.limit;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -2967,6 +3065,11 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
memset(&bctl->stat, 0, sizeof(bctl->stat));
spin_unlock(&fs_info->balance_lock);
again:
+ if (!counting) {
+ bctl->data.limit = limit_data;
+ bctl->meta.limit = limit_meta;
+ bctl->sys.limit = limit_sys;
+ }
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -3051,7 +3154,7 @@ loop:
error:
btrfs_free_path(path);
if (enospc_errors) {
- printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+ btrfs_info(fs_info, "%d enospc errors during balance",
enospc_errors);
if (!ret)
ret = -ENOSPC;
@@ -3137,8 +3240,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
!(bctl->flags & BTRFS_BALANCE_METADATA) ||
memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
- printk(KERN_ERR "btrfs: with mixed groups data and "
- "metadata balance options must be the same\n");
+ btrfs_err(fs_info, "with mixed groups data and "
+ "metadata balance options must be the same");
ret = -EINVAL;
goto out;
}
@@ -3164,8 +3267,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->data.target, 1) ||
(bctl->data.target & ~allowed))) {
- printk(KERN_ERR "btrfs: unable to start balance with target "
- "data profile %llu\n",
+ btrfs_err(fs_info, "unable to start balance with target "
+ "data profile %llu",
bctl->data.target);
ret = -EINVAL;
goto out;
@@ -3173,8 +3276,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->meta.target, 1) ||
(bctl->meta.target & ~allowed))) {
- printk(KERN_ERR "btrfs: unable to start balance with target "
- "metadata profile %llu\n",
+ btrfs_err(fs_info,
+ "unable to start balance with target metadata profile %llu",
bctl->meta.target);
ret = -EINVAL;
goto out;
@@ -3182,8 +3285,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(!alloc_profile_is_valid(bctl->sys.target, 1) ||
(bctl->sys.target & ~allowed))) {
- printk(KERN_ERR "btrfs: unable to start balance with target "
- "system profile %llu\n",
+ btrfs_err(fs_info,
+ "unable to start balance with target system profile %llu",
bctl->sys.target);
ret = -EINVAL;
goto out;
@@ -3192,7 +3295,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
/* allow dup'ed data chunks only in mixed mode */
if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
(bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
- printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+ btrfs_err(fs_info, "dup for data is not allowed");
ret = -EINVAL;
goto out;
}
@@ -3212,11 +3315,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
(fs_info->avail_metadata_alloc_bits & allowed) &&
!(bctl->meta.target & allowed))) {
if (bctl->flags & BTRFS_BALANCE_FORCE) {
- printk(KERN_INFO "btrfs: force reducing metadata "
- "integrity\n");
+ btrfs_info(fs_info, "force reducing metadata integrity");
} else {
- printk(KERN_ERR "btrfs: balance will reduce metadata "
- "integrity, use force if you want this\n");
+ btrfs_err(fs_info, "balance will reduce metadata "
+ "integrity, use force if you want this");
ret = -EINVAL;
goto out;
}
@@ -3302,7 +3404,7 @@ static int balance_kthread(void *data)
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
- printk(KERN_INFO "btrfs: continuing balance\n");
+ btrfs_info(fs_info, "continuing balance");
ret = btrfs_balance(fs_info->balance_ctl, NULL);
}
@@ -3324,7 +3426,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
- printk(KERN_INFO "btrfs: force skipping balance\n");
+ btrfs_info(fs_info, "force skipping balance");
return 0;
}
@@ -3423,6 +3525,9 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
+ if (fs_info->sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
mutex_unlock(&fs_info->balance_mutex);
@@ -3488,7 +3593,7 @@ static int btrfs_uuid_scan_kthread(void *data)
path->keep_locks = 1;
while (1) {
- ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+ ret = btrfs_search_forward(root, &key, path, 0);
if (ret) {
if (ret > 0)
ret = 0;
@@ -3539,7 +3644,7 @@ update_tree:
BTRFS_UUID_KEY_SUBVOL,
key.objectid);
if (ret < 0) {
- pr_warn("btrfs: uuid_tree_add failed %d\n",
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
@@ -3551,7 +3656,7 @@ update_tree:
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
key.objectid);
if (ret < 0) {
- pr_warn("btrfs: uuid_tree_add failed %d\n",
+ btrfs_warn(fs_info, "uuid_tree_add failed %d",
ret);
break;
}
@@ -3586,7 +3691,7 @@ out:
if (trans && !IS_ERR(trans))
btrfs_end_transaction(trans, fs_info->uuid_root);
if (ret)
- pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret);
+ btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
else
fs_info->update_uuid_tree_gen = 1;
up(&fs_info->uuid_tree_rescan_sem);
@@ -3650,7 +3755,7 @@ static int btrfs_uuid_rescan_kthread(void *data)
*/
ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
if (ret < 0) {
- pr_warn("btrfs: iterating uuid_tree failed %d\n", ret);
+ btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
up(&fs_info->uuid_tree_rescan_sem);
return ret;
}
@@ -3691,7 +3796,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
- pr_warn("btrfs: failed to start uuid_scan task\n");
+ btrfs_warn(fs_info, "failed to start uuid_scan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
@@ -3707,7 +3812,7 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
if (IS_ERR(task)) {
/* fs_info->update_uuid_tree_gen remains 0 in all error case */
- pr_warn("btrfs: failed to start uuid_rescan task\n");
+ btrfs_warn(fs_info, "failed to start uuid_rescan task");
up(&fs_info->uuid_tree_rescan_sem);
return PTR_ERR(task);
}
@@ -3864,7 +3969,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
u8 *ptr;
array_size = btrfs_super_sys_array_size(super_copy);
- if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+ if (array_size + item_size + sizeof(disk_key)
+ > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
return -EFBIG;
ptr = super_copy->sys_chunk_array + array_size;
@@ -3969,6 +4075,16 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
+#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
+ - sizeof(struct btrfs_item) \
+ - sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
+#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
+ - 2 * sizeof(struct btrfs_disk_key) \
+ - 2 * sizeof(struct btrfs_chunk)) \
+ / sizeof(struct btrfs_stripe) + 1)
+
static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 start,
u64 type)
@@ -4018,6 +4134,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (type & BTRFS_BLOCK_GROUP_DATA) {
max_stripe_size = 1024 * 1024 * 1024;
max_chunk_size = 10 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
/* for larger filesystems, use larger metadata chunks */
if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
@@ -4025,11 +4143,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
else
max_stripe_size = 256 * 1024 * 1024;
max_chunk_size = max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS(info->chunk_root);
} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
max_stripe_size = 32 * 1024 * 1024;
max_chunk_size = 2 * max_stripe_size;
+ if (!devs_max)
+ devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
} else {
- printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+ btrfs_err(info, "invalid chunk type 0x%llx requested",
type);
BUG_ON(1);
}
@@ -4061,7 +4183,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
if (!device->writeable) {
WARN(1, KERN_ERR
- "btrfs: read-only device in alloc_list\n");
+ "BTRFS: read-only device in alloc_list\n");
continue;
}
@@ -4196,9 +4318,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em = alloc_extent_map();
if (!em) {
+ kfree(map);
ret = -ENOMEM;
goto error;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = start;
em->len = num_bytes;
@@ -4241,7 +4365,6 @@ error_del_extent:
/* One for the tree reference */
free_extent_map(em);
error:
- kfree(map);
kfree(devices_info);
return ret;
}
@@ -4277,7 +4400,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
if (em->start != chunk_offset || em->len != chunk_size) {
btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
- " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+ " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
chunk_size, em->start, em->len);
free_extent_map(em);
return -EINVAL;
@@ -4453,7 +4576,6 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
write_unlock(&tree->map_tree.lock);
if (!em)
break;
- kfree(em->bdev);
/* once for us */
free_extent_map(em);
/* once for the tree */
@@ -4479,15 +4601,16 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* and exit, so return 1 so the callers don't try to use other copies.
*/
if (!em) {
- btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical,
+ btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
logical+len);
return 1;
}
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
- "%Lu-%Lu\n", logical, logical+len, em->start,
+ "%Lu-%Lu", logical, logical+len, em->start,
em->start + em->len);
+ free_extent_map(em);
return 1;
}
@@ -4666,8 +4789,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (em->start > logical || em->start + em->len < logical) {
btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
- "found %Lu-%Lu\n", logical, em->start,
+ "found %Lu-%Lu", logical, em->start,
em->start + em->len);
+ free_extent_map(em);
return -EINVAL;
}
@@ -4895,7 +5019,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
- raid_map = kmalloc(sizeof(u64) * num_stripes,
+ raid_map = kmalloc_array(num_stripes, sizeof(u64),
GFP_NOFS);
if (!raid_map) {
ret = -ENOMEM;
@@ -5187,13 +5311,13 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
read_unlock(&em_tree->lock);
if (!em) {
- printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
+ printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
chunk_start);
return -EIO;
}
if (em->start != chunk_start) {
- printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
+ printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
em->start, chunk_start);
free_extent_map(em);
return -EIO;
@@ -5255,9 +5379,19 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
return 0;
}
+static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
+{
+ if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
+ bio_endio_nodec(bio, err);
+ else
+ bio_endio(bio, err);
+ kfree(bbio);
+}
+
static void btrfs_end_bio(struct bio *bio, int err)
{
struct btrfs_bio *bbio = bio->bi_private;
+ struct btrfs_device *dev = bbio->stripes[0].dev;
int is_orig_bio = 0;
if (err) {
@@ -5265,7 +5399,6 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (err == -EIO || err == -EREMOTEIO) {
unsigned int stripe_index =
btrfs_io_bio(bio)->stripe_index;
- struct btrfs_device *dev;
BUG_ON(stripe_index >= bbio->num_stripes);
dev = bbio->stripes[stripe_index].dev;
@@ -5287,11 +5420,14 @@ static void btrfs_end_bio(struct bio *bio, int err)
if (bio == bbio->orig_bio)
is_orig_bio = 1;
+ btrfs_bio_counter_dec(bbio->fs_info);
+
if (atomic_dec_and_test(&bbio->stripes_pending)) {
if (!is_orig_bio) {
bio_put(bio);
bio = bbio->orig_bio;
}
+
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
@@ -5308,21 +5444,13 @@ static void btrfs_end_bio(struct bio *bio, int err)
set_bit(BIO_UPTODATE, &bio->bi_flags);
err = 0;
}
- kfree(bbio);
- bio_endio(bio, err);
+ btrfs_end_bbio(bbio, bio, err);
} else if (!is_orig_bio) {
bio_put(bio);
}
}
-struct async_sched {
- struct bio *bio;
- int rw;
- struct btrfs_fs_info *info;
- struct btrfs_work work;
-};
-
/*
* see run_scheduled_bios for a description of why bios are collected for
* async submit.
@@ -5379,8 +5507,8 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root,
spin_unlock(&device->io_lock);
if (should_queue)
- btrfs_queue_worker(&root->fs_info->submit_workers,
- &device->work);
+ btrfs_queue_work(root->fs_info->submit_workers,
+ &device->work);
}
static int bio_size_ok(struct block_device *bdev, struct bio *bio,
@@ -5388,17 +5516,15 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
{
struct bio_vec *prev;
struct request_queue *q = bdev_get_queue(bdev);
- unsigned short max_sectors = queue_max_sectors(q);
+ unsigned int max_sectors = queue_max_sectors(q);
struct bvec_merge_data bvm = {
.bi_bdev = bdev,
.bi_sector = sector,
.bi_rw = bio->bi_rw,
};
- if (bio->bi_vcnt == 0) {
- WARN_ON(1);
+ if (WARN_ON(bio->bi_vcnt == 0))
return 1;
- }
prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bio_sectors(bio) > max_sectors)
@@ -5407,7 +5533,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
if (!q->merge_bvec_fn)
return 1;
- bvm.bi_size = bio->bi_size - prev->bv_len;
+ bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
return 0;
return 1;
@@ -5422,7 +5548,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
bio->bi_private = bbio;
btrfs_io_bio(bio)->stripe_index = dev_nr;
bio->bi_end_io = btrfs_end_bio;
- bio->bi_sector = physical >> 9;
+ bio->bi_iter.bi_sector = physical >> 9;
#ifdef DEBUG
{
struct rcu_string *name;
@@ -5437,6 +5563,9 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
}
#endif
bio->bi_bdev = dev->bdev;
+
+ btrfs_bio_counter_inc_noblocked(root->fs_info);
+
if (async)
btrfs_schedule_bio(root, dev, rw, bio);
else
@@ -5460,7 +5589,7 @@ again:
while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
bvec->bv_offset) < bvec->bv_len) {
- u64 len = bio->bi_size;
+ u64 len = bio->bi_iter.bi_size;
atomic_inc(&bbio->stripes_pending);
submit_stripe_bio(root, bbio, bio, physical, dev_nr,
@@ -5479,12 +5608,15 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
{
atomic_inc(&bbio->error);
if (atomic_dec_and_test(&bbio->stripes_pending)) {
+ /* Shoud be the original bio. */
+ WARN_ON(bio != bbio->orig_bio);
+
bio->bi_private = bbio->private;
bio->bi_end_io = bbio->end_io;
btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
- bio->bi_sector = logical >> 9;
- kfree(bbio);
- bio_endio(bio, -EIO);
+ bio->bi_iter.bi_sector = logical >> 9;
+
+ btrfs_end_bbio(bbio, bio, -EIO);
}
}
@@ -5493,7 +5625,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
{
struct btrfs_device *dev;
struct bio *first_bio = bio;
- u64 logical = (u64)bio->bi_sector << 9;
+ u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
u64 *raid_map = NULL;
@@ -5502,31 +5634,41 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
int total_devs = 1;
struct btrfs_bio *bbio = NULL;
- length = bio->bi_size;
+ length = bio->bi_iter.bi_size;
map_length = length;
+ btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
mirror_num, &raid_map);
- if (ret) /* -ENOMEM */
+ if (ret) {
+ btrfs_bio_counter_dec(root->fs_info);
return ret;
+ }
total_devs = bbio->num_stripes;
bbio->orig_bio = first_bio;
bbio->private = first_bio->bi_private;
bbio->end_io = first_bio->bi_end_io;
+ bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
if (raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
- return raid56_parity_write(root, bio, bbio,
- raid_map, map_length);
+ ret = raid56_parity_write(root, bio, bbio,
+ raid_map, map_length);
} else {
- return raid56_parity_recover(root, bio, bbio,
- raid_map, map_length,
- mirror_num);
+ ret = raid56_parity_recover(root, bio, bbio,
+ raid_map, map_length,
+ mirror_num);
}
+ /*
+ * FIXME, replace dosen't support raid56 yet, please fix
+ * it in the future.
+ */
+ btrfs_bio_counter_dec(root->fs_info);
+ return ret;
}
if (map_length < length) {
@@ -5561,6 +5703,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
BUG_ON(!bio); /* -ENOMEM */
} else {
bio = first_bio;
+ bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
}
submit_stripe_bio(root, bbio, bio,
@@ -5568,6 +5711,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
async_submit);
dev_nr++;
}
+ btrfs_bio_counter_dec(root->fs_info);
return 0;
}
@@ -5631,10 +5775,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
struct btrfs_device *dev;
u64 tmp;
- if (!devid && !fs_info) {
- WARN_ON(1);
+ if (WARN_ON(!devid && !fs_info))
return ERR_PTR(-EINVAL);
- }
dev = __alloc_device();
if (IS_ERR(dev))
@@ -5658,7 +5800,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- dev->work.func = pending_bios_fn;
+ btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
return dev;
}
@@ -5703,6 +5845,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
return -ENOMEM;
}
+ set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
em->bdev = (struct block_device *)map;
em->start = logical;
em->len = length;
@@ -5727,7 +5870,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -5735,7 +5877,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev =
add_missing_dev(root, devid, uuid);
if (!map->stripes[i].dev) {
- kfree(map);
free_extent_map(em);
return -EIO;
}
@@ -6027,10 +6168,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
- mutex_lock(&fs_devices->device_list_mutex);
- list_for_each_entry(device, &fs_devices->devices, dev_list)
- device->dev_root = fs_info->dev_root;
- mutex_unlock(&fs_devices->device_list_mutex);
+ while (fs_devices) {
+ mutex_lock(&fs_devices->device_list_mutex);
+ list_for_each_entry(device, &fs_devices->devices, dev_list)
+ device->dev_root = fs_info->dev_root;
+ mutex_unlock(&fs_devices->device_list_mutex);
+
+ fs_devices = fs_devices->seed;
+ }
}
static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
@@ -6121,7 +6266,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+ printk_in_rcu(KERN_WARNING "BTRFS: "
+ "error %d while searching for dev_stats item for device %s!\n",
ret, rcu_str_deref(device->name));
goto out;
}
@@ -6131,7 +6277,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+ printk_in_rcu(KERN_WARNING "BTRFS: "
+ "delete too small dev_stats item for device %s failed %d!\n",
rcu_str_deref(device->name), ret);
goto out;
}
@@ -6144,7 +6291,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+ printk_in_rcu(KERN_WARNING "BTRFS: "
+ "insert dev_stats item for device %s failed %d!\n",
rcu_str_deref(device->name), ret);
goto out;
}
@@ -6197,16 +6345,14 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- printk_ratelimited_in_rcu(KERN_ERR
- "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
- btrfs_dev_stat_read(dev,
- BTRFS_DEV_STAT_CORRUPTION_ERRS),
- btrfs_dev_stat_read(dev,
- BTRFS_DEV_STAT_GENERATION_ERRS));
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+ btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
@@ -6219,7 +6365,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ printk_in_rcu(KERN_INFO "BTRFS: "
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6240,12 +6387,10 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
mutex_unlock(&fs_devices->device_list_mutex);
if (!dev) {
- printk(KERN_WARNING
- "btrfs: get dev_stats failed, device not found\n");
+ btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
return -ENODEV;
} else if (!dev->dev_stats_valid) {
- printk(KERN_WARNING
- "btrfs: get dev_stats failed, not yet valid\n");
+ btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
return -ENODEV;
} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b72f540c8b2..2aaa00c4781 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -43,9 +43,8 @@ struct btrfs_device {
/* WRITE_SYNC bios */
struct btrfs_pending_bios pending_sync_bios;
- int running_pending;
u64 generation;
-
+ int running_pending;
int writeable;
int in_fs_metadata;
int missing;
@@ -53,11 +52,11 @@ struct btrfs_device {
int is_tgtdev_for_dev_replace;
spinlock_t io_lock;
+ /* the mode sent to blkdev_get */
+ fmode_t mode;
struct block_device *bdev;
- /* the mode sent to blkdev_get */
- fmode_t mode;
struct rcu_string *name;
@@ -78,16 +77,21 @@ struct btrfs_device {
/* optimal io width for this device */
u32 io_width;
+ /* type and info about this device */
+ u64 type;
/* minimal io size for this device */
u32 sector_size;
- /* type and info about this device */
- u64 type;
/* physical drive uuid (or lvm uuid) */
u8 uuid[BTRFS_UUID_SIZE];
+ /* for sending down flush barriers */
+ int nobarriers;
+ struct bio *flush_bio;
+ struct completion flush_wait;
+
/* per-device scrub information */
struct scrub_ctx *scrub_device;
@@ -103,10 +107,6 @@ struct btrfs_device {
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
- /* for sending down flush barriers */
- struct bio *flush_bio;
- struct completion flush_wait;
- int nobarriers;
/* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */
@@ -132,7 +132,9 @@ struct btrfs_fs_devices {
/* all of the devices in the FS, protected by a mutex
* so we can safely walk it to write out the supers without
- * worrying about add/remove by the multi-device code
+ * worrying about add/remove by the multi-device code.
+ * Scrubbing super can kick off supers writing by holding
+ * this mutex lock.
*/
struct mutex device_list_mutex;
struct list_head devices;
@@ -188,10 +190,14 @@ struct btrfs_bio_stripe {
struct btrfs_bio;
typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+#define BTRFS_BIO_ORIG_BIO_SUBMITTED 0x1
+
struct btrfs_bio {
atomic_t stripes_pending;
+ struct btrfs_fs_info *fs_info;
bio_end_io_t *end_io;
struct bio *orig_bio;
+ unsigned long flags;
void *private;
atomic_t error;
int max_errors;
@@ -252,6 +258,7 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
+#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 05740b9789e..ad8328d797e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -22,11 +22,13 @@
#include <linux/rwsem.h>
#include <linux/xattr.h>
#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "xattr.h"
#include "disk-io.h"
+#include "props.h"
ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
@@ -313,8 +315,8 @@ err:
*/
const struct xattr_handler *btrfs_xattr_handlers[] = {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- &btrfs_xattr_acl_access_handler,
- &btrfs_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL,
};
@@ -331,7 +333,8 @@ static bool btrfs_is_valid_xattr(const char *name)
XATTR_SECURITY_PREFIX_LEN) ||
!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
- !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+ !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) ||
+ !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN);
}
ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
@@ -373,6 +376,10 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!btrfs_is_valid_xattr(name))
return -EOPNOTSUPP;
+ if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ return btrfs_set_prop(dentry->d_inode, name,
+ value, size, flags);
+
if (size == 0)
value = ""; /* empty EA, do not remove */
@@ -402,6 +409,10 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
if (!btrfs_is_valid_xattr(name))
return -EOPNOTSUPP;
+ if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
+ return btrfs_set_prop(dentry->d_inode, name,
+ NULL, 0, XATTR_REPLACE);
+
return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0,
XATTR_REPLACE);
}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index b3cc8039134..5049608d138 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,8 +21,6 @@
#include <linux/xattr.h>
-extern const struct xattr_handler btrfs_xattr_acl_access_handler;
-extern const struct xattr_handler btrfs_xattr_acl_default_handler;
extern const struct xattr_handler *btrfs_xattr_handlers[];
extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 9acb846c3e7..b67d8fc8127 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -97,8 +97,8 @@ static int zlib_compress_pages(struct list_head *ws,
*total_in = 0;
if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
- printk(KERN_WARNING "btrfs: deflateInit failed\n");
- ret = -1;
+ printk(KERN_WARNING "BTRFS: deflateInit failed\n");
+ ret = -EIO;
goto out;
}
@@ -110,7 +110,7 @@ static int zlib_compress_pages(struct list_head *ws,
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
- ret = -1;
+ ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
@@ -125,10 +125,10 @@ static int zlib_compress_pages(struct list_head *ws,
while (workspace->def_strm.total_in < len) {
ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
if (ret != Z_OK) {
- printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
+ printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
ret);
zlib_deflateEnd(&workspace->def_strm);
- ret = -1;
+ ret = -EIO;
goto out;
}
@@ -136,7 +136,7 @@ static int zlib_compress_pages(struct list_head *ws,
if (workspace->def_strm.total_in > 8192 &&
workspace->def_strm.total_in <
workspace->def_strm.total_out) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
/* we need another page for writing out. Test this
@@ -147,12 +147,12 @@ static int zlib_compress_pages(struct list_head *ws,
kunmap(out_page);
if (nr_pages == nr_dest_pages) {
out_page = NULL;
- ret = -1;
+ ret = -E2BIG;
goto out;
}
out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
if (out_page == NULL) {
- ret = -1;
+ ret = -ENOMEM;
goto out;
}
cpage_out = kmap(out_page);
@@ -188,12 +188,12 @@ static int zlib_compress_pages(struct list_head *ws,
zlib_deflateEnd(&workspace->def_strm);
if (ret != Z_STREAM_END) {
- ret = -1;
+ ret = -EIO;
goto out;
}
if (workspace->def_strm.total_out >= workspace->def_strm.total_in) {
- ret = -1;
+ ret = -E2BIG;
goto out;
}
@@ -252,8 +252,8 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
- printk(KERN_WARNING "btrfs: inflateInit failed\n");
- return -1;
+ printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+ return -EIO;
}
while (workspace->inf_strm.total_in < srclen) {
ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
@@ -295,7 +295,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
}
}
if (ret != Z_STREAM_END)
- ret = -1;
+ ret = -EIO;
else
ret = 0;
done:
@@ -336,8 +336,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
}
if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
- printk(KERN_WARNING "btrfs: inflateInit failed\n");
- return -1;
+ printk(KERN_WARNING "BTRFS: inflateInit failed\n");
+ return -EIO;
}
while (bytes_left > 0) {
@@ -354,7 +354,7 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
total_out = workspace->inf_strm.total_out;
if (total_out == buf_start) {
- ret = -1;
+ ret = -EIO;
break;
}
@@ -382,7 +382,7 @@ next:
}
if (ret != Z_STREAM_END && bytes_left != 0)
- ret = -1;
+ ret = -EIO;
else
ret = 0;
diff --git a/fs/buffer.c b/fs/buffer.c
index 4d7433534f5..eba6e4f621c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -77,7 +77,7 @@ EXPORT_SYMBOL(__lock_buffer);
void unlock_buffer(struct buffer_head *bh)
{
clear_bit_unlock(BH_Lock, &bh->b_state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);
@@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
- page = find_get_page(bd_mapping, index);
+ page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
if (!page)
goto out;
@@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
static void __set_page_dirty(struct page *page,
struct address_space *mapping, int warn)
{
- spin_lock_irq(&mapping->tree_lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&mapping->tree_lock, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
@@ -1005,9 +1007,19 @@ grow_dev_page(struct block_device *bdev, sector_t block,
struct buffer_head *bh;
sector_t end_block;
int ret = 0; /* Will call free_more_memory() */
+ gfp_t gfp_mask;
+
+ gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
+ gfp_mask |= __GFP_MOVABLE;
+ /*
+ * XXX: __getblk_slow() can not really deal with failure and
+ * will endlessly loop on improvised global reclaim. Prefer
+ * looping in the allocator rather than here, at least that
+ * code knows what it's doing.
+ */
+ gfp_mask |= __GFP_NOFAIL;
- page = find_or_create_page(inode->i_mapping, index,
- (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
+ page = find_or_create_page(inode->i_mapping, index, gfp_mask);
if (!page)
return ret;
@@ -1302,7 +1314,7 @@ static void bh_lru_install(struct buffer_head *bh)
}
while (out < BH_LRU_SIZE)
bhs[out++] = NULL;
- memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
+ memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
}
bh_lru_unlock();
@@ -1354,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
if (bh == NULL) {
+ /* __find_get_block_slow will mark the page accessed */
bh = __find_get_block_slow(bdev, block);
if (bh)
bh_lru_install(bh);
- }
- if (bh)
+ } else
touch_buffer(bh);
+
return bh;
}
EXPORT_SYMBOL(__find_get_block);
@@ -1471,16 +1484,27 @@ EXPORT_SYMBOL(set_bh_page);
/*
* Called when truncating a buffer on a page completely.
*/
+
+/* Bits that are cleared during an invalidate */
+#define BUFFER_FLAGS_DISCARD \
+ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
+ 1 << BH_Delay | 1 << BH_Unwritten)
+
static void discard_buffer(struct buffer_head * bh)
{
+ unsigned long b_state, b_state_old;
+
lock_buffer(bh);
clear_buffer_dirty(bh);
bh->b_bdev = NULL;
- clear_buffer_mapped(bh);
- clear_buffer_req(bh);
- clear_buffer_new(bh);
- clear_buffer_delay(bh);
- clear_buffer_unwritten(bh);
+ b_state = bh->b_state;
+ for (;;) {
+ b_state_old = cmpxchg(&bh->b_state, b_state,
+ (b_state & ~BUFFER_FLAGS_DISCARD));
+ if (b_state_old == b_state)
+ break;
+ b_state = b_state_old;
+ }
unlock_buffer(bh);
}
@@ -2102,8 +2126,8 @@ EXPORT_SYMBOL(generic_write_end);
* Returns true if all buffers which correspond to a file portion
* we want to read are uptodate.
*/
-int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
- unsigned long from)
+int block_is_partially_uptodate(struct page *page, unsigned long from,
+ unsigned long count)
{
unsigned block_start, block_end, blocksize;
unsigned to;
@@ -2115,7 +2139,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
head = page_buffers(page);
blocksize = head->b_size;
- to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
+ to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
to = from + to;
if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
return 0;
@@ -2867,10 +2891,9 @@ EXPORT_SYMBOL(block_truncate_page);
/*
* The generic ->writepage function for buffer-backed address_spaces
- * this form passes in the end_io handler used to finish the IO.
*/
-int block_write_full_page_endio(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc, bh_end_io_t *handler)
+int block_write_full_page(struct page *page, get_block_t *get_block,
+ struct writeback_control *wbc)
{
struct inode * const inode = page->mapping->host;
loff_t i_size = i_size_read(inode);
@@ -2880,7 +2903,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
/* Is the page fully inside i_size? */
if (page->index < end_index)
return __block_write_full_page(inode, page, get_block, wbc,
- handler);
+ end_buffer_async_write);
/* Is the page fully outside i_size? (truncate in progress) */
offset = i_size & (PAGE_CACHE_SIZE-1);
@@ -2903,18 +2926,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block,
* writes to that region are not written out to the file."
*/
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
- return __block_write_full_page(inode, page, get_block, wbc, handler);
-}
-EXPORT_SYMBOL(block_write_full_page_endio);
-
-/*
- * The generic ->writepage function for buffer-backed address_spaces
- */
-int block_write_full_page(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
-{
- return block_write_full_page_endio(page, get_block, wbc,
- end_buffer_async_write);
+ return __block_write_full_page(inode, page, get_block, wbc,
+ end_buffer_async_write);
}
EXPORT_SYMBOL(block_write_full_page);
@@ -2972,11 +2985,11 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
* let it through, and the IO layer will turn it into
* an EIO.
*/
- if (unlikely(bio->bi_sector >= maxsector))
+ if (unlikely(bio->bi_iter.bi_sector >= maxsector))
return;
- maxsector -= bio->bi_sector;
- bytes = bio->bi_size;
+ maxsector -= bio->bi_iter.bi_sector;
+ bytes = bio->bi_iter.bi_size;
if (likely((bytes >> 9) <= maxsector))
return;
@@ -2984,7 +2997,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
bytes = maxsector << 9;
/* Truncate the bio.. */
- bio->bi_size = bytes;
+ bio->bi_iter.bi_size = bytes;
bio->bi_io_vec[0].bv_len = bytes;
/* ..and clear the end of the buffer for reads */
@@ -3019,14 +3032,14 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
*/
bio = bio_alloc(GFP_NOIO, 1);
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
bio->bi_io_vec[0].bv_len = bh->b_size;
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
- bio->bi_size = bh->b_size;
+ bio->bi_iter.bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
@@ -3076,7 +3089,7 @@ EXPORT_SYMBOL(submit_bh);
* until the buffer gets unlocked).
*
* ll_rw_block sets b_end_io to simple completion handler that marks
- * the buffer up-to-date (if approriate), unlocks the buffer and wakes
+ * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
* any waiters.
*
* All of the buffers must be for the same device, and must also be a
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index 622f4696e48..d749731dc0e 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
cache->brun_percent < 100);
if (*args) {
- kerror("'bind' command doesn't take an argument");
+ pr_err("'bind' command doesn't take an argument");
return -EINVAL;
}
if (!cache->rootdirname) {
- kerror("No cache directory specified");
+ pr_err("No cache directory specified");
return -EINVAL;
}
/* don't permit already bound caches to be re-bound */
if (test_bit(CACHEFILES_READY, &cache->flags)) {
- kerror("Cache already bound");
+ pr_err("Cache already bound");
return -EBUSY;
}
@@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
/* check parameters */
ret = -EOPNOTSUPP;
if (!root->d_inode ||
- !root->d_inode->i_op ||
!root->d_inode->i_op->lookup ||
!root->d_inode->i_op->mkdir ||
!root->d_inode->i_op->setxattr ||
@@ -229,9 +228,7 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
set_bit(CACHEFILES_READY, &cache->flags);
dput(root);
- printk(KERN_INFO "CacheFiles:"
- " File cache on %s registered\n",
- cache->cache.identifier);
+ pr_info("File cache on %s registered\n", cache->cache.identifier);
/* check how much space the cache has */
cachefiles_has_space(cache, 0, 0);
@@ -251,7 +248,7 @@ error_open_root:
kmem_cache_free(cachefiles_object_jar, fsdef);
error_root_object:
cachefiles_end_secure(cache, saved_cred);
- kerror("Failed to register: %d", ret);
+ pr_err("Failed to register: %d", ret);
return ret;
}
@@ -263,9 +260,8 @@ void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
_enter("");
if (test_bit(CACHEFILES_READY, &cache->flags)) {
- printk(KERN_INFO "CacheFiles:"
- " File cache on %s unregistering\n",
- cache->cache.identifier);
+ pr_info("File cache on %s unregistering\n",
+ cache->cache.identifier);
fscache_withdraw_cache(&cache->cache);
}
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index 0a1467b1551..b078d3081d6 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,8 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
char *args)
{
- kerror("Free space limits must be in range"
- " 0%%<=stop<cull<run<100%%");
+ pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%");
return -EINVAL;
}
@@ -476,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- kerror("Empty directory specified");
+ pr_err("Empty directory specified");
return -EINVAL;
}
if (cache->rootdirname) {
- kerror("Second cache directory specified");
+ pr_err("Second cache directory specified");
return -EEXIST;
}
@@ -504,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- kerror("Empty security context specified");
+ pr_err("Empty security context specified");
return -EINVAL;
}
if (cache->secctx) {
- kerror("Second security context specified");
+ pr_err("Second security context specified");
return -EINVAL;
}
@@ -532,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
_enter(",%s", args);
if (!*args) {
- kerror("Empty tag specified");
+ pr_err("Empty tag specified");
return -EINVAL;
}
@@ -563,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
goto inval;
if (!test_bit(CACHEFILES_READY, &cache->flags)) {
- kerror("cull applied to unready cache");
+ pr_err("cull applied to unready cache");
return -EIO;
}
if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
- kerror("cull applied to dead cache");
+ pr_err("cull applied to dead cache");
return -EIO;
}
@@ -588,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
notdir:
path_put(&path);
- kerror("cull command requires dirfd to be a directory");
+ pr_err("cull command requires dirfd to be a directory");
return -ENOTDIR;
inval:
- kerror("cull command requires dirfd and filename");
+ pr_err("cull command requires dirfd and filename");
return -EINVAL;
}
@@ -615,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
return 0;
inval:
- kerror("debug command requires mask");
+ pr_err("debug command requires mask");
return -EINVAL;
}
@@ -635,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
goto inval;
if (!test_bit(CACHEFILES_READY, &cache->flags)) {
- kerror("inuse applied to unready cache");
+ pr_err("inuse applied to unready cache");
return -EIO;
}
if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
- kerror("inuse applied to dead cache");
+ pr_err("inuse applied to dead cache");
return -EIO;
}
@@ -660,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
notdir:
path_put(&path);
- kerror("inuse command requires dirfd to be a directory");
+ pr_err("inuse command requires dirfd to be a directory");
return -ENOTDIR;
inval:
- kerror("inuse command requires dirfd and filename");
+ pr_err("inuse command requires dirfd and filename");
return -EINVAL;
}
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 43eb5592cde..584743d456c 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -146,8 +146,7 @@ static int cachefiles_lookup_object(struct fscache_object *_object)
if (ret < 0 && ret != -ETIMEDOUT) {
if (ret != -ENOBUFS)
- printk(KERN_WARNING
- "CacheFiles: Lookup failed error %d\n", ret);
+ pr_warn("Lookup failed error %d\n", ret);
fscache_object_lookup_error(&object->fscache);
}
@@ -270,7 +269,7 @@ static void cachefiles_drop_object(struct fscache_object *_object)
#endif
/* delete retired objects */
- if (test_bit(FSCACHE_COOKIE_RETIRED, &object->fscache.cookie->flags) &&
+ if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
_object != cache->cache.fsdef
) {
_debug("- retire object OBJ%x", object->fscache.debug_id);
@@ -449,14 +448,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
_debug("discard tail %llx", oi_size);
newattrs.ia_valid = ATTR_SIZE;
newattrs.ia_size = oi_size & PAGE_MASK;
- ret = notify_change(object->backer, &newattrs);
+ ret = notify_change(object->backer, &newattrs, NULL);
if (ret < 0)
goto truncate_failed;
}
newattrs.ia_valid = ATTR_SIZE;
newattrs.ia_size = ni_size;
- ret = notify_change(object->backer, &newattrs);
+ ret = notify_change(object->backer, &newattrs, NULL);
truncate_failed:
mutex_unlock(&object->backer->d_inode->i_mutex);
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 5349473df1b..3d50998abf5 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -9,6 +9,13 @@
* 2 of the Licence, or (at your option) any later version.
*/
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "CacheFiles: " fmt
+
+
#include <linux/fscache-cache.h>
#include <linux/timer.h>
#include <linux/wait.h>
@@ -245,11 +252,10 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
/*
* error handling
*/
-#define kerror(FMT, ...) printk(KERN_ERR "CacheFiles: "FMT"\n", ##__VA_ARGS__)
#define cachefiles_io_error(___cache, FMT, ...) \
do { \
- kerror("I/O Error: " FMT, ##__VA_ARGS__); \
+ pr_err("I/O Error: " FMT, ##__VA_ARGS__); \
fscache_io_error(&(___cache)->cache); \
set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
} while (0)
@@ -310,8 +316,8 @@ do { \
#define ASSERT(X) \
do { \
if (unlikely(!(X))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
@@ -319,9 +325,9 @@ do { \
#define ASSERTCMP(X, OP, Y) \
do { \
if (unlikely(!((X) OP (Y)))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
- printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
(unsigned long)(X), (unsigned long)(Y)); \
BUG(); \
} \
@@ -330,8 +336,8 @@ do { \
#define ASSERTIF(C, X) \
do { \
if (unlikely((C) && !(X))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
@@ -339,9 +345,9 @@ do { \
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
if (unlikely((C) && !((X) OP (Y)))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "CacheFiles: Assertion failed\n"); \
- printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
(unsigned long)(X), (unsigned long)(Y)); \
BUG(); \
} \
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 4bfa8cf43bf..180edfb45f6 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -68,8 +68,7 @@ static int __init cachefiles_init(void)
SLAB_HWCACHE_ALIGN,
cachefiles_object_init_once);
if (!cachefiles_object_jar) {
- printk(KERN_NOTICE
- "CacheFiles: Failed to allocate an object jar\n");
+ pr_notice("Failed to allocate an object jar\n");
goto error_object_jar;
}
@@ -77,7 +76,7 @@ static int __init cachefiles_init(void)
if (ret < 0)
goto error_proc;
- printk(KERN_INFO "CacheFiles: Loaded\n");
+ pr_info("Loaded\n");
return 0;
error_proc:
@@ -85,7 +84,7 @@ error_proc:
error_object_jar:
misc_deregister(&cachefiles_dev);
error_dev:
- kerror("failed to register: %d", ret);
+ pr_err("failed to register: %d", ret);
return ret;
}
@@ -96,7 +95,7 @@ fs_initcall(cachefiles_init);
*/
static void __exit cachefiles_exit(void)
{
- printk(KERN_INFO "CacheFiles: Unloading\n");
+ pr_info("Unloading\n");
cachefiles_proc_cleanup();
kmem_cache_destroy(cachefiles_object_jar);
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index f4a08d7fa2f..5bf2b41e66d 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -35,22 +35,21 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
struct fscache_cookie *cookie;
unsigned keylen, loop;
- printk(KERN_ERR "%sobject: OBJ%x\n",
- prefix, object->fscache.debug_id);
- printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
+ pr_err("%sobject: OBJ%x\n", prefix, object->fscache.debug_id);
+ pr_err("%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n",
prefix, object->fscache.state->name,
object->fscache.flags, work_busy(&object->fscache.work),
object->fscache.events, object->fscache.event_mask);
- printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
+ pr_err("%sops=%u inp=%u exc=%u\n",
prefix, object->fscache.n_ops, object->fscache.n_in_progress,
object->fscache.n_exclusive);
- printk(KERN_ERR "%sparent=%p\n",
+ pr_err("%sparent=%p\n",
prefix, object->fscache.parent);
spin_lock(&object->fscache.lock);
cookie = object->fscache.cookie;
if (cookie) {
- printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
+ pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n",
prefix,
object->fscache.cookie,
object->fscache.cookie->parent,
@@ -62,16 +61,16 @@ void __cachefiles_printk_object(struct cachefiles_object *object,
else
keylen = 0;
} else {
- printk(KERN_ERR "%scookie=NULL\n", prefix);
+ pr_err("%scookie=NULL\n", prefix);
keylen = 0;
}
spin_unlock(&object->fscache.lock);
if (keylen) {
- printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
+ pr_err("%skey=[%u] '", prefix, keylen);
for (loop = 0; loop < keylen; loop++)
- printk("%02x", keybuf[loop]);
- printk("'\n");
+ pr_cont("%02x", keybuf[loop]);
+ pr_cont("'\n");
}
}
@@ -131,13 +130,11 @@ found_dentry:
dentry);
if (fscache_object_is_live(&object->fscache)) {
- printk(KERN_ERR "\n");
- printk(KERN_ERR "CacheFiles: Error:"
- " Can't preemptively bury live object\n");
+ pr_err("\n");
+ pr_err("Error: Can't preemptively bury live object\n");
cachefiles_printk_object(object, NULL);
} else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) {
- printk(KERN_ERR "CacheFiles: Error:"
- " Object already preemptively buried\n");
+ pr_err("Error: Object already preemptively buried\n");
}
write_unlock(&cache->active_lock);
@@ -160,7 +157,7 @@ try_again:
write_lock(&cache->active_lock);
if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
- printk(KERN_ERR "CacheFiles: Error: Object already active\n");
+ pr_err("Error: Object already active\n");
cachefiles_printk_object(object, NULL);
BUG();
}
@@ -193,9 +190,8 @@ try_again:
* need to wait for it to be destroyed */
wait_for_old_object:
if (fscache_object_is_live(&object->fscache)) {
- printk(KERN_ERR "\n");
- printk(KERN_ERR "CacheFiles: Error:"
- " Unexpected object collision\n");
+ pr_err("\n");
+ pr_err("Error: Unexpected object collision\n");
cachefiles_printk_object(object, xobject);
BUG();
}
@@ -241,9 +237,8 @@ wait_for_old_object:
}
if (timeout <= 0) {
- printk(KERN_ERR "\n");
- printk(KERN_ERR "CacheFiles: Error: Overlong"
- " wait for old active object to go away\n");
+ pr_err("\n");
+ pr_err("Error: Overlong wait for old active object to go away\n");
cachefiles_printk_object(object, xobject);
goto requeue;
}
@@ -294,7 +289,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
if (ret < 0) {
cachefiles_io_error(cache, "Unlink security error");
} else {
- ret = vfs_unlink(dir->d_inode, rep);
+ ret = vfs_unlink(dir->d_inode, rep, NULL);
if (preemptive)
cachefiles_mark_object_buried(cache, rep);
@@ -391,12 +386,12 @@ try_again:
path.dentry = dir;
path_to_graveyard.mnt = cache->mnt;
path_to_graveyard.dentry = cache->graveyard;
- ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
+ ret = security_path_rename(&path, rep, &path_to_graveyard, grave, 0);
if (ret < 0) {
cachefiles_io_error(cache, "Rename security error %d", ret);
} else {
ret = vfs_rename(dir->d_inode, rep,
- cache->graveyard->d_inode, grave);
+ cache->graveyard->d_inode, grave, NULL, 0);
if (ret != 0 && ret != -ENOMEM)
cachefiles_io_error(cache,
"Rename failed with error %d", ret);
@@ -548,7 +543,7 @@ lookup_again:
next, next->d_inode, next->d_inode->i_ino);
} else if (!S_ISDIR(next->d_inode->i_mode)) {
- kerror("inode %lu is not a directory",
+ pr_err("inode %lu is not a directory",
next->d_inode->i_ino);
ret = -ENOBUFS;
goto error;
@@ -579,7 +574,7 @@ lookup_again:
} else if (!S_ISDIR(next->d_inode->i_mode) &&
!S_ISREG(next->d_inode->i_mode)
) {
- kerror("inode %lu is not a file or directory",
+ pr_err("inode %lu is not a file or directory",
next->d_inode->i_ino);
ret = -ENOBUFS;
goto error;
@@ -773,14 +768,13 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
ASSERT(subdir->d_inode);
if (!S_ISDIR(subdir->d_inode->i_mode)) {
- kerror("%s is not a directory", dirname);
+ pr_err("%s is not a directory", dirname);
ret = -EIO;
goto check_error;
}
ret = -EPERM;
- if (!subdir->d_inode->i_op ||
- !subdir->d_inode->i_op->setxattr ||
+ if (!subdir->d_inode->i_op->setxattr ||
!subdir->d_inode->i_op->getxattr ||
!subdir->d_inode->i_op->lookup ||
!subdir->d_inode->i_op->mkdir ||
@@ -801,13 +795,13 @@ check_error:
mkdir_error:
mutex_unlock(&dir->d_inode->i_mutex);
dput(subdir);
- kerror("mkdir %s failed with error %d", dirname, ret);
+ pr_err("mkdir %s failed with error %d", dirname, ret);
return ERR_PTR(ret);
lookup_error:
mutex_unlock(&dir->d_inode->i_mutex);
ret = PTR_ERR(subdir);
- kerror("Lookup %s failed with error %d", dirname, ret);
+ pr_err("Lookup %s failed with error %d", dirname, ret);
return ERR_PTR(ret);
nomem_d_alloc:
@@ -897,7 +891,7 @@ lookup_error:
if (ret == -EIO) {
cachefiles_io_error(cache, "Lookup failed");
} else if (ret != -ENOMEM) {
- kerror("Internal error: %d", ret);
+ pr_err("Internal error: %d", ret);
ret = -EIO;
}
@@ -956,7 +950,7 @@ error:
}
if (ret != -ENOMEM) {
- kerror("Internal error: %d", ret);
+ pr_err("Internal error: %d", ret);
ret = -EIO;
}
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index ebaff368120..4b1fb5ca65b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -265,24 +265,22 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object,
goto nomem_monitor;
}
- ret = add_to_page_cache(newpage, bmapping,
- netpage->index, cachefiles_gfp);
+ ret = add_to_page_cache_lru(newpage, bmapping,
+ netpage->index, cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
goto nomem_page;
}
- /* we've installed a new backing page, so now we need to add it
- * to the LRU list and start it reading */
+ /* we've installed a new backing page, so now we need to start
+ * it reading */
installed_new_backing_page:
_debug("- new %p", newpage);
backpage = newpage;
newpage = NULL;
- lru_cache_add_file(backpage);
-
read_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
@@ -510,24 +508,23 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
goto nomem;
}
- ret = add_to_page_cache(newpage, bmapping,
- netpage->index, cachefiles_gfp);
+ ret = add_to_page_cache_lru(newpage, bmapping,
+ netpage->index,
+ cachefiles_gfp);
if (ret == 0)
goto installed_new_backing_page;
if (ret != -EEXIST)
goto nomem;
}
- /* we've installed a new backing page, so now we need to add it
- * to the LRU list and start it reading */
+ /* we've installed a new backing page, so now we need
+ * to start it reading */
installed_new_backing_page:
_debug("- new %p", newpage);
backpage = newpage;
newpage = NULL;
- lru_cache_add_file(backpage);
-
reread_backing_page:
ret = bmapping->a_ops->readpage(NULL, backpage);
if (ret < 0)
@@ -538,8 +535,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
monitor_backing_page:
_debug("- monitor add");
- ret = add_to_page_cache(netpage, op->mapping, netpage->index,
- cachefiles_gfp);
+ ret = add_to_page_cache_lru(netpage, op->mapping,
+ netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
@@ -549,8 +546,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
goto nomem;
}
- lru_cache_add_file(netpage);
-
/* install a monitor */
page_cache_get(netpage);
monitor->netfs_page = netpage;
@@ -613,8 +608,8 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
backing_page_already_uptodate:
_debug("- uptodate");
- ret = add_to_page_cache(netpage, op->mapping, netpage->index,
- cachefiles_gfp);
+ ret = add_to_page_cache_lru(netpage, op->mapping,
+ netpage->index, cachefiles_gfp);
if (ret < 0) {
if (ret == -EEXIST) {
page_cache_release(netpage);
@@ -631,8 +626,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object,
fscache_mark_page_cached(op, netpage);
- lru_cache_add_file(netpage);
-
/* the netpage is unlocked and marked up to date here */
fscache_end_io(op, netpage, 0);
page_cache_release(netpage);
diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c
index 039b5011d83..396c18ea276 100644
--- a/fs/cachefiles/security.c
+++ b/fs/cachefiles/security.c
@@ -34,9 +34,7 @@ int cachefiles_get_security_ID(struct cachefiles_cache *cache)
ret = set_security_override_from_ctx(new, cache->secctx);
if (ret < 0) {
put_cred(new);
- printk(KERN_ERR "CacheFiles:"
- " Security denies permission to nominate"
- " security context: error %d\n",
+ pr_err("Security denies permission to nominate security context: error %d\n",
ret);
goto error;
}
@@ -59,16 +57,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache,
ret = security_inode_mkdir(root->d_inode, root, 0);
if (ret < 0) {
- printk(KERN_ERR "CacheFiles:"
- " Security denies permission to make dirs: error %d",
+ pr_err("Security denies permission to make dirs: error %d",
ret);
return ret;
}
ret = security_inode_create(root->d_inode, root, 0);
if (ret < 0)
- printk(KERN_ERR "CacheFiles:"
- " Security denies permission to create files: error %d",
+ pr_err("Security denies permission to create files: error %d",
ret);
return ret;
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 12b0eef8418..1ad51ffbb27 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
}
if (ret != -EEXIST) {
- kerror("Can't set xattr on %*.*s [%lu] (err %d)",
+ pr_err("Can't set xattr on %*.*s [%lu] (err %d)",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
-ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
if (ret == -ERANGE)
goto bad_type_length;
- kerror("Can't read xattr on %*.*s [%lu] (err %d)",
+ pr_err("Can't read xattr on %*.*s [%lu] (err %d)",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
-ret);
@@ -85,14 +85,14 @@ error:
return ret;
bad_type_length:
- kerror("Cache object %lu type xattr length incorrect",
+ pr_err("Cache object %lu type xattr length incorrect",
dentry->d_inode->i_ino);
ret = -EIO;
goto error;
bad_type:
xtype[2] = 0;
- kerror("Cache object %*.*s [%lu] type %s not %s",
+ pr_err("Cache object %*.*s [%lu] type %s not %s",
dentry->d_name.len, dentry->d_name.len,
dentry->d_name.name, dentry->d_inode->i_ino,
xtype, type);
@@ -293,7 +293,7 @@ error:
return ret;
bad_type_length:
- kerror("Cache object %lu xattr length incorrect",
+ pr_err("Cache object %lu xattr length incorrect",
dentry->d_inode->i_ino);
ret = -EIO;
goto error;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index ac9a2ef5bb9..264e9bf83ff 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -25,3 +25,16 @@ config CEPH_FSCACHE
caching support for Ceph clients using FS-Cache
endif
+
+config CEPH_FS_POSIX_ACL
+ bool "Ceph POSIX Access Control Lists"
+ depends on CEPH_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
+
+ To learn more about Access Control Lists, visit the POSIX ACLs for
+ Linux website <http://acl.bestbits.at/>.
+
+ If you don't know what Access Control Lists are, say N
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 32e30106a2f..85a4230b9bf 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -10,3 +10,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
debugfs.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
+ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
new file mode 100644
index 00000000000..469f2e8657e
--- /dev/null
+++ b/fs/ceph/acl.c
@@ -0,0 +1,194 @@
+/*
+ * linux/fs/ceph/acl.c
+ *
+ * Copyright (C) 2013 Guangliang Zhao, <lucienchao@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/ceph/ceph_debug.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/posix_acl.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "super.h"
+
+static inline void ceph_set_cached_acl(struct inode *inode,
+ int type, struct posix_acl *acl)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+ set_cached_acl(inode, type, acl);
+ spin_unlock(&ci->i_ceph_lock);
+}
+
+static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
+ int type)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct posix_acl *acl = ACL_NOT_CACHED;
+
+ spin_lock(&ci->i_ceph_lock);
+ if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
+ acl = get_cached_acl(inode, type);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return acl;
+}
+
+struct posix_acl *ceph_get_acl(struct inode *inode, int type)
+{
+ int size;
+ const char *name;
+ char *value = NULL;
+ struct posix_acl *acl;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ BUG();
+ }
+
+ size = __ceph_getxattr(inode, name, "", 0);
+ if (size > 0) {
+ value = kzalloc(size, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ size = __ceph_getxattr(inode, name, value, size);
+ }
+
+ if (size > 0)
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ else if (size == -ERANGE || size == -ENODATA || size == 0)
+ acl = NULL;
+ else
+ acl = ERR_PTR(-EIO);
+
+ kfree(value);
+
+ if (!IS_ERR(acl))
+ ceph_set_cached_acl(inode, type, acl);
+
+ return acl;
+}
+
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int ret = 0, size = 0;
+ const char *name = NULL;
+ char *value = NULL;
+ struct iattr newattrs;
+ umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
+ struct dentry *dentry;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ ret = posix_acl_equiv_mode(acl, &new_mode);
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ acl = NULL;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode)) {
+ ret = acl ? -EINVAL : 0;
+ goto out;
+ }
+ name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (acl) {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (ret < 0)
+ goto out_free;
+ }
+
+ dentry = d_find_alias(inode);
+ if (new_mode != old_mode) {
+ newattrs.ia_mode = new_mode;
+ newattrs.ia_valid = ATTR_MODE;
+ ret = ceph_setattr(dentry, &newattrs);
+ if (ret)
+ goto out_dput;
+ }
+
+ ret = __ceph_setxattr(dentry, name, value, size, 0);
+ if (ret) {
+ if (new_mode != old_mode) {
+ newattrs.ia_mode = old_mode;
+ newattrs.ia_valid = ATTR_MODE;
+ ceph_setattr(dentry, &newattrs);
+ }
+ goto out_dput;
+ }
+
+ ceph_set_cached_acl(inode, type, acl);
+
+out_dput:
+ dput(dentry);
+out_free:
+ kfree(value);
+out:
+ return ret;
+}
+
+int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir)
+{
+ struct posix_acl *default_acl, *acl;
+ int error;
+
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
+
+ if (!default_acl && !acl)
+ cache_no_acl(inode);
+
+ if (default_acl) {
+ error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+ return error;
+}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd48142..90b3954d48e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -209,15 +209,17 @@ static int readpage_nounlock(struct file *filp, struct page *page)
err = 0;
if (err < 0) {
SetPageError(page);
+ ceph_fscache_readpage_cancel(inode, page);
goto out;
- } else if (err < PAGE_CACHE_SIZE) {
+ }
+ if (err < PAGE_CACHE_SIZE)
/* zero fill remainder of page */
zero_user_segment(page, err, PAGE_CACHE_SIZE);
- }
- SetPageUptodate(page);
+ else
+ flush_dcache_page(page);
- if (err == 0)
- ceph_readpage_to_fscache(inode, page);
+ SetPageUptodate(page);
+ ceph_readpage_to_fscache(inode, page);
out:
return err < 0 ? err : 0;
@@ -252,6 +254,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i];
+ if (rc < 0)
+ goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */
int s = bytes < 0 ? 0 : bytes;
@@ -262,6 +266,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
flush_dcache_page(page);
SetPageUptodate(page);
ceph_readpage_to_fscache(inode, page);
+unlock:
unlock_page(page);
page_cache_release(page);
bytes -= PAGE_CACHE_SIZE;
@@ -686,7 +691,7 @@ static int ceph_writepages_start(struct address_space *mapping,
(wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
- pr_warning("writepage_start %p on forced umount\n", inode);
+ pr_warn("writepage_start %p on forced umount\n", inode);
return -EIO; /* we're in a forced umount, don't write! */
}
if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
@@ -1179,8 +1184,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
* never get called.
*/
static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t pos, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t pos)
{
WARN_ON(1);
return -EINVAL;
@@ -1203,6 +1208,41 @@ const struct address_space_operations ceph_aops = {
/*
* vm ops
*/
+static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
+ loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
+ int want, got, ret;
+
+ dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
+ inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_CACHE;
+ while (1) {
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
+ if (ret == 0)
+ break;
+ if (ret != -ERESTARTSYS) {
+ WARN_ON(1);
+ return VM_FAULT_SIGBUS;
+ }
+ }
+ dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
+ inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
+
+ ret = filemap_fault(vma, vmf);
+
+ dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
+ inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
+ ceph_put_cap_refs(ci, got);
+
+ return ret;
+}
/*
* Reuse write_begin here for simplicity.
@@ -1210,23 +1250,41 @@ const struct address_space_operations ceph_aops = {
static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
- struct page *page = vmf->page;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_file_info *fi = vma->vm_file->private_data;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct page *page = vmf->page;
loff_t off = page_offset(page);
- loff_t size, len;
- int ret;
+ loff_t size = i_size_read(inode);
+ size_t len;
+ int want, got, ret;
- /* Update time before taking page lock */
- file_update_time(vma->vm_file);
-
- size = i_size_read(inode);
if (off + PAGE_CACHE_SIZE <= size)
len = PAGE_CACHE_SIZE;
else
len = size & ~PAGE_CACHE_MASK;
- dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
- off, len, page, page->index);
+ dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
+ inode, ceph_vinop(inode), off, len, size);
+ if (fi->fmode & CEPH_FILE_MODE_LAZY)
+ want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
+ else
+ want = CEPH_CAP_FILE_BUFFER;
+ while (1) {
+ got = 0;
+ ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len);
+ if (ret == 0)
+ break;
+ if (ret != -ERESTARTSYS) {
+ WARN_ON(1);
+ return VM_FAULT_SIGBUS;
+ }
+ }
+ dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
+ inode, off, len, ceph_cap_string(got));
+
+ /* Update time before taking page lock */
+ file_update_time(vma->vm_file);
lock_page(page);
@@ -1248,14 +1306,26 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_SIGBUS;
}
out:
- dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
- if (ret != VM_FAULT_LOCKED)
+ if (ret != VM_FAULT_LOCKED) {
unlock_page(page);
+ } else {
+ int dirty;
+ spin_lock(&ci->i_ceph_lock);
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
+ spin_unlock(&ci->i_ceph_lock);
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ }
+
+ dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
+ inode, off, len, ceph_cap_string(got), ret);
+ ceph_put_cap_refs(ci, got);
+
return ret;
}
static struct vm_operations_struct ceph_vmops = {
- .fault = filemap_fault,
+ .fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 6bfe65e0b03..834f9f3723f 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -68,7 +68,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
- fsc);
+ fsc, true);
if (fsc->fscache == NULL) {
pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
@@ -204,7 +204,8 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
ci->fscache = fscache_acquire_cookie(fsc->fscache,
&ceph_fscache_inode_object_def,
- ci);
+ ci, true);
+ fscache_check_consistency(ci->fscache);
done:
mutex_unlock(&inode->i_mutex);
@@ -324,6 +325,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ if (!PageFsCache(page))
+ return;
+
fscache_wait_on_page_write(ci->fscache, page);
fscache_uncache_page(ci->fscache, page);
}
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index ba949408a33..5ac591bd012 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -48,6 +48,12 @@ void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ fscache_attr_changed(ci->fscache);
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
fscache_invalidate(ceph_inode(inode)->fscache);
@@ -67,6 +73,14 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
return fscache_maybe_release_page(ci->fscache, page, gfp);
}
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+ struct page *page)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ if (fscache_cookie_valid(ci->fscache) && PageFsCache(page))
+ __fscache_uncache_page(ci->fscache, page);
+}
+
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
@@ -127,6 +141,10 @@ static inline void ceph_readpage_to_fscache(struct inode *inode,
{
}
+static inline void ceph_fscache_update_objectsize(struct inode *inode)
+{
+}
+
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
@@ -145,6 +163,11 @@ static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
return 1;
}
+static inline void ceph_fscache_readpage_cancel(struct inode *inode,
+ struct page *page)
+{
+}
+
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 13976c33332..1fde164b74b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -221,8 +221,8 @@ int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
- struct ceph_cap_reservation *ctx)
+struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx)
{
struct ceph_cap *cap = NULL;
@@ -508,15 +508,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* it is < 0. (This is so we can atomically add the cap and add an
* open file reference to it.)
*/
-int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned seq, unsigned mseq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation)
+void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap **new_cap)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap *new_cap = NULL;
struct ceph_cap *cap;
int mds = session->s_mds;
int actual_wanted;
@@ -531,20 +530,10 @@ int ceph_add_cap(struct inode *inode,
if (fmode >= 0)
wanted |= ceph_caps_for_mode(fmode);
-retry:
- spin_lock(&ci->i_ceph_lock);
cap = __get_cap_for_mds(ci, mds);
if (!cap) {
- if (new_cap) {
- cap = new_cap;
- new_cap = NULL;
- } else {
- spin_unlock(&ci->i_ceph_lock);
- new_cap = get_cap(mdsc, caps_reservation);
- if (new_cap == NULL)
- return -ENOMEM;
- goto retry;
- }
+ cap = *new_cap;
+ *new_cap = NULL;
cap->issued = 0;
cap->implemented = 0;
@@ -555,21 +544,31 @@ retry:
cap->ci = ci;
__insert_cap_node(ci, cap);
- /* clear out old exporting info? (i.e. on cap import) */
- if (ci->i_cap_exporting_mds == mds) {
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
- }
-
/* add to session cap list */
cap->session = session;
spin_lock(&session->s_cap_lock);
list_add_tail(&cap->session_caps, &session->s_caps);
session->s_nr_caps++;
spin_unlock(&session->s_cap_lock);
- } else if (new_cap)
- ceph_put_cap(mdsc, new_cap);
+ } else {
+ /*
+ * auth mds of the inode changed. we received the cap export
+ * message, but still haven't received the cap import message.
+ * handle_cap_export() updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
+ * a message that was send before the cap import message. So
+ * don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != cap_id);
+ seq = cap->seq;
+ mseq = cap->mseq;
+ issued |= cap->issued;
+ flags |= CEPH_CAP_FLAG_AUTH;
+ }
+ }
if (!ci->i_snap_realm) {
/*
@@ -609,17 +608,12 @@ retry:
if (flags & CEPH_CAP_FLAG_AUTH) {
if (ci->i_auth_cap == NULL ||
- ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0)
+ ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
ci->i_auth_cap = cap;
- } else if (ci->i_auth_cap == cap) {
- ci->i_auth_cap = NULL;
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n", inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ cap->mds_wanted = wanted;
}
- spin_unlock(&mdsc->cap_dirty_lock);
+ } else {
+ WARN_ON(ci->i_auth_cap == cap);
}
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
@@ -628,7 +622,7 @@ retry:
cap->cap_id = cap_id;
cap->issued = issued;
cap->implemented |= issued;
- if (mseq > cap->mseq)
+ if (ceph_seq_cmp(mseq, cap->mseq) > 0)
cap->mds_wanted = wanted;
else
cap->mds_wanted |= wanted;
@@ -639,9 +633,6 @@ retry:
if (fmode >= 0)
__ceph_get_fmode(ci, fmode);
- spin_unlock(&ci->i_ceph_lock);
- wake_up_all(&ci->i_cap_wq);
- return 0;
}
/*
@@ -676,7 +667,7 @@ static int __cap_is_valid(struct ceph_cap *cap)
*/
int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
{
- int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+ int have = ci->i_snap_caps;
struct ceph_cap *cap;
struct rb_node *p;
@@ -816,7 +807,7 @@ int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
cap = rb_entry(p, struct ceph_cap, ci_node);
- if (cap != ocap && __cap_is_valid(cap) &&
+ if (cap != ocap &&
(cap->implemented & ~cap->issued & mask))
return 1;
}
@@ -878,7 +869,10 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
cap = rb_entry(p, struct ceph_cap, ci_node);
if (!__cap_is_valid(cap))
continue;
- mds_wanted |= cap->mds_wanted;
+ if (cap == ci->i_auth_cap)
+ mds_wanted |= cap->mds_wanted;
+ else
+ mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
}
return mds_wanted;
}
@@ -888,7 +882,19 @@ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
*/
static int __ceph_is_any_caps(struct ceph_inode_info *ci)
{
- return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+ return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+int ceph_is_any_caps(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+
+ spin_lock(&ci->i_ceph_lock);
+ ret = __ceph_is_any_caps(ci);
+ spin_unlock(&ci->i_ceph_lock);
+
+ return ret;
}
/*
@@ -897,7 +903,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
* caller should hold i_ceph_lock.
* caller will not hold session s_mutex if called from destroy_inode.
*/
-void __ceph_remove_cap(struct ceph_cap *cap)
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
struct ceph_mds_session *session = cap->session;
struct ceph_inode_info *ci = cap->ci;
@@ -909,6 +915,16 @@ void __ceph_remove_cap(struct ceph_cap *cap)
/* remove from session list */
spin_lock(&session->s_cap_lock);
+ /*
+ * s_cap_reconnect is protected by s_cap_lock. no one changes
+ * s_cap_gen while session is in the reconnect state.
+ */
+ if (queue_release &&
+ (!session->s_cap_reconnect ||
+ cap->cap_gen == session->s_cap_gen))
+ __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
+ cap->mseq, cap->issue_seq);
+
if (session->s_cap_iterator == cap) {
/* not yet, we are iterating over this very cap */
dout("__ceph_remove_cap delaying %p removal from session %p\n",
@@ -1023,7 +1039,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item;
- spin_lock(&session->s_cap_lock);
BUG_ON(!session->s_num_cap_releases);
msg = list_first_entry(&session->s_cap_releases,
struct ceph_msg, list_head);
@@ -1052,7 +1067,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
(int)CEPH_CAPS_PER_RELEASE,
(int)msg->front.iov_len);
}
- spin_unlock(&session->s_cap_lock);
}
/*
@@ -1067,12 +1081,8 @@ void ceph_queue_caps_release(struct inode *inode)
p = rb_first(&ci->i_caps);
while (p) {
struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
- struct ceph_mds_session *session = cap->session;
-
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
p = rb_next(p);
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, true);
}
}
@@ -1379,13 +1389,10 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
ci->i_snap_realm->cached_context);
dout(" inode %p now dirty snapc %p auth cap %p\n",
&ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
+ WARN_ON(!ci->i_auth_cap);
BUG_ON(!list_empty(&ci->i_dirty_item));
spin_lock(&mdsc->cap_dirty_lock);
- if (ci->i_auth_cap)
- list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
- else
- list_add(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
spin_unlock(&mdsc->cap_dirty_lock);
if (ci->i_flushing_caps == 0) {
ihold(inode);
@@ -1731,13 +1738,12 @@ ack:
/*
* Try to flush dirty caps back to the auth mds.
*/
-static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
- unsigned *flush_tid)
+static int try_flush_caps(struct inode *inode, unsigned *flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
- int unlock_session = session ? 0 : 1;
int flushing = 0;
+ struct ceph_mds_session *session = NULL;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1751,13 +1757,14 @@ retry:
int want = __ceph_caps_wanted(ci);
int delayed;
- if (!session) {
+ if (!session || session != cap->session) {
spin_unlock(&ci->i_ceph_lock);
+ if (session)
+ mutex_unlock(&session->s_mutex);
session = cap->session;
mutex_lock(&session->s_mutex);
goto retry;
}
- BUG_ON(session != cap->session);
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
@@ -1776,7 +1783,7 @@ retry:
out:
spin_unlock(&ci->i_ceph_lock);
out_unlocked:
- if (session && unlock_session)
+ if (session)
mutex_unlock(&session->s_mutex);
return flushing;
}
@@ -1861,7 +1868,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return ret;
mutex_lock(&inode->i_mutex);
- dirty = try_flush_caps(inode, NULL, &flush_tid);
+ dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
/*
@@ -1896,7 +1903,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
dout("write_inode %p wait=%d\n", inode, wait);
if (wait) {
- dirty = try_flush_caps(inode, NULL, &flush_tid);
+ dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
caps_are_flushed(inode, flush_tid));
@@ -2346,11 +2353,11 @@ static void invalidate_aliases(struct inode *inode)
d_prune_aliases(inode);
/*
* For non-directory inode, d_find_alias() only returns
- * connected dentry. After calling d_invalidate(), the
- * dentry become disconnected.
+ * hashed dentry. After calling d_invalidate(), the
+ * dentry becomes unhashed.
*
* For directory inode, d_find_alias() can return
- * disconnected dentry. But directory inode should have
+ * unhashed dentry. But directory inode should have
* one alias at most.
*/
while ((dn = d_find_alias(inode))) {
@@ -2372,38 +2379,52 @@ static void invalidate_aliases(struct inode *inode)
* actually be a revocation if it specifies a smaller cap set.)
*
* caller holds s_mutex and i_ceph_lock, we drop both.
- *
- * return value:
- * 0 - ok
- * 1 - check_caps on auth cap only (writeback)
- * 2 - check_caps (ack revoke)
*/
-static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+static void handle_cap_grant(struct ceph_mds_client *mdsc,
+ struct inode *inode, struct ceph_mds_caps *grant,
+ void *snaptrace, int snaptrace_len,
+ struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap,
- struct ceph_buffer *xattr_buf)
- __releases(ci->i_ceph_lock)
+ struct ceph_cap *cap, int issued)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
int seq = le32_to_cpu(grant->seq);
int newcaps = le32_to_cpu(grant->caps);
- int issued, implemented, used, wanted, dirty;
+ int used, wanted, dirty;
u64 size = le64_to_cpu(grant->size);
u64 max_size = le64_to_cpu(grant->max_size);
struct timespec mtime, atime, ctime;
int check_caps = 0;
- int wake = 0;
- int writeback = 0;
- int queue_invalidate = 0;
- int deleted_inode = 0;
- int queue_revalidate = 0;
+ bool wake = 0;
+ bool writeback = 0;
+ bool queue_trunc = 0;
+ bool queue_invalidate = 0;
+ bool queue_revalidate = 0;
+ bool deleted_inode = 0;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
inode->i_size);
+
+ /*
+ * auth mds of the inode changed. we received the cap export message,
+ * but still haven't received the cap import message. handle_cap_export
+ * updated the new auth MDS' cap.
+ *
+ * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
+ * that was sent before the cap import message. So don't remove caps.
+ */
+ if (ceph_seq_cmp(seq, cap->seq) <= 0) {
+ WARN_ON(cap != ci->i_auth_cap);
+ WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
+ seq = cap->seq;
+ newcaps |= cap->issued;
+ }
+
/*
* If CACHE is being revoked, and we have no dirty buffers,
* try to invalidate (once). (If there are dirty buffers, we
@@ -2425,15 +2446,13 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
/* side effects now are allowed */
-
- issued = __ceph_caps_issued(ci, &implemented);
- issued |= implemented | __ceph_caps_dirty(ci);
-
cap->cap_gen = session->s_cap_gen;
+ cap->seq = seq;
__check_cap_issue(ci, cap, newcaps);
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(grant->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
@@ -2442,7 +2461,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
+ if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0) {
set_nlink(inode, le32_to_cpu(grant->nlink));
if (inode->i_nlink == 0 &&
(newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
@@ -2460,6 +2480,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_buffer_put(ci->i_xattrs.blob);
ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
ci->i_xattrs.version = version;
+ ceph_forget_all_cached_acls(inode);
}
}
@@ -2468,26 +2489,35 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
queue_revalidate = 1;
- /* size/ctime/mtime/atime? */
- ceph_fill_file_size(inode, issued,
- le32_to_cpu(grant->truncate_seq),
- le64_to_cpu(grant->truncate_size), size);
- ceph_decode_timespec(&mtime, &grant->mtime);
- ceph_decode_timespec(&atime, &grant->atime);
- ceph_decode_timespec(&ctime, &grant->ctime);
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
- &atime);
-
- /* max size increase? */
- if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
- dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
- ci->i_max_size = max_size;
- if (max_size >= ci->i_wanted_max_size) {
- ci->i_wanted_max_size = 0; /* reset */
- ci->i_requested_max_size = 0;
+ if (newcaps & CEPH_CAP_ANY_RD) {
+ /* ctime/mtime/atime? */
+ ceph_decode_timespec(&mtime, &grant->mtime);
+ ceph_decode_timespec(&atime, &grant->atime);
+ ceph_decode_timespec(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(grant->time_warp_seq),
+ &ctime, &mtime, &atime);
+ }
+
+ if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
+ /* file layout may have changed */
+ ci->i_layout = grant->layout;
+ /* size/truncate_seq? */
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size),
+ size);
+ /* max size increase? */
+ if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n",
+ ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = 1;
}
- wake = 1;
}
/* check cap bits */
@@ -2507,11 +2537,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
check_caps = 1;
}
- cap->seq = seq;
-
- /* file layout may have changed */
- ci->i_layout = grant->layout;
-
/* revocation, grant, or no-op? */
if (cap->issued & ~newcaps) {
int revoking = cap->issued & ~newcaps;
@@ -2553,6 +2578,23 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
spin_unlock(&ci->i_ceph_lock);
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len, false);
+ downgrade_write(&mdsc->snap_rwsem);
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ if (newcaps & ~issued)
+ wake = 1;
+ }
+
+ if (queue_trunc) {
+ ceph_queue_vmtruncate(inode);
+ ceph_queue_revalidate(inode);
+ } else if (queue_revalidate)
+ ceph_queue_revalidate(inode);
+
if (writeback)
/*
* queue inode for writeback: we can't actually call
@@ -2564,8 +2606,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_invalidate(inode);
if (deleted_inode)
invalidate_aliases(inode);
- if (queue_revalidate)
- ceph_queue_revalidate(inode);
if (wake)
wake_up_all(&ci->i_cap_wq);
@@ -2737,123 +2777,200 @@ static void handle_cap_trunc(struct inode *inode,
* caller holds s_mutex
*/
static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
- struct ceph_mds_session *session,
- int *open_target_sessions)
+ struct ceph_mds_cap_peer *ph,
+ struct ceph_mds_session *session)
{
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *tsession = NULL;
+ struct ceph_cap *cap, *tcap, *new_cap = NULL;
struct ceph_inode_info *ci = ceph_inode(inode);
- int mds = session->s_mds;
+ u64 t_cap_id;
unsigned mseq = le32_to_cpu(ex->migrate_seq);
- struct ceph_cap *cap = NULL, *t;
- struct rb_node *p;
- int remember = 1;
+ unsigned t_seq, t_mseq;
+ int target, issued;
+ int mds = session->s_mds;
- dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
+ if (ph) {
+ t_cap_id = le64_to_cpu(ph->cap_id);
+ t_seq = le32_to_cpu(ph->seq);
+ t_mseq = le32_to_cpu(ph->mseq);
+ target = le32_to_cpu(ph->mds);
+ } else {
+ t_cap_id = t_seq = t_mseq = 0;
+ target = -1;
+ }
+ dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
+ inode, ci, mds, mseq, target);
+retry:
spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
+ goto out_unlock;
- /* make sure we haven't seen a higher mseq */
- for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
- t = rb_entry(p, struct ceph_cap, ci_node);
- if (ceph_seq_cmp(t->mseq, mseq) > 0) {
- dout(" higher mseq on cap from mds%d\n",
- t->session->s_mds);
- remember = 0;
- }
- if (t->session->s_mds == mds)
- cap = t;
+ if (target < 0) {
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
}
- if (cap) {
- if (remember) {
- /* make note */
- ci->i_cap_exporting_mds = mds;
- ci->i_cap_exporting_mseq = mseq;
- ci->i_cap_exporting_issued = cap->issued;
-
- /*
- * make sure we have open sessions with all possible
- * export targets, so that we get the matching IMPORT
- */
- *open_target_sessions = 1;
+ /*
+ * now we know we haven't received the cap import message yet
+ * because the exported cap still exist.
+ */
- /*
- * we can't flush dirty caps that we've seen the
- * EXPORT but no IMPORT for
- */
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p to cap_dirty_migrating\n",
- inode);
- list_move(&ci->i_dirty_item,
- &mdsc->cap_dirty_migrating);
+ issued = cap->issued;
+ WARN_ON(issued != cap->implemented);
+
+ tcap = __get_cap_for_mds(ci, target);
+ if (tcap) {
+ /* already have caps from the target */
+ if (tcap->cap_id != t_cap_id ||
+ ceph_seq_cmp(tcap->seq, t_seq) < 0) {
+ dout(" updating import cap %p mds%d\n", tcap, target);
+ tcap->cap_id = t_cap_id;
+ tcap->seq = t_seq - 1;
+ tcap->issue_seq = t_seq - 1;
+ tcap->mseq = t_mseq;
+ tcap->issued |= issued;
+ tcap->implemented |= issued;
+ if (cap == ci->i_auth_cap)
+ ci->i_auth_cap = tcap;
+ if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_move_tail(&ci->i_flushing_item,
+ &tcap->session->s_cap_flushing);
+ spin_unlock(&mdsc->cap_dirty_lock);
}
- spin_unlock(&mdsc->cap_dirty_lock);
}
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
+ } else if (tsession) {
+ /* add placeholder for the export tagert */
+ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
+ ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0,
+ t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
+
+ __ceph_remove_cap(cap, false);
+ goto out_unlock;
}
- /* else, we already released it */
spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&session->s_mutex);
+
+ /* open target session */
+ tsession = ceph_mdsc_open_export_target_session(mdsc, target);
+ if (!IS_ERR(tsession)) {
+ if (mds > target) {
+ mutex_lock(&session->s_mutex);
+ mutex_lock_nested(&tsession->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ } else {
+ mutex_lock(&tsession->s_mutex);
+ mutex_lock_nested(&session->s_mutex,
+ SINGLE_DEPTH_NESTING);
+ }
+ ceph_add_cap_releases(mdsc, tsession);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ } else {
+ WARN_ON(1);
+ tsession = NULL;
+ target = -1;
+ }
+ goto retry;
+
+out_unlock:
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&session->s_mutex);
+ if (tsession) {
+ mutex_unlock(&tsession->s_mutex);
+ ceph_put_mds_session(tsession);
+ }
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
}
/*
- * Handle cap IMPORT. If there are temp bits from an older EXPORT,
- * clean them up.
+ * Handle cap IMPORT.
*
- * caller holds s_mutex.
+ * caller holds s_mutex. acquires i_ceph_lock
*/
static void handle_cap_import(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *im,
+ struct ceph_mds_cap_peer *ph,
struct ceph_mds_session *session,
- void *snaptrace, int snaptrace_len)
+ struct ceph_cap **target_cap, int *old_issued)
+ __acquires(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_cap *cap, *ocap, *new_cap = NULL;
int mds = session->s_mds;
- unsigned issued = le32_to_cpu(im->caps);
+ int issued;
+ unsigned caps = le32_to_cpu(im->caps);
unsigned wanted = le32_to_cpu(im->wanted);
unsigned seq = le32_to_cpu(im->seq);
unsigned mseq = le32_to_cpu(im->migrate_seq);
u64 realmino = le64_to_cpu(im->realm);
u64 cap_id = le64_to_cpu(im->cap_id);
+ u64 p_cap_id;
+ int peer;
- if (ci->i_cap_exporting_mds >= 0 &&
- ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d"
- " - cleared exporting from mds%d\n",
- inode, ci, mds, mseq,
- ci->i_cap_exporting_mds);
- ci->i_cap_exporting_issued = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_mds = -1;
+ if (ph) {
+ p_cap_id = le64_to_cpu(ph->cap_id);
+ peer = le32_to_cpu(ph->mds);
+ } else {
+ p_cap_id = 0;
+ peer = -1;
+ }
- spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&ci->i_dirty_item)) {
- dout(" moving %p back to cap_dirty\n", inode);
- list_move(&ci->i_dirty_item, &mdsc->cap_dirty);
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
+ inode, ci, mds, mseq, peer);
+
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (!new_cap) {
+ spin_unlock(&ci->i_ceph_lock);
+ new_cap = ceph_get_cap(mdsc, NULL);
+ goto retry;
}
- spin_unlock(&mdsc->cap_dirty_lock);
+ cap = new_cap;
} else {
- dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
- inode, ci, mds, mseq);
+ if (new_cap) {
+ ceph_put_cap(mdsc, new_cap);
+ new_cap = NULL;
+ }
}
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
- false);
- downgrade_write(&mdsc->snap_rwsem);
- ceph_add_cap(inode, session, cap_id, -1,
- issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
- NULL /* no caps context */);
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+
+ ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq,
+ realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
+
+ ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
+ if (ocap && ocap->cap_id == p_cap_id) {
+ dout(" remove export cap %p mds%d flags %d\n",
+ ocap, peer, ph->flags);
+ if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
+ (ocap->seq != le32_to_cpu(ph->seq) ||
+ ocap->mseq != le32_to_cpu(ph->mseq))) {
+ pr_err("handle_cap_import: mismatched seq/mseq: "
+ "ino (%llx.%llx) mds%d seq %d mseq %d "
+ "importer mds%d has peer seq %d mseq %d\n",
+ ceph_vinop(inode), peer, ocap->seq,
+ ocap->mseq, mds, le32_to_cpu(ph->seq),
+ le32_to_cpu(ph->mseq));
+ }
+ __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
+ }
/* make sure we re-request max_size, if necessary */
- spin_lock(&ci->i_ceph_lock);
- ci->i_wanted_max_size = 0; /* reset */
+ ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0;
- spin_unlock(&ci->i_ceph_lock);
+
+ *old_issued = issued;
+ *target_cap = cap;
}
/*
@@ -2871,8 +2988,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_inode_info *ci;
struct ceph_cap *cap;
struct ceph_mds_caps *h;
+ struct ceph_mds_cap_peer *peer = NULL;
int mds = session->s_mds;
- int op;
+ int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
u64 cap_id;
@@ -2881,12 +2999,13 @@ void ceph_handle_caps(struct ceph_mds_session *session,
void *snaptrace;
size_t snaptrace_len;
void *flock;
+ void *end;
u32 flock_len;
- int open_target_sessions = 0;
dout("handle_caps from mds%d\n", mds);
/* decode */
+ end = msg->front.iov_base + msg->front.iov_len;
tid = le64_to_cpu(msg->hdr.tid);
if (msg->front.iov_len < sizeof(*h))
goto bad;
@@ -2904,17 +3023,28 @@ void ceph_handle_caps(struct ceph_mds_session *session,
snaptrace_len = le32_to_cpu(h->snap_trace_len);
if (le16_to_cpu(msg->hdr.version) >= 2) {
- void *p, *end;
-
- p = snaptrace + snaptrace_len;
- end = msg->front.iov_base + msg->front.iov_len;
+ void *p = snaptrace + snaptrace_len;
ceph_decode_32_safe(&p, end, flock_len, bad);
+ if (p + flock_len > end)
+ goto bad;
flock = p;
} else {
flock = NULL;
flock_len = 0;
}
+ if (le16_to_cpu(msg->hdr.version) >= 3) {
+ if (op == CEPH_CAP_OP_IMPORT) {
+ void *p = flock + flock_len;
+ if (p + sizeof(*peer) > end)
+ goto bad;
+ peer = p;
+ } else if (op == CEPH_CAP_OP_EXPORT) {
+ /* recorded in unused fields */
+ peer = (void *)&h->size;
+ }
+ }
+
mutex_lock(&session->s_mutex);
session->s_seq++;
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2931,9 +3061,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
if (!inode) {
dout(" i don't have ino %llx\n", vino.ino);
- if (op == CEPH_CAP_OP_IMPORT)
+ if (op == CEPH_CAP_OP_IMPORT) {
+ spin_lock(&session->s_cap_lock);
__queue_cap_release(session, vino.ino, cap_id,
mseq, seq);
+ spin_unlock(&session->s_cap_lock);
+ }
goto flush_cap_releases;
}
@@ -2944,12 +3077,15 @@ void ceph_handle_caps(struct ceph_mds_session *session,
goto done;
case CEPH_CAP_OP_EXPORT:
- handle_cap_export(inode, h, session, &open_target_sessions);
- goto done;
+ handle_cap_export(inode, h, peer, session);
+ goto done_unlocked;
case CEPH_CAP_OP_IMPORT:
- handle_cap_import(mdsc, inode, h, session,
- snaptrace, snaptrace_len);
+ handle_cap_import(mdsc, inode, h, peer, session,
+ &cap, &issued);
+ handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ msg->middle, session, cap, issued);
+ goto done_unlocked;
}
/* the rest require a cap */
@@ -2966,8 +3102,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
switch (op) {
case CEPH_CAP_OP_REVOKE:
case CEPH_CAP_OP_GRANT:
- case CEPH_CAP_OP_IMPORT:
- handle_cap_grant(inode, h, session, cap, msg->middle);
+ __ceph_caps_issued(ci, &issued);
+ issued |= __ceph_caps_dirty(ci);
+ handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle,
+ session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3000,8 +3138,6 @@ done:
done_unlocked:
if (inode)
iput(inode);
- if (open_target_sessions)
- ceph_mdsc_open_export_target_sessions(mdsc, session);
return;
bad:
@@ -3143,7 +3279,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
rel->seq = cpu_to_le32(cap->seq);
rel->issue_seq = cpu_to_le32(cap->issue_seq),
rel->mseq = cpu_to_le32(cap->mseq);
- rel->caps = cpu_to_le32(cap->issued);
+ rel->caps = cpu_to_le32(cap->implemented);
rel->wanted = cpu_to_le32(cap->mds_wanted);
rel->dname_len = 0;
rel->dname_seq = 0;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6d59006bfa2..5a743ac141a 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -71,9 +71,9 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
if (req->r_got_unsafe)
- seq_printf(s, "\t(unsafe)");
+ seq_puts(s, "\t(unsafe)");
else
- seq_printf(s, "\t");
+ seq_puts(s, "\t");
if (req->r_inode) {
seq_printf(s, " #%llx", ceph_ino(req->r_inode));
@@ -93,6 +93,8 @@ static int mdsc_show(struct seq_file *s, void *p)
} else if (req->r_path1) {
seq_printf(s, " #%llx/%s", req->r_ino1.ino,
req->r_path1);
+ } else {
+ seq_printf(s, " #%llx", req->r_ino1.ino);
}
if (req->r_old_dentry) {
@@ -102,7 +104,8 @@ static int mdsc_show(struct seq_file *s, void *p)
path = NULL;
spin_lock(&req->r_old_dentry->d_lock);
seq_printf(s, " #%llx/%.*s (%s)",
- ceph_ino(req->r_old_dentry_dir),
+ req->r_old_dentry_dir ?
+ ceph_ino(req->r_old_dentry_dir) : 0,
req->r_old_dentry->d_name.len,
req->r_old_dentry->d_name.name,
path ? path : "");
@@ -116,7 +119,7 @@ static int mdsc_show(struct seq_file *s, void *p)
seq_printf(s, " %s", req->r_path2);
}
- seq_printf(s, "\n");
+ seq_puts(s, "\n");
}
mutex_unlock(&mdsc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 868b61d56ca..c29d6ae6887 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -100,6 +100,14 @@ static unsigned fpos_off(loff_t p)
return p & 0xffffffff;
}
+static int fpos_cmp(loff_t l, loff_t r)
+{
+ int v = ceph_frag_compare(fpos_frag(l), fpos_frag(r));
+ if (v)
+ return v;
+ return (int)(fpos_off(l) - fpos_off(r));
+}
+
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
@@ -111,7 +119,8 @@ static unsigned fpos_off(loff_t p)
* defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
* the MDS if/when the directory is modified).
*/
-static int __dcache_readdir(struct file *file, struct dir_context *ctx)
+static int __dcache_readdir(struct file *file, struct dir_context *ctx,
+ u32 shared_gen)
{
struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_dentry;
@@ -125,14 +134,14 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx)
last = fi->dentry;
fi->dentry = NULL;
- dout("__dcache_readdir %p at %llu (last %p)\n", dir, ctx->pos,
- last);
+ dout("__dcache_readdir %p v%u at %llu (last %p)\n",
+ dir, shared_gen, ctx->pos, last);
spin_lock(&parent->d_lock);
/* start at beginning? */
if (ctx->pos == 2 || last == NULL ||
- ctx->pos < ceph_dentry(last)->offset) {
+ fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
@@ -153,10 +162,11 @@ more:
goto out_unlock;
}
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (!d_unhashed(dentry) && dentry->d_inode &&
+ if (di->lease_shared_gen == shared_gen &&
+ !d_unhashed(dentry) && dentry->d_inode &&
ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
- ctx->pos <= di->offset)
+ fpos_cmp(ctx->pos, di->offset) <= 0)
break;
dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
dentry->d_name.len, dentry->d_name.name, di->offset,
@@ -172,9 +182,16 @@ more:
spin_unlock(&dentry->d_lock);
spin_unlock(&parent->d_lock);
+ /* make sure a dentry wasn't dropped while we didn't have parent lock */
+ if (!ceph_dir_is_complete(dir)) {
+ dout(" lost dir complete on %p; falling back to mds\n", dir);
+ dput(dentry);
+ err = -EAGAIN;
+ goto out;
+ }
+
dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
- ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -182,25 +199,18 @@ more:
if (last) {
/* remember our position */
fi->dentry = last;
- fi->next_offset = di->offset;
+ fi->next_offset = fpos_off(di->offset);
}
dput(dentry);
return 0;
}
+ ctx->pos = di->offset + 1;
+
if (last)
dput(last);
last = dentry;
- ctx->pos++;
-
- /* make sure a dentry wasn't dropped while we didn't have parent lock */
- if (!ceph_dir_is_complete(dir)) {
- dout(" lost dir complete on %p; falling back to mds\n", dir);
- err = -EAGAIN;
- goto out;
- }
-
spin_lock(&parent->d_lock);
p = p->prev; /* advance to next dentry */
goto more;
@@ -244,8 +254,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
int err;
u32 ftype;
struct ceph_mds_reply_info_parsed *rinfo;
- const int max_entries = fsc->mount_options->max_readdir;
- const int max_bytes = fsc->mount_options->max_readdir_bytes;
dout("readdir %p file %p frag %u off %u\n", inode, file, frag, off);
if (fi->flags & CEPH_F_ATEND)
@@ -283,10 +291,13 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete(ci) &&
__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ u32 shared_gen = ci->i_shared_gen;
spin_unlock(&ci->i_ceph_lock);
- err = __dcache_readdir(file, ctx);
+ err = __dcache_readdir(file, ctx, shared_gen);
if (err != -EAGAIN)
return err;
+ frag = fpos_frag(ctx->pos);
+ off = fpos_off(ctx->pos);
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -314,14 +325,16 @@ more:
fi->last_readdir = NULL;
}
- /* requery frag tree, as the frag topology may have changed */
- frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
-
dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
ceph_vinop(inode), frag, fi->last_name);
req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ err = ceph_alloc_readdir_reply_buffer(req, inode);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return err;
+ }
req->r_inode = inode;
ihold(inode);
req->r_dentry = dget(file->f_dentry);
@@ -332,9 +345,6 @@ more:
req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag);
- req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
- req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
- req->r_num_caps = max_entries + 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0) {
ceph_mdsc_put_request(req);
@@ -352,6 +362,16 @@ more:
}
/* note next offset and last dentry name */
+ rinfo = &req->r_reply_info;
+ if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (ceph_frag_is_leftmost(frag))
+ fi->next_offset = 2;
+ else
+ fi->next_offset = 0;
+ off = fi->next_offset;
+ }
+ fi->frag = frag;
fi->offset = fi->next_offset;
fi->last_readdir = req;
@@ -363,7 +383,6 @@ more:
else
fi->next_offset = 0;
} else {
- rinfo = &req->r_reply_info;
err = note_last_dentry(fi,
rinfo->dir_dname[rinfo->dir_nr-1],
rinfo->dir_dname_len[rinfo->dir_nr-1]);
@@ -429,7 +448,6 @@ more:
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
dout(" marking %p complete\n", inode);
__ceph_dir_set_complete(ci, fi->dir_release_count);
- ci->i_max_offset = ctx->pos;
}
spin_unlock(&ci->i_ceph_lock);
@@ -437,7 +455,7 @@ more:
return 0;
}
-static void reset_readdir(struct ceph_file_info *fi)
+static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
{
if (fi->last_readdir) {
ceph_mdsc_put_request(fi->last_readdir);
@@ -445,7 +463,10 @@ static void reset_readdir(struct ceph_file_info *fi)
}
kfree(fi->last_name);
fi->last_name = NULL;
- fi->next_offset = 2; /* compensate for . and .. */
+ if (ceph_frag_is_leftmost(frag))
+ fi->next_offset = 2; /* compensate for . and .. */
+ else
+ fi->next_offset = 0;
if (fi->dentry) {
dput(fi->dentry);
fi->dentry = NULL;
@@ -457,7 +478,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct ceph_file_info *fi = file->private_data;
struct inode *inode = file->f_mapping->host;
- loff_t old_offset = offset;
+ loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
loff_t retval;
mutex_lock(&inode->i_mutex);
@@ -474,7 +495,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
goto out;
}
- if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset >= 0) {
if (offset != file->f_pos) {
file->f_pos = offset;
file->f_version = 0;
@@ -487,14 +508,14 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
* seek to new frag, or seek prior to current chunk.
*/
if (offset == 0 ||
- fpos_frag(offset) != fpos_frag(old_offset) ||
+ fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) {
dout("dir_llseek dropping %p content\n", file);
- reset_readdir(fi);
+ reset_readdir(fi, fpos_frag(offset));
}
/* bump dir_release_count if we did a forward seek */
- if (offset > old_offset)
+ if (fpos_cmp(offset, old_offset) > 0)
fi->dir_release_count--;
}
out:
@@ -684,7 +705,10 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
- if (err)
+
+ if (!err)
+ ceph_init_acl(dentry, dentry->d_inode, dir);
+ else
d_drop(dentry);
return err;
}
@@ -722,7 +746,9 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
if (!err && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
- if (err)
+ if (!err)
+ ceph_init_acl(dentry, dentry->d_inode, dir);
+ else
d_drop(dentry);
return err;
}
@@ -763,7 +789,9 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
err = ceph_handle_notrace_create(dir, dentry);
ceph_mdsc_put_request(req);
out:
- if (err < 0)
+ if (!err)
+ ceph_init_acl(dentry, dentry->d_inode, dir);
+ else
d_drop(dentry);
return err;
}
@@ -788,8 +816,7 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
}
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
- req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry = dget(old_dentry);
req->r_locked_dir = dir;
req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -887,10 +914,11 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
+ ihold(old_dir);
req->r_dentry = dget(new_dentry);
req->r_num_caps = 2;
req->r_old_dentry = dget(old_dentry);
- req->r_old_dentry_dir = ceph_get_dentry_parent_inode(old_dentry);
+ req->r_old_dentry_dir = old_dir;
req->r_locked_dir = new_dir;
req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
@@ -908,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* to do it here.
*/
- /* d_move screws up d_subdirs order */
- ceph_dir_clear_complete(new_dir);
-
d_move(old_dentry, new_dentry);
/* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(new_dentry);
+
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(old_dir);
+ ceph_dir_clear_complete(new_dir);
+
}
ceph_mdsc_put_request(req);
return err;
@@ -1028,14 +1058,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = 1;
} else if (dentry_lease_is_valid(dentry) ||
dir_lease_is_valid(dir, dentry)) {
- valid = 1;
+ if (dentry->d_inode)
+ valid = ceph_is_any_caps(dentry->d_inode);
+ else
+ valid = 1;
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
- if (valid)
+ if (valid) {
ceph_dentry_lru_touch(dentry);
- else
+ } else {
+ ceph_dir_clear_complete(dir);
d_drop(dentry);
+ }
iput(dir);
return valid;
}
@@ -1284,6 +1319,8 @@ const struct inode_operations ceph_dir_iops = {
.getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
.removexattr = ceph_removexattr,
+ .get_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
.mknod = ceph_mknod,
.symlink = ceph_symlink,
.mkdir = ceph_mkdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 16796be53ca..8d7d782f438 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -8,23 +8,6 @@
#include "mds_client.h"
/*
- * NFS export support
- *
- * NFS re-export of a ceph mount is, at present, only semireliable.
- * The basic issue is that the Ceph architectures doesn't lend itself
- * well to generating filehandles that will remain valid forever.
- *
- * So, we do our best. If you're lucky, your inode will be in the
- * client's cache. If it's not, and you have a connectable fh, then
- * the MDS server may be able to find it for you. Otherwise, you get
- * ESTALE.
- *
- * There are ways to this more reliable, but in the non-connectable fh
- * case, we won't every work perfectly, and in the connectable case,
- * some changes are needed on the MDS side to work better.
- */
-
-/*
* Basic fh
*/
struct ceph_nfs_fh {
@@ -32,22 +15,12 @@ struct ceph_nfs_fh {
} __attribute__ ((packed));
/*
- * Larger 'connectable' fh that includes parent ino and name hash.
- * Use this whenever possible, as it works more reliably.
+ * Larger fh that includes parent ino.
*/
struct ceph_nfs_confh {
u64 ino, parent_ino;
- u32 parent_name_hash;
} __attribute__ ((packed));
-/*
- * The presence of @parent_inode here tells us whether NFS wants a
- * connectable file handle. However, we want to make a connectionable
- * file handle unconditionally so that the MDS gets as much of a hint
- * as possible. That means we only use @parent_dentry to indicate
- * whether nfsd wants a connectable fh, and whether we should indicate
- * failure from a too-small @max_len.
- */
static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct inode *parent_inode)
{
@@ -56,54 +29,36 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
struct ceph_nfs_confh *cfh = (void *)rawfh;
int connected_handle_length = sizeof(*cfh)/4;
int handle_length = sizeof(*fh)/4;
- struct dentry *dentry;
- struct dentry *parent;
/* don't re-export snaps */
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EINVAL;
- dentry = d_find_alias(inode);
+ if (parent_inode && (*max_len < connected_handle_length)) {
+ *max_len = connected_handle_length;
+ return FILEID_INVALID;
+ } else if (*max_len < handle_length) {
+ *max_len = handle_length;
+ return FILEID_INVALID;
+ }
- /* if we found an alias, generate a connectable fh */
- if (*max_len >= connected_handle_length && dentry) {
- dout("encode_fh %p connectable\n", dentry);
- spin_lock(&dentry->d_lock);
- parent = dentry->d_parent;
+ if (parent_inode) {
+ dout("encode_fh %llx with parent %llx\n",
+ ceph_ino(inode), ceph_ino(parent_inode));
cfh->ino = ceph_ino(inode);
- cfh->parent_ino = ceph_ino(parent->d_inode);
- cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
- dentry);
+ cfh->parent_ino = ceph_ino(parent_inode);
*max_len = connected_handle_length;
- type = 2;
- spin_unlock(&dentry->d_lock);
- } else if (*max_len >= handle_length) {
- if (parent_inode) {
- /* nfsd wants connectable */
- *max_len = connected_handle_length;
- type = FILEID_INVALID;
- } else {
- dout("encode_fh %p\n", dentry);
- fh->ino = ceph_ino(inode);
- *max_len = handle_length;
- type = 1;
- }
+ type = FILEID_INO32_GEN_PARENT;
} else {
+ dout("encode_fh %llx\n", ceph_ino(inode));
+ fh->ino = ceph_ino(inode);
*max_len = handle_length;
- type = FILEID_INVALID;
+ type = FILEID_INO32_GEN;
}
- if (dentry)
- dput(dentry);
return type;
}
-/*
- * convert regular fh to dentry
- *
- * FIXME: we should try harder by querying the mds for the ino.
- */
-static struct dentry *__fh_to_dentry(struct super_block *sb,
- struct ceph_nfs_fh *fh, int fh_len)
+static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
struct inode *inode;
@@ -111,11 +66,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*fh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__fh_to_dentry %llx\n", fh->ino);
- vino.ino = fh->ino;
+ vino.ino = ino;
vino.snap = CEPH_NOSNAP;
inode = ceph_find_inode(sb, vino);
if (!inode) {
@@ -139,139 +90,161 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
- fh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
+ dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
return dentry;
}
/*
- * convert connectable fh to dentry
+ * convert regular fh to dentry
*/
-static struct dentry *__cfh_to_dentry(struct super_block *sb,
- struct ceph_nfs_confh *cfh, int fh_len)
+static struct dentry *ceph_fh_to_dentry(struct super_block *sb,
+ struct fid *fid,
+ int fh_len, int fh_type)
+{
+ struct ceph_nfs_fh *fh = (void *)fid->raw;
+
+ if (fh_type != FILEID_INO32_GEN &&
+ fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
+ if (fh_len < sizeof(*fh) / 4)
+ return NULL;
+
+ dout("fh_to_dentry %llx\n", fh->ino);
+ return __fh_to_dentry(sb, fh->ino);
+}
+
+static struct dentry *__get_parent(struct super_block *sb,
+ struct dentry *child, u64 ino)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
+ struct ceph_mds_request *req;
struct inode *inode;
struct dentry *dentry;
- struct ceph_vino vino;
int err;
- if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
-
- dout("__cfh_to_dentry %llx (%llx/%x)\n",
- cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
-
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode) {
- struct ceph_mds_request *req;
-
- req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
- USE_ANY_MDS);
- if (IS_ERR(req))
- return ERR_CAST(req);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_CAST(req);
- req->r_ino1 = vino;
- req->r_ino2.ino = cfh->parent_ino;
- req->r_ino2.snap = CEPH_NOSNAP;
- req->r_path2 = kmalloc(16, GFP_NOFS);
- snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
- req->r_num_caps = 1;
- err = ceph_mdsc_do_request(mdsc, NULL, req);
- inode = req->r_target_inode;
- if (inode)
- ihold(inode);
- ceph_mdsc_put_request(req);
- if (!inode)
- return ERR_PTR(err ? err : -ESTALE);
+ if (child) {
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ } else {
+ req->r_ino1 = (struct ceph_vino) {
+ .ino = ino,
+ .snap = CEPH_NOSNAP,
+ };
}
+ req->r_num_caps = 1;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ inode = req->r_target_inode;
+ if (inode)
+ ihold(inode);
+ ceph_mdsc_put_request(req);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
dentry = d_obtain_alias(inode);
if (IS_ERR(dentry)) {
- pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
iput(inode);
return dentry;
}
err = ceph_init_dentry(dentry);
if (err < 0) {
- iput(inode);
+ dput(dentry);
return ERR_PTR(err);
}
- dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
+ dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
+ child ? ceph_ino(child->d_inode) : ino,
+ dentry, ceph_vinop(inode));
return dentry;
}
-static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
- int fh_len, int fh_type)
+static struct dentry *ceph_get_parent(struct dentry *child)
{
- if (fh_type == 1)
- return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw,
- fh_len);
- else
- return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw,
- fh_len);
+ /* don't re-export snaps */
+ if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+ return ERR_PTR(-EINVAL);
+
+ dout("get_parent %p ino %llx.%llx\n",
+ child, ceph_vinop(child->d_inode));
+ return __get_parent(child->d_sb, child, 0);
}
/*
- * get parent, if possible.
- *
- * FIXME: we could do better by querying the mds to discover the
- * parent.
+ * convert regular fh to parent
*/
static struct dentry *ceph_fh_to_parent(struct super_block *sb,
- struct fid *fid,
+ struct fid *fid,
int fh_len, int fh_type)
{
struct ceph_nfs_confh *cfh = (void *)fid->raw;
- struct ceph_vino vino;
- struct inode *inode;
struct dentry *dentry;
- int err;
- if (fh_type == 1)
- return ERR_PTR(-ESTALE);
+ if (fh_type != FILEID_INO32_GEN_PARENT)
+ return NULL;
if (fh_len < sizeof(*cfh) / 4)
- return ERR_PTR(-ESTALE);
+ return NULL;
- pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
- cfh->parent_name_hash);
+ dout("fh_to_parent %llx\n", cfh->parent_ino);
+ dentry = __get_parent(sb, NULL, cfh->ino);
+ if (IS_ERR(dentry) && PTR_ERR(dentry) == -ENOENT)
+ dentry = __fh_to_dentry(sb, cfh->parent_ino);
+ return dentry;
+}
- vino.ino = cfh->ino;
- vino.snap = CEPH_NOSNAP;
- inode = ceph_find_inode(sb, vino);
- if (!inode)
- return ERR_PTR(-ESTALE);
+static int ceph_get_name(struct dentry *parent, char *name,
+ struct dentry *child)
+{
+ struct ceph_mds_client *mdsc;
+ struct ceph_mds_request *req;
+ int err;
- dentry = d_obtain_alias(inode);
- if (IS_ERR(dentry)) {
- pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
- cfh->ino, inode);
- iput(inode);
- return dentry;
- }
- err = ceph_init_dentry(dentry);
- if (err < 0) {
- iput(inode);
- return ERR_PTR(err);
+ mdsc = ceph_inode_to_client(child->d_inode)->mdsc;
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME,
+ USE_ANY_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ mutex_lock(&parent->d_inode->i_mutex);
+
+ req->r_inode = child->d_inode;
+ ihold(child->d_inode);
+ req->r_ino2 = ceph_vino(parent->d_inode);
+ req->r_locked_dir = parent->d_inode;
+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+
+ mutex_unlock(&parent->d_inode->i_mutex);
+
+ if (!err) {
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ memcpy(name, rinfo->dname, rinfo->dname_len);
+ name[rinfo->dname_len] = 0;
+ dout("get_name %p ino %llx.%llx name %s\n",
+ child, ceph_vinop(child->d_inode), name);
+ } else {
+ dout("get_name %p ino %llx.%llx err %d\n",
+ child, ceph_vinop(child->d_inode), err);
}
- dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
- return dentry;
+
+ ceph_mdsc_put_request(req);
+ return err;
}
const struct export_operations ceph_export_ops = {
.encode_fh = ceph_encode_fh,
.fh_to_dentry = ceph_fh_to_dentry,
.fh_to_parent = ceph_fh_to_parent,
+ .get_parent = ceph_get_parent,
+ .get_name = ceph_get_name,
};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3de89829e2a..302085100c2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -210,7 +210,7 @@ int ceph_open(struct inode *inode, struct file *file)
ihold(inode);
req->r_num_caps = 1;
- if (flags & (O_CREAT|O_TRUNC))
+ if (flags & O_CREAT)
parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
err = ceph_mdsc_do_request(mdsc, parent_inode, req);
iput(parent_inode);
@@ -286,12 +286,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
} else {
dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
+ ceph_init_acl(dentry, dentry->d_inode, dir);
*opened |= FILE_CREATED;
}
err = finish_open(file, dentry, ceph_open, opened);
}
-
out_err:
+ if (!req->r_err && req->r_target_inode)
+ ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
ceph_mdsc_put_request(req);
dout("atomic_open result=%d\n", err);
return err;
@@ -408,51 +410,82 @@ more:
*
* If the read spans object boundary, just do multiple reads.
*/
-static ssize_t ceph_sync_read(struct file *file, char __user *data,
- unsigned len, loff_t *poff, int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
+ int *checkeof)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct page **pages;
- u64 off = *poff;
+ u64 off = iocb->ki_pos;
int num_pages, ret;
+ size_t len = iov_iter_count(i);
- dout("sync_read on file %p %llu~%u %s\n", file, off, len,
+ dout("sync_read on file %p %llu~%u %s\n", file, off,
+ (unsigned)len,
(file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
-
- if (file->f_flags & O_DIRECT) {
- num_pages = calc_pages_for((unsigned long)data, len);
- pages = ceph_get_direct_page_vector(data, num_pages, true);
- } else {
- num_pages = calc_pages_for(off, len);
- pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
- }
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
/*
* flush any page cache pages in this range. this
* will make concurrent normal and sync io slow,
* but it will at least behave sensibly when they are
* in sequence.
*/
- ret = filemap_write_and_wait(inode->i_mapping);
+ ret = filemap_write_and_wait_range(inode->i_mapping, off,
+ off + len);
if (ret < 0)
- goto done;
+ return ret;
- ret = striped_read(inode, off, len, pages, num_pages, checkeof,
- file->f_flags & O_DIRECT,
- (unsigned long)data & ~PAGE_MASK);
+ if (file->f_flags & O_DIRECT) {
+ while (iov_iter_count(i)) {
+ size_t start;
+ ssize_t n;
- if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
- ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
- if (ret >= 0)
- *poff = off + ret;
+ n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
+ if (n < 0)
+ return n;
-done:
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, true);
- else
+ num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ ret = striped_read(inode, off, n,
+ pages, num_pages, checkeof,
+ 1, start);
+
+ ceph_put_page_vector(pages, num_pages, true);
+
+ if (ret <= 0)
+ break;
+ off += ret;
+ iov_iter_advance(i, ret);
+ if (ret < n)
+ break;
+ }
+ } else {
+ num_pages = calc_pages_for(off, len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+ ret = striped_read(inode, off, len, pages,
+ num_pages, checkeof, 0, 0);
+ if (ret > 0) {
+ int l, k = 0;
+ size_t left = ret;
+
+ while (left) {
+ int copy = min_t(size_t, PAGE_SIZE, left);
+ l = copy_page_to_iter(pages[k++], 0, copy, i);
+ off += l;
+ left -= l;
+ if (l < copy)
+ break;
+ }
+ }
ceph_release_page_vector(pages, num_pages);
+ }
+
+ if (off > iocb->ki_pos) {
+ ret = off - iocb->ki_pos;
+ iocb->ki_pos = off;
+ }
+
dout("sync_read result %d\n", ret);
return ret;
}
@@ -489,148 +522,249 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
}
+
/*
- * Synchronous write, straight from __user pointer or user pages (if
- * O_DIRECT).
+ * Synchronous write, straight from __user pointer or user pages.
*
* If write spans object boundary, just do multiple writes. (For a
* correct atomic write, we should e.g. take write locks on all
* objects, rollback on failure, etc.)
*/
-static ssize_t ceph_sync_write(struct file *file, const char __user *data,
- size_t left, loff_t pos, loff_t *ppos)
+static ssize_t
+ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
struct ceph_snap_context *snapc;
struct ceph_vino vino;
struct ceph_osd_request *req;
- int num_ops = 1;
struct page **pages;
int num_pages;
- u64 len;
int written = 0;
int flags;
int check_caps = 0;
- int page_align, io_align;
- unsigned long buf_align;
int ret;
struct timespec mtime = CURRENT_TIME;
- bool own_pages = false;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
return -EROFS;
- dout("sync_write on file %p %lld~%u %s\n", file, pos,
- (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
+ dout("sync_direct_write on file %p %lld~%u\n", file, pos,
+ (unsigned)count);
- ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
+ ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
if (ret < 0)
return ret;
ret = invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_CACHE_SHIFT,
- (pos + left) >> PAGE_CACHE_SHIFT);
+ (pos + count) >> PAGE_CACHE_SHIFT);
if (ret < 0)
dout("invalidate_inode_pages2_range returned %d\n", ret);
flags = CEPH_OSD_FLAG_ORDERSNAP |
CEPH_OSD_FLAG_ONDISK |
CEPH_OSD_FLAG_WRITE;
- if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
- flags |= CEPH_OSD_FLAG_ACK;
- else
- num_ops++; /* Also include a 'startsync' command. */
-
- /*
- * we may need to do multiple writes here if we span an object
- * boundary. this isn't atomic, unfortunately. :(
- */
-more:
- io_align = pos & ~PAGE_MASK;
- buf_align = (unsigned long)data & ~PAGE_MASK;
- len = left;
- snapc = ci->i_snap_realm->cached_context;
- vino = ceph_vino(inode);
- req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
- vino, pos, &len, num_ops,
- CEPH_OSD_OP_WRITE, flags, snapc,
- ci->i_truncate_seq, ci->i_truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
+ while (iov_iter_count(from) > 0) {
+ u64 len = iov_iter_single_seg_count(from);
+ size_t start;
+ ssize_t n;
+
+ snapc = ci->i_snap_realm->cached_context;
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &len,
+ 2,/*include a 'startsync' command*/
+ CEPH_OSD_OP_WRITE, flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
- /* write from beginning of first page, regardless of io alignment */
- page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
- num_pages = calc_pages_for(page_align, len);
- if (file->f_flags & O_DIRECT) {
- pages = ceph_get_direct_page_vector(data, num_pages, false);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
- goto out;
+ n = iov_iter_get_pages_alloc(from, &pages, len, &start);
+ if (unlikely(n < 0)) {
+ ret = n;
+ ceph_osdc_put_request(req);
+ break;
}
+ num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
*/
truncate_inode_pages_range(inode->i_mapping, pos,
- (pos+len) | (PAGE_CACHE_SIZE-1));
- } else {
+ (pos+n) | (PAGE_CACHE_SIZE-1));
+ osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
+ false, false);
+
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+
+ ceph_put_page_vector(pages, num_pages, false);
+
+ ceph_osdc_put_request(req);
+ if (ret)
+ break;
+ pos += n;
+ written += n;
+ iov_iter_advance(from, n);
+
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+ }
+
+ if (ret != -EOLDSNAPC && written > 0) {
+ iocb->ki_pos = pos;
+ ret = written;
+ }
+ return ret;
+}
+
+
+/*
+ * Synchronous write, straight from __user pointer or user pages.
+ *
+ * If write spans object boundary, just do multiple writes. (For a
+ * correct atomic write, we should e.g. take write locks on all
+ * objects, rollback on failure, etc.)
+ */
+static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_snap_context *snapc;
+ struct ceph_vino vino;
+ struct ceph_osd_request *req;
+ struct page **pages;
+ u64 len;
+ int num_pages;
+ int written = 0;
+ int flags;
+ int check_caps = 0;
+ int ret;
+ struct timespec mtime = CURRENT_TIME;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+
+ if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count);
+
+ ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
+ if (ret < 0)
+ return ret;
+
+ ret = invalidate_inode_pages2_range(inode->i_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ (pos + count) >> PAGE_CACHE_SHIFT);
+ if (ret < 0)
+ dout("invalidate_inode_pages2_range returned %d\n", ret);
+
+ flags = CEPH_OSD_FLAG_ORDERSNAP |
+ CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ACK;
+
+ while ((len = iov_iter_count(from)) > 0) {
+ size_t left;
+ int n;
+
+ snapc = ci->i_snap_realm->cached_context;
+ vino = ceph_vino(inode);
+ req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
+ vino, pos, &len, 1,
+ CEPH_OSD_OP_WRITE, flags, snapc,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ false);
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ break;
+ }
+
+ /*
+ * write from beginning of first page,
+ * regardless of io alignment
+ */
+ num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
goto out;
}
- ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
+
+ left = len;
+ for (n = 0; n < num_pages; n++) {
+ size_t plen = min_t(size_t, left, PAGE_SIZE);
+ ret = copy_page_from_iter(pages[n], 0, plen, from);
+ if (ret != plen) {
+ ret = -EFAULT;
+ break;
+ }
+ left -= ret;
+ }
+
if (ret < 0) {
ceph_release_page_vector(pages, num_pages);
goto out;
}
- if ((file->f_flags & O_SYNC) == 0) {
- /* get a second commit callback */
- req->r_unsafe_callback = ceph_sync_write_unsafe;
- req->r_inode = inode;
- own_pages = true;
- }
- }
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, own_pages);
+ /* get a second commit callback */
+ req->r_unsafe_callback = ceph_sync_write_unsafe;
+ req->r_inode = inode;
- /* BUG_ON(vino.snap != CEPH_NOSNAP); */
- ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
+ osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
+ false, true);
- ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
- if (!ret)
- ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
+ /* BUG_ON(vino.snap != CEPH_NOSNAP); */
+ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
- if (file->f_flags & O_DIRECT)
- ceph_put_page_vector(pages, num_pages, false);
- else if (file->f_flags & O_SYNC)
- ceph_release_page_vector(pages, num_pages);
+ ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+ if (!ret)
+ ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
out:
- ceph_osdc_put_request(req);
- if (ret == 0) {
- pos += len;
- written += len;
- left -= len;
- data += len;
- if (left)
- goto more;
+ ceph_osdc_put_request(req);
+ if (ret == 0) {
+ pos += len;
+ written += len;
+
+ if (pos > i_size_read(inode)) {
+ check_caps = ceph_inode_set_size(inode, pos);
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_AUTHONLY,
+ NULL);
+ }
+ } else
+ break;
+ }
+ if (ret != -EOLDSNAPC && written > 0) {
ret = written;
- *ppos = pos;
- if (pos > i_size_read(inode))
- check_caps = ceph_inode_set_size(inode, pos);
- if (check_caps)
- ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
- NULL);
- } else if (ret != -EOLDSNAPC && written > 0) {
- ret = written;
+ iocb->ki_pos = pos;
}
return ret;
}
@@ -642,60 +776,69 @@ out:
*
* Hmm, the sync read case isn't actually async... should it be?
*/
-static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *filp = iocb->ki_filp;
struct ceph_file_info *fi = filp->private_data;
- loff_t *ppos = &iocb->ki_pos;
- size_t len = iov->iov_len;
+ size_t len = iocb->ki_nbytes;
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
- void __user *base = iov->iov_base;
ssize_t ret;
int want, got = 0;
int checkeof = 0, read = 0;
- dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
- inode, ceph_vinop(inode), pos, (unsigned)len, inode);
again:
+ dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode);
+
if (fi->fmode & CEPH_FILE_MODE_LAZY)
want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
else
want = CEPH_CAP_FILE_CACHE;
ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
if (ret < 0)
- goto out;
- dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)len,
- ceph_cap_string(got));
+ return ret;
if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_filp->f_flags & O_DIRECT) ||
- (fi->flags & CEPH_F_SYNC))
+ (fi->flags & CEPH_F_SYNC)) {
+
+ dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
+
/* hmm, this isn't really async... */
- ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
- else
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ ret = ceph_sync_read(iocb, to, &checkeof);
+ } else {
+ dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
+ inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
+ ceph_cap_string(got));
-out:
+ ret = generic_file_read_iter(iocb, to);
+ }
dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
ceph_put_cap_refs(ci, got);
if (checkeof && ret >= 0) {
- int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
+ int statret = ceph_do_getattr(inode,
+ CEPH_STAT_CAP_SIZE);
/* hit EOF or hole? */
- if (statret == 0 && *ppos < inode->i_size) {
- dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size);
+ if (statret == 0 && iocb->ki_pos < inode->i_size &&
+ ret < len) {
+ dout("sync_read hit hole, ppos %lld < size %lld"
+ ", reading more\n", iocb->ki_pos,
+ inode->i_size);
+
+ iov_iter_advance(to, ret);
read += ret;
- base += ret;
len -= ret;
checkeof = 0;
goto again;
}
}
+
if (ret >= 0)
ret += read;
@@ -712,8 +855,7 @@ out:
*
* If we are near ENOSPC, write synchronously.
*/
-static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct ceph_file_info *fi = file->private_data;
@@ -721,18 +863,15 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
- ssize_t count, written = 0;
+ ssize_t count = iov_iter_count(from), written = 0;
int err, want, got;
+ loff_t pos = iocb->ki_pos;
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
mutex_lock(&inode->i_mutex);
- err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
- if (err)
- goto out;
-
/* We can write back this queue in page reclaim */
current->backing_dev_info = file->f_mapping->backing_dev_info;
@@ -742,6 +881,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
+ iov_iter_truncate(from, count);
err = file_remove_suid(file);
if (err)
@@ -772,20 +912,27 @@ retry_snap:
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
- (iocb->ki_filp->f_flags & O_DIRECT) ||
- (fi->flags & CEPH_F_SYNC)) {
+ (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
+ struct iov_iter data;
mutex_unlock(&inode->i_mutex);
- written = ceph_sync_write(file, iov->iov_base, count,
- pos, &iocb->ki_pos);
+ /* we might need to revert back to that point */
+ data = *from;
+ if (file->f_flags & O_DIRECT)
+ written = ceph_sync_direct_write(iocb, &data);
+ else
+ written = ceph_sync_write(iocb, &data);
if (written == -EOLDSNAPC) {
dout("aio_write %p %llx.%llx %llu~%u"
"got EOLDSNAPC, retrying\n",
inode, ceph_vinop(inode),
- pos, (unsigned)iov->iov_len);
+ pos, (unsigned)count);
mutex_lock(&inode->i_mutex);
goto retry_snap;
}
+ if (written > 0)
+ iov_iter_advance(from, written);
} else {
+ loff_t old_size = inode->i_size;
/*
* No need to acquire the i_truncate_mutex. Because
* the MDS revokes Fwb caps before sending truncate
@@ -793,9 +940,11 @@ retry_snap:
* are pending vmtruncate. So write and vmtruncate
* can not run at the same time
*/
- written = generic_file_buffered_write(iocb, iov, nr_segs,
- pos, &iocb->ki_pos,
- count, 0);
+ written = generic_perform_write(file, from, pos);
+ if (likely(written >= 0))
+ iocb->ki_pos = pos + written;
+ if (inode->i_size > old_size)
+ ceph_fscache_update_objectsize(inode);
mutex_unlock(&inode->i_mutex);
}
@@ -809,7 +958,7 @@ retry_snap:
}
dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
- inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
+ inode, ceph_vinop(inode), pos, (unsigned)count,
ceph_cap_string(got));
ceph_put_cap_refs(ci, got);
@@ -1018,7 +1167,7 @@ static long ceph_fallocate(struct file *file, int mode,
loff_t offset, loff_t length)
{
struct ceph_file_info *fi = file->private_data;
- struct inode *inode = file->f_dentry->d_inode;
+ struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
@@ -1031,9 +1180,6 @@ static long ceph_fallocate(struct file *file, int mode,
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (IS_SWAPFILE(inode))
- return -ETXTBSY;
-
mutex_lock(&inode->i_mutex);
if (ceph_snap(inode) != CEPH_NOSNAP) {
@@ -1089,16 +1235,16 @@ const struct file_operations ceph_file_fops = {
.open = ceph_open,
.release = ceph_release,
.llseek = ceph_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = ceph_aio_read,
- .aio_write = ceph_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = ceph_read_iter,
+ .write_iter = ceph_write_iter,
.mmap = ceph_mmap,
.fsync = ceph_fsync,
.lock = ceph_lock,
.flock = ceph_flock,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = ceph_ioctl,
.compat_ioctl = ceph_ioctl,
.fallocate = ceph_fallocate,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8549a48115f..04c89c266ce 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -9,6 +9,8 @@
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/vmalloc.h>
+#include <linux/posix_acl.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
@@ -95,6 +97,8 @@ const struct inode_operations ceph_file_iops = {
.getxattr = ceph_getxattr,
.listxattr = ceph_listxattr,
.removexattr = ceph_removexattr,
+ .get_acl = ceph_get_acl,
+ .set_acl = ceph_set_acl,
};
@@ -176,9 +180,8 @@ struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
* specified, copy the frag delegation info to the caller if
* it is present.
*/
-u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
- struct ceph_inode_frag *pfrag,
- int *found)
+static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
{
u32 t = ceph_frag_make(0, 0);
struct ceph_inode_frag *frag;
@@ -188,7 +191,6 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
if (found)
*found = 0;
- mutex_lock(&ci->i_fragtree_mutex);
while (1) {
WARN_ON(!ceph_frag_contains_value(t, v));
frag = __ceph_find_frag(ci, t);
@@ -217,10 +219,19 @@ u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
}
dout("choose_frag(%x) = %x\n", v, t);
- mutex_unlock(&ci->i_fragtree_mutex);
return t;
}
+u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag, int *found)
+{
+ u32 ret;
+ mutex_lock(&ci->i_fragtree_mutex);
+ ret = __ceph_choose_frag(ci, v, pfrag, found);
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return ret;
+}
+
/*
* Process dirfrag (delegation) info from the mds. Include leaf
* fragment in tree ONLY if ndist > 0. Otherwise, only
@@ -234,11 +245,17 @@ static int ceph_fill_dirfrag(struct inode *inode,
u32 id = le32_to_cpu(dirinfo->frag);
int mds = le32_to_cpu(dirinfo->auth);
int ndist = le32_to_cpu(dirinfo->ndist);
+ int diri_auth = -1;
int i;
int err = 0;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap)
+ diri_auth = ci->i_auth_cap->mds;
+ spin_unlock(&ci->i_ceph_lock);
+
mutex_lock(&ci->i_fragtree_mutex);
- if (ndist == 0) {
+ if (ndist == 0 && mds == diri_auth) {
/* no delegation info needed. */
frag = __ceph_find_frag(ci, id);
if (!frag)
@@ -283,6 +300,75 @@ out:
return err;
}
+static int ceph_fill_fragtree(struct inode *inode,
+ struct ceph_frag_tree_head *fragtree,
+ struct ceph_mds_reply_dirfrag *dirinfo)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_inode_frag *frag;
+ struct rb_node *rb_node;
+ int i;
+ u32 id, nsplits;
+ bool update = false;
+
+ mutex_lock(&ci->i_fragtree_mutex);
+ nsplits = le32_to_cpu(fragtree->nsplits);
+ if (nsplits) {
+ i = prandom_u32() % nsplits;
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ if (!__ceph_find_frag(ci, id))
+ update = true;
+ } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) {
+ rb_node = rb_first(&ci->i_fragtree);
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node))
+ update = true;
+ }
+ if (!update && dirinfo) {
+ id = le32_to_cpu(dirinfo->frag);
+ if (id != __ceph_choose_frag(ci, id, NULL, NULL))
+ update = true;
+ }
+ if (!update)
+ goto out_unlock;
+
+ dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode));
+ rb_node = rb_first(&ci->i_fragtree);
+ for (i = 0; i < nsplits; i++) {
+ id = le32_to_cpu(fragtree->splits[i].frag);
+ frag = NULL;
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ if (ceph_frag_compare(frag->frag, id) >= 0) {
+ if (frag->frag != id)
+ frag = NULL;
+ else
+ rb_node = rb_next(rb_node);
+ break;
+ }
+ rb_node = rb_next(rb_node);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ frag = NULL;
+ }
+ if (!frag) {
+ frag = __get_or_create_frag(ci, id);
+ if (IS_ERR(frag))
+ continue;
+ }
+ frag->split_by = le32_to_cpu(fragtree->splits[i].by);
+ dout(" frag %x split by %d\n", frag->frag, frag->split_by);
+ }
+ while (rb_node) {
+ frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+ rb_node = rb_next(rb_node);
+ rb_erase(&frag->node, &ci->i_fragtree);
+ kfree(frag);
+ }
+out_unlock:
+ mutex_unlock(&ci->i_fragtree_mutex);
+ return 0;
+}
/*
* initialize a newly allocated inode.
@@ -335,9 +421,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
INIT_LIST_HEAD(&ci->i_cap_delay_list);
- ci->i_cap_exporting_mds = 0;
- ci->i_cap_exporting_mseq = 0;
- ci->i_cap_exporting_issued = 0;
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
@@ -406,7 +489,7 @@ void ceph_destroy_inode(struct inode *inode)
/*
* we may still have a snap_realm reference if there are stray
- * caps in i_cap_exporting_issued or i_snap_caps.
+ * caps in i_snap_caps.
*/
if (ci->i_snap_realm) {
struct ceph_mds_client *mdsc =
@@ -436,6 +519,16 @@ void ceph_destroy_inode(struct inode *inode)
call_rcu(&inode->i_rcu, ceph_i_callback);
}
+int ceph_drop_inode(struct inode *inode)
+{
+ /*
+ * Positve dentry and corresponding inode are always accompanied
+ * in MDS reply. So no need to keep inode in the cache after
+ * dropping all its aliases.
+ */
+ return 1;
+}
+
/*
* Helpers to fill in size, ctime, mtime, and atime. We have to be
* careful because either the client or MDS may have more up to date
@@ -571,20 +664,26 @@ static int fill_inode(struct inode *inode,
unsigned long ttl_from, int cap_fmode,
struct ceph_cap_reservation *caps_reservation)
{
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_reply_inode *info = iinfo->in;
struct ceph_inode_info *ci = ceph_inode(inode);
- int i;
- int issued = 0, implemented;
+ int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
- u32 nsplits;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_cap *new_cap = NULL;
int err = 0;
- int queue_trunc = 0;
+ bool wake = false;
+ bool queue_trunc = false;
+ bool new_version = false;
dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
inode, ceph_vinop(inode), le64_to_cpu(info->version),
ci->i_version);
+ /* prealloc new cap struct */
+ if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP)
+ new_cap = ceph_get_cap(mdsc, caps_reservation);
+
/*
* prealloc xattr data, if it looks like we'll need it. only
* if len > 4 (meaning there are actually xattrs; the first 4
@@ -610,19 +709,23 @@ static int fill_inode(struct inode *inode,
* 3 2 skip
* 3 3 update
*/
- if (le64_to_cpu(info->version) > 0 &&
- (ci->i_version & ~1) >= le64_to_cpu(info->version))
- goto no_change;
-
+ if (ci->i_version == 0 ||
+ ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ le64_to_cpu(info->version) > (ci->i_version & ~1)))
+ new_version = true;
+
issued = __ceph_caps_issued(ci, &implemented);
issued |= implemented | __ceph_caps_dirty(ci);
+ new_issued = ~issued & le32_to_cpu(info->cap.caps);
/* update inode */
ci->i_version = le64_to_cpu(info->version);
inode->i_version++;
inode->i_rdev = le32_to_cpu(info->rdev);
+ inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
- if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
+ (issued & CEPH_CAP_AUTH_EXCL) == 0) {
inode->i_mode = le32_to_cpu(info->mode);
inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid));
inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid));
@@ -631,31 +734,35 @@ static int fill_inode(struct inode *inode,
from_kgid(&init_user_ns, inode->i_gid));
}
- if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
+ (issued & CEPH_CAP_LINK_EXCL) == 0)
set_nlink(inode, le32_to_cpu(info->nlink));
- /* be careful with mtime, atime, size */
- ceph_decode_timespec(&atime, &info->atime);
- ceph_decode_timespec(&mtime, &info->mtime);
- ceph_decode_timespec(&ctime, &info->ctime);
- queue_trunc = ceph_fill_file_size(inode, issued,
- le32_to_cpu(info->truncate_seq),
- le64_to_cpu(info->truncate_size),
- le64_to_cpu(info->size));
- ceph_fill_file_time(inode, issued,
- le32_to_cpu(info->time_warp_seq),
- &ctime, &mtime, &atime);
-
- /* only update max_size on auth cap */
- if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
- ci->i_max_size != le64_to_cpu(info->max_size)) {
- dout("max_size %lld -> %llu\n", ci->i_max_size,
- le64_to_cpu(info->max_size));
- ci->i_max_size = le64_to_cpu(info->max_size);
+ if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
+ /* be careful with mtime, atime, size */
+ ceph_decode_timespec(&atime, &info->atime);
+ ceph_decode_timespec(&mtime, &info->mtime);
+ ceph_decode_timespec(&ctime, &info->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(info->time_warp_seq),
+ &ctime, &mtime, &atime);
}
- ci->i_layout = info->layout;
- inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
+ if (new_version ||
+ (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
+ ci->i_layout = info->layout;
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ le32_to_cpu(info->truncate_seq),
+ le64_to_cpu(info->truncate_size),
+ le64_to_cpu(info->size));
+ /* only update max_size on auth cap */
+ if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
+ ci->i_max_size != le64_to_cpu(info->max_size)) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size,
+ le64_to_cpu(info->max_size));
+ ci->i_max_size = le64_to_cpu(info->max_size);
+ }
+ }
/* xattrs */
/* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
@@ -668,6 +775,7 @@ static int fill_inode(struct inode *inode,
memcpy(ci->i_xattrs.blob->vec.iov_base,
iinfo->xattr_data, iinfo->xattr_len);
ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+ ceph_forget_all_cached_acls(inode);
xattr_blob = NULL;
}
@@ -738,29 +846,7 @@ static int fill_inode(struct inode *inode,
!__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode);
__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
- ci->i_max_offset = 2;
- }
-no_change:
- spin_unlock(&ci->i_ceph_lock);
-
- /* queue truncate if we saw i_size decrease */
- if (queue_trunc)
- ceph_queue_vmtruncate(inode);
-
- /* populate frag tree */
- /* FIXME: move me up, if/when version reflects fragtree changes */
- nsplits = le32_to_cpu(info->fragtree.nsplits);
- mutex_lock(&ci->i_fragtree_mutex);
- for (i = 0; i < nsplits; i++) {
- u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
- struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
- if (IS_ERR(frag))
- continue;
- frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
- dout(" frag %x split by %d\n", frag->frag, frag->split_by);
}
- mutex_unlock(&ci->i_fragtree_mutex);
/* were we issued a capability? */
if (info->cap.caps) {
@@ -773,30 +859,41 @@ no_change:
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
- info->cap.flags,
- caps_reservation);
+ info->cap.flags, &new_cap);
+ wake = true;
} else {
- spin_lock(&ci->i_ceph_lock);
dout(" %p got snap_caps %s\n", inode,
ceph_cap_string(le32_to_cpu(info->cap.caps)));
ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
if (cap_fmode >= 0)
__ceph_get_fmode(ci, cap_fmode);
- spin_unlock(&ci->i_ceph_lock);
}
} else if (cap_fmode >= 0) {
- pr_warning("mds issued no caps on %llx.%llx\n",
+ pr_warn("mds issued no caps on %llx.%llx\n",
ceph_vinop(inode));
__ceph_get_fmode(ci, cap_fmode);
}
+ spin_unlock(&ci->i_ceph_lock);
+
+ if (wake)
+ wake_up_all(&ci->i_cap_wq);
+
+ /* queue truncate if we saw i_size decrease */
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+
+ /* populate frag tree */
+ if (S_ISDIR(inode->i_mode))
+ ceph_fill_fragtree(inode, &info->fragtree, dirinfo);
/* update delegation info? */
if (dirinfo)
ceph_fill_dirfrag(inode, dirinfo);
err = 0;
-
out:
+ if (new_cap)
+ ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
return err;
@@ -853,41 +950,6 @@ out_unlock:
}
/*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
- struct dentry *dir = dn->d_parent;
- struct inode *inode = dir->d_inode;
- struct ceph_inode_info *ci;
- struct ceph_dentry_info *di;
-
- BUG_ON(!inode);
-
- ci = ceph_inode(inode);
- di = ceph_dentry(dn);
-
- spin_lock(&ci->i_ceph_lock);
- if (!__ceph_dir_is_complete(ci)) {
- spin_unlock(&ci->i_ceph_lock);
- return;
- }
- di->offset = ceph_inode(inode)->i_max_offset++;
- spin_unlock(&ci->i_ceph_lock);
-
- spin_lock(&dir->d_lock);
- spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
- list_move(&dn->d_u.d_child, &dir->d_subdirs);
- dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
- dn->d_u.d_child.prev, dn->d_u.d_child.next);
- spin_unlock(&dn->d_lock);
- spin_unlock(&dir->d_lock);
-}
-
-/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
*
@@ -896,7 +958,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
* the caller) if we fail.
*/
static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
- bool *prehash, bool set_offset)
+ bool *prehash)
{
struct dentry *realdn;
@@ -928,8 +990,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
}
if ((!prehash || *prehash) && d_unhashed(dn))
d_rehash(dn);
- if (set_offset)
- ceph_set_dentry_offset(dn);
out:
return dn;
}
@@ -950,10 +1010,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
{
struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
struct inode *in = NULL;
- struct ceph_mds_reply_inode *ininfo;
struct ceph_vino vino;
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
- int i = 0;
int err = 0;
dout("fill_trace %p is_dentry %d is_target %d\n", req,
@@ -1008,10 +1066,82 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
session, req->r_request_started, -1,
&req->r_caps_reservation);
if (err < 0)
- return err;
+ goto done;
} else {
WARN_ON_ONCE(1);
}
+
+ if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
+ struct qstr dname;
+ struct dentry *dn, *parent;
+
+ BUG_ON(!rinfo->head->is_target);
+ BUG_ON(req->r_dentry);
+
+ parent = d_find_any_alias(dir);
+ BUG_ON(!parent);
+
+ dname.name = rinfo->dname;
+ dname.len = rinfo->dname_len;
+ dname.hash = full_name_hash(dname.name, dname.len);
+ vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+retry_lookup:
+ dn = d_lookup(parent, &dname);
+ dout("d_lookup on parent=%p name=%.*s got %p\n",
+ parent, dname.len, dname.name, dn);
+
+ if (!dn) {
+ dn = d_alloc(parent, &dname);
+ dout("d_alloc %p '%.*s' = %p\n", parent,
+ dname.len, dname.name, dn);
+ if (dn == NULL) {
+ dput(parent);
+ err = -ENOMEM;
+ goto done;
+ }
+ err = ceph_init_dentry(dn);
+ if (err < 0) {
+ dput(dn);
+ dput(parent);
+ goto done;
+ }
+ } else if (dn->d_inode &&
+ (ceph_ino(dn->d_inode) != vino.ino ||
+ ceph_snap(dn->d_inode) != vino.snap)) {
+ dout(" dn %p points to wrong inode %p\n",
+ dn, dn->d_inode);
+ d_delete(dn);
+ dput(dn);
+ goto retry_lookup;
+ }
+
+ req->r_dentry = dn;
+ dput(parent);
+ }
+ }
+
+ if (rinfo->head->is_target) {
+ vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+ vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+ in = ceph_get_inode(sb, vino);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ goto done;
+ }
+ req->r_target_inode = in;
+
+ err = fill_inode(in, &rinfo->targeti, NULL,
+ session, req->r_request_started,
+ (!req->r_aborted && rinfo->head->result == 0) ?
+ req->r_fmode : -1,
+ &req->r_caps_reservation);
+ if (err < 0) {
+ pr_err("fill_inode badness %p %llx.%llx\n",
+ in, ceph_vinop(in));
+ goto done;
+ }
}
/*
@@ -1053,6 +1183,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
/* rename? */
if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+ struct inode *olddir = req->r_old_dentry_dir;
+ BUG_ON(!olddir);
+
dout(" src %p '%.*s' dst %p '%.*s'\n",
req->r_old_dentry,
req->r_old_dentry->d_name.len,
@@ -1072,18 +1205,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn);
- /*
- * d_move() puts the renamed dentry at the end of
- * d_subdirs. We need to assign it an appropriate
- * directory offset so we can behave when dir is
- * complete.
- */
- ceph_set_dentry_offset(req->r_old_dentry);
+ /* d_move screws up sibling dentries' offsets */
+ ceph_dir_clear_complete(dir);
+ ceph_dir_clear_complete(olddir);
+
dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset);
dn = req->r_old_dentry; /* use old_dentry */
- in = dn->d_inode;
}
/* null dentry? */
@@ -1105,99 +1234,46 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
}
/* attach proper inode */
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = dn->d_inode;
- if (!in) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_trace bad get_inode "
- "%llx.%llx\n", vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_drop(dn);
- goto done;
- }
- dn = splice_dentry(dn, in, &have_lease, true);
+ if (!dn->d_inode) {
+ ceph_dir_clear_complete(dir);
+ ihold(in);
+ dn = splice_dentry(dn, in, &have_lease);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
}
req->r_dentry = dn; /* may have spliced */
- ihold(in);
- } else if (ceph_ino(in) == vino.ino &&
- ceph_snap(in) == vino.snap) {
- ihold(in);
- } else {
+ } else if (dn->d_inode && dn->d_inode != in) {
dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
- dn, in, ceph_ino(in), ceph_snap(in),
- vino.ino, vino.snap);
+ dn, dn->d_inode, ceph_vinop(dn->d_inode),
+ ceph_vinop(in));
have_lease = false;
- in = NULL;
}
if (have_lease)
update_dentry_lease(dn, rinfo->dlease, session,
req->r_request_started);
dout(" final dn %p\n", dn);
- i++;
- } else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
- req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
+ } else if (!req->r_aborted &&
+ (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+ req->r_op == CEPH_MDS_OP_MKSNAP)) {
struct dentry *dn = req->r_dentry;
+ struct inode *dir = req->r_locked_dir;
/* fill out a snapdir LOOKUPSNAP dentry */
BUG_ON(!dn);
- BUG_ON(!req->r_locked_dir);
- BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
- ininfo = rinfo->targeti.in;
- vino.ino = le64_to_cpu(ininfo->ino);
- vino.snap = le64_to_cpu(ininfo->snapid);
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- pr_err("fill_inode get_inode badness %llx.%llx\n",
- vino.ino, vino.snap);
- err = PTR_ERR(in);
- d_delete(dn);
- goto done;
- }
+ BUG_ON(!dir);
+ BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
dout(" linking snapped dir %p to dn %p\n", in, dn);
- dn = splice_dentry(dn, in, NULL, true);
+ ceph_dir_clear_complete(dir);
+ ihold(in);
+ dn = splice_dentry(dn, in, NULL);
if (IS_ERR(dn)) {
err = PTR_ERR(dn);
goto done;
}
req->r_dentry = dn; /* may have spliced */
- ihold(in);
- rinfo->head->is_dentry = 1; /* fool notrace handlers */
- }
-
- if (rinfo->head->is_target) {
- vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
- vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
- if (in == NULL || ceph_ino(in) != vino.ino ||
- ceph_snap(in) != vino.snap) {
- in = ceph_get_inode(sb, vino);
- if (IS_ERR(in)) {
- err = PTR_ERR(in);
- goto done;
- }
- }
- req->r_target_inode = in;
-
- err = fill_inode(in,
- &rinfo->targeti, NULL,
- session, req->r_request_started,
- (le32_to_cpu(rinfo->head->result) == 0) ?
- req->r_fmode : -1,
- &req->r_caps_reservation);
- if (err < 0) {
- pr_err("fill_inode badness %p %llx.%llx\n",
- in, ceph_vinop(in));
- goto done;
- }
}
-
done:
dout("fill_trace done err=%d\n", err);
return err;
@@ -1247,11 +1323,23 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct qstr dname;
struct dentry *dn;
struct inode *in;
- int err = 0, i;
+ int err = 0, ret, i;
struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
- u64 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_dentry_info *di;
+ u64 r_readdir_offset = req->r_readdir_offset;
+ u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+
+ if (rinfo->dir_dir &&
+ le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+ dout("readdir_prepopulate got new frag %x -> %x\n",
+ frag, le32_to_cpu(rinfo->dir_dir->frag));
+ frag = le32_to_cpu(rinfo->dir_dir->frag);
+ if (ceph_frag_is_leftmost(frag))
+ r_readdir_offset = 2;
+ else
+ r_readdir_offset = 0;
+ }
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
@@ -1268,6 +1356,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
}
+ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino;
@@ -1292,9 +1381,10 @@ retry_lookup:
err = -ENOMEM;
goto out;
}
- err = ceph_init_dentry(dn);
- if (err < 0) {
+ ret = ceph_init_dentry(dn);
+ if (ret < 0) {
dput(dn);
+ err = ret;
goto out;
}
} else if (dn->d_inode &&
@@ -1314,9 +1404,6 @@ retry_lookup:
spin_unlock(&parent->d_lock);
}
- di = dn->d_fsdata;
- di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
/* inode */
if (dn->d_inode) {
in = dn->d_inode;
@@ -1329,26 +1416,39 @@ retry_lookup:
err = PTR_ERR(in);
goto out;
}
- dn = splice_dentry(dn, in, NULL, false);
- if (IS_ERR(dn))
- dn = NULL;
}
if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1,
&req->r_caps_reservation) < 0) {
pr_err("fill_inode badness on %p\n", in);
+ if (!dn->d_inode)
+ iput(in);
+ d_drop(dn);
goto next_item;
}
- if (dn)
- update_dentry_lease(dn, rinfo->dir_dlease[i],
- req->r_session,
- req->r_request_started);
+
+ if (!dn->d_inode) {
+ dn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(dn)) {
+ err = PTR_ERR(dn);
+ dn = NULL;
+ goto next_item;
+ }
+ }
+
+ di = dn->d_fsdata;
+ di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+
+ update_dentry_lease(dn, rinfo->dir_dlease[i],
+ req->r_session,
+ req->r_request_started);
next_item:
if (dn)
dput(dn);
}
- req->r_did_prepopulate = true;
+ if (err == 0)
+ req->r_did_prepopulate = true;
out:
if (snapdir) {
@@ -1437,7 +1537,8 @@ static void ceph_invalidate_work(struct work_struct *work)
dout("invalidate_pages %p gen %d revoking %d\n", inode,
ci->i_rdcache_gen, ci->i_rdcache_revoking);
if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
- /* nevermind! */
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
goto out;
@@ -1445,7 +1546,7 @@ static void ceph_invalidate_work(struct work_struct *work)
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, 0);
+ truncate_pagecache(inode, 0);
spin_lock(&ci->i_ceph_lock);
if (orig_gen == ci->i_rdcache_gen &&
@@ -1458,13 +1559,14 @@ static void ceph_invalidate_work(struct work_struct *work)
dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
inode, orig_gen, ci->i_rdcache_gen,
ci->i_rdcache_revoking);
+ if (__ceph_caps_revoking_other(ci, NULL, CEPH_CAP_FILE_CACHE))
+ check = 1;
}
spin_unlock(&ci->i_ceph_lock);
mutex_unlock(&ci->i_truncate_mutex);
-
+out:
if (check)
ceph_check_caps(ci, 0, NULL);
-out:
iput(inode);
}
@@ -1547,7 +1649,7 @@ retry:
ci->i_truncate_pending, to);
spin_unlock(&ci->i_ceph_lock);
- truncate_inode_pages(inode->i_mapping, to);
+ truncate_pagecache(inode, to);
spin_lock(&ci->i_ceph_lock);
if (to == ci->i_truncate_size) {
@@ -1594,7 +1696,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
const unsigned int ia_valid = attr->ia_valid;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
@@ -1658,6 +1759,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
dirtied |= CEPH_CAP_AUTH_EXCL;
} else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
attr->ia_mode != inode->i_mode) {
+ inode->i_mode = attr->ia_mode;
req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
mask |= CEPH_SETATTR_MODE;
release |= CEPH_CAP_AUTH_SHARED;
@@ -1773,15 +1875,19 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (inode_dirty_flags)
__mark_inode_dirty(inode, inode_dirty_flags);
+ if (ia_valid & ATTR_MODE) {
+ err = posix_acl_chmod(inode, attr->ia_mode);
+ if (err)
+ goto out_put;
+ }
+
if (mask) {
req->r_inode = inode;
ihold(inode);
req->r_inode_drop = release;
req->r_args.setattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1;
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
}
dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
ceph_cap_string(dirtied), mask);
@@ -1792,6 +1898,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
return err;
out:
spin_unlock(&ci->i_ceph_lock);
+out_put:
ceph_mdsc_put_request(req);
return err;
}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 669622fd1ae..a822a6e5829 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,9 +1,8 @@
+#include <linux/ceph/ceph_debug.h>
#include <linux/in.h>
#include "super.h"
#include "mds_client.h"
-#include <linux/ceph/ceph_debug.h>
-
#include "ioctl.h"
@@ -64,7 +63,6 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
- struct inode *parent_inode;
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
struct ceph_ioctl_layout l;
@@ -111,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
+
req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
req->r_args.setlayout.layout.fl_stripe_unit =
@@ -121,9 +121,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
cpu_to_le32(l.object_size);
req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
- parent_inode = ceph_get_dentry_parent_inode(file->f_dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
@@ -157,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
req->r_args.setlayout.layout.fl_stripe_unit =
cpu_to_le32(l.stripe_unit);
@@ -183,6 +182,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
+ struct ceph_object_locator oloc;
+ struct ceph_object_id oid;
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
@@ -211,8 +212,10 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- r = ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
- ceph_file_layout_pg_pool(ci->i_layout));
+ oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+ ceph_oid_set_name(&oid, dl.object_name);
+
+ r = ceph_oloc_oid_to_pg(osdc->osdmap, &oloc, &oid, &pgid);
if (r < 0) {
up_read(&osdc->map_sem);
return r;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae6d14e82b0..fbc39c47bac 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -2,11 +2,31 @@
#include <linux/file.h>
#include <linux/namei.h>
+#include <linux/random.h>
#include "super.h"
#include "mds_client.h"
#include <linux/ceph/pagelist.h>
+static u64 lock_secret;
+
+static inline u64 secure_addr(void *addr)
+{
+ u64 v = lock_secret ^ (u64)(unsigned long)addr;
+ /*
+ * Set the most significant bit, so that MDS knows the 'owner'
+ * is sufficient to identify the owner of lock. (old code uses
+ * both 'owner' and 'pid')
+ */
+ v |= (1ULL << 63);
+ return v;
+}
+
+void __init ceph_flock_init(void)
+{
+ get_random_bytes(&lock_secret, sizeof(lock_secret));
+}
+
/**
* Implement fcntl and flock locking functions.
*/
@@ -14,17 +34,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
int cmd, u8 wait, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
- struct ceph_mds_client *mdsc =
- ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_mds_request *req;
int err;
u64 length = 0;
+ u64 owner;
req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
if (IS_ERR(req))
return PTR_ERR(req);
req->r_inode = inode;
ihold(inode);
+ req->r_num_caps = 1;
/* mds requires start and length rather than start and end */
if (LLONG_MAX == fl->fl_end)
@@ -32,25 +53,24 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
else
length = fl->fl_end - fl->fl_start + 1;
- dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
- "length: %llu, wait: %d, type: %d", (int)lock_type,
- (int)operation, (u64)fl->fl_pid, fl->fl_start,
- length, wait, fl->fl_type);
+ owner = secure_addr(fl->fl_owner);
+
+ dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
+ "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
+ (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length,
+ wait, fl->fl_type);
req->r_args.filelock_change.rule = lock_type;
req->r_args.filelock_change.type = cmd;
+ req->r_args.filelock_change.owner = cpu_to_le64(owner);
req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
- /* This should be adjusted, but I'm not sure if
- namespaces actually get id numbers*/
- req->r_args.filelock_change.pid_namespace =
- cpu_to_le64((u64)(unsigned long)fl->fl_nspid);
req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
err = ceph_mdsc_do_request(mdsc, inode, req);
- if ( operation == CEPH_MDS_OP_GETFILELOCK){
+ if (operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
fl->fl_type = F_RDLCK;
@@ -87,14 +107,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
u8 wait = 0;
u16 op = CEPH_MDS_OP_SETFILELOCK;
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_lock, fl_pid:%d", fl->fl_pid);
+ if (!(fl->fl_flags & FL_POSIX))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_lock, fl_owner: %p", fl->fl_owner);
/* set wait bit as appropriate, then make command as Ceph expects it*/
- if (F_SETLKW == cmd)
- wait = 1;
- if (F_GETLK == cmd)
+ if (IS_GETLK(cmd))
op = CEPH_MDS_OP_GETFILELOCK;
+ else if (IS_SETLKW(cmd))
+ wait = 1;
if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
@@ -105,7 +130,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl);
if (!err) {
- if ( op != CEPH_MDS_OP_GETFILELOCK ){
+ if (op != CEPH_MDS_OP_GETFILELOCK) {
dout("mds locked, locking locally");
err = posix_lock_file(file, fl, NULL);
if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
@@ -131,20 +156,22 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
{
u8 lock_cmd;
int err;
- u8 wait = 1;
-
- fl->fl_nspid = get_pid(task_tgid(current));
- dout("ceph_flock, fl_pid:%d", fl->fl_pid);
-
- /* set wait bit, then clear it out of cmd*/
- if (cmd & LOCK_NB)
- wait = 0;
- cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
- /* set command sequence that Ceph wants to see:
- shared lock, exclusive lock, or unlock */
- if (LOCK_SH == cmd)
+ u8 wait = 0;
+
+ if (!(fl->fl_flags & FL_FLOCK))
+ return -ENOLCK;
+ /* No mandatory locks */
+ if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK)
+ return -ENOLCK;
+
+ dout("ceph_flock, fl_file: %p", fl->fl_file);
+
+ if (IS_SETLKW(cmd))
+ wait = 1;
+
+ if (F_RDLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_SHARED;
- else if (LOCK_EX == cmd)
+ else if (F_WRLCK == fl->fl_type)
lock_cmd = CEPH_LOCK_EXCL;
else
lock_cmd = CEPH_LOCK_UNLOCK;
@@ -280,13 +307,11 @@ int lock_to_ceph_filelock(struct file_lock *lock,
struct ceph_filelock *cephlock)
{
int err = 0;
-
cephlock->start = cpu_to_le64(lock->fl_start);
cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
cephlock->client = cpu_to_le64(0);
- cephlock->pid = cpu_to_le64(lock->fl_pid);
- cephlock->pid_namespace =
- cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
+ cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
+ cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
switch (lock->fl_type) {
case F_RDLCK:
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b7bda5d9611..92a2548278f 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
#include <linux/fs.h>
#include <linux/wait.h>
#include <linux/slab.h>
+#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -43,6 +44,7 @@
*/
struct ceph_reconnect_state {
+ int nr_caps;
struct ceph_pagelist *pagelist;
bool flock;
};
@@ -62,7 +64,7 @@ static const struct ceph_connection_operations mds_con_ops;
*/
static int parse_reply_info_in(void **p, void *end,
struct ceph_mds_reply_info_in *info,
- int features)
+ u64 features)
{
int err = -EIO;
@@ -97,7 +99,7 @@ bad:
*/
static int parse_reply_info_trace(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
int err;
@@ -144,7 +146,7 @@ out_bad:
*/
static int parse_reply_info_dir(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
u32 num, i = 0;
int err;
@@ -164,21 +166,18 @@ static int parse_reply_info_dir(void **p, void *end,
if (num == 0)
goto done;
- /* alloc large array */
- info->dir_nr = num;
- info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
- sizeof(*info->dir_dname) +
- sizeof(*info->dir_dname_len) +
- sizeof(*info->dir_dlease),
- GFP_NOFS);
- if (info->dir_in == NULL) {
- err = -ENOMEM;
- goto out_bad;
- }
+ BUG_ON(!info->dir_in);
info->dir_dname = (void *)(info->dir_in + num);
info->dir_dname_len = (void *)(info->dir_dname + num);
info->dir_dlease = (void *)(info->dir_dname_len + num);
+ if ((unsigned long)(info->dir_dlease + num) >
+ (unsigned long)info->dir_in + info->dir_buf_size) {
+ pr_err("dir contents are larger than expected\n");
+ WARN_ON(1);
+ goto bad;
+ }
+ info->dir_nr = num;
while (num) {
/* dentry */
ceph_decode_need(p, end, sizeof(u32)*2, bad);
@@ -216,7 +215,7 @@ out_bad:
*/
static int parse_reply_info_filelock(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (*p + sizeof(*info->filelock_reply) > end)
goto bad;
@@ -237,7 +236,7 @@ bad:
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
if (*p == end) {
@@ -261,7 +260,7 @@ bad:
*/
static int parse_reply_info_extra(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features);
@@ -279,7 +278,7 @@ static int parse_reply_info_extra(void **p, void *end,
*/
static int parse_reply_info(struct ceph_msg *msg,
struct ceph_mds_reply_info_parsed *info,
- int features)
+ u64 features)
{
void *p, *end;
u32 len;
@@ -326,7 +325,9 @@ out_bad:
static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
{
- kfree(info->dir_in);
+ if (!info->dir_in)
+ return;
+ free_pages((unsigned long)info->dir_in, get_order(info->dir_buf_size));
}
@@ -443,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe);
s->s_num_cap_releases = 0;
+ s->s_cap_reconnect = 0;
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_releases_done);
@@ -510,12 +512,11 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
+ destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
- if (req->r_reply) {
+ if (req->r_reply)
ceph_msg_put(req->r_reply);
- destroy_reply_info(&req->r_reply_info);
- }
if (req->r_inode) {
ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
iput(req->r_inode);
@@ -526,7 +527,9 @@ void ceph_mdsc_release_request(struct kref *kref)
iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
- if (req->r_old_dentry) {
+ if (req->r_old_dentry)
+ dput(req->r_old_dentry);
+ if (req->r_old_dentry_dir) {
/*
* track (and drop pins for) r_old_dentry_dir
* separately, since r_old_dentry's d_parent may have
@@ -535,7 +538,6 @@ void ceph_mdsc_release_request(struct kref *kref)
*/
ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- dput(req->r_old_dentry);
iput(req->r_old_dentry_dir);
}
kfree(req->r_path1);
@@ -642,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
req->r_unsafe_dir = NULL;
}
+ complete_all(&req->r_safe_completion);
+
ceph_mdsc_put_request(req);
}
@@ -709,14 +713,15 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
struct dentry *dn = get_nonsnap_parent(parent);
inode = dn->d_inode;
dout("__choose_mds using nonsnap parent %p\n", inode);
- } else if (req->r_dentry->d_inode) {
+ } else {
/* dentry target */
inode = req->r_dentry->d_inode;
- } else {
- /* dir + name */
- inode = dir;
- hash = ceph_dentry_hash(dir, req->r_dentry);
- is_hash = true;
+ if (!inode || mode == USE_AUTH_MDS) {
+ /* dir + name */
+ inode = dir;
+ hash = ceph_dentry_hash(dir, req->r_dentry);
+ is_hash = true;
+ }
}
}
@@ -842,35 +847,56 @@ static int __open_session(struct ceph_mds_client *mdsc,
*
* called under mdsc->mutex
*/
+static struct ceph_mds_session *
+__open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+
+ session = __ceph_lookup_mds_session(mdsc, target);
+ if (!session) {
+ session = register_session(mdsc, target);
+ if (IS_ERR(session))
+ return session;
+ }
+ if (session->s_state == CEPH_MDS_SESSION_NEW ||
+ session->s_state == CEPH_MDS_SESSION_CLOSING)
+ __open_session(mdsc, session);
+
+ return session;
+}
+
+struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target)
+{
+ struct ceph_mds_session *session;
+
+ dout("open_export_target_session to mds%d\n", target);
+
+ mutex_lock(&mdsc->mutex);
+ session = __open_export_target_session(mdsc, target);
+ mutex_unlock(&mdsc->mutex);
+
+ return session;
+}
+
static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_info *mi;
struct ceph_mds_session *ts;
int i, mds = session->s_mds;
- int target;
if (mds >= mdsc->mdsmap->m_max_mds)
return;
+
mi = &mdsc->mdsmap->m_info[mds];
dout("open_export_target_sessions for mds%d (%d targets)\n",
session->s_mds, mi->num_export_targets);
for (i = 0; i < mi->num_export_targets; i++) {
- target = mi->export_targets[i];
- ts = __ceph_lookup_mds_session(mdsc, target);
- if (!ts) {
- ts = register_session(mdsc, target);
- if (IS_ERR(ts))
- return;
- }
- if (session->s_state == CEPH_MDS_SESSION_NEW ||
- session->s_state == CEPH_MDS_SESSION_CLOSING)
- __open_session(mdsc, session);
- else
- dout(" mds%d target mds%d %p is %s\n", session->s_mds,
- i, ts, session_state_name(ts->s_state));
- ceph_put_mds_session(ts);
+ ts = __open_export_target_session(mdsc, mi->export_targets[i]);
+ if (!IS_ERR(ts))
+ ceph_put_mds_session(ts);
}
}
@@ -986,7 +1012,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
dout("removing cap %p, ci is %p, inode is %p\n",
cap, ci, &ci->vfs_inode);
spin_lock(&ci->i_ceph_lock);
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, false);
if (!__ceph_is_any_real_caps(ci)) {
struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1132,6 +1158,21 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
return 0;
}
+static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session, u64 seq)
+{
+ struct ceph_msg *msg;
+
+ dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
+ session->s_mds, session_state_name(session->s_state), seq);
+ msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ if (!msg)
+ return -ENOMEM;
+ ceph_con_send(&session->s_con, msg);
+ return 0;
+}
+
+
/*
* Note new cap ttl, and any transition from stale -> not stale (fresh?).
*
@@ -1210,7 +1251,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
{
struct ceph_mds_session *session = arg;
struct ceph_inode_info *ci = ceph_inode(inode);
- int used, oissued, mine;
+ int used, wanted, oissued, mine;
if (session->s_trim_caps <= 0)
return -1;
@@ -1218,22 +1259,25 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
spin_lock(&ci->i_ceph_lock);
mine = cap->issued | cap->implemented;
used = __ceph_caps_used(ci);
+ wanted = __ceph_caps_file_wanted(ci);
oissued = __ceph_caps_issued_other(ci, cap);
- dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
+ dout("trim_caps_cb %p cap %p mine %s oissued %s used %s wanted %s\n",
inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
- ceph_cap_string(used));
- if (ci->i_dirty_caps)
- goto out; /* dirty caps */
- if ((used & ~oissued) & mine)
+ ceph_cap_string(used), ceph_cap_string(wanted));
+ if (cap == ci->i_auth_cap) {
+ if (ci->i_dirty_caps | ci->i_flushing_caps)
+ goto out;
+ if ((used | wanted) & CEPH_CAP_ANY_WR)
+ goto out;
+ }
+ if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
session->s_trim_caps--;
if (oissued) {
/* we aren't the only cap.. just remove us */
- __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
- cap->mseq, cap->issue_seq);
- __ceph_remove_cap(cap);
+ __ceph_remove_cap(cap, true);
} else {
/* try to drop referring dentries */
spin_unlock(&ci->i_ceph_lock);
@@ -1267,6 +1311,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
trim_caps - session->s_trim_caps);
session->s_trim_caps = 0;
}
+
+ ceph_add_cap_releases(mdsc, session);
+ ceph_send_cap_releases(mdsc, session);
return 0;
}
@@ -1416,17 +1463,19 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
unsigned num;
dout("discard_cap_releases mds%d\n", session->s_mds);
- spin_lock(&session->s_cap_lock);
- /* zero out the in-progress message */
- msg = list_first_entry(&session->s_cap_releases,
- struct ceph_msg, list_head);
- head = msg->front.iov_base;
- num = le32_to_cpu(head->num);
- dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
- head->num = cpu_to_le32(0);
- msg->front.iov_len = sizeof(*head);
- session->s_num_cap_releases += num;
+ if (!list_empty(&session->s_cap_releases)) {
+ /* zero out the in-progress message */
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+ head = msg->front.iov_base;
+ num = le32_to_cpu(head->num);
+ dout("discard_cap_releases mds%d %p %u\n",
+ session->s_mds, msg, num);
+ head->num = cpu_to_le32(0);
+ msg->front.iov_len = sizeof(*head);
+ session->s_num_cap_releases += num;
+ }
/* requeue completed messages */
while (!list_empty(&session->s_cap_releases_done)) {
@@ -1443,14 +1492,49 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
msg->front.iov_len = sizeof(*head);
list_add(&msg->list_head, &session->s_cap_releases);
}
-
- spin_unlock(&session->s_cap_lock);
}
/*
* requests
*/
+int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
+ struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options;
+ size_t size = sizeof(*rinfo->dir_in) + sizeof(*rinfo->dir_dname_len) +
+ sizeof(*rinfo->dir_dname) + sizeof(*rinfo->dir_dlease);
+ int order, num_entries;
+
+ spin_lock(&ci->i_ceph_lock);
+ num_entries = ci->i_files + ci->i_subdirs;
+ spin_unlock(&ci->i_ceph_lock);
+ num_entries = max(num_entries, 1);
+ num_entries = min(num_entries, opt->max_readdir);
+
+ order = get_order(size * num_entries);
+ while (order >= 0) {
+ rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN,
+ order);
+ if (rinfo->dir_in)
+ break;
+ order--;
+ }
+ if (!rinfo->dir_in)
+ return -ENOMEM;
+
+ num_entries = (PAGE_SIZE << order) / size;
+ num_entries = min(num_entries, opt->max_readdir);
+
+ rinfo->dir_buf_size = PAGE_SIZE << order;
+ req->r_num_caps = num_entries + 1;
+ req->r_args.readdir.max_entries = cpu_to_le32(num_entries);
+ req->r_args.readdir.max_bytes = cpu_to_le32(opt->max_readdir_bytes);
+ return 0;
+}
+
/*
* Create an mds request.
*/
@@ -1474,6 +1558,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item);
+ req->r_stamp = CURRENT_TIME;
+
req->r_op = op;
req->r_direct_mode = mode;
return req;
@@ -1699,7 +1785,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
}
len = sizeof(*head) +
- pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
+ pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+ sizeof(struct timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -1716,6 +1803,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
goto out_free2;
}
+ msg->hdr.version = 2;
msg->hdr.tid = cpu_to_le64(req->r_tid);
head = msg->front.iov_base;
@@ -1752,6 +1840,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
head->num_releases = cpu_to_le16(releases);
+ /* time stamp */
+ ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1875,8 +1966,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
int mds = -1;
int err = -EAGAIN;
- if (req->r_err || req->r_got_result)
+ if (req->r_err || req->r_got_result) {
+ if (req->r_aborted)
+ __unregister_request(mdsc, req);
goto out;
+ }
if (req->r_timeout &&
time_after_eq(jiffies, req->r_started + req->r_timeout)) {
@@ -2009,7 +2103,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
if (req->r_locked_dir)
ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
- if (req->r_old_dentry)
+ if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
@@ -2131,13 +2225,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* dup? */
if ((req->r_got_unsafe && !head->safe) ||
(req->r_got_safe && head->safe)) {
- pr_warning("got a dup %s reply on %llu from mds%d\n",
+ pr_warn("got a dup %s reply on %llu from mds%d\n",
head->safe ? "safe" : "unsafe", tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
if (req->r_got_safe && !head->safe) {
- pr_warning("got unsafe after safe on %llu from mds%d\n",
+ pr_warn("got unsafe after safe on %llu from mds%d\n",
tid, mds);
mutex_unlock(&mdsc->mutex);
goto out;
@@ -2154,26 +2248,16 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
*/
if (result == -ESTALE) {
dout("got ESTALE on request %llu", req->r_tid);
- if (!req->r_inode) {
- /* do nothing; not an authority problem */
- } else if (req->r_direct_mode != USE_AUTH_MDS) {
+ if (req->r_direct_mode != USE_AUTH_MDS) {
dout("not using auth, setting for that now");
req->r_direct_mode = USE_AUTH_MDS;
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
} else {
- struct ceph_inode_info *ci = ceph_inode(req->r_inode);
- struct ceph_cap *cap = NULL;
-
- if (req->r_session)
- cap = ceph_get_cap_for_mds(ci,
- req->r_session->s_mds);
-
- dout("already using auth");
- if ((!cap || cap != ci->i_auth_cap) ||
- (cap->mseq != req->r_sent_on_mseq)) {
- dout("but cap changed, so resending");
+ int mds = __choose_mds(mdsc, req);
+ if (mds >= 0 && mds != req->r_session->s_mds) {
+ dout("but auth changed, so resending");
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
@@ -2186,7 +2270,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
if (head->safe) {
req->r_got_safe = true;
__unregister_request(mdsc, req);
- complete_all(&req->r_safe_completion);
if (req->r_got_unsafe) {
/*
@@ -2238,8 +2321,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
- req->r_op == CEPH_MDS_OP_LSSNAP) &&
- rinfo->dir_nr)
+ req->r_op == CEPH_MDS_OP_LSSNAP))
ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
}
@@ -2400,6 +2482,10 @@ static void handle_session(struct ceph_mds_session *session,
trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
break;
+ case CEPH_SESSION_FLUSHMSG:
+ send_flushmsg_ack(mdsc, session, seq);
+ break;
+
default:
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
WARN_ON(1);
@@ -2490,6 +2576,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->seq = 0; /* reset cap seq */
cap->issue_seq = 0; /* and issue_seq */
cap->mseq = 0; /* and migrate_seq */
+ cap->cap_gen = cap->session->s_cap_gen;
if (recon_state->flock) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2552,6 +2639,8 @@ encode_again:
} else {
err = ceph_pagelist_append(pagelist, &rec, reclen);
}
+
+ recon_state->nr_caps++;
out_free:
kfree(path);
out_dput:
@@ -2579,6 +2668,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
struct rb_node *p;
int mds = session->s_mds;
int err = -ENOMEM;
+ int s_nr_caps;
struct ceph_pagelist *pagelist;
struct ceph_reconnect_state recon_state;
@@ -2610,20 +2700,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
dout("session %p state %s\n", session,
session_state_name(session->s_state));
+ spin_lock(&session->s_gen_ttl_lock);
+ session->s_cap_gen++;
+ spin_unlock(&session->s_gen_ttl_lock);
+
+ spin_lock(&session->s_cap_lock);
+ /*
+ * notify __ceph_remove_cap() that we are composing cap reconnect.
+ * If a cap get released before being added to the cap reconnect,
+ * __ceph_remove_cap() should skip queuing cap release.
+ */
+ session->s_cap_reconnect = 1;
/* drop old cap expires; we're about to reestablish that state */
discard_cap_releases(mdsc, session);
+ spin_unlock(&session->s_cap_lock);
/* traverse this session's caps */
- err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+ s_nr_caps = session->s_nr_caps;
+ err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
if (err)
goto fail;
+ recon_state.nr_caps = 0;
recon_state.pagelist = pagelist;
recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
+ spin_lock(&session->s_cap_lock);
+ session->s_cap_reconnect = 0;
+ spin_unlock(&session->s_cap_lock);
+
/*
* snaprealms. we provide mds with the ino, seq (version), and
* parent for all of our realms. If the mds has any newer info,
@@ -2646,11 +2754,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
if (recon_state.flock)
reply->hdr.version = cpu_to_le16(2);
- if (pagelist->length) {
- /* set up outbound data if we have any */
- reply->hdr.data_len = cpu_to_le32(pagelist->length);
- ceph_msg_data_add_pagelist(reply, pagelist);
+
+ /* raced with cap release? */
+ if (s_nr_caps != recon_state.nr_caps) {
+ struct page *page = list_first_entry(&pagelist->head,
+ struct page, lru);
+ __le32 *addr = kmap_atomic(page);
+ *addr = cpu_to_le32(recon_state.nr_caps);
+ kunmap_atomic(addr);
}
+
+ reply->hdr.data_len = cpu_to_le32(pagelist->length);
+ ceph_msg_data_add_pagelist(reply, pagelist);
ceph_con_send(&session->s_con, reply);
mutex_unlock(&session->s_mutex);
@@ -3417,7 +3532,7 @@ static void peer_reset(struct ceph_connection *con)
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
- pr_warning("mds%d closed our session\n", s->s_mds);
+ pr_warn("mds%d closed our session\n", s->s_mds);
send_mds_reconnect(mdsc, s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c2a19fbbe51..e00737cf523 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -67,6 +67,7 @@ struct ceph_mds_reply_info_parsed {
/* for readdir results */
struct {
struct ceph_mds_reply_dirfrag *dir_dir;
+ size_t dir_buf_size;
int dir_nr;
char **dir_dname;
u32 *dir_dname_len;
@@ -132,6 +133,7 @@ struct ceph_mds_session {
struct list_head s_caps; /* all caps issued by this session */
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
+ int s_cap_reconnect;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
@@ -192,6 +194,7 @@ struct ceph_mds_request {
int r_fmode; /* file mode, if expecting cap */
kuid_t r_uid;
kgid_t r_gid;
+ struct timespec r_stamp;
/* for choosing which mds to send this request to */
int r_direct_mode;
@@ -345,7 +348,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
struct dentry *dn);
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
-
+extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
+ struct inode *dir);
extern struct ceph_mds_request *
ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
@@ -382,6 +386,8 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
+extern struct ceph_mds_session *
+ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 132b64eeecd..261531e55e9 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -62,7 +62,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_decode_16_safe(p, end, version, bad);
if (version > 3) {
- pr_warning("got mdsmap version %d > 3, failing", version);
+ pr_warn("got mdsmap version %d > 3, failing", version);
goto bad;
}
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 89fa4a940a0..51cc23e4811 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -41,6 +41,8 @@ const char *ceph_session_op_name(int op)
case CEPH_SESSION_RENEWCAPS: return "renewcaps";
case CEPH_SESSION_STALE: return "stale";
case CEPH_SESSION_RECALL_STATE: return "recall_state";
+ case CEPH_SESSION_FLUSHMSG: return "flushmsg";
+ case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
}
return "???";
}
@@ -52,6 +54,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
+ case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6a0951e4304..06150fd745a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -144,7 +144,11 @@ enum {
Opt_ino32,
Opt_noino32,
Opt_fscache,
- Opt_nofscache
+ Opt_nofscache,
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ Opt_acl,
+#endif
+ Opt_noacl
};
static match_table_t fsopt_tokens = {
@@ -172,6 +176,10 @@ static match_table_t fsopt_tokens = {
{Opt_noino32, "noino32"},
{Opt_fscache, "fsc"},
{Opt_nofscache, "nofsc"},
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ {Opt_acl, "acl"},
+#endif
+ {Opt_noacl, "noacl"},
{-1, NULL}
};
@@ -271,6 +279,14 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
break;
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ case Opt_acl:
+ fsopt->sb_flags |= MS_POSIXACL;
+ break;
+#endif
+ case Opt_noacl:
+ fsopt->sb_flags &= ~MS_POSIXACL;
+ break;
default:
BUG_ON(token);
}
@@ -438,6 +454,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
else
seq_puts(m, ",nofsc");
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ if (fsopt->sb_flags & MS_POSIXACL)
+ seq_puts(m, ",acl");
+ else
+ seq_puts(m, ",noacl");
+#endif
+
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -490,10 +513,10 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
{
struct ceph_fs_client *fsc;
- const unsigned supported_features =
+ const u64 supported_features =
CEPH_FEATURE_FLOCK |
CEPH_FEATURE_DIRLAYOUTHASH;
- const unsigned required_features = 0;
+ const u64 required_features = 0;
int page_count;
size_t size;
int err = -ENOMEM;
@@ -686,6 +709,7 @@ static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
+ .drop_inode = ceph_drop_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
@@ -819,6 +843,7 @@ static int ceph_set_super(struct super_block *s, void *data)
s->s_flags = fsc->mount_options->sb_flags;
s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
+ s->s_xattr = ceph_xattr_handlers;
s->s_fs_info = fsc;
fsc->sb = s;
@@ -906,6 +931,10 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
struct ceph_options *opt = NULL;
dout("ceph_mount\n");
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ flags |= MS_POSIXACL;
+#endif
err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
if (err < 0) {
res = ERR_PTR(err);
@@ -997,6 +1026,7 @@ static int __init init_ceph(void)
if (ret)
goto out;
+ ceph_flock_init();
ceph_xattr_init();
ret = register_filesystem(&ceph_fs_type);
if (ret)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6014b0a3c40..12b20744e38 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -13,6 +13,7 @@
#include <linux/wait.h>
#include <linux/writeback.h>
#include <linux/slab.h>
+#include <linux/posix_acl.h>
#include <linux/ceph/libceph.h>
@@ -265,7 +266,6 @@ struct ceph_inode_info {
struct timespec i_rctime;
u64 i_rbytes, i_rfiles, i_rsubdirs;
u64 i_files, i_subdirs;
- u64 i_max_offset; /* largest readdir offset, set with complete dir */
struct rb_root i_fragtree;
struct mutex i_fragtree_mutex;
@@ -287,9 +287,6 @@ struct ceph_inode_info {
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
struct list_head i_cap_delay_list; /* for delayed cap release to mds */
- int i_cap_exporting_mds; /* to handle cap migration between */
- unsigned i_cap_exporting_mseq; /* mds's. */
- unsigned i_cap_exporting_issued;
struct ceph_cap_reservation i_cap_migration_resv;
struct list_head i_cap_snaps; /* snapped state pending flush to mds */
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
@@ -335,7 +332,6 @@ struct ceph_inode_info {
u32 i_fscache_gen; /* sequence, for delayed fscache validate */
struct work_struct i_revalidate_work;
#endif
-
struct inode vfs_inode; /* at end */
};
@@ -529,6 +525,8 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
}
extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
+ struct ceph_cap *ocap, int mask);
extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
extern int __ceph_caps_used(struct ceph_inode_info *ci);
@@ -577,7 +575,7 @@ struct ceph_file_info {
/* readdir: position within a frag */
unsigned offset; /* offset of last chunk, adjusted for . and .. */
- u64 next_offset; /* offset of next chunk (last_name's + 1) */
+ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */
int dir_release_count;
@@ -691,6 +689,7 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
+extern int ceph_drop_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -724,6 +723,9 @@ extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
/* xattr.c */
extern int ceph_setxattr(struct dentry *, const char *, const void *,
size_t, int);
+int __ceph_setxattr(struct dentry *, const char *, const void *, size_t, int);
+ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
+int __ceph_removexattr(struct dentry *, const char *);
extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern int ceph_removexattr(struct dentry *, const char *);
@@ -732,24 +734,57 @@ extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void);
+/* acl.c */
+extern const struct xattr_handler *ceph_xattr_handlers[];
+
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+
+struct posix_acl *ceph_get_acl(struct inode *, int);
+int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ceph_init_acl(struct dentry *, struct inode *, struct inode *);
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+ forget_all_cached_acls(inode);
+}
+
+#else
+
+#define ceph_get_acl NULL
+#define ceph_set_acl NULL
+
+static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode,
+ struct inode *dir)
+{
+ return 0;
+}
+
+static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
+{
+ return 0;
+}
+
+static inline void ceph_forget_all_cached_acls(struct inode *inode)
+{
+}
+
+#endif
+
/* caps.c */
extern const char *ceph_cap_string(int c);
extern void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_msg *msg);
-extern int ceph_add_cap(struct inode *inode,
- struct ceph_mds_session *session, u64 cap_id,
- int fmode, unsigned issued, unsigned wanted,
- unsigned cap, unsigned seq, u64 realmino, int flags,
- struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
- spin_lock(&cap->ci->i_ceph_lock);
- __ceph_remove_cap(cap);
- spin_unlock(&cap->ci->i_ceph_lock);
-}
+extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
+ struct ceph_cap_reservation *ctx);
+extern void ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap **new_cap);
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
extern void ceph_put_cap(struct ceph_mds_client *mdsc,
struct ceph_cap *cap);
+extern int ceph_is_any_caps(struct inode *inode);
extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino,
u64 cap_id, u32 migrate_seq, u32 issue_seq);
@@ -836,6 +871,7 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
extern const struct export_operations ceph_export_ops;
/* locks.c */
+extern __init void ceph_flock_init(void);
extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index be661d8f532..c9c2b887381 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -6,16 +6,33 @@
#include <linux/ceph/decode.h>
#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/slab.h>
#define XATTR_CEPH_PREFIX "ceph."
#define XATTR_CEPH_PREFIX_LEN (sizeof (XATTR_CEPH_PREFIX) - 1)
+static int __remove_xattr(struct ceph_inode_info *ci,
+ struct ceph_inode_xattr *xattr);
+
+/*
+ * List of handlers for synthetic system.* attributes. Other
+ * attributes are handled directly.
+ */
+const struct xattr_handler *ceph_xattr_handlers[] = {
+#ifdef CONFIG_CEPH_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ NULL,
+};
+
static bool ceph_is_valid_xattr(const char *name)
{
return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
!strncmp(name, XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN) ||
+ !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
@@ -47,32 +64,48 @@ static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
- size_t size)
+ size_t size)
{
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name;
+ char buf[128];
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->map_sem);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
- if (pool_name)
- ret = snprintf(val, size,
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
+ if (pool_name) {
+ size_t len = strlen(pool_name);
+ ret = snprintf(buf, sizeof(buf),
+ "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- pool_name);
- else
- ret = snprintf(val, size,
+ (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ if (!size) {
+ ret += len;
+ } else if (ret + len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, ret);
+ memcpy(val + ret, pool_name, len);
+ ret += len;
+ }
+ } else {
+ ret = snprintf(buf, sizeof(buf),
"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout),
(unsigned long long)pool);
-
+ if (size) {
+ if (ret <= size)
+ memcpy(val, buf, ret);
+ else
+ ret = -ERANGE;
+ }
+ }
up_read(&osdc->map_sem);
return ret;
}
@@ -198,7 +231,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
.name_size = sizeof("ceph.dir.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
- .hidden = false,
+ .hidden = true,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
@@ -225,7 +258,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
.name_size = sizeof("ceph.file.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
- .hidden = false,
+ .hidden = true,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
@@ -305,8 +338,7 @@ static struct ceph_vxattr *ceph_match_vxattr(struct inode *inode,
static int __set_xattr(struct ceph_inode_info *ci,
const char *name, int name_len,
const char *val, int val_len,
- int dirty,
- int should_free_name, int should_free_val,
+ int flags, int update_xattr,
struct ceph_inode_xattr **newxattr)
{
struct rb_node **p;
@@ -335,12 +367,31 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr = NULL;
}
+ if (update_xattr) {
+ int err = 0;
+ if (xattr && (flags & XATTR_CREATE))
+ err = -EEXIST;
+ else if (!xattr && (flags & XATTR_REPLACE))
+ err = -ENODATA;
+ if (err) {
+ kfree(name);
+ kfree(val);
+ return err;
+ }
+ if (update_xattr < 0) {
+ if (xattr)
+ __remove_xattr(ci, xattr);
+ kfree(name);
+ return 0;
+ }
+ }
+
if (!xattr) {
new = 1;
xattr = *newxattr;
xattr->name = name;
xattr->name_len = name_len;
- xattr->should_free_name = should_free_name;
+ xattr->should_free_name = update_xattr;
ci->i_xattrs.count++;
dout("__set_xattr count=%d\n", ci->i_xattrs.count);
@@ -350,7 +401,7 @@ static int __set_xattr(struct ceph_inode_info *ci,
if (xattr->should_free_val)
kfree((void *)xattr->val);
- if (should_free_name) {
+ if (update_xattr) {
kfree((void *)name);
name = xattr->name;
}
@@ -365,8 +416,8 @@ static int __set_xattr(struct ceph_inode_info *ci,
xattr->val = "";
xattr->val_len = val_len;
- xattr->dirty = dirty;
- xattr->should_free_val = (val && should_free_val);
+ xattr->dirty = update_xattr;
+ xattr->should_free_val = (val && update_xattr);
if (new) {
rb_link_node(&xattr->node, parent, p);
@@ -428,7 +479,7 @@ static int __remove_xattr(struct ceph_inode_info *ci,
struct ceph_inode_xattr *xattr)
{
if (!xattr)
- return -EOPNOTSUPP;
+ return -ENODATA;
rb_erase(&xattr->node, &ci->i_xattrs.index);
@@ -574,7 +625,7 @@ start:
p += len;
err = __set_xattr(ci, name, namelen, val, len,
- 0, 0, 0, &xattrs[numattr]);
+ 0, 0, &xattrs[numattr]);
if (err < 0)
goto bad;
@@ -663,10 +714,9 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
}
}
-ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
- struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
int err;
struct ceph_inode_xattr *xattr;
@@ -675,7 +725,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name))
return -ENODATA;
-
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
@@ -725,6 +774,15 @@ out:
return err;
}
+ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
+ size_t size)
+{
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, value, size);
+
+ return __ceph_getxattr(dentry->d_inode, name, value, size);
+}
+
ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
{
struct inode *inode = dentry->d_inode;
@@ -800,7 +858,6 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *inode = dentry->d_inode;
struct ceph_inode_info *ci = ceph_inode(inode);
- struct inode *parent_inode;
struct ceph_mds_request *req;
struct ceph_mds_client *mdsc = fsc->mdsc;
int err;
@@ -829,6 +886,9 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
dout("setxattr value=%.*s\n", (int)size, value);
+ if (!value)
+ flags |= CEPH_XATTR_REMOVE;
+
/* do request */
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
USE_AUTH_MDS);
@@ -848,9 +908,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
req->r_data_len = size;
dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
@@ -863,15 +921,15 @@ out:
return err;
}
-int ceph_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+int __ceph_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = dentry->d_inode;
struct ceph_vxattr *vxattr;
struct ceph_inode_info *ci = ceph_inode(inode);
int issued;
int err;
- int dirty;
+ int dirty = 0;
int name_len = strlen(name);
int val_len = size;
char *newname = NULL;
@@ -879,9 +937,6 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
struct ceph_inode_xattr *xattr = NULL;
int required_blob_size;
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
@@ -935,12 +990,14 @@ retry:
goto retry;
}
- err = __set_xattr(ci, newname, name_len, newval,
- val_len, 1, 1, 1, &xattr);
+ err = __set_xattr(ci, newname, name_len, newval, val_len,
+ flags, value ? 1 : -1, &xattr);
- dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
- ci->i_xattrs.dirty = true;
- inode->i_ctime = CURRENT_TIME;
+ if (!err) {
+ dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
+ ci->i_xattrs.dirty = true;
+ inode->i_ctime = CURRENT_TIME;
+ }
spin_unlock(&ci->i_ceph_lock);
if (dirty)
@@ -958,12 +1015,23 @@ out:
return err;
}
+int ceph_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, size, flags);
+
+ return __ceph_setxattr(dentry, name, value, size, flags);
+}
+
static int ceph_send_removexattr(struct dentry *dentry, const char *name)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = dentry->d_inode;
- struct inode *parent_inode;
struct ceph_mds_request *req;
int err;
@@ -977,14 +1045,12 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name)
req->r_num_caps = 1;
req->r_path2 = kstrdup(name, GFP_NOFS);
- parent_inode = ceph_get_dentry_parent_inode(dentry);
- err = ceph_mdsc_do_request(mdsc, parent_inode, req);
- iput(parent_inode);
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
ceph_mdsc_put_request(req);
return err;
}
-int ceph_removexattr(struct dentry *dentry, const char *name)
+int __ceph_removexattr(struct dentry *dentry, const char *name)
{
struct inode *inode = dentry->d_inode;
struct ceph_vxattr *vxattr;
@@ -994,9 +1060,6 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
int required_blob_size;
int dirty;
- if (ceph_snap(inode) != CEPH_NOSNAP)
- return -EROFS;
-
if (!ceph_is_valid_xattr(name))
return -EOPNOTSUPP;
@@ -1053,3 +1116,13 @@ out:
return err;
}
+int ceph_removexattr(struct dentry *dentry, const char *name)
+{
+ if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
+ return __ceph_removexattr(dentry, name);
+}
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 94b5f60076d..f77f7702fab 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -576,7 +576,8 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
void __init chrdev_init(void)
{
cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
- bdi_init(&directly_mappable_cdev_bdi);
+ if (bdi_init(&directly_mappable_cdev_bdi))
+ panic("Failed to init directly mappable cdev bdi");
}
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 37e4a72a7d1..9409fa10bd5 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -65,5 +65,6 @@ struct cifs_sb_info {
char *mountdata; /* options received at mount time or via DFS refs */
struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
+ struct rcu_head rcu;
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 0227b45ef00..15e9505aa35 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -290,7 +290,8 @@ int
cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
const struct nls_table *cp, int mapChars)
{
- int i, j, charlen;
+ int i, charlen;
+ int j = 0;
char src_char;
__le16 dst_char;
wchar_t tmp;
@@ -298,12 +299,11 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
if (!mapChars)
return cifs_strtoUTF16(target, source, PATH_MAX, cp);
- for (i = 0, j = 0; i < srclen; j++) {
+ for (i = 0; i < srclen; j++) {
src_char = source[i];
charlen = 1;
switch (src_char) {
case 0:
- put_unaligned(0, &target[j]);
goto ctoUTF16_out;
case ':':
dst_char = cpu_to_le16(UNI_COLON);
@@ -350,6 +350,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
}
ctoUTF16_out:
+ put_unaligned(0, &target[j]); /* Null terminate target unicode string */
return j;
}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 51f5e0ee723..7ff866dbb89 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -865,8 +865,8 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
return rc;
}
-static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
- __u16 fid, u32 *pacllen)
+struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+ const struct cifs_fid *cifsfid, u32 *pacllen)
{
struct cifs_ntsd *pntsd = NULL;
unsigned int xid;
@@ -877,7 +877,8 @@ static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
return ERR_CAST(tlink);
xid = get_xid();
- rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), fid, &pntsd, pacllen);
+ rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd,
+ pacllen);
free_xid(xid);
cifs_put_tlink(tlink);
@@ -895,9 +896,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
int oplock = 0;
unsigned int xid;
int rc, create_options = 0;
- __u16 fid;
struct cifs_tcon *tcon;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
if (IS_ERR(tlink))
return ERR_CAST(tlink);
@@ -908,12 +910,19 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, READ_CONTROL,
- create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = READ_CONTROL;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (!rc) {
- rc = CIFSSMBGetCIFSACL(xid, tcon, fid, &pntsd, pacllen);
- CIFSSMBClose(xid, tcon, fid);
+ rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen);
+ CIFSSMBClose(xid, tcon, fid.netfid);
}
cifs_put_tlink(tlink);
@@ -938,7 +947,7 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
if (!open_file)
return get_cifs_acl_by_path(cifs_sb, path, pacllen);
- pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->fid.netfid, pacllen);
+ pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen);
cifsFileInfo_put(open_file);
return pntsd;
}
@@ -950,10 +959,11 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
int oplock = 0;
unsigned int xid;
int rc, access_flags, create_options = 0;
- __u16 fid;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
if (IS_ERR(tlink))
return PTR_ERR(tlink);
@@ -969,18 +979,25 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
else
access_flags = WRITE_DAC;
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, access_flags,
- create_options, &fid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = access_flags;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc) {
cifs_dbg(VFS, "Unable to open file to set ACL\n");
goto out;
}
- rc = CIFSSMBSetCIFSACL(xid, tcon, fid, pnntsd, acllen, aclflag);
+ rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag);
cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc);
- CIFSSMBClose(xid, tcon, fid);
+ CIFSSMBClose(xid, tcon, fid.netfid);
out:
free_xid(xid);
cifs_put_tlink(tlink);
@@ -990,19 +1007,31 @@ out:
/* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
int
cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
- struct inode *inode, const char *path, const __u16 *pfid)
+ struct inode *inode, const char *path,
+ const struct cifs_fid *pfid)
{
struct cifs_ntsd *pntsd = NULL;
u32 acllen = 0;
int rc = 0;
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_tcon *tcon;
cifs_dbg(NOISY, "converting ACL to mode for %s\n", path);
- if (pfid)
- pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
- else
- pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
+ if (pfid && (tcon->ses->server->ops->get_acl_by_fid))
+ pntsd = tcon->ses->server->ops->get_acl_by_fid(cifs_sb, pfid,
+ &acllen);
+ else if (tcon->ses->server->ops->get_acl)
+ pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+ &acllen);
+ else {
+ cifs_put_tlink(tlink);
+ return -EOPNOTSUPP;
+ }
/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
if (IS_ERR(pntsd)) {
rc = PTR_ERR(pntsd);
@@ -1014,6 +1043,8 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc);
}
+ cifs_put_tlink(tlink);
+
return rc;
}
@@ -1027,15 +1058,30 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
__u32 secdesclen = 0;
struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */
struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
+ struct cifs_tcon *tcon;
+
+ if (IS_ERR(tlink))
+ return PTR_ERR(tlink);
+ tcon = tlink_tcon(tlink);
cifs_dbg(NOISY, "set ACL from mode for %s\n", path);
/* Get the security descriptor */
- pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
+
+ if (tcon->ses->server->ops->get_acl == NULL) {
+ cifs_put_tlink(tlink);
+ return -EOPNOTSUPP;
+ }
+
+ pntsd = tcon->ses->server->ops->get_acl(cifs_sb, inode, path,
+ &secdesclen);
if (IS_ERR(pntsd)) {
rc = PTR_ERR(pntsd);
cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc);
- goto out;
+ cifs_put_tlink(tlink);
+ return rc;
}
/*
@@ -1048,6 +1094,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
pnntsd = kmalloc(secdesclen, GFP_KERNEL);
if (!pnntsd) {
kfree(pntsd);
+ cifs_put_tlink(tlink);
return -ENOMEM;
}
@@ -1056,14 +1103,18 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc);
+ if (tcon->ses->server->ops->set_acl == NULL)
+ rc = -EOPNOTSUPP;
+
if (!rc) {
/* Set the security descriptor */
- rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
+ rc = tcon->ses->server->ops->set_acl(pnntsd, secdesclen, inode,
+ path, aclflag);
cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc);
}
+ cifs_put_tlink(tlink);
kfree(pnntsd);
kfree(pntsd);
-out:
return rc;
}
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index fc6f4f3a1a9..4934347321d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -548,7 +548,13 @@ static int
CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
{
int rc;
- unsigned int offset = CIFS_SESS_KEY_SIZE + 8;
+ struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *)
+ (ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+ unsigned int hash_len;
+
+ /* The MD5 hash starts at challenge_key.key */
+ hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE +
+ offsetof(struct ntlmv2_resp, challenge.key[0]));
if (!ses->server->secmech.sdeschmacmd5) {
cifs_dbg(VFS, "%s: can't generate ntlmv2 hash\n", __func__);
@@ -556,7 +562,7 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
}
rc = crypto_shash_setkey(ses->server->secmech.hmacmd5,
- ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+ ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
__func__);
@@ -570,20 +576,21 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash)
}
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED)
- memcpy(ses->auth_key.response + offset,
- ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+ memcpy(ntlmv2->challenge.key,
+ ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
else
- memcpy(ses->auth_key.response + offset,
- ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
+ memcpy(ntlmv2->challenge.key,
+ ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE);
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
- ses->auth_key.response + offset, ses->auth_key.len - offset);
+ ntlmv2->challenge.key, hash_len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
return rc;
}
+ /* Note that the MD5 digest over writes anon.challenge_key.key */
rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
- ses->auth_key.response + CIFS_SESS_KEY_SIZE);
+ ntlmv2->ntlmv2_hash);
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
@@ -627,7 +634,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
int rc;
int baselen;
unsigned int tilen;
- struct ntlmv2_resp *buf;
+ struct ntlmv2_resp *ntlmv2;
char ntlmv2_hash[16];
unsigned char *tiblob = NULL; /* target info blob */
@@ -660,13 +667,14 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
ses->auth_key.len += baselen;
- buf = (struct ntlmv2_resp *)
+ ntlmv2 = (struct ntlmv2_resp *)
(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
- buf->blob_signature = cpu_to_le32(0x00000101);
- buf->reserved = 0;
- buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
- get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
- buf->reserved2 = 0;
+ ntlmv2->blob_signature = cpu_to_le32(0x00000101);
+ ntlmv2->reserved = 0;
+ /* Must be within 5 minutes of the server */
+ ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+ get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
+ ntlmv2->reserved2 = 0;
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
@@ -706,7 +714,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
- ses->auth_key.response + CIFS_SESS_KEY_SIZE,
+ ntlmv2->ntlmv2_hash,
CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 801975c34cf..88839806742 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -87,10 +87,6 @@ extern mempool_t *cifs_mid_poolp;
struct workqueue_struct *cifsiod_wq;
-#ifdef CONFIG_CIFS_SMB2
-__u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
-#endif
-
/*
* Bumps refcount for cifs super block.
* Note that it should be only called if a referece to VFS super block is
@@ -120,14 +116,16 @@ cifs_read_super(struct super_block *sb)
{
struct inode *inode;
struct cifs_sb_info *cifs_sb;
+ struct cifs_tcon *tcon;
int rc = 0;
cifs_sb = CIFS_SB(sb);
+ tcon = cifs_sb_master_tcon(cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL)
sb->s_flags |= MS_POSIXACL;
- if (cifs_sb_master_tcon(cifs_sb)->ses->capabilities & CAP_LARGE_FILES)
+ if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files)
sb->s_maxbytes = MAX_LFS_FILESIZE;
else
sb->s_maxbytes = MAX_NON_LFS;
@@ -147,7 +145,7 @@ cifs_read_super(struct super_block *sb)
goto out_no_root;
}
- if (cifs_sb_master_tcon(cifs_sb)->nocase)
+ if (tcon->nocase)
sb->s_d_op = &cifs_ci_dentry_ops;
else
sb->s_d_op = &cifs_dentry_ops;
@@ -249,8 +247,9 @@ cifs_alloc_inode(struct super_block *sb)
* server, can not assume caching of file data or metadata.
*/
cifs_set_oplock_level(cifs_inode, 0);
- cifs_inode->delete_pending = false;
- cifs_inode->invalid_mapping = false;
+ cifs_inode->flags = 0;
+ spin_lock_init(&cifs_inode->writers_lock);
+ cifs_inode->writers = 0;
cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
@@ -284,7 +283,7 @@ cifs_destroy_inode(struct inode *inode)
static void
cifs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
cifs_fscache_release_inode_cookie(inode);
}
@@ -295,7 +294,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
struct sockaddr_in *sa = (struct sockaddr_in *) &server->dstaddr;
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *) &server->dstaddr;
- seq_printf(s, ",addr=");
+ seq_puts(s, ",addr=");
switch (server->dstaddr.ss_family) {
case AF_INET:
@@ -307,7 +306,7 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
seq_printf(s, "%%%u", sa6->sin6_scope_id);
break;
default:
- seq_printf(s, "(unknown)");
+ seq_puts(s, "(unknown)");
}
}
@@ -317,45 +316,45 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
if (ses->sectype == Unspecified)
return;
- seq_printf(s, ",sec=");
+ seq_puts(s, ",sec=");
switch (ses->sectype) {
case LANMAN:
- seq_printf(s, "lanman");
+ seq_puts(s, "lanman");
break;
case NTLMv2:
- seq_printf(s, "ntlmv2");
+ seq_puts(s, "ntlmv2");
break;
case NTLM:
- seq_printf(s, "ntlm");
+ seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_printf(s, "krb5");
+ seq_puts(s, "krb5");
break;
case RawNTLMSSP:
- seq_printf(s, "ntlmssp");
+ seq_puts(s, "ntlmssp");
break;
default:
/* shouldn't ever happen */
- seq_printf(s, "unknown");
+ seq_puts(s, "unknown");
break;
}
if (ses->sign)
- seq_printf(s, "i");
+ seq_puts(s, "i");
}
static void
cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
{
- seq_printf(s, ",cache=");
+ seq_puts(s, ",cache=");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
- seq_printf(s, "strict");
+ seq_puts(s, "strict");
else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
- seq_printf(s, "none");
+ seq_puts(s, "none");
else
- seq_printf(s, "loose");
+ seq_puts(s, "loose");
}
static void
@@ -388,7 +387,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_show_cache_flavor(s, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
- seq_printf(s, ",multiuser");
+ seq_puts(s, ",multiuser");
else if (tcon->ses->user_name)
seq_printf(s, ",username=%s", tcon->ses->user_name);
@@ -414,16 +413,16 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",uid=%u",
from_kuid_munged(&init_user_ns, cifs_sb->mnt_uid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
- seq_printf(s, ",forceuid");
+ seq_puts(s, ",forceuid");
else
- seq_printf(s, ",noforceuid");
+ seq_puts(s, ",noforceuid");
seq_printf(s, ",gid=%u",
from_kgid_munged(&init_user_ns, cifs_sb->mnt_gid));
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
- seq_printf(s, ",forcegid");
+ seq_puts(s, ",forcegid");
else
- seq_printf(s, ",noforcegid");
+ seq_puts(s, ",noforcegid");
cifs_show_address(s, tcon->ses->server);
@@ -435,47 +434,47 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
cifs_show_nls(s, cifs_sb->local_nls);
if (tcon->seal)
- seq_printf(s, ",seal");
+ seq_puts(s, ",seal");
if (tcon->nocase)
- seq_printf(s, ",nocase");
+ seq_puts(s, ",nocase");
if (tcon->retry)
- seq_printf(s, ",hard");
+ seq_puts(s, ",hard");
if (tcon->unix_ext)
- seq_printf(s, ",unix");
+ seq_puts(s, ",unix");
else
- seq_printf(s, ",nounix");
+ seq_puts(s, ",nounix");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
- seq_printf(s, ",posixpaths");
+ seq_puts(s, ",posixpaths");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
- seq_printf(s, ",setuids");
+ seq_puts(s, ",setuids");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
- seq_printf(s, ",serverino");
+ seq_puts(s, ",serverino");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
- seq_printf(s, ",rwpidforward");
+ seq_puts(s, ",rwpidforward");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL)
- seq_printf(s, ",forcemand");
+ seq_puts(s, ",forcemand");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
- seq_printf(s, ",nouser_xattr");
+ seq_puts(s, ",nouser_xattr");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
- seq_printf(s, ",mapchars");
+ seq_puts(s, ",mapchars");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
- seq_printf(s, ",sfu");
+ seq_puts(s, ",sfu");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
- seq_printf(s, ",nobrl");
+ seq_puts(s, ",nobrl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
- seq_printf(s, ",cifsacl");
+ seq_puts(s, ",cifsacl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
- seq_printf(s, ",dynperm");
+ seq_puts(s, ",dynperm");
if (root->d_sb->s_flags & MS_POSIXACL)
- seq_printf(s, ",acl");
+ seq_puts(s, ",acl");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
- seq_printf(s, ",mfsymlinks");
+ seq_puts(s, ",mfsymlinks");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE)
- seq_printf(s, ",fsc");
+ seq_puts(s, ",fsc");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)
- seq_printf(s, ",nostrictsync");
+ seq_puts(s, ",nostrictsync");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
- seq_printf(s, ",noperm");
+ seq_puts(s, ",noperm");
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
seq_printf(s, ",backupuid=%u",
from_kuid_munged(&init_user_ns,
@@ -539,6 +538,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
static int cifs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_NODIRATIME;
return 0;
}
@@ -725,23 +725,42 @@ out_nls:
goto out;
}
-static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t
+cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ ssize_t rc;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ rc = cifs_revalidate_mapping(inode);
+ if (rc)
+ return rc;
+
+ return generic_file_read_iter(iocb, iter);
+}
+
+static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
+ struct cifsInodeInfo *cinode = CIFS_I(inode);
ssize_t written;
int rc;
- written = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
+
+ written = generic_file_write_iter(iocb, from);
if (CIFS_CACHE_WRITE(CIFS_I(inode)))
- return written;
+ goto out;
rc = filemap_fdatawrite(inode->i_mapping);
if (rc)
- cifs_dbg(FYI, "cifs_file_aio_write: %d rc on %p inode\n",
+ cifs_dbg(FYI, "cifs_file_write_iter: %d rc on %p inode\n",
rc, inode);
+out:
+ cifs_put_writer(cinode);
return written;
}
@@ -847,7 +866,6 @@ const struct inode_operations cifs_file_inode_ops = {
/* revalidate:cifs_revalidate, */
.setattr = cifs_setattr,
.getattr = cifs_getattr, /* do we need this anymore? */
- .rename = cifs_rename,
.permission = cifs_permission,
#ifdef CONFIG_CIFS_XATTR
.setxattr = cifs_setxattr,
@@ -874,10 +892,10 @@ const struct inode_operations cifs_symlink_inode_ops = {
};
const struct file_operations cifs_file_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = cifs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_loose_read_iter,
+ .write_iter = cifs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -893,10 +911,10 @@ const struct file_operations cifs_file_ops = {
};
const struct file_operations cifs_file_strict_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_strict_readv,
- .aio_write = cifs_strict_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_strict_readv,
+ .write_iter = cifs_strict_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -913,10 +931,10 @@ const struct file_operations cifs_file_strict_ops = {
const struct file_operations cifs_file_direct_ops = {
/* BB reevaluate whether they can be done with directio, no cache */
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_user_readv,
- .aio_write = cifs_user_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_user_readv,
+ .write_iter = cifs_user_writev,
.open = cifs_open,
.release = cifs_close,
.lock = cifs_lock,
@@ -932,10 +950,10 @@ const struct file_operations cifs_file_direct_ops = {
};
const struct file_operations cifs_file_nobrl_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = cifs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_loose_read_iter,
+ .write_iter = cifs_file_write_iter,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -950,10 +968,10 @@ const struct file_operations cifs_file_nobrl_ops = {
};
const struct file_operations cifs_file_strict_nobrl_ops = {
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_strict_readv,
- .aio_write = cifs_strict_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_strict_readv,
+ .write_iter = cifs_strict_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_strict_fsync,
@@ -969,10 +987,10 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
const struct file_operations cifs_file_direct_nobrl_ops = {
/* BB reevaluate whether they can be done with directio, no cache */
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = cifs_user_readv,
- .aio_write = cifs_user_writev,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = cifs_user_readv,
+ .write_iter = cifs_user_writev,
.open = cifs_open,
.release = cifs_close,
.fsync = cifs_fsync,
@@ -1003,7 +1021,7 @@ cifs_init_once(void *inode)
init_rwsem(&cifsi->lock_sem);
}
-static int
+static int __init
cifs_init_inodecache(void)
{
cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
@@ -1178,10 +1196,6 @@ init_cifs(void)
spin_lock_init(&cifs_file_list_lock);
spin_lock_init(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_SMB2
- get_random_bytes(cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
-#endif
-
if (cifs_max_pending < 2) {
cifs_max_pending = 2;
cifs_dbg(FYI, "cifs_max_pending set to min of 2\n");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 26a754f49ba..70f178a7c75 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -22,20 +22,28 @@
#ifndef _CIFSFS_H
#define _CIFSFS_H
+#include <linux/hash.h>
+
#define ROOT_I 2
/*
* ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
- * so that it will fit.
+ * so that it will fit. We use hash_64 to convert the value to 31 bits, and
+ * then add 1, to ensure that we don't end up with a 0 as the value.
*/
+#if BITS_PER_LONG == 64
static inline ino_t
cifs_uniqueid_to_ino_t(u64 fileid)
{
- ino_t ino = (ino_t) fileid;
- if (sizeof(ino_t) < sizeof(u64))
- ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8;
- return ino;
+ return (ino_t)fileid;
}
+#else
+static inline ino_t
+cifs_uniqueid_to_ino_t(u64 fileid)
+{
+ return (ino_t)hash_64(fileid, (sizeof(ino_t) * 8) - 1) + 1;
+}
+#endif
extern struct file_system_type cifs_fs_type;
extern const struct address_space_operations cifs_addr_ops;
@@ -67,6 +75,8 @@ extern int cifs_revalidate_dentry_attr(struct dentry *);
extern int cifs_revalidate_file(struct file *filp);
extern int cifs_revalidate_dentry(struct dentry *);
extern int cifs_invalidate_mapping(struct inode *inode);
+extern int cifs_revalidate_mapping(struct inode *inode);
+extern int cifs_zap_mapping(struct inode *inode);
extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
extern int cifs_setattr(struct dentry *, struct iattr *);
@@ -85,14 +95,10 @@ extern const struct file_operations cifs_file_strict_nobrl_ops;
extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
-extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
-extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos);
+extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
extern int cifs_strict_fsync(struct file *, loff_t, loff_t, int);
@@ -130,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.02"
+#define CIFS_VERSION "2.03"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 52b6f6c26bf..de6aed8c78e 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -228,6 +228,8 @@ struct smb_version_operations {
/* verify the message */
int (*check_message)(char *, unsigned int);
bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
+ void (*downgrade_oplock)(struct TCP_Server_Info *,
+ struct cifsInodeInfo *, bool);
/* process transaction2 response */
bool (*check_trans2)(struct mid_q_entry *, struct TCP_Server_Info *,
char *, int);
@@ -261,7 +263,7 @@ struct smb_version_operations {
/* query path data from the server */
int (*query_path_info)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *,
- FILE_ALL_INFO *, bool *);
+ FILE_ALL_INFO *, bool *, bool *);
/* query file data from the server */
int (*query_file_info)(const unsigned int, struct cifs_tcon *,
struct cifs_fid *, FILE_ALL_INFO *);
@@ -278,6 +280,8 @@ struct smb_version_operations {
/* set attributes */
int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
const unsigned int);
+ int (*set_compression)(const unsigned int, struct cifs_tcon *,
+ struct cifsFileInfo *);
/* check if we can send an echo or nor */
bool (*can_echo)(struct TCP_Server_Info *);
/* send echo request */
@@ -321,7 +325,8 @@ struct smb_version_operations {
/* async read from the server */
int (*async_readv)(struct cifs_readdata *);
/* async write to the server */
- int (*async_writev)(struct cifs_writedata *);
+ int (*async_writev)(struct cifs_writedata *,
+ void (*release)(struct kref *));
/* sync read from the server */
int (*sync_read)(const unsigned int, struct cifsFileInfo *,
struct cifs_io_parms *, unsigned int *, char **,
@@ -368,8 +373,12 @@ struct smb_version_operations {
void (*new_lease_key)(struct cifs_fid *);
int (*generate_signingkey)(struct cifs_ses *);
int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
- int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *,
- struct cifs_sb_info *, unsigned int);
+ int (*query_mf_symlink)(unsigned int, struct cifs_tcon *,
+ struct cifs_sb_info *, const unsigned char *,
+ char *, unsigned int *);
+ int (*create_mf_symlink)(unsigned int, struct cifs_tcon *,
+ struct cifs_sb_info *, const unsigned char *,
+ char *, unsigned int *);
/* if we can do cache read operations */
bool (*is_read_op)(__u32);
/* set oplock level for the inode */
@@ -379,6 +388,22 @@ struct smb_version_operations {
char * (*create_lease_buf)(u8 *, u8);
/* parse lease context buffer and return oplock/epoch info */
__u8 (*parse_lease_buf)(void *, unsigned int *);
+ int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
+ struct cifsFileInfo *target_file, u64 src_off, u64 len,
+ u64 dest_off);
+ int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
+ ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *,
+ const unsigned char *, const unsigned char *, char *,
+ size_t, const struct nls_table *, int);
+ int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *,
+ const char *, const void *, const __u16,
+ const struct nls_table *, int);
+ struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *,
+ const char *, u32 *);
+ struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *,
+ const struct cifs_fid *, u32 *);
+ int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
+ int);
};
struct smb_version_values {
@@ -490,7 +515,7 @@ struct cifs_mnt_data {
static inline unsigned int
get_rfc1002_length(void *buf)
{
- return be32_to_cpu(*((__be32 *)buf));
+ return be32_to_cpu(*((__be32 *)buf)) & 0xffffff;
}
static inline void
@@ -534,6 +559,7 @@ struct TCP_Server_Info {
int echo_credits; /* echo reserved slots */
int oplock_credits; /* oplock break reserved slots */
bool echoes:1; /* enable echoes */
+ __u8 client_guid[SMB2_CLIENT_GUID_SIZE]; /* Client GUID */
#endif
u16 dialect; /* dialect index that server chose */
bool oplocks:1; /* enable oplocks */
@@ -620,11 +646,34 @@ set_credits(struct TCP_Server_Info *server, const int val)
}
static inline __u64
-get_next_mid(struct TCP_Server_Info *server)
+get_next_mid64(struct TCP_Server_Info *server)
{
return server->ops->get_next_mid(server);
}
+static inline __le16
+get_next_mid(struct TCP_Server_Info *server)
+{
+ __u16 mid = get_next_mid64(server);
+ /*
+ * The value in the SMB header should be little endian for easy
+ * on-the-wire decoding.
+ */
+ return cpu_to_le16(mid);
+}
+
+static inline __u16
+get_mid(const struct smb_hdr *smb)
+{
+ return le16_to_cpu(smb->Mid);
+}
+
+static inline bool
+compare_mid(__u16 mid, const struct smb_hdr *smb)
+{
+ return mid == le16_to_cpu(smb->Mid);
+}
+
/*
* When the server supports very large reads and writes via POSIX extensions,
* we can allow up to 2^24-1, minus the size of a READ/WRITE_AND_X header, not
@@ -828,6 +877,11 @@ struct cifs_tcon {
__u32 maximal_access;
__u32 vol_serial_number;
__le64 vol_create_time;
+ __u32 ss_flags; /* sector size flags */
+ __u32 perf_sector_size; /* best sector size for perf */
+ __u32 max_chunks;
+ __u32 max_bytes_chunk;
+ __u32 max_bytes_copy;
#endif /* CONFIG_CIFS_SMB2 */
#ifdef CONFIG_CIFS_FSCACHE
u64 resource_id; /* server resource id */
@@ -1020,7 +1074,7 @@ struct cifs_writedata {
unsigned int pagesz;
unsigned int tailsz;
unsigned int nr_pages;
- struct page *pages[1];
+ struct page *pages[];
};
/*
@@ -1060,8 +1114,15 @@ struct cifsInodeInfo {
__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
unsigned int oplock; /* oplock/lease level we have */
unsigned int epoch; /* used to track lease state changes */
- bool delete_pending; /* DELETE_ON_CLOSE is set */
- bool invalid_mapping; /* pagecache is invalid */
+#define CIFS_INODE_PENDING_OPLOCK_BREAK (0) /* oplock break in progress */
+#define CIFS_INODE_PENDING_WRITERS (1) /* Writes in progress */
+#define CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2 (2) /* Downgrade oplock to L2 */
+#define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */
+#define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */
+#define CIFS_INO_LOCK (5) /* lock bit for synchronization */
+ unsigned long flags;
+ spinlock_t writers_lock;
+ unsigned int writers; /* Number of writers on this inode */
unsigned long time; /* jiffies of last update of inode */
u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index a630475e421..33df36ef9d5 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -428,7 +428,7 @@ struct smb_hdr {
__u16 Tid;
__le16 Pid;
__u16 Uid;
- __u16 Mid;
+ __le16 Mid;
__u8 WordCount;
} __attribute__((packed));
@@ -697,7 +697,13 @@ struct ntlmssp2_name {
} __attribute__((packed));
struct ntlmv2_resp {
- char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+ union {
+ char ntlmv2_hash[CIFS_ENCPWD_SIZE];
+ struct {
+ __u8 reserved[8];
+ __u8 key[CIFS_SERVER_CHALLENGE_SIZE];
+ } __attribute__((packed)) challenge;
+ } __attribute__((packed));
__le32 blob_signature;
__u32 reserved;
__le64 time;
@@ -1352,6 +1358,35 @@ typedef struct smb_com_transaction_ioctl_req {
__u8 Data[1];
} __attribute__((packed)) TRANSACT_IOCTL_REQ;
+typedef struct smb_com_transaction_compr_ioctl_req {
+ struct smb_hdr hdr; /* wct = 23 */
+ __u8 MaxSetupCount;
+ __u16 Reserved;
+ __le32 TotalParameterCount;
+ __le32 TotalDataCount;
+ __le32 MaxParameterCount;
+ __le32 MaxDataCount;
+ __le32 ParameterCount;
+ __le32 ParameterOffset;
+ __le32 DataCount;
+ __le32 DataOffset;
+ __u8 SetupCount; /* four setup words follow subcommand */
+ /* SNIA spec incorrectly included spurious pad here */
+ __le16 SubCommand; /* 2 = IOCTL/FSCTL */
+ __le32 FunctionCode;
+ __u16 Fid;
+ __u8 IsFsctl; /* 1 = File System Control 0 = device control (IOCTL) */
+ __u8 IsRootFlag; /* 1 = apply command to root of share (must be DFS) */
+ __le16 ByteCount;
+ __u8 Pad[3];
+ __le16 compression_state; /* See below for valid flags */
+} __attribute__((packed)) TRANSACT_COMPR_IOCTL_REQ;
+
+/* compression state flags */
+#define COMPRESSION_FORMAT_NONE 0x0000
+#define COMPRESSION_FORMAT_DEFAULT 0x0001
+#define COMPRESSION_FORMAT_LZNT1 0x0002
+
typedef struct smb_com_transaction_ioctl_rsp {
struct smb_hdr hdr; /* wct = 19 */
__u8 Reserved[3];
@@ -1491,15 +1526,30 @@ struct file_notify_information {
__u8 FileName[0];
} __attribute__((packed));
-struct reparse_data {
- __u32 ReparseTag;
- __u16 ReparseDataLength;
+/* For IO_REPARSE_TAG_SYMLINK */
+struct reparse_symlink_data {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
__u16 Reserved;
- __u16 SubstituteNameOffset;
- __u16 SubstituteNameLength;
- __u16 PrintNameOffset;
- __u16 PrintNameLength;
- __u32 Flags;
+ __le16 SubstituteNameOffset;
+ __le16 SubstituteNameLength;
+ __le16 PrintNameOffset;
+ __le16 PrintNameLength;
+ __le32 Flags;
+ char PathBuffer[0];
+} __attribute__((packed));
+
+/* For IO_REPARSE_TAG_NFS */
+#define NFS_SPECFILE_LNK 0x00000000014B4E4C
+#define NFS_SPECFILE_CHR 0x0000000000524843
+#define NFS_SPECFILE_BLK 0x00000000004B4C42
+#define NFS_SPECFILE_FIFO 0x000000004F464946
+#define NFS_SPECFILE_SOCK 0x000000004B434F53
+struct reparse_posix_data {
+ __le32 ReparseTag;
+ __le16 ReparseDataLength;
+ __u16 Reserved;
+ __le64 InodeType; /* LNK, FIFO, CHR etc. */
char PathBuffer[0];
} __attribute__((packed));
@@ -2200,6 +2250,9 @@ typedef struct {
__le32 DeviceCharacteristics;
} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */
+/* minimum includes first three fields, and empty FS Name */
+#define MIN_FS_ATTR_INFO_SIZE 12
+
typedef struct {
__le32 Attributes;
__le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b5ec2a268f5..ca7980a1e30 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -127,6 +127,9 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
int offset);
extern void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
+extern int cifs_get_writer(struct cifsInodeInfo *cinode);
+extern void cifs_put_writer(struct cifsInodeInfo *cinode);
+extern void cifs_done_oplock_break(struct cifsInodeInfo *cinode);
extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
@@ -151,7 +154,7 @@ extern struct inode *cifs_iget(struct super_block *sb,
extern int cifs_get_inode_info(struct inode **inode, const char *full_path,
FILE_ALL_INFO *data, struct super_block *sb,
- int xid, const __u16 *fid);
+ int xid, const struct cifs_fid *fid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *search_path,
struct super_block *sb, unsigned int xid);
@@ -162,11 +165,13 @@ extern int cifs_rename_pending_delete(const char *full_path,
const unsigned int xid);
extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr, struct inode *inode,
- const char *path, const __u16 *pfid);
+ const char *path, const struct cifs_fid *pfid);
extern int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64,
kuid_t, kgid_t);
extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *,
const char *, u32 *);
+extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *,
+ const struct cifs_fid *, u32 *);
extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *,
const char *, int);
@@ -360,11 +365,10 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
__u16 fid, char **symlinkinfo,
const struct nls_table *nls_codepage);
-extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
- const char *fileName, const int disposition,
- const int access_flags, const int omode,
- __u16 *netfid, int *pOplock, FILE_ALL_INFO *,
- const struct nls_table *nls_codepage, int remap);
+extern int CIFSSMB_set_compression(const unsigned int xid,
+ struct cifs_tcon *tcon, __u16 fid);
+extern int CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms,
+ int *oplock, FILE_ALL_INFO *buf);
extern int SMBLegacyOpen(const unsigned int xid, struct cifs_tcon *tcon,
const char *fileName, const int disposition,
const int access_flags, const int omode,
@@ -474,10 +478,11 @@ extern int CIFSSMBSetPosixACL(const unsigned int xid, struct cifs_tcon *tcon,
extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
-extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
-extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
- const unsigned char *path,
- struct cifs_sb_info *cifs_sb, unsigned int xid);
+extern bool couldbe_mf_symlink(const struct cifs_fattr *fattr);
+extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ struct cifs_fattr *fattr,
+ const unsigned char *path);
extern int mdfour(unsigned char *, unsigned char *, int);
extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
const struct nls_table *codepage);
@@ -488,12 +493,18 @@ void cifs_readdata_release(struct kref *refcount);
int cifs_async_readv(struct cifs_readdata *rdata);
int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid);
-int cifs_async_writev(struct cifs_writedata *wdata);
+int cifs_async_writev(struct cifs_writedata *wdata,
+ void (*release)(struct kref *kref));
void cifs_writev_complete(struct work_struct *work);
struct cifs_writedata *cifs_writedata_alloc(unsigned int nr_pages,
work_func_t complete);
void cifs_writedata_release(struct kref *refcount);
-int open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
- unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
- unsigned int xid);
+int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_read);
+int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const unsigned char *path, char *pbuf,
+ unsigned int *pbytes_written);
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4baf35949b5..6ce4e0954b9 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1273,104 +1273,124 @@ OldOpenRetry:
}
int
-CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
- const char *fileName, const int openDisposition,
- const int access_flags, const int create_options, __u16 *netfid,
- int *pOplock, FILE_ALL_INFO *pfile_info,
- const struct nls_table *nls_codepage, int remap)
+CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
+ FILE_ALL_INFO *buf)
{
int rc = -EACCES;
- OPEN_REQ *pSMB = NULL;
- OPEN_RSP *pSMBr = NULL;
+ OPEN_REQ *req = NULL;
+ OPEN_RSP *rsp = NULL;
int bytes_returned;
int name_len;
__u16 count;
+ struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
+ struct cifs_tcon *tcon = oparms->tcon;
+ int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
+ const struct nls_table *nls = cifs_sb->local_nls;
+ int create_options = oparms->create_options;
+ int desired_access = oparms->desired_access;
+ int disposition = oparms->disposition;
+ const char *path = oparms->path;
openRetry:
- rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **) &pSMB,
- (void **) &pSMBr);
+ rc = smb_init(SMB_COM_NT_CREATE_ANDX, 24, tcon, (void **)&req,
+ (void **)&rsp);
if (rc)
return rc;
- pSMB->AndXCommand = 0xFF; /* none */
+ /* no commands go after this */
+ req->AndXCommand = 0xFF;
- if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
- count = 1; /* account for one byte pad to word boundary */
- name_len =
- cifsConvertToUTF16((__le16 *) (pSMB->fileName + 1),
- fileName, PATH_MAX, nls_codepage, remap);
- name_len++; /* trailing null */
+ if (req->hdr.Flags2 & SMBFLG2_UNICODE) {
+ /* account for one byte pad to word boundary */
+ count = 1;
+ name_len = cifsConvertToUTF16((__le16 *)(req->fileName + 1),
+ path, PATH_MAX, nls, remap);
+ /* trailing null */
+ name_len++;
name_len *= 2;
- pSMB->NameLength = cpu_to_le16(name_len);
- } else { /* BB improve check for buffer overruns BB */
- count = 0; /* no pad */
- name_len = strnlen(fileName, PATH_MAX);
- name_len++; /* trailing null */
- pSMB->NameLength = cpu_to_le16(name_len);
- strncpy(pSMB->fileName, fileName, name_len);
+ req->NameLength = cpu_to_le16(name_len);
+ } else {
+ /* BB improve check for buffer overruns BB */
+ /* no pad */
+ count = 0;
+ name_len = strnlen(path, PATH_MAX);
+ /* trailing null */
+ name_len++;
+ req->NameLength = cpu_to_le16(name_len);
+ strncpy(req->fileName, path, name_len);
}
- if (*pOplock & REQ_OPLOCK)
- pSMB->OpenFlags = cpu_to_le32(REQ_OPLOCK);
- else if (*pOplock & REQ_BATCHOPLOCK)
- pSMB->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
- pSMB->DesiredAccess = cpu_to_le32(access_flags);
- pSMB->AllocationSize = 0;
- /* set file as system file if special file such
- as fifo and server expecting SFU style and
- no Unix extensions */
+
+ if (*oplock & REQ_OPLOCK)
+ req->OpenFlags = cpu_to_le32(REQ_OPLOCK);
+ else if (*oplock & REQ_BATCHOPLOCK)
+ req->OpenFlags = cpu_to_le32(REQ_BATCHOPLOCK);
+
+ req->DesiredAccess = cpu_to_le32(desired_access);
+ req->AllocationSize = 0;
+
+ /*
+ * Set file as system file if special file such as fifo and server
+ * expecting SFU style and no Unix extensions.
+ */
if (create_options & CREATE_OPTION_SPECIAL)
- pSMB->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
+ req->FileAttributes = cpu_to_le32(ATTR_SYSTEM);
else
- pSMB->FileAttributes = cpu_to_le32(ATTR_NORMAL);
+ req->FileAttributes = cpu_to_le32(ATTR_NORMAL);
- /* XP does not handle ATTR_POSIX_SEMANTICS */
- /* but it helps speed up case sensitive checks for other
- servers such as Samba */
+ /*
+ * XP does not handle ATTR_POSIX_SEMANTICS but it helps speed up case
+ * sensitive checks for other servers such as Samba.
+ */
if (tcon->ses->capabilities & CAP_UNIX)
- pSMB->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
+ req->FileAttributes |= cpu_to_le32(ATTR_POSIX_SEMANTICS);
if (create_options & CREATE_OPTION_READONLY)
- pSMB->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+ req->FileAttributes |= cpu_to_le32(ATTR_READONLY);
+
+ req->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
+ req->CreateDisposition = cpu_to_le32(disposition);
+ req->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
- pSMB->ShareAccess = cpu_to_le32(FILE_SHARE_ALL);
- pSMB->CreateDisposition = cpu_to_le32(openDisposition);
- pSMB->CreateOptions = cpu_to_le32(create_options & CREATE_OPTIONS_MASK);
/* BB Expirement with various impersonation levels and verify */
- pSMB->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
- pSMB->SecurityFlags =
- SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY;
+ req->ImpersonationLevel = cpu_to_le32(SECURITY_IMPERSONATION);
+ req->SecurityFlags = SECURITY_CONTEXT_TRACKING|SECURITY_EFFECTIVE_ONLY;
count += name_len;
- inc_rfc1001_len(pSMB, count);
+ inc_rfc1001_len(req, count);
- pSMB->ByteCount = cpu_to_le16(count);
- /* long_op set to 1 to allow for oplock break timeouts */
- rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
- (struct smb_hdr *)pSMBr, &bytes_returned, 0);
+ req->ByteCount = cpu_to_le16(count);
+ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *)req,
+ (struct smb_hdr *)rsp, &bytes_returned, 0);
cifs_stats_inc(&tcon->stats.cifs_stats.num_opens);
if (rc) {
cifs_dbg(FYI, "Error in Open = %d\n", rc);
- } else {
- *pOplock = pSMBr->OplockLevel; /* 1 byte no need to le_to_cpu */
- *netfid = pSMBr->Fid; /* cifs fid stays in le */
- /* Let caller know file was created so we can set the mode. */
- /* Do we care about the CreateAction in any other cases? */
- if (cpu_to_le32(FILE_CREATE) == pSMBr->CreateAction)
- *pOplock |= CIFS_CREATE_ACTION;
- if (pfile_info) {
- memcpy((char *)pfile_info, (char *)&pSMBr->CreationTime,
- 36 /* CreationTime to Attributes */);
- /* the file_info buf is endian converted by caller */
- pfile_info->AllocationSize = pSMBr->AllocationSize;
- pfile_info->EndOfFile = pSMBr->EndOfFile;
- pfile_info->NumberOfLinks = cpu_to_le32(1);
- pfile_info->DeletePending = 0;
- }
+ cifs_buf_release(req);
+ if (rc == -EAGAIN)
+ goto openRetry;
+ return rc;
}
- cifs_buf_release(pSMB);
- if (rc == -EAGAIN)
- goto openRetry;
+ /* 1 byte no need to le_to_cpu */
+ *oplock = rsp->OplockLevel;
+ /* cifs fid stays in le */
+ oparms->fid->netfid = rsp->Fid;
+
+ /* Let caller know file was created so we can set the mode. */
+ /* Do we care about the CreateAction in any other cases? */
+ if (cpu_to_le32(FILE_CREATE) == rsp->CreateAction)
+ *oplock |= CIFS_CREATE_ACTION;
+
+ if (buf) {
+ /* copy from CreationTime to Attributes */
+ memcpy((char *)buf, (char *)&rsp->CreationTime, 36);
+ /* the file_info buf is endian converted by caller */
+ buf->AllocationSize = rsp->AllocationSize;
+ buf->EndOfFile = rsp->EndOfFile;
+ buf->NumberOfLinks = cpu_to_le32(1);
+ buf->DeletePending = 0;
+ }
+
+ cifs_buf_release(req);
return rc;
}
@@ -1890,7 +1910,7 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
do {
server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata);
+ rc = server->ops->async_writev(wdata, cifs_writedata_release);
} while (rc == -EAGAIN);
for (i = 0; i < wdata->nr_pages; i++) {
@@ -1942,15 +1962,9 @@ cifs_writedata_alloc(unsigned int nr_pages, work_func_t complete)
{
struct cifs_writedata *wdata;
- /* this would overflow */
- if (nr_pages == 0) {
- cifs_dbg(VFS, "%s: called with nr_pages == 0!\n", __func__);
- return NULL;
- }
-
/* writedata + number of page pointers */
wdata = kzalloc(sizeof(*wdata) +
- sizeof(struct page *) * (nr_pages - 1), GFP_NOFS);
+ sizeof(struct page *) * nr_pages, GFP_NOFS);
if (wdata != NULL) {
kref_init(&wdata->refcount);
INIT_LIST_HEAD(&wdata->list);
@@ -2011,7 +2025,8 @@ cifs_writev_callback(struct mid_q_entry *mid)
/* cifs_async_writev - send an async write, and set up mid to handle result */
int
-cifs_async_writev(struct cifs_writedata *wdata)
+cifs_async_writev(struct cifs_writedata *wdata,
+ void (*release)(struct kref *kref))
{
int rc = -EACCES;
WRITE_REQ *smb = NULL;
@@ -2085,7 +2100,7 @@ cifs_async_writev(struct cifs_writedata *wdata)
if (rc == 0)
cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
else
- kref_put(&wdata->refcount, cifs_writedata_release);
+ kref_put(&wdata->refcount, release);
async_writev_out:
cifs_small_buf_release(smb);
@@ -3088,7 +3103,8 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
bool is_unicode;
unsigned int sub_len;
char *sub_start;
- struct reparse_data *reparse_buf;
+ struct reparse_symlink_data *reparse_buf;
+ struct reparse_posix_data *posix_buf;
__u32 data_offset, data_count;
char *end_of_smb;
@@ -3137,20 +3153,47 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
goto qreparse_out;
}
end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
- reparse_buf = (struct reparse_data *)
+ reparse_buf = (struct reparse_symlink_data *)
((char *)&pSMBr->hdr.Protocol + data_offset);
if ((char *)reparse_buf >= end_of_smb) {
rc = -EIO;
goto qreparse_out;
}
- if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset +
- reparse_buf->PrintNameLength) > end_of_smb) {
+ if (reparse_buf->ReparseTag == cpu_to_le32(IO_REPARSE_TAG_NFS)) {
+ cifs_dbg(FYI, "NFS style reparse tag\n");
+ posix_buf = (struct reparse_posix_data *)reparse_buf;
+
+ if (posix_buf->InodeType != cpu_to_le64(NFS_SPECFILE_LNK)) {
+ cifs_dbg(FYI, "unsupported file type 0x%llx\n",
+ le64_to_cpu(posix_buf->InodeType));
+ rc = -EOPNOTSUPP;
+ goto qreparse_out;
+ }
+ is_unicode = true;
+ sub_len = le16_to_cpu(reparse_buf->ReparseDataLength);
+ if (posix_buf->PathBuffer + sub_len > end_of_smb) {
+ cifs_dbg(FYI, "reparse buf beyond SMB\n");
+ rc = -EIO;
+ goto qreparse_out;
+ }
+ *symlinkinfo = cifs_strndup_from_utf16(posix_buf->PathBuffer,
+ sub_len, is_unicode, nls_codepage);
+ goto qreparse_out;
+ } else if (reparse_buf->ReparseTag !=
+ cpu_to_le32(IO_REPARSE_TAG_SYMLINK)) {
+ rc = -EOPNOTSUPP;
+ goto qreparse_out;
+ }
+
+ /* Reparse tag is NTFS symlink */
+ sub_start = le16_to_cpu(reparse_buf->SubstituteNameOffset) +
+ reparse_buf->PathBuffer;
+ sub_len = le16_to_cpu(reparse_buf->SubstituteNameLength);
+ if (sub_start + sub_len > end_of_smb) {
cifs_dbg(FYI, "reparse buf beyond SMB\n");
rc = -EIO;
goto qreparse_out;
}
- sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer;
- sub_len = reparse_buf->SubstituteNameLength;
if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
is_unicode = true;
else
@@ -3171,6 +3214,60 @@ qreparse_out:
return rc;
}
+int
+CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+ __u16 fid)
+{
+ int rc = 0;
+ int bytes_returned;
+ struct smb_com_transaction_compr_ioctl_req *pSMB;
+ struct smb_com_transaction_ioctl_rsp *pSMBr;
+
+ cifs_dbg(FYI, "Set compression for %u\n", fid);
+ rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
+ (void **) &pSMBr);
+ if (rc)
+ return rc;
+
+ pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+ pSMB->TotalParameterCount = 0;
+ pSMB->TotalDataCount = __constant_cpu_to_le32(2);
+ pSMB->MaxParameterCount = 0;
+ pSMB->MaxDataCount = 0;
+ pSMB->MaxSetupCount = 4;
+ pSMB->Reserved = 0;
+ pSMB->ParameterOffset = 0;
+ pSMB->DataCount = __constant_cpu_to_le32(2);
+ pSMB->DataOffset =
+ cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req,
+ compression_state) - 4); /* 84 */
+ pSMB->SetupCount = 4;
+ pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL);
+ pSMB->ParameterCount = 0;
+ pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION);
+ pSMB->IsFsctl = 1; /* FSCTL */
+ pSMB->IsRootFlag = 0;
+ pSMB->Fid = fid; /* file handle always le */
+ /* 3 byte pad, followed by 2 byte compress state */
+ pSMB->ByteCount = __constant_cpu_to_le16(5);
+ inc_rfc1001_len(pSMB, 5);
+
+ rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
+ (struct smb_hdr *) pSMBr, &bytes_returned, 0);
+ if (rc)
+ cifs_dbg(FYI, "Send error in SetCompression = %d\n", rc);
+
+ cifs_buf_release(pSMB);
+
+ /*
+ * Note: On -EAGAIN error only caller can retry on handle based calls
+ * since file handle passed in no longer valid.
+ */
+ return rc;
+}
+
+
#ifdef CONFIG_CIFS_POSIX
/*Convert an Access Control Entry from wire format to local POSIX xattr format*/
@@ -3287,11 +3384,13 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
return 0;
}
cifs_acl->version = cpu_to_le16(1);
- if (acl_type == ACL_TYPE_ACCESS)
+ if (acl_type == ACL_TYPE_ACCESS) {
cifs_acl->access_entry_count = cpu_to_le16(count);
- else if (acl_type == ACL_TYPE_DEFAULT)
+ cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF);
+ } else if (acl_type == ACL_TYPE_DEFAULT) {
cifs_acl->default_entry_count = cpu_to_le16(count);
- else {
+ cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF);
+ } else {
cifs_dbg(FYI, "unknown ACL type %d\n", acl_type);
return 0;
}
@@ -3926,7 +4025,7 @@ QFileInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+ cifs_dbg(FYI, "Send error in QFileInfo = %d", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4095,7 +4194,7 @@ UnixQFileInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+ cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -4179,7 +4278,7 @@ UnixQPathInfoRetry:
rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
(struct smb_hdr *) pSMBr, &bytes_returned, 0);
if (rc) {
- cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+ cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc);
} else { /* decode response */
rc = validate_t2((struct smb_t2_rsp *)pSMBr);
@@ -6098,6 +6197,9 @@ QAllEAsRetry:
cifs_dbg(FYI, "ea length %d\n", list_len);
if (list_len <= 8) {
cifs_dbg(FYI, "empty EA list returned from server\n");
+ /* didn't find the named attribute */
+ if (ea_name)
+ rc = -ENODATA;
goto QAllEAsOut;
}
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index a279ffc0bc2..20d75b8ddb2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2144,6 +2144,9 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
sizeof(tcp_ses->srcaddr));
memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
sizeof(tcp_ses->dstaddr));
+#ifdef CONFIG_CIFS_SMB2
+ get_random_bytes(tcp_ses->client_guid, SMB2_CLIENT_GUID_SIZE);
+#endif
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -2225,7 +2228,7 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
vol->username ? vol->username : "",
CIFS_MAX_USERNAME_LEN))
return 0;
- if (strlen(vol->username) != 0 &&
+ if ((vol->username && strlen(vol->username) != 0) &&
ses->password != NULL &&
strncmp(ses->password,
vol->password ? vol->password : "",
@@ -2242,6 +2245,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+ if (ses->status == CifsExiting)
+ continue;
if (!match_session(ses, vol))
continue;
++ses->ses_count;
@@ -2255,24 +2260,37 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
static void
cifs_put_smb_ses(struct cifs_ses *ses)
{
- unsigned int xid;
+ unsigned int rc, xid;
struct TCP_Server_Info *server = ses->server;
cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
+
spin_lock(&cifs_tcp_ses_lock);
+ if (ses->status == CifsExiting) {
+ spin_unlock(&cifs_tcp_ses_lock);
+ return;
+ }
if (--ses->ses_count > 0) {
spin_unlock(&cifs_tcp_ses_lock);
return;
}
-
- list_del_init(&ses->smb_ses_list);
+ if (ses->status == CifsGood)
+ ses->status = CifsExiting;
spin_unlock(&cifs_tcp_ses_lock);
- if (ses->status == CifsGood && server->ops->logoff) {
+ if (ses->status == CifsExiting && server->ops->logoff) {
xid = get_xid();
- server->ops->logoff(xid, ses);
+ rc = server->ops->logoff(xid, ses);
+ if (rc)
+ cifs_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
+ __func__, rc);
_free_xid(xid);
}
+
+ spin_lock(&cifs_tcp_ses_lock);
+ list_del_init(&ses->smb_ses_list);
+ spin_unlock(&cifs_tcp_ses_lock);
+
sesInfoFree(ses);
cifs_put_tcp_session(server);
}
@@ -3755,6 +3773,13 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
return rc;
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct cifs_sb_info *sbi = container_of(p, struct cifs_sb_info, rcu);
+ unload_nls(sbi->local_nls);
+ kfree(sbi);
+}
+
void
cifs_umount(struct cifs_sb_info *cifs_sb)
{
@@ -3779,8 +3804,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
- unload_nls(cifs_sb->local_nls);
- kfree(cifs_sb);
+ call_rcu(&cifs_sb->rcu, delayed_free);
}
int
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 5384c2a640c..3db0c5fd9a1 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -193,7 +193,7 @@ check_name(struct dentry *direntry)
static int
cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
struct tcon_link *tlink, unsigned oflags, umode_t mode,
- __u32 *oplock, struct cifs_fid *fid, int *created)
+ __u32 *oplock, struct cifs_fid *fid)
{
int rc = -ENOENT;
int create_options = CREATE_NOT_DIR;
@@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
.device = 0,
};
- *created |= FILE_CREATED;
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
args.uid = current_fsuid();
if (inode->i_mode & S_ISGID)
@@ -379,7 +378,7 @@ cifs_create_get_file_info:
xid);
else {
rc = cifs_get_inode_info(&newinode, full_path, buf, inode->i_sb,
- xid, &fid->netfid);
+ xid, fid);
if (newinode) {
if (server->ops->set_lease_key)
server->ops->set_lease_key(newinode, fid);
@@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
cifs_add_pending_open(&fid, tlink, &open);
rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
- &oplock, &fid, opened);
+ &oplock, &fid);
if (rc) {
cifs_del_pending_open(&open);
goto out;
}
+ if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+ *opened |= FILE_CREATED;
+
rc = finish_open(file, direntry, generic_file_open, opened);
if (rc) {
if (server->ops->close)
@@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
struct TCP_Server_Info *server;
struct cifs_fid fid;
__u32 oplock;
- int created = FILE_CREATED;
cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
inode, direntry->d_name.name, direntry);
@@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
server->ops->new_lease_key(&fid);
rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
- &oplock, &fid, &created);
+ &oplock, &fid);
if (!rc && server->ops->close)
server->ops->close(xid, tcon, &fid);
@@ -564,12 +565,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
struct cifs_sb_info *cifs_sb;
struct tcon_link *tlink;
- struct cifs_tcon *pTcon;
+ struct cifs_tcon *tcon;
struct cifs_io_parms io_parms;
char *full_path = NULL;
struct inode *newinode = NULL;
int oplock = 0;
- u16 fileHandle;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
FILE_ALL_INFO *buf = NULL;
unsigned int bytes_written;
struct win_dev *pdev;
@@ -582,7 +584,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
if (IS_ERR(tlink))
return PTR_ERR(tlink);
- pTcon = tlink_tcon(tlink);
+ tcon = tlink_tcon(tlink);
xid = get_xid();
@@ -592,7 +594,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
goto mknod_out;
}
- if (pTcon->unix_ext) {
+ if (tcon->unix_ext) {
struct cifs_unix_set_info_args args = {
.mode = mode & ~current_umask(),
.ctime = NO_CHANGE_64,
@@ -607,7 +609,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
args.uid = INVALID_UID; /* no change */
args.gid = INVALID_GID; /* no change */
}
- rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, &args,
+ rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -639,42 +641,44 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
- rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
- GENERIC_WRITE, create_options,
- &fileHandle, &oplock, buf, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, buf);
if (rc)
goto mknod_out;
- /* BB Do not bother to decode buf since no local inode yet to put
- * timestamps in, but we can reuse it safely */
+ /*
+ * BB Do not bother to decode buf since no local inode yet to put
+ * timestamps in, but we can reuse it safely.
+ */
pdev = (struct win_dev *)buf;
- io_parms.netfid = fileHandle;
+ io_parms.netfid = fid.netfid;
io_parms.pid = current->tgid;
- io_parms.tcon = pTcon;
+ io_parms.tcon = tcon;
io_parms.offset = 0;
io_parms.length = sizeof(struct win_dev);
if (S_ISCHR(mode)) {
memcpy(pdev->type, "IntxCHR", 8);
- pdev->major =
- cpu_to_le64(MAJOR(device_number));
- pdev->minor =
- cpu_to_le64(MINOR(device_number));
- rc = CIFSSMBWrite(xid, &io_parms,
- &bytes_written, (char *)pdev,
- NULL, 0);
+ pdev->major = cpu_to_le64(MAJOR(device_number));
+ pdev->minor = cpu_to_le64(MINOR(device_number));
+ rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
+ NULL, 0);
} else if (S_ISBLK(mode)) {
memcpy(pdev->type, "IntxBLK", 8);
- pdev->major =
- cpu_to_le64(MAJOR(device_number));
- pdev->minor =
- cpu_to_le64(MINOR(device_number));
- rc = CIFSSMBWrite(xid, &io_parms,
- &bytes_written, (char *)pdev,
- NULL, 0);
+ pdev->major = cpu_to_le64(MAJOR(device_number));
+ pdev->minor = cpu_to_le64(MINOR(device_number));
+ rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev,
+ NULL, 0);
} /* else if (S_ISFIFO) */
- CIFSSMBClose(xid, pTcon, fileHandle);
+ CIFSSMBClose(xid, tcon, fid.netfid);
d_drop(direntry);
/* FIXME: add code here to set EAs */
@@ -756,7 +760,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
/* if it was once a directory (but how can we tell?) we could do
shrink_dcache_parent(direntry); */
} else if (rc != -EACCES) {
- cifs_dbg(VFS, "Unexpected lookup error %d\n", rc);
+ cifs_dbg(FYI, "Unexpected lookup error %d\n", rc);
/* We special case check for Access Denied - since that
is a common return code */
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7ddddf2e250..e90a1e9aa62 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -244,7 +244,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
xid);
else
rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb,
- xid, &fid->netfid);
+ xid, fid);
out:
kfree(buf);
@@ -335,7 +335,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
spin_unlock(&cifs_file_list_lock);
if (fid->purge_cache)
- cifs_invalidate_mapping(inode);
+ cifs_zap_mapping(inode);
file->private_data = cfile;
return cfile;
@@ -392,7 +392,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
* again and get at least level II oplock.
*/
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
- CIFS_I(inode)->invalid_mapping = true;
+ set_bit(CIFS_INO_INVALID_MAPPING, &cifsi->flags);
cifs_set_oplock_level(cifsi, 0);
}
spin_unlock(&cifs_file_list_lock);
@@ -678,7 +678,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
/*
* Can not refresh inode by passing in file_info buf to be returned by
- * CIFSSMBOpen and then calling get_inode_info with returned buf since
+ * ops->open and then calling get_inode_info with returned buf since
* file might have write behind data that needs to be flushed and server
* version of file size can be stale. If we knew for sure that inode was
* not dirty locally we could do this.
@@ -1529,7 +1529,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
*/
if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
CIFS_CACHE_READ(CIFS_I(inode))) {
- cifs_invalidate_mapping(inode);
+ cifs_zap_mapping(inode);
cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
inode);
CIFS_I(inode)->oplock = 0;
@@ -2043,7 +2043,8 @@ retry:
}
wdata->pid = wdata->cfile->pid;
server = tlink_tcon(wdata->cfile->tlink)->ses->server;
- rc = server->ops->async_writev(wdata);
+ rc = server->ops->async_writev(wdata,
+ cifs_writedata_release);
} while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
for (i = 0; i < nr_pages; ++i)
@@ -2217,7 +2218,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
file->f_path.dentry->d_name.name, datasync);
if (!CIFS_CACHE_READ(CIFS_I(inode))) {
- rc = cifs_invalidate_mapping(inode);
+ rc = cifs_zap_mapping(inode);
if (rc) {
cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
rc = 0; /* don't care about it in fsync */
@@ -2331,9 +2332,20 @@ size_t get_numpages(const size_t wsize, const size_t len, size_t *cur_len)
}
static void
-cifs_uncached_writev_complete(struct work_struct *work)
+cifs_uncached_writedata_release(struct kref *refcount)
{
int i;
+ struct cifs_writedata *wdata = container_of(refcount,
+ struct cifs_writedata, refcount);
+
+ for (i = 0; i < wdata->nr_pages; i++)
+ put_page(wdata->pages[i]);
+ cifs_writedata_release(refcount);
+}
+
+static void
+cifs_uncached_writev_complete(struct work_struct *work)
+{
struct cifs_writedata *wdata = container_of(work,
struct cifs_writedata, work);
struct inode *inode = wdata->cfile->dentry->d_inode;
@@ -2347,12 +2359,7 @@ cifs_uncached_writev_complete(struct work_struct *work)
complete(&wdata->done);
- if (wdata->result != -EAGAIN) {
- for (i = 0; i < wdata->nr_pages; i++)
- put_page(wdata->pages[i]);
- }
-
- kref_put(&wdata->refcount, cifs_writedata_release);
+ kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
/* attempt to send write to server, retry on any -EAGAIN errors */
@@ -2370,21 +2377,20 @@ cifs_uncached_retry_writev(struct cifs_writedata *wdata)
if (rc != 0)
continue;
}
- rc = server->ops->async_writev(wdata);
+ rc = server->ops->async_writev(wdata,
+ cifs_uncached_writedata_release);
} while (rc == -EAGAIN);
return rc;
}
static ssize_t
-cifs_iovec_write(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *poffset)
+cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
{
unsigned long nr_pages, i;
- size_t copied, len, cur_len;
+ size_t bytes, copied, len, cur_len;
ssize_t total_written = 0;
loff_t offset;
- struct iov_iter it;
struct cifsFileInfo *open_file;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb;
@@ -2393,14 +2399,16 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
int rc;
pid_t pid;
- len = iov_length(iov, nr_segs);
- if (!len)
- return 0;
-
+ len = iov_iter_count(from);
rc = generic_write_checks(file, poffset, &len, 0);
if (rc)
return rc;
+ if (!len)
+ return 0;
+
+ iov_iter_truncate(from, len);
+
INIT_LIST_HEAD(&wdata_list);
cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
open_file = file->private_data;
@@ -2416,7 +2424,6 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
else
pid = current->tgid;
- iov_iter_init(&it, iov, nr_segs, len, 0);
do {
size_t save_len;
@@ -2436,14 +2443,44 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
save_len = cur_len;
for (i = 0; i < nr_pages; i++) {
- copied = min_t(const size_t, cur_len, PAGE_SIZE);
- copied = iov_iter_copy_from_user(wdata->pages[i], &it,
- 0, copied);
+ bytes = min_t(size_t, cur_len, PAGE_SIZE);
+ copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
+ from);
cur_len -= copied;
- iov_iter_advance(&it, copied);
+ /*
+ * If we didn't copy as much as we expected, then that
+ * may mean we trod into an unmapped area. Stop copying
+ * at that point. On the next pass through the big
+ * loop, we'll likely end up getting a zero-length
+ * write and bailing out of it.
+ */
+ if (copied < bytes)
+ break;
}
cur_len = save_len - cur_len;
+ /*
+ * If we have no data to send, then that probably means that
+ * the copy above failed altogether. That's most likely because
+ * the address in the iovec was bogus. Set the rc to -EFAULT,
+ * free anything we allocated and bail out.
+ */
+ if (!cur_len) {
+ for (i = 0; i < nr_pages; i++)
+ put_page(wdata->pages[i]);
+ kfree(wdata);
+ rc = -EFAULT;
+ break;
+ }
+
+ /*
+ * i + 1 now represents the number of pages we actually used in
+ * the copy phase above. Bring nr_pages down to that, and free
+ * any pages that we didn't use.
+ */
+ for ( ; nr_pages > i + 1; nr_pages--)
+ put_page(wdata->pages[nr_pages - 1]);
+
wdata->sync_mode = WB_SYNC_ALL;
wdata->nr_pages = nr_pages;
wdata->offset = (__u64)offset;
@@ -2454,7 +2491,8 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
rc = cifs_uncached_retry_writev(wdata);
if (rc) {
- kref_put(&wdata->refcount, cifs_writedata_release);
+ kref_put(&wdata->refcount,
+ cifs_uncached_writedata_release);
break;
}
@@ -2496,7 +2534,7 @@ restart_loop:
}
}
list_del_init(&wdata->list);
- kref_put(&wdata->refcount, cifs_writedata_release);
+ kref_put(&wdata->refcount, cifs_uncached_writedata_release);
}
if (total_written > 0)
@@ -2506,11 +2544,11 @@ restart_loop:
return total_written ? total_written : (ssize_t)rc;
}
-ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t written;
struct inode *inode;
+ loff_t pos = iocb->ki_pos;
inode = file_inode(iocb->ki_filp);
@@ -2520,9 +2558,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
* write request.
*/
- written = cifs_iovec_write(iocb->ki_filp, iov, nr_segs, &pos);
+ written = cifs_iovec_write(iocb->ki_filp, from, &pos);
if (written > 0) {
- CIFS_I(inode)->invalid_mapping = true;
+ set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
iocb->ki_pos = pos;
}
@@ -2530,8 +2568,7 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
}
static ssize_t
-cifs_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_writev(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2539,38 +2576,38 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
ssize_t rc = -EACCES;
-
- BUG_ON(iocb->ki_pos != pos);
+ loff_t lock_pos = iocb->ki_pos;
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents writing.
*/
down_read(&cinode->lock_sem);
- if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
+ mutex_lock(&inode->i_mutex);
+ if (file->f_flags & O_APPEND)
+ lock_pos = i_size_read(inode);
+ if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from),
server->vals->exclusive_lock_type, NULL,
CIFS_WRITE_OP)) {
- mutex_lock(&inode->i_mutex);
- rc = __generic_file_aio_write(iocb, iov, nr_segs,
- &iocb->ki_pos);
+ rc = __generic_file_write_iter(iocb, from);
mutex_unlock(&inode->i_mutex);
- }
- if (rc > 0) {
- ssize_t err;
+ if (rc > 0) {
+ ssize_t err;
- err = generic_write_sync(file, pos, rc);
- if (err < 0 && rc > 0)
- rc = err;
+ err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+ if (err < 0)
+ rc = err;
+ }
+ } else {
+ mutex_unlock(&inode->i_mutex);
}
-
up_read(&cinode->lock_sem);
return rc;
}
ssize_t
-cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -2580,12 +2617,19 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
ssize_t written;
+ written = cifs_get_writer(cinode);
+ if (written)
+ return written;
+
if (CIFS_CACHE_WRITE(cinode)) {
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
- && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
- return cifs_writev(iocb, iov, nr_segs, pos);
+ && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0)) {
+ written = generic_file_write_iter(iocb, from);
+ goto out;
+ }
+ written = cifs_writev(iocb, from);
+ goto out;
}
/*
* For non-oplocked files in strict cache mode we need to write the data
@@ -2593,18 +2637,20 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
* affected pages because it may cause a error with mandatory locks on
* these pages but not on the region from pos to ppos+len-1.
*/
- written = cifs_user_writev(iocb, iov, nr_segs, pos);
+ written = cifs_user_writev(iocb, from);
if (written > 0 && CIFS_CACHE_READ(cinode)) {
/*
* Windows 7 server can delay breaking level2 oplock if a write
* request comes - break it on the client to prevent reading
* an old data.
*/
- cifs_invalidate_mapping(inode);
+ cifs_zap_mapping(inode);
cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
inode);
cinode->oplock = 0;
}
+out:
+ cifs_put_writer(cinode);
return written;
}
@@ -2699,56 +2745,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
/**
* cifs_readdata_to_iov - copy data from pages in response to an iovec
* @rdata: the readdata response with list of pages holding data
- * @iov: vector in which we should copy the data
- * @nr_segs: number of segments in vector
- * @offset: offset into file of the first iovec
- * @copied: used to return the amount of data copied to the iov
+ * @iter: destination for our data
*
* This function copies data from a list of pages in a readdata response into
* an array of iovecs. It will first calculate where the data should go
* based on the info in the readdata and then copy the data into that spot.
*/
-static ssize_t
-cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov,
- unsigned long nr_segs, loff_t offset, ssize_t *copied)
+static int
+cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
{
- int rc = 0;
- struct iov_iter ii;
- size_t pos = rdata->offset - offset;
- ssize_t remaining = rdata->bytes;
- unsigned char *pdata;
+ size_t remaining = rdata->bytes;
unsigned int i;
- /* set up iov_iter and advance to the correct offset */
- iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0);
- iov_iter_advance(&ii, pos);
-
- *copied = 0;
for (i = 0; i < rdata->nr_pages; i++) {
- ssize_t copy;
struct page *page = rdata->pages[i];
-
- /* copy a whole page or whatever's left */
- copy = min_t(ssize_t, remaining, PAGE_SIZE);
-
- /* ...but limit it to whatever space is left in the iov */
- copy = min_t(ssize_t, copy, iov_iter_count(&ii));
-
- /* go while there's data to be copied and no errors */
- if (copy && !rc) {
- pdata = kmap(page);
- rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset,
- (int)copy);
- kunmap(page);
- if (!rc) {
- *copied += copy;
- remaining -= copy;
- iov_iter_advance(&ii, copy);
- }
- }
+ size_t copy = min_t(size_t, remaining, PAGE_SIZE);
+ size_t written = copy_page_to_iter(page, 0, copy, iter);
+ remaining -= written;
+ if (written < copy && iov_iter_count(iter) > 0)
+ break;
}
-
- return rc;
+ return remaining ? -EFAULT : 0;
}
static void
@@ -2809,14 +2826,13 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
return total_read > 0 ? total_read : result;
}
-static ssize_t
-cifs_iovec_read(struct file *file, const struct iovec *iov,
- unsigned long nr_segs, loff_t *poffset)
+ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
{
+ struct file *file = iocb->ki_filp;
ssize_t rc;
size_t len, cur_len;
ssize_t total_read = 0;
- loff_t offset = *poffset;
+ loff_t offset = iocb->ki_pos;
unsigned int npages;
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
@@ -2825,10 +2841,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
struct list_head rdata_list;
pid_t pid;
- if (!nr_segs)
- return 0;
-
- len = iov_length(iov, nr_segs);
+ len = iov_iter_count(to);
if (!len)
return 0;
@@ -2857,7 +2870,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
cifs_uncached_readv_complete);
if (!rdata) {
rc = -ENOMEM;
- goto error;
+ break;
}
rc = cifs_read_allocate_pages(rdata, npages);
@@ -2889,60 +2902,48 @@ error:
if (!list_empty(&rdata_list))
rc = 0;
+ len = iov_iter_count(to);
/* the loop below should proceed in the order of increasing offsets */
-restart_loop:
list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
+ again:
if (!rc) {
- ssize_t copied;
-
/* FIXME: freezable sleep too? */
rc = wait_for_completion_killable(&rdata->done);
if (rc)
rc = -EINTR;
- else if (rdata->result)
+ else if (rdata->result) {
rc = rdata->result;
- else {
- rc = cifs_readdata_to_iov(rdata, iov,
- nr_segs, *poffset,
- &copied);
- total_read += copied;
+ /* resend call if it's a retryable error */
+ if (rc == -EAGAIN) {
+ rc = cifs_retry_async_readv(rdata);
+ goto again;
+ }
+ } else {
+ rc = cifs_readdata_to_iov(rdata, to);
}
- /* resend call if it's a retryable error */
- if (rc == -EAGAIN) {
- rc = cifs_retry_async_readv(rdata);
- goto restart_loop;
- }
}
list_del_init(&rdata->list);
kref_put(&rdata->refcount, cifs_uncached_readdata_release);
}
+ total_read = len - iov_iter_count(to);
+
cifs_stats_bytes_read(tcon, total_read);
- *poffset += total_read;
/* mask nodata case */
if (rc == -ENODATA)
rc = 0;
- return total_read ? total_read : rc;
-}
-
-ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- ssize_t read;
-
- read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos);
- if (read > 0)
- iocb->ki_pos = pos;
-
- return read;
+ if (total_read) {
+ iocb->ki_pos += total_read;
+ return total_read;
+ }
+ return rc;
}
ssize_t
-cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
struct cifsInodeInfo *cinode = CIFS_I(inode);
@@ -2961,22 +2962,22 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
* pos+len-1.
*/
if (!CIFS_CACHE_READ(cinode))
- return cifs_user_readv(iocb, iov, nr_segs, pos);
+ return cifs_user_readv(iocb, to);
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_aio_read(iocb, iov, nr_segs, pos);
+ return generic_file_read_iter(iocb, to);
/*
* We need to hold the sem to be sure nobody modifies lock list
* with a brlock that prevents reading.
*/
down_read(&cinode->lock_sem);
- if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
+ if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(to),
tcon->ses->server->vals->shared_lock_type,
NULL, CIFS_READ_OP))
- rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ rc = generic_file_read_iter(iocb, to);
up_read(&cinode->lock_sem);
return rc;
}
@@ -3085,6 +3086,7 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = cifs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -3097,7 +3099,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
xid = get_xid();
if (!CIFS_CACHE_READ(CIFS_I(inode))) {
- rc = cifs_invalidate_mapping(inode);
+ rc = cifs_zap_mapping(inode);
if (rc)
return rc;
}
@@ -3616,6 +3618,13 @@ static int cifs_launder_page(struct page *page)
return rc;
}
+static int
+cifs_pending_writers_wait(void *unused)
+{
+ schedule();
+ return 0;
+}
+
void cifs_oplock_break(struct work_struct *work)
{
struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3623,8 +3632,15 @@ void cifs_oplock_break(struct work_struct *work)
struct inode *inode = cfile->dentry->d_inode;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ struct TCP_Server_Info *server = tcon->ses->server;
int rc = 0;
+ wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
+ cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE);
+
+ server->ops->downgrade_oplock(server, cinode,
+ test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
+
if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
cifs_has_mand_locks(cinode)) {
cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
@@ -3641,7 +3657,7 @@ void cifs_oplock_break(struct work_struct *work)
if (!CIFS_CACHE_READ(cinode)) {
rc = filemap_fdatawait(inode->i_mapping);
mapping_set_error(inode->i_mapping, rc);
- cifs_invalidate_mapping(inode);
+ cifs_zap_mapping(inode);
}
cifs_dbg(FYI, "Oplock flush inode %p rc %d\n", inode, rc);
}
@@ -3661,8 +3677,30 @@ void cifs_oplock_break(struct work_struct *work)
cinode);
cifs_dbg(FYI, "Oplock release rc = %d\n", rc);
}
+ cifs_done_oplock_break(cinode);
}
+/*
+ * The presence of cifs_direct_io() in the address space ops vector
+ * allowes open() O_DIRECT flags which would have failed otherwise.
+ *
+ * In the non-cached mode (mount with cache=none), we shunt off direct read and write requests
+ * so this method should never be called.
+ *
+ * Direct IO is not yet supported in the cached mode.
+ */
+static ssize_t
+cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos)
+{
+ /*
+ * FIXME
+ * Eventually need to support direct IO for non forcedirectio mounts
+ */
+ return -EINVAL;
+}
+
+
const struct address_space_operations cifs_addr_ops = {
.readpage = cifs_readpage,
.readpages = cifs_readpages,
@@ -3672,6 +3710,7 @@ const struct address_space_operations cifs_addr_ops = {
.write_end = cifs_write_end,
.set_page_dirty = __set_page_dirty_nobuffers,
.releasepage = cifs_release_page,
+ .direct_IO = cifs_direct_io,
.invalidatepage = cifs_invalidate_page,
.launder_page = cifs_launder_page,
};
diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c
index b3258f35e88..8d4b7bc8ae9 100644
--- a/fs/cifs/fscache.c
+++ b/fs/cifs/fscache.c
@@ -27,7 +27,7 @@ void cifs_fscache_get_client_cookie(struct TCP_Server_Info *server)
{
server->fscache =
fscache_acquire_cookie(cifs_fscache_netfs.primary_index,
- &cifs_fscache_server_index_def, server);
+ &cifs_fscache_server_index_def, server, true);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server, server->fscache);
}
@@ -46,7 +46,7 @@ void cifs_fscache_get_super_cookie(struct cifs_tcon *tcon)
tcon->fscache =
fscache_acquire_cookie(server->fscache,
- &cifs_fscache_super_index_def, tcon);
+ &cifs_fscache_super_index_def, tcon, true);
cifs_dbg(FYI, "%s: (0x%p/0x%p)\n",
__func__, server->fscache, tcon->fscache);
}
@@ -69,7 +69,7 @@ static void cifs_fscache_enable_inode_cookie(struct inode *inode)
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_FSCACHE) {
cifsi->fscache = fscache_acquire_cookie(tcon->fscache,
- &cifs_fscache_inode_object_def, cifsi);
+ &cifs_fscache_inode_object_def, cifsi, true);
cifs_dbg(FYI, "%s: got FH cookie (0x%p/0x%p)\n",
__func__, tcon->fscache, cifsi->fscache);
}
@@ -119,7 +119,7 @@ void cifs_fscache_reset_inode_cookie(struct inode *inode)
cifsi->fscache = fscache_acquire_cookie(
cifs_sb_master_tcon(cifs_sb)->fscache,
&cifs_fscache_inode_object_def,
- cifsi);
+ cifsi, true);
cifs_dbg(FYI, "%s: new cookie 0x%p oldcookie 0x%p\n",
__func__, cifsi->fscache, old);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 867b7cdc794..a174605f6af 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -22,6 +22,7 @@
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
+#include <linux/freezer.h>
#include <asm/div64.h>
#include "cifsfs.h"
#include "cifspdu.h"
@@ -117,7 +118,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
cifs_dbg(FYI, "%s: invalidating inode %llu mapping\n",
__func__, cifs_i->uniqueid);
- cifs_i->invalid_mapping = true;
+ set_bit(CIFS_INO_INVALID_MAPPING, &cifs_i->flags);
}
/*
@@ -177,7 +178,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
else
cifs_i->time = jiffies;
- cifs_i->delete_pending = fattr->cf_flags & CIFS_FATTR_DELETE_PENDING;
+ if (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING)
+ set_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
+ else
+ clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
cifs_i->server_eof = fattr->cf_eof;
/*
@@ -383,9 +387,10 @@ int cifs_get_inode_info_unix(struct inode **pinode,
/* check for Minshall+French symlinks */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+ int tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+ full_path);
if (tmprc)
- cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
+ cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
if (*pinode == NULL) {
@@ -403,18 +408,20 @@ int cifs_get_inode_info_unix(struct inode **pinode,
}
static int
-cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
+cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
struct cifs_sb_info *cifs_sb, unsigned int xid)
{
int rc;
int oplock = 0;
- __u16 netfid;
struct tcon_link *tlink;
struct cifs_tcon *tcon;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
struct cifs_io_parms io_parms;
char buf[24];
unsigned int bytes_read;
char *pbuf;
+ int buf_type = CIFS_NO_BUFFER;
pbuf = buf;
@@ -435,62 +442,69 @@ cifs_sfu_type(struct cifs_fattr *fattr, const unsigned char *path,
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
- rc = CIFSSMBOpen(xid, tcon, path, FILE_OPEN, GENERIC_READ,
- CREATE_NOT_DIR, &netfid, &oplock, NULL,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc == 0) {
- int buf_type = CIFS_NO_BUFFER;
- /* Read header */
- io_parms.netfid = netfid;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = 24;
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf,
- &buf_type);
- if ((rc == 0) && (bytes_read >= 8)) {
- if (memcmp("IntxBLK", pbuf, 8) == 0) {
- cifs_dbg(FYI, "Block device\n");
- fattr->cf_mode |= S_IFBLK;
- fattr->cf_dtype = DT_BLK;
- if (bytes_read == 24) {
- /* we have enough to decode dev num */
- __u64 mjr; /* major */
- __u64 mnr; /* minor */
- mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
- mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
- fattr->cf_rdev = MKDEV(mjr, mnr);
- }
- } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
- cifs_dbg(FYI, "Char device\n");
- fattr->cf_mode |= S_IFCHR;
- fattr->cf_dtype = DT_CHR;
- if (bytes_read == 24) {
- /* we have enough to decode dev num */
- __u64 mjr; /* major */
- __u64 mnr; /* minor */
- mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
- mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
- fattr->cf_rdev = MKDEV(mjr, mnr);
- }
- } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
- cifs_dbg(FYI, "Symlink\n");
- fattr->cf_mode |= S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- } else {
- fattr->cf_mode |= S_IFREG; /* file? */
- fattr->cf_dtype = DT_REG;
- rc = -EOPNOTSUPP;
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_READ;
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (rc) {
+ cifs_put_tlink(tlink);
+ return rc;
+ }
+
+ /* Read header */
+ io_parms.netfid = fid.netfid;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = 24;
+
+ rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
+ if ((rc == 0) && (bytes_read >= 8)) {
+ if (memcmp("IntxBLK", pbuf, 8) == 0) {
+ cifs_dbg(FYI, "Block device\n");
+ fattr->cf_mode |= S_IFBLK;
+ fattr->cf_dtype = DT_BLK;
+ if (bytes_read == 24) {
+ /* we have enough to decode dev num */
+ __u64 mjr; /* major */
+ __u64 mnr; /* minor */
+ mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+ mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+ fattr->cf_rdev = MKDEV(mjr, mnr);
}
+ } else if (memcmp("IntxCHR", pbuf, 8) == 0) {
+ cifs_dbg(FYI, "Char device\n");
+ fattr->cf_mode |= S_IFCHR;
+ fattr->cf_dtype = DT_CHR;
+ if (bytes_read == 24) {
+ /* we have enough to decode dev num */
+ __u64 mjr; /* major */
+ __u64 mnr; /* minor */
+ mjr = le64_to_cpu(*(__le64 *)(pbuf+8));
+ mnr = le64_to_cpu(*(__le64 *)(pbuf+16));
+ fattr->cf_rdev = MKDEV(mjr, mnr);
+ }
+ } else if (memcmp("IntxLNK", pbuf, 7) == 0) {
+ cifs_dbg(FYI, "Symlink\n");
+ fattr->cf_mode |= S_IFLNK;
+ fattr->cf_dtype = DT_LNK;
} else {
- fattr->cf_mode |= S_IFREG; /* then it is a file */
+ fattr->cf_mode |= S_IFREG; /* file? */
fattr->cf_dtype = DT_REG;
- rc = -EOPNOTSUPP; /* or some unknown SFU type */
+ rc = -EOPNOTSUPP;
}
- CIFSSMBClose(xid, tcon, netfid);
+ } else {
+ fattr->cf_mode |= S_IFREG; /* then it is a file */
+ fattr->cf_dtype = DT_REG;
+ rc = -EOPNOTSUPP; /* or some unknown SFU type */
}
+ CIFSSMBClose(xid, tcon, fid.netfid);
cifs_put_tlink(tlink);
return rc;
}
@@ -517,10 +531,15 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
- rc = CIFSSMBQAllEAs(xid, tcon, path, "SETFILEBITS",
- ea_value, 4 /* size of buf */, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (tcon->ses->server->ops->query_all_EAs == NULL) {
+ cifs_put_tlink(tlink);
+ return -EOPNOTSUPP;
+ }
+
+ rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
+ "SETFILEBITS", ea_value, 4 /* size of buf */,
+ cifs_sb->local_nls,
+ cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
cifs_put_tlink(tlink);
if (rc < 0)
return (int)rc;
@@ -542,7 +561,8 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
static void
cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
- struct cifs_sb_info *cifs_sb, bool adjust_tz)
+ struct cifs_sb_info *cifs_sb, bool adjust_tz,
+ bool symlink)
{
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
@@ -569,7 +589,11 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
- if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+
+ if (symlink) {
+ fattr->cf_mode = S_IFLNK;
+ fattr->cf_dtype = DT_LNK;
+ } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
/*
@@ -578,10 +602,6 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
*/
if (!tcon->unix_ext)
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
- } else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
- fattr->cf_mode = S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
} else {
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
@@ -626,7 +646,8 @@ cifs_get_file_info(struct file *filp)
rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data);
switch (rc) {
case 0:
- cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
+ cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false,
+ false);
break;
case -EREMOTE:
cifs_create_dfs_fattr(&fattr, inode->i_sb);
@@ -660,7 +681,7 @@ cgfi_exit:
int
cifs_get_inode_info(struct inode **inode, const char *full_path,
FILE_ALL_INFO *data, struct super_block *sb, int xid,
- const __u16 *fid)
+ const struct cifs_fid *fid)
{
bool validinum = false;
__u16 srchflgs;
@@ -673,6 +694,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
bool adjust_tz = false;
struct cifs_fattr fattr;
struct cifs_search_info *srchinf = NULL;
+ bool symlink = false;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
@@ -702,12 +724,12 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
}
data = (FILE_ALL_INFO *)buf;
rc = server->ops->query_path_info(xid, tcon, cifs_sb, full_path,
- data, &adjust_tz);
+ data, &adjust_tz, &symlink);
}
if (!rc) {
- cifs_all_info_to_fattr(&fattr, (FILE_ALL_INFO *)data, cifs_sb,
- adjust_tz);
+ cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz,
+ symlink);
} else if (rc == -EREMOTE) {
cifs_create_dfs_fattr(&fattr, sb);
rc = 0;
@@ -796,9 +818,10 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
/* check for Minshall+French symlinks */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
- tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+ tmprc = check_mf_symlink(xid, tcon, cifs_sb, &fattr,
+ full_path);
if (tmprc)
- cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
+ cifs_dbg(FYI, "check_mf_symlink: %d\n", tmprc);
}
if (!*inode) {
@@ -1027,7 +1050,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
{
int oplock = 0;
int rc;
- __u16 netfid;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
struct inode *inode = dentry->d_inode;
struct cifsInodeInfo *cifsInode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -1050,10 +1074,16 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
goto out;
}
- rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
- DELETE|FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
- &netfid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc != 0)
goto out;
@@ -1074,7 +1104,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
goto out_close;
}
info_buf->Attributes = cpu_to_le32(dosattr);
- rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+ rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
current->tgid);
/* although we would like to mark the file hidden
if that fails we will still try to rename it */
@@ -1085,7 +1115,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
}
/* rename the file */
- rc = CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls,
+ rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
+ cifs_sb->local_nls,
cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
if (rc != 0) {
@@ -1094,8 +1125,8 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
}
/* try to set DELETE_ON_CLOSE */
- if (!cifsInode->delete_pending) {
- rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid,
+ if (!test_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags)) {
+ rc = CIFSSMBSetFileDisposition(xid, tcon, true, fid.netfid,
current->tgid);
/*
* some samba versions return -ENOENT when we try to set the
@@ -1111,11 +1142,11 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
rc = -EBUSY;
goto undo_rename;
}
- cifsInode->delete_pending = true;
+ set_bit(CIFS_INO_DELETE_PENDING, &cifsInode->flags);
}
out_close:
- CIFSSMBClose(xid, tcon, netfid);
+ CIFSSMBClose(xid, tcon, fid.netfid);
out:
kfree(info_buf);
cifs_put_tlink(tlink);
@@ -1127,13 +1158,13 @@ out:
* them anyway.
*/
undo_rename:
- CIFSSMBRenameOpenFile(xid, tcon, netfid, dentry->d_name.name,
+ CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
undo_setattr:
if (dosattr != origattr) {
info_buf->Attributes = cpu_to_le32(origattr);
- if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid,
+ if (!CIFSSMBSetFileInfo(xid, tcon, info_buf, fid.netfid,
current->tgid))
cifsInode->cifsAttrs = origattr;
}
@@ -1544,7 +1575,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct TCP_Server_Info *server;
- __u16 srcfid;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
int oplock, rc;
tlink = cifs_sb_tlink(cifs_sb);
@@ -1571,17 +1603,23 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
if (to_dentry->d_parent != from_dentry->d_parent)
goto do_rename_exit;
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
/* open the file to be renamed -- we need DELETE perms */
- rc = CIFSSMBOpen(xid, tcon, from_path, FILE_OPEN, DELETE,
- CREATE_NOT_DIR, &srcfid, &oplock, NULL,
- cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ oparms.desired_access = DELETE;
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = from_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc == 0) {
- rc = CIFSSMBRenameOpenFile(xid, tcon, srcfid,
+ rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
(const char *) to_dentry->d_name.name,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
CIFS_MOUNT_MAP_SPECIAL_CHR);
- CIFSSMBClose(xid, tcon, srcfid);
+ CIFSSMBClose(xid, tcon, fid.netfid);
}
do_rename_exit:
cifs_put_tlink(tlink);
@@ -1703,6 +1741,9 @@ cifs_inode_needs_reval(struct inode *inode)
if (cifs_i->time == 0)
return true;
+ if (!cifs_sb->actimeo)
+ return true;
+
if (!time_in_range(jiffies, cifs_i->time,
cifs_i->time + cifs_sb->actimeo))
return true;
@@ -1722,23 +1763,62 @@ int
cifs_invalidate_mapping(struct inode *inode)
{
int rc = 0;
- struct cifsInodeInfo *cifs_i = CIFS_I(inode);
-
- cifs_i->invalid_mapping = false;
if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
rc = invalidate_inode_pages2(inode->i_mapping);
- if (rc) {
+ if (rc)
cifs_dbg(VFS, "%s: could not invalidate inode %p\n",
__func__, inode);
- cifs_i->invalid_mapping = true;
- }
}
cifs_fscache_reset_inode_cookie(inode);
return rc;
}
+/**
+ * cifs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+static int
+cifs_wait_bit_killable(void *word)
+{
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ freezable_schedule_unsafe();
+ return 0;
+}
+
+int
+cifs_revalidate_mapping(struct inode *inode)
+{
+ int rc;
+ unsigned long *flags = &CIFS_I(inode)->flags;
+
+ rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
+ TASK_KILLABLE);
+ if (rc)
+ return rc;
+
+ if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) {
+ rc = cifs_invalidate_mapping(inode);
+ if (rc)
+ set_bit(CIFS_INO_INVALID_MAPPING, flags);
+ }
+
+ clear_bit_unlock(CIFS_INO_LOCK, flags);
+ smp_mb__after_atomic();
+ wake_up_bit(flags, CIFS_INO_LOCK);
+
+ return rc;
+}
+
+int
+cifs_zap_mapping(struct inode *inode)
+{
+ set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags);
+ return cifs_revalidate_mapping(inode);
+}
+
int cifs_revalidate_file_attr(struct file *filp)
{
int rc = 0;
@@ -1805,9 +1885,7 @@ int cifs_revalidate_file(struct file *filp)
if (rc)
return rc;
- if (CIFS_I(inode)->invalid_mapping)
- rc = cifs_invalidate_mapping(inode);
- return rc;
+ return cifs_revalidate_mapping(inode);
}
/* revalidate a dentry's inode attributes */
@@ -1820,9 +1898,7 @@ int cifs_revalidate_dentry(struct dentry *dentry)
if (rc)
return rc;
- if (CIFS_I(inode)->invalid_mapping)
- rc = cifs_invalidate_mapping(inode);
- return rc;
+ return cifs_revalidate_mapping(inode);
}
int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 3e084558585..45cb59bcc79 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -3,7 +3,7 @@
*
* vfs operations that deal with io control
*
- * Copyright (C) International Business Machines Corp., 2005,2007
+ * Copyright (C) International Business Machines Corp., 2005,2013
* Author(s): Steve French (sfrench@us.ibm.com)
*
* This library is free software; you can redistribute it and/or modify
@@ -22,25 +22,132 @@
*/
#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
#include "cifspdu.h"
#include "cifsglob.h"
#include "cifsproto.h"
#include "cifs_debug.h"
#include "cifsfs.h"
+#define CIFS_IOCTL_MAGIC 0xCF
+#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+ unsigned long srcfd, u64 off, u64 len, u64 destoff)
+{
+ int rc;
+ struct cifsFileInfo *smb_file_target = dst_file->private_data;
+ struct inode *target_inode = file_inode(dst_file);
+ struct cifs_tcon *target_tcon;
+ struct fd src_file;
+ struct cifsFileInfo *smb_file_src;
+ struct inode *src_inode;
+ struct cifs_tcon *src_tcon;
+
+ cifs_dbg(FYI, "ioctl clone range\n");
+ /* the destination must be opened for writing */
+ if (!(dst_file->f_mode & FMODE_WRITE)) {
+ cifs_dbg(FYI, "file target not open for write\n");
+ return -EINVAL;
+ }
+
+ /* check if target volume is readonly and take reference */
+ rc = mnt_want_write_file(dst_file);
+ if (rc) {
+ cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+ return rc;
+ }
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ rc = -EBADF;
+ goto out_drop_write;
+ }
+
+ if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+ goto out_fput;
+ }
+
+ rc = -EXDEV;
+ smb_file_target = dst_file->private_data;
+ smb_file_src = src_file.file->private_data;
+ src_tcon = tlink_tcon(smb_file_src->tlink);
+ target_tcon = tlink_tcon(smb_file_target->tlink);
+
+ /* check if source and target are on same tree connection */
+ if (src_tcon != target_tcon) {
+ cifs_dbg(VFS, "file copy src and target on different volume\n");
+ goto out_fput;
+ }
+
+ src_inode = file_inode(src_file.file);
+
+ /*
+ * Note: cifs case is easier than btrfs since server responsible for
+ * checks for proper open modes and file type and if it wants
+ * server could even support copy of range where source = target
+ */
+
+ /* so we do not deadlock racing two ioctls on same files */
+ if (target_inode < src_inode) {
+ mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
+ }
+
+ /* determine range to clone */
+ rc = -EINVAL;
+ if (off + len > src_inode->i_size || off + len < off)
+ goto out_unlock;
+ if (len == 0)
+ len = src_inode->i_size - off;
+
+ cifs_dbg(FYI, "about to flush pages\n");
+ /* should we flush first and last page first */
+ truncate_inode_pages_range(&target_inode->i_data, destoff,
+ PAGE_CACHE_ALIGN(destoff + len)-1);
+
+ if (target_tcon->ses->server->ops->clone_range)
+ rc = target_tcon->ses->server->ops->clone_range(xid,
+ smb_file_src, smb_file_target, off, len, destoff);
+
+ /* force revalidate of size and timestamps of target file now
+ that target is updated on the server */
+ CIFS_I(target_inode)->time = 0;
+out_unlock:
+ /* although unlocking in the reverse order from locking is not
+ strictly necessary here it is a little cleaner to be consistent */
+ if (target_inode < src_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ mutex_unlock(&target_inode->i_mutex);
+ } else {
+ mutex_unlock(&target_inode->i_mutex);
+ mutex_unlock(&src_inode->i_mutex);
+ }
+out_fput:
+ fdput(src_file);
+out_drop_write:
+ mnt_drop_write_file(dst_file);
+ return rc;
+}
+
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
{
struct inode *inode = file_inode(filep);
int rc = -ENOTTY; /* strange error - but the precedent */
unsigned int xid;
struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_POSIX
struct cifsFileInfo *pSMBFile = filep->private_data;
struct cifs_tcon *tcon;
__u64 ExtAttrBits = 0;
- __u64 ExtAttrMask = 0;
__u64 caps;
-#endif /* CONFIG_CIFS_POSIX */
xid = get_xid();
@@ -49,13 +156,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
cifs_sb = CIFS_SB(inode->i_sb);
switch (command) {
-#ifdef CONFIG_CIFS_POSIX
case FS_IOC_GETFLAGS:
if (pSMBFile == NULL)
break;
tcon = tlink_tcon(pSMBFile->tlink);
caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
+#ifdef CONFIG_CIFS_POSIX
if (CIFS_UNIX_EXTATTR_CAP & caps) {
+ __u64 ExtAttrMask = 0;
rc = CIFSGetExtAttr(xid, tcon,
pSMBFile->fid.netfid,
&ExtAttrBits, &ExtAttrMask);
@@ -63,29 +171,53 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = put_user(ExtAttrBits &
FS_FL_USER_VISIBLE,
(int __user *)arg);
+ if (rc != EOPNOTSUPP)
+ break;
+ }
+#endif /* CONFIG_CIFS_POSIX */
+ rc = 0;
+ if (CIFS_I(inode)->cifsAttrs & ATTR_COMPRESSED) {
+ /* add in the compressed bit */
+ ExtAttrBits = FS_COMPR_FL;
+ rc = put_user(ExtAttrBits & FS_FL_USER_VISIBLE,
+ (int __user *)arg);
}
break;
-
case FS_IOC_SETFLAGS:
if (pSMBFile == NULL)
break;
tcon = tlink_tcon(pSMBFile->tlink);
caps = le64_to_cpu(tcon->fsUnixInfo.Capability);
- if (CIFS_UNIX_EXTATTR_CAP & caps) {
- if (get_user(ExtAttrBits, (int __user *)arg)) {
- rc = -EFAULT;
- break;
- }
- /*
- * rc = CIFSGetExtAttr(xid, tcon,
- * pSMBFile->fid.netfid,
- * extAttrBits,
- * &ExtAttrMask);
- */
+
+ if (get_user(ExtAttrBits, (int __user *)arg)) {
+ rc = -EFAULT;
+ break;
+ }
+
+ /*
+ * if (CIFS_UNIX_EXTATTR_CAP & caps)
+ * rc = CIFSSetExtAttr(xid, tcon,
+ * pSMBFile->fid.netfid,
+ * extAttrBits,
+ * &ExtAttrMask);
+ * if (rc != EOPNOTSUPP)
+ * break;
+ */
+
+ /* Currently only flag we can set is compressed flag */
+ if ((ExtAttrBits & FS_COMPR_FL) == 0)
+ break;
+
+ /* Try to set compress flag */
+ if (tcon->ses->server->ops->set_compression) {
+ rc = tcon->ses->server->ops->set_compression(
+ xid, tcon, pSMBFile);
+ cifs_dbg(FYI, "set compress flag rc %d\n", rc);
}
- cifs_dbg(FYI, "set flags not implemented yet\n");
break;
-#endif /* CONFIG_CIFS_POSIX */
+ case CIFS_IOC_COPYCHUNK_FILE:
+ rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
+ break;
default:
cifs_dbg(FYI, "unsupported ioctl\n");
break;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cc0234710dd..68559fd557f 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -29,6 +29,10 @@
#include "cifs_debug.h"
#include "cifs_fs_sb.h"
+/*
+ * M-F Symlink Functions - Begin
+ */
+
#define CIFS_MF_SYMLINK_LEN_OFFSET (4+1)
#define CIFS_MF_SYMLINK_MD5_OFFSET (CIFS_MF_SYMLINK_LEN_OFFSET+(4+1))
#define CIFS_MF_SYMLINK_LINK_OFFSET (CIFS_MF_SYMLINK_MD5_OFFSET+(32+1))
@@ -91,10 +95,8 @@ symlink_hash_err:
}
static int
-CIFSParseMFSymlink(const u8 *buf,
- unsigned int buf_len,
- unsigned int *_link_len,
- char **_link_str)
+parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len,
+ char **_link_str)
{
int rc;
unsigned int link_len;
@@ -137,7 +139,7 @@ CIFSParseMFSymlink(const u8 *buf,
}
static int
-CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
+format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str)
{
int rc;
unsigned int link_len;
@@ -180,214 +182,114 @@ CIFSFormatMFSymlink(u8 *buf, unsigned int buf_len, const char *link_str)
return 0;
}
+bool
+couldbe_mf_symlink(const struct cifs_fattr *fattr)
+{
+ if (!S_ISREG(fattr->cf_mode))
+ /* it's not a symlink */
+ return false;
+
+ if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
+ /* it's not a symlink */
+ return false;
+
+ return true;
+}
+
static int
-CIFSCreateMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *fromName, const char *toName,
- struct cifs_sb_info *cifs_sb)
+create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const char *fromName,
+ const char *toName)
{
int rc;
- int oplock = 0;
- int remap;
- int create_options = CREATE_NOT_DIR;
- __u16 netfid = 0;
u8 *buf;
unsigned int bytes_written = 0;
- struct cifs_io_parms io_parms;
- struct nls_table *nls_codepage;
-
- nls_codepage = cifs_sb->local_nls;
- remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR;
buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- rc = CIFSFormatMFSymlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
- if (rc != 0) {
- kfree(buf);
- return rc;
- }
-
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
- rc = CIFSSMBOpen(xid, tcon, fromName, FILE_CREATE, GENERIC_WRITE,
- create_options, &netfid, &oplock, NULL,
- nls_codepage, remap);
- if (rc != 0) {
- kfree(buf);
- return rc;
- }
-
- io_parms.netfid = netfid;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+ rc = format_mf_symlink(buf, CIFS_MF_SYMLINK_FILE_SIZE, toName);
+ if (rc)
+ goto out;
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, buf, NULL, 0);
- CIFSSMBClose(xid, tcon, netfid);
- kfree(buf);
- if (rc != 0)
- return rc;
+ rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
+ fromName, buf, &bytes_written);
+ if (rc)
+ goto out;
if (bytes_written != CIFS_MF_SYMLINK_FILE_SIZE)
- return -EIO;
-
- return 0;
+ rc = -EIO;
+out:
+ kfree(buf);
+ return rc;
}
static int
-CIFSQueryMFSymLink(const unsigned int xid, struct cifs_tcon *tcon,
- const unsigned char *searchName, char **symlinkinfo,
- const struct nls_table *nls_codepage, int remap)
+query_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const unsigned char *path,
+ char **symlinkinfo)
{
int rc;
- int oplock = 0;
- __u16 netfid = 0;
- u8 *buf;
- char *pbuf;
- unsigned int bytes_read = 0;
- int buf_type = CIFS_NO_BUFFER;
+ u8 *buf = NULL;
unsigned int link_len = 0;
- struct cifs_io_parms io_parms;
- FILE_ALL_INFO file_info;
-
- rc = CIFSSMBOpen(xid, tcon, searchName, FILE_OPEN, GENERIC_READ,
- CREATE_NOT_DIR, &netfid, &oplock, &file_info,
- nls_codepage, remap);
- if (rc != 0)
- return rc;
-
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
- CIFSSMBClose(xid, tcon, netfid);
- /* it's not a symlink */
- return -EINVAL;
- }
+ unsigned int bytes_read = 0;
buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
- pbuf = buf;
- io_parms.netfid = netfid;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-
- rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type);
- CIFSSMBClose(xid, tcon, netfid);
- if (rc != 0) {
- kfree(buf);
- return rc;
- }
-
- rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, symlinkinfo);
- kfree(buf);
- if (rc != 0)
- return rc;
-
- return 0;
-}
-
-bool
-CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr)
-{
- if (!(fattr->cf_mode & S_IFREG))
- /* it's not a symlink */
- return false;
-
- if (fattr->cf_eof != CIFS_MF_SYMLINK_FILE_SIZE)
- /* it's not a symlink */
- return false;
-
- return true;
-}
-
-int
-open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
- unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
- unsigned int xid)
-{
- int rc;
- int oplock = 0;
- __u16 netfid = 0;
- struct tcon_link *tlink;
- struct cifs_tcon *ptcon;
- struct cifs_io_parms io_parms;
- int buf_type = CIFS_NO_BUFFER;
- FILE_ALL_INFO file_info;
- tlink = cifs_sb_tlink(cifs_sb);
- if (IS_ERR(tlink))
- return PTR_ERR(tlink);
- ptcon = tlink_tcon(tlink);
+ if (tcon->ses->server->ops->query_mf_symlink)
+ rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+ cifs_sb, path, buf, &bytes_read);
+ else
+ rc = -ENOSYS;
- rc = CIFSSMBOpen(xid, ptcon, path, FILE_OPEN, GENERIC_READ,
- CREATE_NOT_DIR, &netfid, &oplock, &file_info,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
- if (rc != 0) {
- cifs_put_tlink(tlink);
- return rc;
- }
+ if (rc)
+ goto out;
- if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
- CIFSSMBClose(xid, ptcon, netfid);
- cifs_put_tlink(tlink);
- /* it's not a symlink */
- return rc;
+ if (bytes_read == 0) { /* not a symlink */
+ rc = -EINVAL;
+ goto out;
}
- io_parms.netfid = netfid;
- io_parms.pid = current->tgid;
- io_parms.tcon = ptcon;
- io_parms.offset = 0;
- io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
-
- rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
- CIFSSMBClose(xid, ptcon, netfid);
- cifs_put_tlink(tlink);
+ rc = parse_mf_symlink(buf, bytes_read, &link_len, symlinkinfo);
+out:
+ kfree(buf);
return rc;
}
-
int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
- const unsigned char *path,
- struct cifs_sb_info *cifs_sb, unsigned int xid)
+check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+ const unsigned char *path)
{
- int rc = 0;
+ int rc;
u8 *buf = NULL;
unsigned int link_len = 0;
unsigned int bytes_read = 0;
- struct cifs_tcon *ptcon;
- if (!CIFSCouldBeMFSymlink(fattr))
+ if (!couldbe_mf_symlink(fattr))
/* it's not a symlink */
return 0;
buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
- if (!buf) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!buf)
+ return -ENOMEM;
- ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
- if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
- rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
- &bytes_read, cifs_sb, xid);
+ if (tcon->ses->server->ops->query_mf_symlink)
+ rc = tcon->ses->server->ops->query_mf_symlink(xid, tcon,
+ cifs_sb, path, buf, &bytes_read);
else
- goto out;
+ rc = -ENOSYS;
- if (rc != 0)
+ if (rc)
goto out;
if (bytes_read == 0) /* not a symlink */
goto out;
- rc = CIFSParseMFSymlink(buf, bytes_read, &link_len, NULL);
+ rc = parse_mf_symlink(buf, bytes_read, &link_len, NULL);
if (rc == -EINVAL) {
/* it's not a symlink */
rc = 0;
@@ -407,6 +309,95 @@ out:
return rc;
}
+/*
+ * SMB 1.0 Protocol specific functions
+ */
+
+int
+cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const unsigned char *path,
+ char *pbuf, unsigned int *pbytes_read)
+{
+ int rc;
+ int oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ struct cifs_io_parms io_parms;
+ int buf_type = CIFS_NO_BUFFER;
+ FILE_ALL_INFO file_info;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_READ;
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, &file_info);
+ if (rc)
+ return rc;
+
+ if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+ /* it's not a symlink */
+ goto out;
+
+ io_parms.netfid = fid.netfid;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+ rc = CIFSSMBRead(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
+out:
+ CIFSSMBClose(xid, tcon, fid.netfid);
+ return rc;
+}
+
+int
+cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, const unsigned char *path,
+ char *pbuf, unsigned int *pbytes_written)
+{
+ int rc;
+ int oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+ struct cifs_io_parms io_parms;
+ int create_options = CREATE_NOT_DIR;
+
+ if (backup_cred(cifs_sb))
+ create_options |= CREATE_OPEN_BACKUP_INTENT;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = GENERIC_WRITE;
+ oparms.create_options = create_options;
+ oparms.disposition = FILE_CREATE;
+ oparms.path = path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (rc)
+ return rc;
+
+ io_parms.netfid = fid.netfid;
+ io_parms.pid = current->tgid;
+ io_parms.tcon = tcon;
+ io_parms.offset = 0;
+ io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
+
+ rc = CIFSSMBWrite(xid, &io_parms, pbytes_written, pbuf, NULL, 0);
+ CIFSSMBClose(xid, tcon, fid.netfid);
+ return rc;
+}
+
+/*
+ * M-F Symlink Functions - End
+ */
+
int
cifs_hardlink(struct dentry *old_file, struct inode *inode,
struct dentry *direntry)
@@ -442,8 +433,10 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
CIFS_MOUNT_MAP_SPECIAL_CHR);
else {
server = tcon->ses->server;
- if (!server->ops->create_hardlink)
- return -ENOSYS;
+ if (!server->ops->create_hardlink) {
+ rc = -ENOSYS;
+ goto cifs_hl_exit;
+ }
rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
cifs_sb);
if ((rc == -EIO) || (rc == -EINVAL))
@@ -534,15 +527,10 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
* and fallback to UNIX Extensions Symlinks.
*/
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
- rc = CIFSQueryMFSymLink(xid, tcon, full_path, &target_path,
- cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ rc = query_mf_symlink(xid, tcon, cifs_sb, full_path,
+ &target_path);
- if ((rc != 0) && cap_unix(tcon->ses))
- rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
- cifs_sb->local_nls);
- else if (rc != 0 && server->ops->query_symlink)
+ if (rc != 0 && server->ops->query_symlink)
rc = server->ops->query_symlink(xid, tcon, full_path,
&target_path, cifs_sb);
@@ -591,8 +579,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname)
/* BB what if DFS and this volume is on different share? BB */
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
- rc = CIFSCreateMFSymLink(xid, pTcon, full_path, symname,
- cifs_sb);
+ rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
else if (pTcon->unix_ext)
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 138a011633f..3b0c62e622d 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -278,7 +278,7 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
}
static int
-check_smb_hdr(struct smb_hdr *smb, __u16 mid)
+check_smb_hdr(struct smb_hdr *smb)
{
/* does it have the right SMB "signature" ? */
if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
@@ -287,13 +287,6 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
return 1;
}
- /* Make sure that message ids match */
- if (mid != smb->Mid) {
- cifs_dbg(VFS, "Mids do not match. received=%u expected=%u\n",
- smb->Mid, mid);
- return 1;
- }
-
/* if it's a response then accept */
if (smb->Flags & SMBFLG_RESPONSE)
return 0;
@@ -302,7 +295,8 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid)
if (smb->Command == SMB_COM_LOCKING_ANDX)
return 0;
- cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", smb->Mid);
+ cifs_dbg(VFS, "Server sent request, not response. mid=%u\n",
+ get_mid(smb));
return 1;
}
@@ -310,7 +304,6 @@ int
checkSMB(char *buf, unsigned int total_read)
{
struct smb_hdr *smb = (struct smb_hdr *)buf;
- __u16 mid = smb->Mid;
__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
__u32 clc_len; /* calculated length */
cifs_dbg(FYI, "checkSMB Length: 0x%x, smb_buf_length: 0x%x\n",
@@ -348,7 +341,7 @@ checkSMB(char *buf, unsigned int total_read)
}
/* otherwise, there is enough to get to the BCC */
- if (check_smb_hdr(smb, mid))
+ if (check_smb_hdr(smb))
return -EIO;
clc_len = smbCalcSize(smb);
@@ -359,6 +352,7 @@ checkSMB(char *buf, unsigned int total_read)
}
if (4 + rfclen != clc_len) {
+ __u16 mid = get_mid(smb);
/* check if bcc wrapped around for large read responses */
if ((rfclen > 64 * 1024) && (rfclen > clc_len)) {
/* check if lengths match mod 64K */
@@ -366,11 +360,11 @@ checkSMB(char *buf, unsigned int total_read)
return 0; /* bcc wrapped */
}
cifs_dbg(FYI, "Calculated size %u vs length %u mismatch for mid=%u\n",
- clc_len, 4 + rfclen, smb->Mid);
+ clc_len, 4 + rfclen, mid);
if (4 + rfclen < clc_len) {
cifs_dbg(VFS, "RFC1001 size %u smaller than SMB for mid=%u\n",
- rfclen, smb->Mid);
+ rfclen, mid);
return -EIO;
} else if (rfclen > clc_len + 512) {
/*
@@ -383,7 +377,7 @@ checkSMB(char *buf, unsigned int total_read)
* data to 512 bytes.
*/
cifs_dbg(VFS, "RFC1001 size %u more than 512 bytes larger than SMB for mid=%u\n",
- rfclen, smb->Mid);
+ rfclen, mid);
return -EIO;
}
}
@@ -472,8 +466,22 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
cifs_dbg(FYI, "file id match, oplock break\n");
pCifsInode = CIFS_I(netfile->dentry->d_inode);
- cifs_set_oplock_level(pCifsInode,
- pSMB->OplockLevel ? OPLOCK_READ : 0);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+ &pCifsInode->flags);
+
+ /*
+ * Set flag if the server downgrades the oplock
+ * to L2 else clear.
+ */
+ if (pSMB->OplockLevel)
+ set_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &pCifsInode->flags);
+ else
+ clear_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &pCifsInode->flags);
+
queue_work(cifsiod_wq,
&netfile->oplock_break);
netfile->oplock_break_cancelled = false;
@@ -557,6 +565,62 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
cinode->oplock = 0;
}
+static int
+cifs_oplock_break_wait(void *unused)
+{
+ schedule();
+ return signal_pending(current) ? -ERESTARTSYS : 0;
+}
+
+/*
+ * We wait for oplock breaks to be processed before we attempt to perform
+ * writes.
+ */
+int cifs_get_writer(struct cifsInodeInfo *cinode)
+{
+ int rc;
+
+start:
+ rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
+ cifs_oplock_break_wait, TASK_KILLABLE);
+ if (rc)
+ return rc;
+
+ spin_lock(&cinode->writers_lock);
+ if (!cinode->writers)
+ set_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ cinode->writers++;
+ /* Check to see if we have started servicing an oplock break */
+ if (test_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags)) {
+ cinode->writers--;
+ if (cinode->writers == 0) {
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+ }
+ spin_unlock(&cinode->writers_lock);
+ goto start;
+ }
+ spin_unlock(&cinode->writers_lock);
+ return 0;
+}
+
+void cifs_put_writer(struct cifsInodeInfo *cinode)
+{
+ spin_lock(&cinode->writers_lock);
+ cinode->writers--;
+ if (cinode->writers == 0) {
+ clear_bit(CIFS_INODE_PENDING_WRITERS, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS);
+ }
+ spin_unlock(&cinode->writers_lock);
+}
+
+void cifs_done_oplock_break(struct cifsInodeInfo *cinode)
+{
+ clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags);
+ wake_up_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK);
+}
+
bool
backup_cred(struct cifs_sb_info *cifs_sb)
{
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index af847e1cf1c..6834b9c3bec 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -51,7 +51,7 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = {
{ERRnoaccess, -EACCES},
{ERRbadfid, -EBADF},
{ERRbadmcb, -EIO},
- {ERRnomem, -ENOMEM},
+ {ERRnomem, -EREMOTEIO},
{ERRbadmem, -EFAULT},
{ERRbadenv, -EFAULT},
{ERRbadformat, -EINVAL},
@@ -780,7 +780,9 @@ static const struct {
ERRDOS, ERRnoaccess, 0xc0000290}, {
ERRDOS, ERRbadfunc, 0xc000029c}, {
ERRDOS, ERRsymlink, NT_STATUS_STOPPED_ON_SYMLINK}, {
- ERRDOS, ERRinvlevel, 0x007c0001}, };
+ ERRDOS, ERRinvlevel, 0x007c0001}, {
+ 0, 0, 0 }
+};
/*****************************************************************************
Print an error message from the status code
@@ -793,8 +795,8 @@ cifs_print_status(__u32 status_code)
while (nt_errs[idx].nt_errstr != NULL) {
if (((nt_errs[idx].nt_errcode) & 0xFFFFFF) ==
(status_code & 0xFFFFFF)) {
- printk(KERN_NOTICE "Status code returned 0x%08x %s\n",
- status_code, nt_errs[idx].nt_errstr);
+ pr_notice("Status code returned 0x%08x %s\n",
+ status_code, nt_errs[idx].nt_errstr);
}
idx++;
}
@@ -939,8 +941,9 @@ cifs_UnixTimeToNT(struct timespec t)
return (u64) t.tv_sec * 10000000 + t.tv_nsec/100 + NTFS_TIME_OFFSET;
}
-static int total_days_of_prev_months[] =
-{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
+static const int total_days_of_prev_months[] = {
+ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334
+};
struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
{
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 53a75f3d017..b15862e0f68 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -134,22 +134,6 @@ out:
dput(dentry);
}
-/*
- * Is it possible that this directory might turn out to be a DFS referral
- * once we go to try and use it?
- */
-static bool
-cifs_dfs_is_possible(struct cifs_sb_info *cifs_sb)
-{
-#ifdef CONFIG_CIFS_DFS_UPCALL
- struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
-
- if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
- return true;
-#endif
- return false;
-}
-
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
@@ -159,27 +143,19 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
fattr->cf_dtype = DT_DIR;
- /*
- * Windows CIFS servers generally make DFS referrals look
- * like directories in FIND_* responses with the reparse
- * attribute flag also set (since DFS junctions are
- * reparse points). We must revalidate at least these
- * directory inodes before trying to use them (if
- * they are DFS we will get PATH_NOT_COVERED back
- * when queried directly and can then try to connect
- * to the DFS target)
- */
- if (cifs_dfs_is_possible(cifs_sb) &&
- (fattr->cf_cifsattrs & ATTR_REPARSE))
- fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
- } else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
- fattr->cf_mode = S_IFLNK;
- fattr->cf_dtype = DT_LNK;
} else {
fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
fattr->cf_dtype = DT_REG;
}
+ /*
+ * We need to revalidate it further to make a decision about whether it
+ * is a symbolic link, DFS referral or a reparse point with a direct
+ * access like junctions, deduplicated files, NFS symlinks.
+ */
+ if (fattr->cf_cifsattrs & ATTR_REPARSE)
+ fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
/* non-unix readdir doesn't provide nlink */
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
@@ -773,7 +749,7 @@ static int cifs_filldir(char *find_entry, struct file *file,
}
if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
- CIFSCouldBeMFSymlink(&fattr))
+ couldbe_mf_symlink(&fattr))
/*
* trying to get the type and mode can be slow,
* so just call those regular files for now, and mark
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 352358de1d7..e87387dbf39 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -500,9 +500,9 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
return NTLMv2;
if (global_secflags & CIFSSEC_MAY_NTLM)
return NTLM;
- /* Fallthrough */
default:
- return Unspecified;
+ /* Fallthrough to attempt LANMAN authentication next */
+ break;
}
case CIFS_NEGFLAVOR_LANMAN:
switch (requested) {
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 8233b174de3..d1fdfa84870 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -67,7 +67,7 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
mutex_unlock(&server->srv_mutex);
cifs_dbg(FYI, "issued NT_CANCEL for mid %u, rc = %d\n",
- in_buf->Mid, rc);
+ get_mid(in_buf), rc);
return rc;
}
@@ -101,7 +101,7 @@ cifs_find_mid(struct TCP_Server_Info *server, char *buffer)
spin_lock(&GlobalMid_Lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
- if (mid->mid == buf->Mid &&
+ if (compare_mid(mid->mid, buf) &&
mid->mid_state == MID_REQUEST_SUBMITTED &&
le16_to_cpu(mid->command) == buf->Command) {
spin_unlock(&GlobalMid_Lock);
@@ -372,6 +372,16 @@ coalesce_t2(char *second_buf, struct smb_hdr *target_hdr)
return 0;
}
+static void
+cifs_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ if (set_level2)
+ cifs_set_oplock_level(cinode, OPLOCK_READ);
+ else
+ cifs_set_oplock_level(cinode, 0);
+}
+
static bool
cifs_check_trans2(struct mid_q_entry *mid, struct TCP_Server_Info *server,
char *buf, int malformed)
@@ -534,10 +544,12 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
static int
cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjustTZ)
+ FILE_ALL_INFO *data, bool *adjustTZ, bool *symlink)
{
int rc;
+ *symlink = false;
+
/* could do find first instead but this returns more info */
rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
@@ -554,6 +566,30 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
CIFS_MOUNT_MAP_SPECIAL_CHR);
*adjustTZ = true;
}
+
+ if (!rc && (le32_to_cpu(data->Attributes) & ATTR_REPARSE)) {
+ int tmprc;
+ int oplock = 0;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.create_options = 0;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ /* Need to check if this is a symbolic link or not */
+ tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
+ if (tmprc == -EOPNOTSUPP)
+ *symlink = true;
+ else
+ CIFSSMBClose(xid, tcon, fid.netfid);
+ }
+
return rc;
}
@@ -686,12 +722,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
oparms->cifs_sb->local_nls,
oparms->cifs_sb->mnt_cifs_flags
& CIFS_MOUNT_MAP_SPECIAL_CHR);
- return CIFSSMBOpen(xid, oparms->tcon, oparms->path,
- oparms->disposition, oparms->desired_access,
- oparms->create_options, &oparms->fid->netfid, oplock,
- buf, oparms->cifs_sb->local_nls,
- oparms->cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ return CIFS_open(xid, oparms, oplock, buf);
}
static void
@@ -742,8 +773,9 @@ smb_set_file_info(struct inode *inode, const char *full_path,
{
int oplock = 0;
int rc;
- __u16 netfid;
__u32 netpid;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
struct cifsFileInfo *open_file;
struct cifsInodeInfo *cinode = CIFS_I(inode);
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -753,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
/* if the file is already open for write, just use that fileid */
open_file = find_writable_file(cinode, true);
if (open_file) {
- netfid = open_file->fid.netfid;
+ fid.netfid = open_file->fid.netfid;
netpid = open_file->pid;
tcon = tlink_tcon(open_file->tlink);
goto set_via_filehandle;
@@ -777,12 +809,17 @@ smb_set_file_info(struct inode *inode, const char *full_path,
goto out;
}
- cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
- rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
- SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, CREATE_NOT_DIR,
- &netfid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
+ oparms.create_options = CREATE_NOT_DIR;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+ cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n");
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc != 0) {
if (rc == -EIO)
rc = -EINVAL;
@@ -792,12 +829,12 @@ smb_set_file_info(struct inode *inode, const char *full_path,
netpid = current->tgid;
set_via_filehandle:
- rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
+ rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid);
if (!rc)
cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
if (open_file == NULL)
- CIFSSMBClose(xid, tcon, netfid);
+ CIFSSMBClose(xid, tcon, fid.netfid);
else
cifsFileInfo_put(open_file);
out:
@@ -807,6 +844,13 @@ out:
}
static int
+cifs_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile)
+{
+ return CIFSSMB_set_compression(xid, tcon, cfile->fid.netfid);
+}
+
+static int
cifs_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
const char *path, struct cifs_sb_info *cifs_sb,
struct cifs_fid *fid, __u16 search_flags,
@@ -882,33 +926,80 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
}
static int
+cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon,
+ const unsigned char *searchName, char **symlinkinfo,
+ const struct nls_table *nls_codepage)
+{
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ int rc;
+ unsigned int num_referrals = 0;
+ struct dfs_info3_param *referrals = NULL;
+
+ rc = get_dfs_path(xid, tcon->ses, searchName, nls_codepage,
+ &num_referrals, &referrals, 0);
+
+ if (!rc && num_referrals > 0) {
+ *symlinkinfo = kstrndup(referrals->node_name,
+ strlen(referrals->node_name),
+ GFP_KERNEL);
+ if (!*symlinkinfo)
+ rc = -ENOMEM;
+ free_dfs_info_array(referrals, num_referrals);
+ }
+ return rc;
+#else /* No DFS support */
+ return -EREMOTE;
+#endif
+}
+
+static int
cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, char **target_path,
struct cifs_sb_info *cifs_sb)
{
int rc;
int oplock = 0;
- __u16 netfid;
+ struct cifs_fid fid;
+ struct cifs_open_parms oparms;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
- rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
- FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid,
- &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ /* Check for unix extensions */
+ if (cap_unix(tcon->ses)) {
+ rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path,
+ cifs_sb->local_nls);
+ if (rc == -EREMOTE)
+ rc = cifs_unix_dfs_readlink(xid, tcon, full_path,
+ target_path,
+ cifs_sb->local_nls);
+
+ goto out;
+ }
+
+ oparms.tcon = tcon;
+ oparms.cifs_sb = cifs_sb;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.create_options = OPEN_REPARSE_POINT;
+ oparms.disposition = FILE_OPEN;
+ oparms.path = full_path;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = CIFS_open(xid, &oparms, &oplock, NULL);
if (rc)
- return rc;
+ goto out;
- rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path,
+ rc = CIFSSMBQuerySymLink(xid, tcon, fid.netfid, target_path,
cifs_sb->local_nls);
- if (rc) {
- CIFSSMBClose(xid, tcon, netfid);
- return rc;
- }
+ if (rc)
+ goto out_close;
convert_delimiter(*target_path, '/');
- CIFSSMBClose(xid, tcon, netfid);
- cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+out_close:
+ CIFSSMBClose(xid, tcon, fid.netfid);
+out:
+ if (!rc)
+ cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
return rc;
}
@@ -938,6 +1029,7 @@ struct smb_version_operations smb1_operations = {
.clear_stats = cifs_clear_stats,
.print_stats = cifs_print_stats,
.is_oplock_break = is_valid_oplock_break,
+ .downgrade_oplock = cifs_downgrade_oplock,
.check_trans2 = cifs_check_trans2,
.need_neg = cifs_need_neg,
.negotiate = cifs_negotiate,
@@ -956,6 +1048,7 @@ struct smb_version_operations smb1_operations = {
.set_path_size = CIFSSMBSetEOF,
.set_file_size = CIFSSMBSetFileSize,
.set_file_info = smb_set_file_info,
+ .set_compression = cifs_set_compression,
.echo = CIFSSMBEcho,
.mkdir = CIFSSMBMkDir,
.mkdir_setinfo = cifs_mkdir_setinfo,
@@ -982,8 +1075,18 @@ struct smb_version_operations smb1_operations = {
.mand_lock = cifs_mand_lock,
.mand_unlock_range = cifs_unlock_range,
.push_mand_locks = cifs_push_mandatory_locks,
- .query_mf_symlink = open_query_close_cifs_symlink,
+ .query_mf_symlink = cifs_query_mf_symlink,
+ .create_mf_symlink = cifs_create_mf_symlink,
.is_read_op = cifs_is_read_op,
+#ifdef CONFIG_CIFS_XATTR
+ .query_all_EAs = CIFSSMBQAllEAs,
+ .set_EA = CIFSSMBSetEA,
+#endif /* CIFS_XATTR */
+#ifdef CONFIG_CIFS_ACL
+ .get_acl = get_cifs_acl,
+ .get_acl_by_fid = get_cifs_acl_by_fid,
+ .set_acl = set_cifs_acl,
+#endif /* CIFS_ACL */
};
struct smb_version_values smb1_values = {
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index c38350851b0..bc0bb9c34f7 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -57,4 +57,7 @@
#define SMB2_CMACAES_SIZE (16)
#define SMB3_SIGNKEY_SIZE (16)
+/* Maximum buffer size value we can send with 1 credit */
+#define SMB2_MAX_BUFFER_SIZE 65536
+
#endif /* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 78ff88c467b..84c012a6aba 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -123,12 +123,13 @@ move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src)
int
smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- FILE_ALL_INFO *data, bool *adjust_tz)
+ FILE_ALL_INFO *data, bool *adjust_tz, bool *symlink)
{
int rc;
struct smb2_file_all_info *smb2_data;
*adjust_tz = false;
+ *symlink = false;
smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
GFP_KERNEL);
@@ -136,9 +137,16 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
return -ENOMEM;
rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
- FILE_READ_ATTRIBUTES, FILE_OPEN,
- OPEN_REPARSE_POINT, smb2_data,
- SMB2_OP_QUERY_INFO);
+ FILE_READ_ATTRIBUTES, FILE_OPEN, 0,
+ smb2_data, SMB2_OP_QUERY_INFO);
+ if (rc == -EOPNOTSUPP) {
+ *symlink = true;
+ /* Failed on a symbolic link - query a reparse point info */
+ rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ OPEN_REPARSE_POINT, smb2_data,
+ SMB2_OP_QUERY_INFO);
+ }
if (rc)
goto out;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 7c2f45c06fc..94bd4fbb13d 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -306,7 +306,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_NONEXISTENT_SECTOR, -EIO, "STATUS_NONEXISTENT_SECTOR"},
{STATUS_MORE_PROCESSING_REQUIRED, -EIO,
"STATUS_MORE_PROCESSING_REQUIRED"},
- {STATUS_NO_MEMORY, -ENOMEM, "STATUS_NO_MEMORY"},
+ {STATUS_NO_MEMORY, -EREMOTEIO, "STATUS_NO_MEMORY"},
{STATUS_CONFLICTING_ADDRESSES, -EADDRINUSE,
"STATUS_CONFLICTING_ADDRESSES"},
{STATUS_NOT_MAPPED_VIEW, -EIO, "STATUS_NOT_MAPPED_VIEW"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index fb3966265b6..b8021fde987 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -575,9 +575,21 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
else
cfile->oplock_break_cancelled = false;
- server->ops->set_oplock_level(cinode,
- rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0,
- 0, NULL);
+ set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK,
+ &cinode->flags);
+
+ /*
+ * Set flag if the server downgrades the oplock
+ * to L2 else clear.
+ */
+ if (rsp->OplockLevel)
+ set_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
+ else
+ clear_bit(
+ CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2,
+ &cinode->flags);
queue_work(cifsiod_wq, &cfile->oplock_break);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 861b3321414..787844bde38 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -182,11 +182,8 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified wsize, or default */
wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
wsize = min_t(unsigned int, wsize, server->max_write);
- /*
- * limit write size to 2 ** 16, because we don't support multicredit
- * requests now.
- */
- wsize = min_t(unsigned int, wsize, 2 << 15);
+ /* set it to the maximum buffer size value we can send with 1 credit */
+ wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
return wsize;
}
@@ -200,15 +197,100 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
/* start with specified rsize, or default */
rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
rsize = min_t(unsigned int, rsize, server->max_read);
- /*
- * limit write size to 2 ** 16, because we don't support multicredit
- * requests now.
- */
- rsize = min_t(unsigned int, rsize, 2 << 15);
+ /* set it to the maximum buffer size value we can send with 1 credit */
+ rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
return rsize;
}
+#ifdef CONFIG_CIFS_STATS2
+static int
+SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
+{
+ int rc;
+ unsigned int ret_data_len = 0;
+ struct network_interface_info_ioctl_rsp *out_buf;
+
+ rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+ FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+ NULL /* no data input */, 0 /* no data input */,
+ (char **)&out_buf, &ret_data_len);
+
+ if ((rc == 0) && (ret_data_len > 0)) {
+ /* Dump info on first interface */
+ cifs_dbg(FYI, "Adapter Capability 0x%x\t",
+ le32_to_cpu(out_buf->Capability));
+ cifs_dbg(FYI, "Link Speed %lld\n",
+ le64_to_cpu(out_buf->LinkSpeed));
+ } else
+ cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
+
+ return rc;
+}
+#endif /* STATS2 */
+
+static void
+smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+ int rc;
+ __le16 srch_path = 0; /* Null - open root of share */
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+ if (rc)
+ return;
+
+#ifdef CONFIG_CIFS_STATS2
+ SMB3_request_interfaces(xid, tcon);
+#endif /* STATS2 */
+
+ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ FS_ATTRIBUTE_INFORMATION);
+ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ FS_DEVICE_INFORMATION);
+ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ return;
+}
+
+static void
+smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+{
+ int rc;
+ __le16 srch_path = 0; /* Null - open root of share */
+ u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+ struct cifs_open_parms oparms;
+ struct cifs_fid fid;
+
+ oparms.tcon = tcon;
+ oparms.desired_access = FILE_READ_ATTRIBUTES;
+ oparms.disposition = FILE_OPEN;
+ oparms.create_options = 0;
+ oparms.fid = &fid;
+ oparms.reconnect = false;
+
+ rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
+ if (rc)
+ return;
+
+ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ FS_ATTRIBUTE_INFORMATION);
+ SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid,
+ FS_DEVICE_INFORMATION);
+ SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
+ return;
+}
+
static int
smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path)
@@ -304,7 +386,19 @@ smb2_dump_share_caps(struct seq_file *m, struct cifs_tcon *tcon)
seq_puts(m, " ASYMMETRIC,");
if (tcon->capabilities == 0)
seq_puts(m, " None");
+ if (tcon->ss_flags & SSINFO_FLAGS_ALIGNED_DEVICE)
+ seq_puts(m, " Aligned,");
+ if (tcon->ss_flags & SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE)
+ seq_puts(m, " Partition Aligned,");
+ if (tcon->ss_flags & SSINFO_FLAGS_NO_SEEK_PENALTY)
+ seq_puts(m, " SSD,");
+ if (tcon->ss_flags & SSINFO_FLAGS_TRIM_ENABLED)
+ seq_puts(m, " TRIM-support,");
+
seq_printf(m, "\tShare Flags: 0x%x", tcon->share_flags);
+ if (tcon->perf_sector_size)
+ seq_printf(m, "\tOptimal sector size: 0x%x",
+ tcon->perf_sector_size);
}
static void
@@ -394,6 +488,157 @@ smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
}
static int
+SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid,
+ struct copychunk_ioctl *pcchunk)
+{
+ int rc;
+ unsigned int ret_data_len;
+ struct resume_key_req *res_key;
+
+ rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+ FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
+ NULL, 0 /* no input */,
+ (char **)&res_key, &ret_data_len);
+
+ if (rc) {
+ cifs_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
+ goto req_res_key_exit;
+ }
+ if (ret_data_len < sizeof(struct resume_key_req)) {
+ cifs_dbg(VFS, "Invalid refcopy resume key length\n");
+ rc = -EINVAL;
+ goto req_res_key_exit;
+ }
+ memcpy(pcchunk->SourceKey, res_key->ResumeKey, COPY_CHUNK_RES_KEY_SIZE);
+
+req_res_key_exit:
+ kfree(res_key);
+ return rc;
+}
+
+static int
+smb2_clone_range(const unsigned int xid,
+ struct cifsFileInfo *srcfile,
+ struct cifsFileInfo *trgtfile, u64 src_off,
+ u64 len, u64 dest_off)
+{
+ int rc;
+ unsigned int ret_data_len;
+ struct copychunk_ioctl *pcchunk;
+ struct copychunk_ioctl_rsp *retbuf = NULL;
+ struct cifs_tcon *tcon;
+ int chunks_copied = 0;
+ bool chunk_sizes_updated = false;
+
+ pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+
+ if (pcchunk == NULL)
+ return -ENOMEM;
+
+ cifs_dbg(FYI, "in smb2_clone_range - about to call request res key\n");
+ /* Request a key from the server to identify the source of the copy */
+ rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink),
+ srcfile->fid.persistent_fid,
+ srcfile->fid.volatile_fid, pcchunk);
+
+ /* Note: request_res_key sets res_key null only if rc !=0 */
+ if (rc)
+ goto cchunk_out;
+
+ /* For now array only one chunk long, will make more flexible later */
+ pcchunk->ChunkCount = __constant_cpu_to_le32(1);
+ pcchunk->Reserved = 0;
+ pcchunk->Reserved2 = 0;
+
+ tcon = tlink_tcon(trgtfile->tlink);
+
+ while (len > 0) {
+ pcchunk->SourceOffset = cpu_to_le64(src_off);
+ pcchunk->TargetOffset = cpu_to_le64(dest_off);
+ pcchunk->Length =
+ cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
+
+ /* Request server copy to target from src identified by key */
+ rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
+ trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
+ true /* is_fsctl */, (char *)pcchunk,
+ sizeof(struct copychunk_ioctl), (char **)&retbuf,
+ &ret_data_len);
+ if (rc == 0) {
+ if (ret_data_len !=
+ sizeof(struct copychunk_ioctl_rsp)) {
+ cifs_dbg(VFS, "invalid cchunk response size\n");
+ rc = -EIO;
+ goto cchunk_out;
+ }
+ if (retbuf->TotalBytesWritten == 0) {
+ cifs_dbg(FYI, "no bytes copied\n");
+ rc = -EIO;
+ goto cchunk_out;
+ }
+ /*
+ * Check if server claimed to write more than we asked
+ */
+ if (le32_to_cpu(retbuf->TotalBytesWritten) >
+ le32_to_cpu(pcchunk->Length)) {
+ cifs_dbg(VFS, "invalid copy chunk response\n");
+ rc = -EIO;
+ goto cchunk_out;
+ }
+ if (le32_to_cpu(retbuf->ChunksWritten) != 1) {
+ cifs_dbg(VFS, "invalid num chunks written\n");
+ rc = -EIO;
+ goto cchunk_out;
+ }
+ chunks_copied++;
+
+ src_off += le32_to_cpu(retbuf->TotalBytesWritten);
+ dest_off += le32_to_cpu(retbuf->TotalBytesWritten);
+ len -= le32_to_cpu(retbuf->TotalBytesWritten);
+
+ cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n",
+ le32_to_cpu(retbuf->ChunksWritten),
+ le32_to_cpu(retbuf->ChunkBytesWritten),
+ le32_to_cpu(retbuf->TotalBytesWritten));
+ } else if (rc == -EINVAL) {
+ if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
+ goto cchunk_out;
+
+ cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n",
+ le32_to_cpu(retbuf->ChunksWritten),
+ le32_to_cpu(retbuf->ChunkBytesWritten),
+ le32_to_cpu(retbuf->TotalBytesWritten));
+
+ /*
+ * Check if this is the first request using these sizes,
+ * (ie check if copy succeed once with original sizes
+ * and check if the server gave us different sizes after
+ * we already updated max sizes on previous request).
+ * if not then why is the server returning an error now
+ */
+ if ((chunks_copied != 0) || chunk_sizes_updated)
+ goto cchunk_out;
+
+ /* Check that server is not asking us to grow size */
+ if (le32_to_cpu(retbuf->ChunkBytesWritten) <
+ tcon->max_bytes_chunk)
+ tcon->max_bytes_chunk =
+ le32_to_cpu(retbuf->ChunkBytesWritten);
+ else
+ goto cchunk_out; /* server gave us bogus size */
+
+ /* No need to change MaxChunks since already set to 1 */
+ chunk_sizes_updated = true;
+ }
+ }
+
+cchunk_out:
+ kfree(pcchunk);
+ return rc;
+}
+
+static int
smb2_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid *fid)
{
@@ -446,6 +691,14 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
}
static int
+smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifsFileInfo *cfile)
+{
+ return SMB2_set_compression(xid, tcon, cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid);
+}
+
+static int
smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
const char *path, struct cifs_sb_info *cifs_sb,
struct cifs_fid *fid, __u16 search_flags,
@@ -652,6 +905,17 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
}
static void
+smb2_downgrade_oplock(struct TCP_Server_Info *server,
+ struct cifsInodeInfo *cinode, bool set_level2)
+{
+ if (set_level2)
+ server->ops->set_oplock_level(cinode, SMB2_OPLOCK_LEVEL_II,
+ 0, NULL);
+ else
+ server->ops->set_oplock_level(cinode, 0, 0, NULL);
+}
+
+static void
smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
unsigned int epoch, bool *purge_cache)
{
@@ -783,6 +1047,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock)
buf->ccontext.NameOffset = cpu_to_le16(offsetof
(struct create_lease, Name));
buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */
buf->Name[0] = 'R';
buf->Name[1] = 'q';
buf->Name[2] = 'L';
@@ -809,6 +1074,7 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock)
buf->ccontext.NameOffset = cpu_to_le16(offsetof
(struct create_lease_v2, Name));
buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_REQUEST_LEASE is "RqLs" */
buf->Name[0] = 'R';
buf->Name[1] = 'q';
buf->Name[2] = 'L';
@@ -857,6 +1123,7 @@ struct smb_version_operations smb20_operations = {
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -865,6 +1132,7 @@ struct smb_version_operations smb20_operations = {
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
.tree_disconnect = SMB2_tdis,
+ .qfs_tcon = smb2_qfs_tcon,
.is_path_accessible = smb2_is_path_accessible,
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
@@ -874,6 +1142,7 @@ struct smb_version_operations smb20_operations = {
.set_path_size = smb2_set_path_size,
.set_file_size = smb2_set_file_size,
.set_file_info = smb2_set_file_info,
+ .set_compression = smb2_set_compression,
.mkdir = smb2_mkdir,
.mkdir_setinfo = smb2_mkdir_setinfo,
.rmdir = smb2_rmdir,
@@ -907,6 +1176,7 @@ struct smb_version_operations smb20_operations = {
.set_oplock_level = smb2_set_oplock_level,
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
+ .clone_range = smb2_clone_range,
};
struct smb_version_operations smb21_operations = {
@@ -928,6 +1198,7 @@ struct smb_version_operations smb21_operations = {
.clear_stats = smb2_clear_stats,
.print_stats = smb2_print_stats,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -936,6 +1207,7 @@ struct smb_version_operations smb21_operations = {
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
.tree_disconnect = SMB2_tdis,
+ .qfs_tcon = smb2_qfs_tcon,
.is_path_accessible = smb2_is_path_accessible,
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
@@ -945,6 +1217,7 @@ struct smb_version_operations smb21_operations = {
.set_path_size = smb2_set_path_size,
.set_file_size = smb2_set_file_size,
.set_file_info = smb2_set_file_info,
+ .set_compression = smb2_set_compression,
.mkdir = smb2_mkdir,
.mkdir_setinfo = smb2_mkdir_setinfo,
.rmdir = smb2_rmdir,
@@ -978,6 +1251,7 @@ struct smb_version_operations smb21_operations = {
.set_oplock_level = smb21_set_oplock_level,
.create_lease_buf = smb2_create_lease_buf,
.parse_lease_buf = smb2_parse_lease_buf,
+ .clone_range = smb2_clone_range,
};
struct smb_version_operations smb30_operations = {
@@ -1000,6 +1274,7 @@ struct smb_version_operations smb30_operations = {
.print_stats = smb2_print_stats,
.dump_share_caps = smb2_dump_share_caps,
.is_oplock_break = smb2_is_valid_oplock_break,
+ .downgrade_oplock = smb2_downgrade_oplock,
.need_neg = smb2_need_neg,
.negotiate = smb2_negotiate,
.negotiate_wsize = smb2_negotiate_wsize,
@@ -1008,6 +1283,7 @@ struct smb_version_operations smb30_operations = {
.logoff = SMB2_logoff,
.tree_connect = SMB2_tcon,
.tree_disconnect = SMB2_tdis,
+ .qfs_tcon = smb3_qfs_tcon,
.is_path_accessible = smb2_is_path_accessible,
.can_echo = smb2_can_echo,
.echo = SMB2_echo,
@@ -1017,6 +1293,7 @@ struct smb_version_operations smb30_operations = {
.set_path_size = smb2_set_path_size,
.set_file_size = smb2_set_file_size,
.set_file_info = smb2_set_file_info,
+ .set_compression = smb2_set_compression,
.mkdir = smb2_mkdir,
.mkdir_setinfo = smb2_mkdir_setinfo,
.rmdir = smb2_rmdir,
@@ -1051,6 +1328,8 @@ struct smb_version_operations smb30_operations = {
.set_oplock_level = smb3_set_oplock_level,
.create_lease_buf = smb3_create_lease_buf,
.parse_lease_buf = smb3_parse_lease_buf,
+ .clone_range = smb2_clone_range,
+ .validate_negotiate = smb3_validate_negotiate,
};
struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index eba0efde66d..b0b260dbb19 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -375,7 +375,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
req->Capabilities = cpu_to_le32(ses->server->vals->req_capabilities);
- memcpy(req->ClientGUID, cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
+ /* ClientGUID must be zero for SMB2.02 dialect */
+ if (ses->server->vals->protocol_id == SMB20_PROT_ID)
+ memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE);
+ else
+ memcpy(req->ClientGUID, server->client_guid,
+ SMB2_CLIENT_GUID_SIZE);
iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field */
@@ -413,7 +418,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
/* SMB2 only has an extended negflavor */
server->negflavor = CIFS_NEGFLAVOR_EXTENDED;
- server->maxBuf = le32_to_cpu(rsp->MaxTransactSize);
+ /* set it to the maximum buffer size value we can send with 1 credit */
+ server->maxBuf = min_t(unsigned int, le32_to_cpu(rsp->MaxTransactSize),
+ SMB2_MAX_BUFFER_SIZE);
server->max_read = le32_to_cpu(rsp->MaxReadSize);
server->max_write = le32_to_cpu(rsp->MaxWriteSize);
/* BB Do we need to validate the SecurityMode? */
@@ -454,6 +461,82 @@ neg_exit:
return rc;
}
+int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
+{
+ int rc = 0;
+ struct validate_negotiate_info_req vneg_inbuf;
+ struct validate_negotiate_info_rsp *pneg_rsp;
+ u32 rsplen;
+
+ cifs_dbg(FYI, "validate negotiate\n");
+
+ /*
+ * validation ioctl must be signed, so no point sending this if we
+ * can not sign it. We could eventually change this to selectively
+ * sign just this, the first and only signed request on a connection.
+ * This is good enough for now since a user who wants better security
+ * would also enable signing on the mount. Having validation of
+ * negotiate info for signed connections helps reduce attack vectors
+ */
+ if (tcon->ses->server->sign == false)
+ return 0; /* validation requires signing */
+
+ vneg_inbuf.Capabilities =
+ cpu_to_le32(tcon->ses->server->vals->req_capabilities);
+ memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid,
+ SMB2_CLIENT_GUID_SIZE);
+
+ if (tcon->ses->sign)
+ vneg_inbuf.SecurityMode =
+ cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+ else if (global_secflags & CIFSSEC_MAY_SIGN)
+ vneg_inbuf.SecurityMode =
+ cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+ else
+ vneg_inbuf.SecurityMode = 0;
+
+ vneg_inbuf.DialectCount = cpu_to_le16(1);
+ vneg_inbuf.Dialects[0] =
+ cpu_to_le16(tcon->ses->server->vals->protocol_id);
+
+ rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+ FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+ (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
+ (char **)&pneg_rsp, &rsplen);
+
+ if (rc != 0) {
+ cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
+ return -EIO;
+ }
+
+ if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
+ cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
+ return -EIO;
+ }
+
+ /* check validate negotiate info response matches what we got earlier */
+ if (pneg_rsp->Dialect !=
+ cpu_to_le16(tcon->ses->server->vals->protocol_id))
+ goto vneg_out;
+
+ if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
+ goto vneg_out;
+
+ /* do not validate server guid because not saved at negprot time yet */
+
+ if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND |
+ SMB2_LARGE_FILES) != tcon->ses->server->capabilities)
+ goto vneg_out;
+
+ /* validate negotiate successful */
+ cifs_dbg(FYI, "validate negotiate info successful\n");
+ return 0;
+
+vneg_out:
+ cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
+ return -EIO;
+}
+
int
SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_cp)
@@ -630,6 +713,8 @@ ssetup_ntlmssp_authenticate:
goto ssetup_exit;
ses->session_flags = le16_to_cpu(rsp->SessionFlags);
+ if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+ cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
ssetup_exit:
free_rsp_buf(resp_buftype, rsp);
@@ -687,6 +772,10 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
else
return -EIO;
+ /* no need to send SMB logoff if uid already closed due to reconnect */
+ if (ses->need_reconnect)
+ goto smb2_session_already_dead;
+
rc = small_smb2_init(SMB2_LOGOFF, NULL, (void **) &req);
if (rc)
return rc;
@@ -701,6 +790,8 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
* No tcon so can't do
* cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
*/
+
+smb2_session_already_dead:
return rc;
}
@@ -711,6 +802,14 @@ static inline void cifs_stats_fail_inc(struct cifs_tcon *tcon, uint16_t code)
#define MAX_SHARENAME_LENGTH (255 /* server */ + 80 /* share */ + 1 /* NULL */)
+/* These are similar values to what Windows uses */
+static inline void init_copy_chunk_defaults(struct cifs_tcon *tcon)
+{
+ tcon->max_chunks = 256;
+ tcon->max_bytes_chunk = 1048576;
+ tcon->max_bytes_copy = 16777216;
+}
+
int
SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
struct cifs_tcon *tcon, const struct nls_table *cp)
@@ -812,7 +911,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
-
+ init_copy_chunk_defaults(tcon);
+ if (tcon->ses->server->ops->validate_negotiate)
+ rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
tcon_exit:
free_rsp_buf(resp_buftype, rsp);
kfree(unc_path);
@@ -871,6 +972,7 @@ create_durable_buf(void)
buf->ccontext.NameOffset = cpu_to_le16(offsetof
(struct create_durable, Name));
buf->ccontext.NameLength = cpu_to_le16(4);
+ /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DHnQ" */
buf->Name[0] = 'D';
buf->Name[1] = 'H';
buf->Name[2] = 'n';
@@ -895,6 +997,7 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
buf->ccontext.NameLength = cpu_to_le16(4);
buf->Data.Fid.PersistentFileId = fid->persistent_fid;
buf->Data.Fid.VolatileFileId = fid->volatile_fid;
+ /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT is "DHnC" */
buf->Name[0] = 'D';
buf->Name[1] = 'H';
buf->Name[2] = 'n';
@@ -994,6 +1097,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
int rc = 0;
unsigned int num_iovecs = 2;
__u32 file_attributes = 0;
+ char *dhc_buf = NULL, *lc_buf = NULL;
cifs_dbg(FYI, "create/open\n");
@@ -1060,6 +1164,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
kfree(copy_path);
return rc;
}
+ lc_buf = iov[num_iovecs-1].iov_base;
}
if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
@@ -1074,9 +1179,10 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
- kfree(iov[num_iovecs-1].iov_base);
+ kfree(lc_buf);
return rc;
}
+ dhc_buf = iov[num_iovecs-1].iov_base;
}
rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
@@ -1108,6 +1214,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
*oplock = rsp->OplockLevel;
creat_exit:
kfree(copy_path);
+ kfree(lc_buf);
+ kfree(dhc_buf);
free_rsp_buf(resp_buftype, rsp);
return rc;
}
@@ -1131,6 +1239,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
cifs_dbg(FYI, "SMB2 IOCTL\n");
+ *out_data = NULL;
/* zero out returned data len, in case of error */
if (plen)
*plen = 0;
@@ -1176,19 +1285,38 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
req->Flags = 0;
iov[0].iov_base = (char *)req;
- /* 4 for rfc1002 length field */
- iov[0].iov_len = get_rfc1002_length(req) + 4;
- if (indatalen)
- inc_rfc1001_len(req, indatalen);
+ /*
+ * If no input data, the size of ioctl struct in
+ * protocol spec still includes a 1 byte data buffer,
+ * but if input data passed to ioctl, we do not
+ * want to double count this, so we do not send
+ * the dummy one byte of data in iovec[0] if sending
+ * input data (in iovec[1]). We also must add 4 bytes
+ * in first iovec to allow for rfc1002 length field.
+ */
+
+ if (indatalen) {
+ iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
+ inc_rfc1001_len(req, indatalen - 1);
+ } else
+ iov[0].iov_len = get_rfc1002_length(req) + 4;
+
rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
- if (rc != 0) {
+ if ((rc != 0) && (rc != -EINVAL)) {
if (tcon)
cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
goto ioctl_exit;
+ } else if (rc == -EINVAL) {
+ if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
+ (opcode != FSCTL_SRV_COPYCHUNK)) {
+ if (tcon)
+ cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+ goto ioctl_exit;
+ }
}
/* check if caller wants to look at return data or just return rc */
@@ -1228,6 +1356,31 @@ ioctl_exit:
return rc;
}
+/*
+ * Individual callers to ioctl worker function follow
+ */
+
+int
+SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid)
+{
+ int rc;
+ struct compress_ioctl fsctl_input;
+ char *ret_data = NULL;
+
+ fsctl_input.CompressionState =
+ __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
+
+ rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
+ FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+ (char *)&fsctl_input /* data input */,
+ 2 /* in data len */, &ret_data /* out data */, NULL);
+
+ cifs_dbg(FYI, "set compression rc %d\n", rc);
+
+ return rc;
+}
+
int
SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
@@ -1750,7 +1903,8 @@ smb2_writev_callback(struct mid_q_entry *mid)
/* smb2_async_writev - send an async write, and set up mid to handle result */
int
-smb2_async_writev(struct cifs_writedata *wdata)
+smb2_async_writev(struct cifs_writedata *wdata,
+ void (*release)(struct kref *kref))
{
int rc = -EACCES;
struct smb2_write_req *req = NULL;
@@ -1798,7 +1952,7 @@ smb2_async_writev(struct cifs_writedata *wdata)
smb2_writev_callback, wdata, 0);
if (rc) {
- kref_put(&wdata->refcount, cifs_writedata_release);
+ kref_put(&wdata->refcount, release);
cifs_stats_fail_inc(tcon, SMB2_WRITE_HE);
}
@@ -2098,11 +2252,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
- if (rc != 0) {
+ if (rc != 0)
cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
- goto out;
- }
-out:
+
free_rsp_buf(resp_buftype, rsp);
kfree(iov);
return rc;
@@ -2293,7 +2445,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
- goto qinf_exit;
+ goto qfsinf_exit;
}
rsp = (struct smb2_query_info_rsp *)iov.iov_base;
@@ -2305,7 +2457,70 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
if (!rc)
copy_fs_info_to_kstatfs(info, fsdata);
-qinf_exit:
+qfsinf_exit:
+ free_rsp_buf(resp_buftype, iov.iov_base);
+ return rc;
+}
+
+int
+SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid, int level)
+{
+ struct smb2_query_info_rsp *rsp = NULL;
+ struct kvec iov;
+ int rc = 0;
+ int resp_buftype, max_len, min_len;
+ struct cifs_ses *ses = tcon->ses;
+ unsigned int rsp_len, offset;
+
+ if (level == FS_DEVICE_INFORMATION) {
+ max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+ min_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
+ } else if (level == FS_ATTRIBUTE_INFORMATION) {
+ max_len = sizeof(FILE_SYSTEM_ATTRIBUTE_INFO);
+ min_len = MIN_FS_ATTR_INFO_SIZE;
+ } else if (level == FS_SECTOR_SIZE_INFORMATION) {
+ max_len = sizeof(struct smb3_fs_ss_info);
+ min_len = sizeof(struct smb3_fs_ss_info);
+ } else {
+ cifs_dbg(FYI, "Invalid qfsinfo level %d\n", level);
+ return -EINVAL;
+ }
+
+ rc = build_qfs_info_req(&iov, tcon, level, max_len,
+ persistent_fid, volatile_fid);
+ if (rc)
+ return rc;
+
+ rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+ if (rc) {
+ cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
+ goto qfsattr_exit;
+ }
+ rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+
+ rsp_len = le32_to_cpu(rsp->OutputBufferLength);
+ offset = le16_to_cpu(rsp->OutputBufferOffset);
+ rc = validate_buf(offset, rsp_len, &rsp->hdr, min_len);
+ if (rc)
+ goto qfsattr_exit;
+
+ if (level == FS_ATTRIBUTE_INFORMATION)
+ memcpy(&tcon->fsAttrInfo, 4 /* RFC1001 len */ + offset
+ + (char *)&rsp->hdr, min_t(unsigned int,
+ rsp_len, max_len));
+ else if (level == FS_DEVICE_INFORMATION)
+ memcpy(&tcon->fsDevInfo, 4 /* RFC1001 len */ + offset
+ + (char *)&rsp->hdr, sizeof(FILE_SYSTEM_DEVICE_INFO));
+ else if (level == FS_SECTOR_SIZE_INFORMATION) {
+ struct smb3_fs_ss_info *ss_info = (struct smb3_fs_ss_info *)
+ (4 /* RFC1001 len */ + offset + (char *)&rsp->hdr);
+ tcon->ss_flags = le32_to_cpu(ss_info->Flags);
+ tcon->perf_sector_size =
+ le32_to_cpu(ss_info->PhysicalBytesPerSectorForPerf);
+ }
+
+qfsattr_exit:
free_rsp_buf(resp_buftype, iov.iov_base);
return rc;
}
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index b83d0118a75..69f3595d395 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -122,6 +122,23 @@ struct smb2_pdu {
__le16 StructureSize2; /* size of wct area (varies, request specific) */
} __packed;
+struct smb2_transform_hdr {
+ __be32 smb2_buf_length; /* big endian on wire */
+ /* length is only two or three bytes - with
+ one or two byte type preceding it that MBZ */
+ __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */
+ __u8 Signature[16];
+ __u8 Nonce[11];
+ __u8 Reserved[5];
+ __le32 OriginalMessageSize;
+ __u16 Reserved1;
+ __le16 EncryptionAlgorithm;
+ __u64 SessionId;
+} __packed;
+
+/* Encryption Algorithms */
+#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001)
+
/*
* SMB2 flag definitions
*/
@@ -166,8 +183,6 @@ struct smb2_symlink_err_rsp {
#define SMB2_CLIENT_GUID_SIZE 16
-extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
-
struct smb2_negotiate_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 36 */
@@ -237,6 +252,7 @@ struct smb2_sess_setup_req {
/* Currently defined SessionFlags */
#define SMB2_SESSION_FLAG_IS_GUEST 0x0001
#define SMB2_SESSION_FLAG_IS_NULL 0x0002
+#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004
struct smb2_sess_setup_rsp {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 9 */
@@ -419,11 +435,15 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AlSi"
+#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
#define SMB2_CREATE_REQUEST_LEASE "RqLs"
+#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q"
+#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C"
+#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74
+#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9
struct smb2_create_req {
struct smb2_hdr hdr;
@@ -534,9 +554,16 @@ struct create_durable {
} Data;
} __packed;
+#define COPY_CHUNK_RES_KEY_SIZE 24
+struct resume_key_req {
+ char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
+ __le32 ContextLength; /* MBZ */
+ char Context[0]; /* ignored, Windows sets to 4 bytes of zero */
+} __packed;
+
/* this goes in the ioctl buffer when doing a copychunk request */
struct copychunk_ioctl {
- char SourceKey[24];
+ char SourceKey[COPY_CHUNK_RES_KEY_SIZE];
__le32 ChunkCount; /* we are only sending 1 */
__le32 Reserved;
/* array will only be one chunk long for us */
@@ -546,13 +573,25 @@ struct copychunk_ioctl {
__u32 Reserved2;
} __packed;
-/* Response and Request are the same format */
-struct validate_negotiate_info {
+struct copychunk_ioctl_rsp {
+ __le32 ChunksWritten;
+ __le32 ChunkBytesWritten;
+ __le32 TotalBytesWritten;
+} __packed;
+
+struct validate_negotiate_info_req {
__le32 Capabilities;
__u8 Guid[SMB2_CLIENT_GUID_SIZE];
__le16 SecurityMode;
__le16 DialectCount;
- __le16 Dialect[1];
+ __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+ __le32 Capabilities;
+ __u8 Guid[SMB2_CLIENT_GUID_SIZE];
+ __le16 SecurityMode;
+ __le16 Dialect; /* Dialect in use for the connection */
} __packed;
#define RSS_CAPABLE 0x00000001
@@ -569,6 +608,10 @@ struct network_interface_info_ioctl_rsp {
#define NO_FILE_ID 0xFFFFFFFFFFFFFFFFULL /* general ioctls to srv not to file */
+struct compress_ioctl {
+ __le16 CompressionState; /* See cifspdu.h for possible flag values */
+} __packed;
+
struct smb2_ioctl_req {
struct smb2_hdr hdr;
__le16 StructureSize; /* Must be 57 */
@@ -584,7 +627,7 @@ struct smb2_ioctl_req {
__le32 MaxOutputResponse;
__le32 Flags;
__u32 Reserved2;
- char Buffer[0];
+ __u8 Buffer[0];
} __packed;
struct smb2_ioctl_rsp {
@@ -870,14 +913,16 @@ struct smb2_lease_ack {
/* File System Information Classes */
#define FS_VOLUME_INFORMATION 1 /* Query */
-#define FS_LABEL_INFORMATION 2 /* Set */
+#define FS_LABEL_INFORMATION 2 /* Local only */
#define FS_SIZE_INFORMATION 3 /* Query */
#define FS_DEVICE_INFORMATION 4 /* Query */
#define FS_ATTRIBUTE_INFORMATION 5 /* Query */
#define FS_CONTROL_INFORMATION 6 /* Query, Set */
#define FS_FULL_SIZE_INFORMATION 7 /* Query */
#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */
-#define FS_DRIVER_PATH_INFORMATION 9 /* Query */
+#define FS_DRIVER_PATH_INFORMATION 9 /* Local only */
+#define FS_VOLUME_FLAGS_INFORMATION 10 /* Local only */
+#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */
struct smb2_fs_full_size_info {
__le64 TotalAllocationUnits;
@@ -887,6 +932,22 @@ struct smb2_fs_full_size_info {
__le32 BytesPerSector;
} __packed;
+#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001
+#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002
+#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004
+#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008
+
+/* sector size info struct */
+struct smb3_fs_ss_info {
+ __le32 LogicalBytesPerSector;
+ __le32 PhysicalBytesPerSectorForAtomicity;
+ __le32 PhysicalBytesPerSectorForPerf;
+ __le32 FileSystemEffectivePhysicalBytesPerSectorForAtomicity;
+ __le32 Flags;
+ __le32 ByteOffsetForSectorAlignment;
+ __le32 ByteOffsetForPartitionAlignment;
+} __packed;
+
/* partial list of QUERY INFO levels */
#define FILE_DIRECTORY_INFORMATION 1
#define FILE_FULL_DIRECTORY_INFORMATION 2
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index e3fb4801ee9..0ce48db20a6 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -61,7 +61,7 @@ extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
extern int smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path, FILE_ALL_INFO *data,
- bool *adjust_tz);
+ bool *adjust_tz, bool *symlink);
extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_alloc);
@@ -123,7 +123,8 @@ extern int SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon,
extern int smb2_async_readv(struct cifs_readdata *rdata);
extern int SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, char **buf, int *buf_type);
-extern int smb2_async_writev(struct cifs_writedata *wdata);
+extern int smb2_async_writev(struct cifs_writedata *wdata,
+ void (*release)(struct kref *kref));
extern int SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
unsigned int *nbytes, struct kvec *iov, int n_vec);
extern int SMB2_echo(struct TCP_Server_Info *server);
@@ -142,12 +143,16 @@ extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid,
FILE_BASIC_INFO *buf);
+extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_fid, u64 volatile_fid);
extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
const u64 persistent_fid, const u64 volatile_fid,
const __u8 oplock_level);
extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_file_id, u64 volatile_file_id,
struct kstatfs *FSData);
+extern int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
+ u64 persistent_file_id, u64 volatile_file_id, int lvl);
extern int SMB2_lock(const unsigned int xid, struct cifs_tcon *tcon,
const __u64 persist_fid, const __u64 volatile_fid,
const __u32 pid, const __u64 length, const __u64 offset,
@@ -158,5 +163,6 @@ extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
struct smb2_lock_element *buf);
extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
__u8 *lease_key, const __le32 lease_state);
+extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 340abca3aa5..59c748ce872 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -466,7 +466,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
static inline void
smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
{
- hdr->MessageId = get_next_mid(server);
+ hdr->MessageId = get_next_mid64(server);
}
static struct mid_q_entry *
@@ -516,13 +516,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
return -EAGAIN;
}
- if (ses->status != CifsGood) {
- /* check if SMB2 session is bad because we are setting it up */
+ if (ses->status == CifsNew) {
if ((buf->Command != SMB2_SESSION_SETUP) &&
(buf->Command != SMB2_NEGOTIATE))
return -EAGAIN;
/* else ok - we are setting up session */
}
+
+ if (ses->status == CifsExiting) {
+ if (buf->Command != SMB2_LOGOFF)
+ return -EAGAIN;
+ /* else ok - we are shutting down the session */
+ }
+
*mid = smb2_mid_entry_alloc(buf, ses->server);
if (*mid == NULL)
return -ENOMEM;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index d952ee48f4d..0e538b5c962 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,16 +90,30 @@
#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
-#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
/* Perform server-side data movement */
#define FSCTL_SRV_COPYCHUNK 0x001440F2
#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC /* BB add struct */
#define FSCTL_SRV_READ_HASH 0x001441BB /* BB add struct */
+/* See FSCC 2.1.2.5 */
#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003
#define IO_REPARSE_TAG_HSM 0xC0000004
#define IO_REPARSE_TAG_SIS 0x80000007
+#define IO_REPARSE_TAG_HSM2 0x80000006
+#define IO_REPARSE_TAG_DRIVER_EXTENDER 0x80000005
+/* Used by the DFS filter. See MS-DFSC */
+#define IO_REPARSE_TAG_DFS 0x8000000A
+/* Used by the DFS filter See MS-DFSC */
+#define IO_REPARSE_TAG_DFSR 0x80000012
+#define IO_REPARSE_TAG_FILTER_MANAGER 0x8000000B
+/* See section MS-FSCC 2.1.2.4 */
+#define IO_REPARSE_TAG_SYMLINK 0xA000000C
+#define IO_REPARSE_TAG_DEDUP 0x80000013
+#define IO_REPARSE_APPXSTREAM 0xC0000014
+/* NFS symlinks, Win 8/SMB3 and later */
+#define IO_REPARSE_TAG_NFS 0x80000014
/* fsctl flags */
/* If Flags is set to this value, the request is an FSCTL not ioctl request */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 6fdcb1b4a10..18cd5650a5f 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -58,7 +58,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
return temp;
else {
memset(temp, 0, sizeof(struct mid_q_entry));
- temp->mid = smb_buffer->Mid; /* always LE */
+ temp->mid = get_mid(smb_buffer);
temp->pid = current->pid;
temp->command = cpu_to_le16(smb_buffer->Command);
cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
@@ -270,6 +270,26 @@ cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
iov->iov_len = rqst->rq_pagesz;
}
+static unsigned long
+rqst_len(struct smb_rqst *rqst)
+{
+ unsigned int i;
+ struct kvec *iov = rqst->rq_iov;
+ unsigned long buflen = 0;
+
+ /* total up iov array first */
+ for (i = 0; i < rqst->rq_nvec; i++)
+ buflen += iov[i].iov_len;
+
+ /* add in the page array if there is one */
+ if (rqst->rq_npages) {
+ buflen += rqst->rq_pagesz * (rqst->rq_npages - 1);
+ buflen += rqst->rq_tailsz;
+ }
+
+ return buflen;
+}
+
static int
smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
{
@@ -277,6 +297,7 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
struct kvec *iov = rqst->rq_iov;
int n_vec = rqst->rq_nvec;
unsigned int smb_buf_length = get_rfc1002_length(iov[0].iov_base);
+ unsigned long send_length;
unsigned int i;
size_t total_len = 0, sent;
struct socket *ssocket = server->ssocket;
@@ -285,6 +306,14 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
if (ssocket == NULL)
return -ENOTSOCK;
+ /* sanity check send length */
+ send_length = rqst_len(rqst);
+ if (send_length != smb_buf_length + 4) {
+ WARN(1, "Send length mismatch(send_length=%lu smb_buf_length=%u)\n",
+ send_length, smb_buf_length);
+ return -EIO;
+ }
+
cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
dump_smb(iov[0].iov_base, iov[0].iov_len);
@@ -410,8 +439,13 @@ static int
wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
const int optype)
{
- return wait_for_free_credits(server, timeout,
- server->ops->get_credits_field(server, optype));
+ int *val;
+
+ val = server->ops->get_credits_field(server, optype);
+ /* Since an echo is already inflight, no need to wait to send another */
+ if (*val <= 0 && optype == CIFS_ECHO_OP)
+ return -EAGAIN;
+ return wait_for_free_credits(server, timeout, val);
}
static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
@@ -426,13 +460,20 @@ static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
return -EAGAIN;
}
- if (ses->status != CifsGood) {
- /* check if SMB session is bad because we are setting it up */
+ if (ses->status == CifsNew) {
if ((in_buf->Command != SMB_COM_SESSION_SETUP_ANDX) &&
(in_buf->Command != SMB_COM_NEGOTIATE))
return -EAGAIN;
/* else ok - we are setting up session */
}
+
+ if (ses->status == CifsExiting) {
+ /* check if SMB session is bad because we are setting it up */
+ if (in_buf->Command != SMB_COM_LOGOFF_ANDX)
+ return -EAGAIN;
+ /* else ok - we are shutting down session */
+ }
+
*ppmidQ = AllocMidQEntry(in_buf, ses->server);
if (*ppmidQ == NULL)
return -ENOMEM;
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 09afda4cc58..5ac836a86b1 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -82,9 +82,11 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
goto remove_ea_exit;
ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
- rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, NULL,
- (__u16)0, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (pTcon->ses->server->ops->set_EA)
+ rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+ full_path, ea_name, NULL, (__u16)0,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
}
remove_ea_exit:
kfree(full_path);
@@ -149,18 +151,22 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
cifs_dbg(FYI, "attempt to set cifs inode metadata\n");
ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
- rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
- (__u16)value_size, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (pTcon->ses->server->ops->set_EA)
+ rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+ full_path, ea_name, ea_value, (__u16)value_size,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
== 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto set_ea_exit;
ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
- rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value,
- (__u16)value_size, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (pTcon->ses->server->ops->set_EA)
+ rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
+ full_path, ea_name, ea_value, (__u16)value_size,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
} else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
#ifdef CONFIG_CIFS_ACL
@@ -170,8 +176,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
rc = -ENOMEM;
} else {
memcpy(pacl, ea_value, value_size);
- rc = set_cifs_acl(pacl, value_size,
- direntry->d_inode, full_path, CIFS_ACL_DACL);
+ if (pTcon->ses->server->ops->set_acl)
+ rc = pTcon->ses->server->ops->set_acl(pacl,
+ value_size, direntry->d_inode,
+ full_path, CIFS_ACL_DACL);
+ else
+ rc = -EOPNOTSUPP;
if (rc == 0) /* force revalidate of the inode */
CIFS_I(direntry->d_inode)->time = 0;
kfree(pacl);
@@ -272,17 +282,21 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
/* revalidate/getattr then populate from inode */
} /* BB add else when above is implemented */
ea_name += XATTR_USER_PREFIX_LEN; /* skip past user. prefix */
- rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
- buf_size, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (pTcon->ses->server->ops->query_all_EAs)
+ rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+ full_path, ea_name, ea_value, buf_size,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
} else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
goto get_ea_exit;
ea_name += XATTR_OS2_PREFIX_LEN; /* skip past os2. prefix */
- rc = CIFSSMBQAllEAs(xid, pTcon, full_path, ea_name, ea_value,
- buf_size, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (pTcon->ses->server->ops->query_all_EAs)
+ rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+ full_path, ea_name, ea_value, buf_size,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
#ifdef CONFIG_CIFS_POSIX
@@ -313,8 +327,11 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
u32 acllen;
struct cifs_ntsd *pacl;
- pacl = get_cifs_acl(cifs_sb, direntry->d_inode,
- full_path, &acllen);
+ if (pTcon->ses->server->ops->get_acl == NULL)
+ goto get_ea_exit; /* rc already EOPNOTSUPP */
+
+ pacl = pTcon->ses->server->ops->get_acl(cifs_sb,
+ direntry->d_inode, full_path, &acllen);
if (IS_ERR(pacl)) {
rc = PTR_ERR(pacl);
cifs_dbg(VFS, "%s: error %zd getting sec desc\n",
@@ -400,11 +417,12 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
/* if proc/fs/cifs/streamstoxattr is set then
search server for EAs or streams to
returns as xattrs */
- rc = CIFSSMBQAllEAs(xid, pTcon, full_path, NULL, data,
- buf_size, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags &
- CIFS_MOUNT_MAP_SPECIAL_CHR);
+ if (pTcon->ses->server->ops->query_all_EAs)
+ rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
+ full_path, NULL, data, buf_size,
+ cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+ CIFS_MOUNT_MAP_SPECIAL_CHR);
list_ea_exit:
kfree(full_path);
free_xid(xid);
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 911cf30d057..7740b1c871c 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -101,7 +101,7 @@ struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
inode = coda_iget(sb, fid, &attr);
if (IS_ERR(inode))
- printk("coda_cnode_make: coda_iget failed\n");
+ pr_warn("%s: coda_iget failed\n", __func__);
return inode;
}
@@ -137,7 +137,7 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
unsigned long hash = coda_f2i(fid);
if ( !sb ) {
- printk("coda_fid_to_inode: no sb!\n");
+ pr_warn("%s: no sb!\n", __func__);
return NULL;
}
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index b7143cf783a..381c993b142 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -10,7 +10,7 @@ extern int coda_hard;
extern int coda_fake_statfs;
void coda_destroy_inodecache(void);
-int coda_init_inodecache(void);
+int __init coda_init_inodecache(void);
int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync);
void coda_sysctl_init(void);
void coda_sysctl_clean(void);
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index cc0ea9fe5ec..d42b725b1d2 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -12,6 +12,12 @@
#ifndef _LINUX_CODA_FS
#define _LINUX_CODA_FS
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/param.h>
#include <linux/mm.h>
@@ -40,7 +46,7 @@ extern const struct file_operations coda_ioctl_operations;
int coda_open(struct inode *i, struct file *f);
int coda_release(struct inode *i, struct file *f);
int coda_permission(struct inode *inode, int mask);
-int coda_revalidate_inode(struct dentry *);
+int coda_revalidate_inode(struct inode *);
int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int coda_setattr(struct dentry *, struct iattr *);
@@ -63,7 +69,7 @@ void coda_sysctl_clean(void);
else \
ptr = (cast)vzalloc((unsigned long) size); \
if (!ptr) \
- printk("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
+ pr_warn("kernel malloc returns 0 at %s:%d\n", __FILE__, __LINE__); \
} while (0)
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 190effc6a6f..cd8a63238b1 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -102,7 +102,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig
int type = 0;
if (length > CODA_MAXNAMLEN) {
- printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
+ pr_err("name too long: lookup, %s (%*s)\n",
coda_i2s(dir), (int)length, name);
return ERR_PTR(-ENAMETOOLONG);
}
@@ -387,9 +387,6 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (!host_file->f_op)
- return -ENOTDIR;
-
if (host_file->f_op->iterate) {
struct inode *host_inode = file_inode(host_file);
mutex_lock(&host_inode->i_mutex);
@@ -456,23 +453,23 @@ static int coda_venus_readdir(struct file *coda_file, struct dir_context *ctx)
ret = kernel_read(host_file, ctx->pos - 2, (char *)vdir,
sizeof(*vdir));
if (ret < 0) {
- printk(KERN_ERR "coda readdir: read dir %s failed %d\n",
- coda_f2s(&cii->c_fid), ret);
+ pr_err("%s: read dir %s failed %d\n",
+ __func__, coda_f2s(&cii->c_fid), ret);
break;
}
if (ret == 0) break; /* end of directory file reached */
/* catch truncated reads */
if (ret < vdir_size || ret < vdir_size + vdir->d_namlen) {
- printk(KERN_ERR "coda readdir: short read on %s\n",
- coda_f2s(&cii->c_fid));
+ pr_err("%s: short read on %s\n",
+ __func__, coda_f2s(&cii->c_fid));
ret = -EBADF;
break;
}
/* validate whether the directory file actually makes sense */
if (vdir->d_reclen < vdir_size + vdir->d_namlen) {
- printk(KERN_ERR "coda readdir: invalid dir %s\n",
- coda_f2s(&cii->c_fid));
+ pr_err("%s: invalid dir %s\n",
+ __func__, coda_f2s(&cii->c_fid));
ret = -EBADF;
break;
}
@@ -566,13 +563,12 @@ static int coda_dentry_delete(const struct dentry * dentry)
* cache manager Venus issues a downcall to the kernel when this
* happens
*/
-int coda_revalidate_inode(struct dentry *dentry)
+int coda_revalidate_inode(struct inode *inode)
{
struct coda_vattr attr;
int error;
int old_mode;
ino_t old_ino;
- struct inode *inode = dentry->d_inode;
struct coda_inode_info *cii = ITOC(inode);
if (!cii->c_flags)
@@ -593,8 +589,8 @@ int coda_revalidate_inode(struct dentry *dentry)
coda_vattr_to_iattr(inode, &attr);
if ((old_mode & S_IFMT) != (inode->i_mode & S_IFMT)) {
- printk("Coda: inode %ld, fid %s changed type!\n",
- inode->i_ino, coda_f2s(&(cii->c_fid)));
+ pr_warn("inode %ld, fid %s changed type!\n",
+ inode->i_ino, coda_f2s(&(cii->c_fid)));
}
/* the following can happen when a local fid is replaced
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 380b798f844..9e83b779021 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -36,7 +36,7 @@ coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *p
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (!host_file->f_op || !host_file->f_op->read)
+ if (!host_file->f_op->read)
return -EINVAL;
return host_file->f_op->read(host_file, buf, count, ppos);
@@ -75,7 +75,7 @@ coda_file_write(struct file *coda_file, const char __user *buf, size_t count, lo
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (!host_file->f_op || !host_file->f_op->write)
+ if (!host_file->f_op->write)
return -EINVAL;
host_inode = file_inode(host_file);
@@ -105,7 +105,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (!host_file->f_op || !host_file->f_op->mmap)
+ if (!host_file->f_op->mmap)
return -ENODEV;
coda_inode = file_inode(coda_file);
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 4dcc0d81a7a..fe3afb2de88 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -73,7 +73,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-int coda_init_inodecache(void)
+int __init coda_init_inodecache(void)
{
coda_inode_cachep = kmem_cache_create("coda_inode_cache",
sizeof(struct coda_inode_info),
@@ -96,6 +96,7 @@ void coda_destroy_inodecache(void)
static int coda_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_NOATIME;
return 0;
}
@@ -118,12 +119,12 @@ static int get_device_index(struct coda_mount_data *data)
int idx;
if (data == NULL) {
- printk("coda_read_super: Bad mount data\n");
+ pr_warn("%s: Bad mount data\n", __func__);
return -1;
}
if (data->version != CODA_MOUNT_VERSION) {
- printk("coda_read_super: Bad mount version\n");
+ pr_warn("%s: Bad mount version\n", __func__);
return -1;
}
@@ -140,13 +141,13 @@ static int get_device_index(struct coda_mount_data *data)
fdput(f);
if (idx < 0 || idx >= MAX_CODADEVS) {
- printk("coda_read_super: Bad minor number\n");
+ pr_warn("%s: Bad minor number\n", __func__);
return -1;
}
return idx;
Ebadf:
- printk("coda_read_super: Bad file\n");
+ pr_warn("%s: Bad file\n", __func__);
return -1;
}
@@ -167,19 +168,19 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
if(idx == -1)
idx = 0;
- printk(KERN_INFO "coda_read_super: device index: %i\n", idx);
+ pr_info("%s: device index: %i\n", __func__, idx);
vc = &coda_comms[idx];
mutex_lock(&vc->vc_mutex);
if (!vc->vc_inuse) {
- printk("coda_read_super: No pseudo device\n");
+ pr_warn("%s: No pseudo device\n", __func__);
error = -EINVAL;
goto unlock_out;
}
if (vc->vc_sb) {
- printk("coda_read_super: Device already mounted\n");
+ pr_warn("%s: Device already mounted\n", __func__);
error = -EBUSY;
goto unlock_out;
}
@@ -203,22 +204,23 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
/* get root fid from Venus: this needs the root inode */
error = venus_rootfid(sb, &fid);
if ( error ) {
- printk("coda_read_super: coda_get_rootfid failed with %d\n",
- error);
+ pr_warn("%s: coda_get_rootfid failed with %d\n",
+ __func__, error);
goto error;
}
- printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
+ pr_info("%s: rootfid is %s\n", __func__, coda_f2s(&fid));
/* make root inode */
root = coda_cnode_make(&fid, sb);
if (IS_ERR(root)) {
error = PTR_ERR(root);
- printk("Failure of coda_cnode_make for root: error %d\n", error);
+ pr_warn("Failure of coda_cnode_make for root: error %d\n",
+ error);
goto error;
}
- printk("coda_read_super: rootinode is %ld dev %s\n",
- root->i_ino, root->i_sb->s_id);
+ pr_info("%s: rootinode is %ld dev %s\n",
+ __func__, root->i_ino, root->i_sb->s_id);
sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = -EINVAL;
@@ -245,19 +247,19 @@ static void coda_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
mutex_unlock(&vcp->vc_mutex);
- printk("Coda: Bye bye.\n");
+ pr_info("Bye bye.\n");
}
static void coda_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
coda_cache_clear_inode(inode);
}
int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
- int err = coda_revalidate_inode(dentry);
+ int err = coda_revalidate_inode(dentry->d_inode);
if (!err)
generic_fillattr(dentry->d_inode, stat);
return err;
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index ebc2bae6c28..5c1e4242368 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -114,14 +114,14 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
int size = sizeof(*dcbuf);
if ( nbytes < sizeof(struct coda_out_hdr) ) {
- printk("coda_downcall opc %d uniq %d, not enough!\n",
- hdr.opcode, hdr.unique);
+ pr_warn("coda_downcall opc %d uniq %d, not enough!\n",
+ hdr.opcode, hdr.unique);
count = nbytes;
goto out;
}
if ( nbytes > size ) {
- printk("Coda: downcall opc %d, uniq %d, too much!",
- hdr.opcode, hdr.unique);
+ pr_warn("downcall opc %d, uniq %d, too much!",
+ hdr.opcode, hdr.unique);
nbytes = size;
}
CODA_ALLOC(dcbuf, union outputArgs *, nbytes);
@@ -136,7 +136,8 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
CODA_FREE(dcbuf, nbytes);
if (error) {
- printk("psdev_write: coda_downcall error: %d\n", error);
+ pr_warn("%s: coda_downcall error: %d\n",
+ __func__, error);
retval = error;
goto out;
}
@@ -157,16 +158,17 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf,
mutex_unlock(&vcp->vc_mutex);
if (!req) {
- printk("psdev_write: msg (%d, %d) not found\n",
- hdr.opcode, hdr.unique);
+ pr_warn("%s: msg (%d, %d) not found\n",
+ __func__, hdr.opcode, hdr.unique);
retval = -ESRCH;
goto out;
}
/* move data into response buffer. */
if (req->uc_outSize < nbytes) {
- printk("psdev_write: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n",
- req->uc_outSize, (long)nbytes, hdr.opcode, hdr.unique);
+ pr_warn("%s: too much cnt: %d, cnt: %ld, opc: %d, uniq: %d.\n",
+ __func__, req->uc_outSize, (long)nbytes,
+ hdr.opcode, hdr.unique);
nbytes = req->uc_outSize; /* don't have more space! */
}
if (copy_from_user(req->uc_data, buf, nbytes)) {
@@ -240,8 +242,8 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
/* Move the input args into userspace */
count = req->uc_inSize;
if (nbytes < req->uc_inSize) {
- printk ("psdev_read: Venus read %ld bytes of %d in message\n",
- (long)nbytes, req->uc_inSize);
+ pr_warn("%s: Venus read %ld bytes of %d in message\n",
+ __func__, (long)nbytes, req->uc_inSize);
count = nbytes;
}
@@ -305,7 +307,7 @@ static int coda_psdev_release(struct inode * inode, struct file * file)
struct upc_req *req, *tmp;
if (!vcp || !vcp->vc_inuse ) {
- printk("psdev_release: Not open.\n");
+ pr_warn("%s: Not open.\n", __func__);
return -1;
}
@@ -354,8 +356,8 @@ static int init_coda_psdev(void)
{
int i, err = 0;
if (register_chrdev(CODA_PSDEV_MAJOR, "coda", &coda_psdev_fops)) {
- printk(KERN_ERR "coda_psdev: unable to get major %d\n",
- CODA_PSDEV_MAJOR);
+ pr_err("%s: unable to get major %d\n",
+ __func__, CODA_PSDEV_MAJOR);
return -EIO;
}
coda_psdev_class = class_create(THIS_MODULE, "coda");
@@ -393,13 +395,13 @@ static int __init init_coda(void)
goto out2;
status = init_coda_psdev();
if ( status ) {
- printk("Problem (%d) in init_coda_psdev\n", status);
+ pr_warn("Problem (%d) in init_coda_psdev\n", status);
goto out1;
}
status = register_filesystem(&coda_fs_type);
if (status) {
- printk("coda: failed to register filesystem!\n");
+ pr_warn("failed to register filesystem!\n");
goto out;
}
return 0;
@@ -420,9 +422,8 @@ static void __exit exit_coda(void)
int err, i;
err = unregister_filesystem(&coda_fs_type);
- if ( err != 0 ) {
- printk("coda: failed to unregister filesystem\n");
- }
+ if (err != 0)
+ pr_warn("failed to unregister filesystem\n");
for (i = 0; i < MAX_CODADEVS; i++)
device_destroy(coda_psdev_class, MKDEV(CODA_PSDEV_MAJOR, i));
class_destroy(coda_psdev_class);
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index af56ad56a89..34218a8a28c 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -14,7 +14,7 @@
#ifdef CONFIG_SYSCTL
static struct ctl_table_header *fs_table_header;
-static ctl_table coda_table[] = {
+static struct ctl_table coda_table[] = {
{
.procname = "timeout",
.data = &coda_timeout,
@@ -39,7 +39,7 @@ static ctl_table coda_table[] = {
{}
};
-static ctl_table fs_table[] = {
+static struct ctl_table fs_table[] = {
{
.procname = "coda",
.mode = 0555,
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 3a731976dc5..21fcf8dcb9c 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -508,8 +508,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
inp->coda_ioctl.data = (char *)(INSIZE(ioctl));
/* get the data out of user space */
- if ( copy_from_user((char*)inp + (long)inp->coda_ioctl.data,
- data->vi.in, data->vi.in_size) ) {
+ if (copy_from_user((char *)inp + (long)inp->coda_ioctl.data,
+ data->vi.in, data->vi.in_size)) {
error = -EINVAL;
goto exit;
}
@@ -518,8 +518,8 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid,
&outsize, inp);
if (error) {
- printk("coda_pioctl: Venus returns: %d for %s\n",
- error, coda_f2s(fid));
+ pr_warn("%s: Venus returns: %d for %s\n",
+ __func__, error, coda_f2s(fid));
goto exit;
}
@@ -675,7 +675,7 @@ static int coda_upcall(struct venus_comm *vcp,
mutex_lock(&vcp->vc_mutex);
if (!vcp->vc_inuse) {
- printk(KERN_NOTICE "coda: Venus dead, not sending upcall\n");
+ pr_notice("Venus dead, not sending upcall\n");
error = -ENXIO;
goto exit;
}
@@ -725,7 +725,7 @@ static int coda_upcall(struct venus_comm *vcp,
error = -EINTR;
if ((req->uc_flags & CODA_REQ_ABORT) || !signal_pending(current)) {
- printk(KERN_WARNING "coda: Unexpected interruption.\n");
+ pr_warn("Unexpected interruption.\n");
goto exit;
}
@@ -735,7 +735,7 @@ static int coda_upcall(struct venus_comm *vcp,
/* Venus saw the upcall, make sure we can send interrupt signal */
if (!vcp->vc_inuse) {
- printk(KERN_INFO "coda: Venus dead, not sending signal.\n");
+ pr_info("Venus dead, not sending signal.\n");
goto exit;
}
diff --git a/fs/compat.c b/fs/compat.c
index 6af20de2c1a..66d3d3c6b4b 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -72,8 +72,8 @@ int compat_printk(const char *fmt, ...)
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
*/
-asmlinkage long compat_sys_utime(const char __user *filename,
- struct compat_utimbuf __user *t)
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
+ struct compat_utimbuf __user *, t)
{
struct timespec tv[2];
@@ -87,13 +87,13 @@ asmlinkage long compat_sys_utime(const char __user *filename,
return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
}
-asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filename, struct compat_timespec __user *t, int flags)
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
{
struct timespec tv[2];
if (t) {
- if (get_compat_timespec(&tv[0], &t[0]) ||
- get_compat_timespec(&tv[1], &t[1]))
+ if (compat_get_timespec(&tv[0], &t[0]) ||
+ compat_get_timespec(&tv[1], &t[1]))
return -EFAULT;
if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
@@ -102,7 +102,7 @@ asmlinkage long compat_sys_utimensat(unsigned int dfd, const char __user *filena
return do_utimes(dfd, filename, t ? tv : NULL, flags);
}
-asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filename, struct compat_timeval __user *t)
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
{
struct timespec tv[2];
@@ -121,7 +121,7 @@ asmlinkage long compat_sys_futimesat(unsigned int dfd, const char __user *filena
return do_utimes(dfd, filename, t ? tv : NULL, 0);
}
-asmlinkage long compat_sys_utimes(const char __user *filename, struct compat_timeval __user *t)
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
{
return compat_sys_futimesat(AT_FDCWD, filename, t);
}
@@ -159,8 +159,8 @@ static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
-asmlinkage long compat_sys_newstat(const char __user * filename,
- struct compat_stat __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
+ struct compat_stat __user *, statbuf)
{
struct kstat stat;
int error;
@@ -171,8 +171,8 @@ asmlinkage long compat_sys_newstat(const char __user * filename,
return cp_compat_stat(&stat, statbuf);
}
-asmlinkage long compat_sys_newlstat(const char __user * filename,
- struct compat_stat __user *statbuf)
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+ struct compat_stat __user *, statbuf)
{
struct kstat stat;
int error;
@@ -184,9 +184,9 @@ asmlinkage long compat_sys_newlstat(const char __user * filename,
}
#ifndef __ARCH_WANT_STAT64
-asmlinkage long compat_sys_newfstatat(unsigned int dfd,
- const char __user *filename,
- struct compat_stat __user *statbuf, int flag)
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
+ const char __user *, filename,
+ struct compat_stat __user *, statbuf, int, flag)
{
struct kstat stat;
int error;
@@ -198,8 +198,8 @@ asmlinkage long compat_sys_newfstatat(unsigned int dfd,
}
#endif
-asmlinkage long compat_sys_newfstat(unsigned int fd,
- struct compat_stat __user * statbuf)
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
+ struct compat_stat __user *, statbuf)
{
struct kstat stat;
int error = vfs_fstat(fd, &stat);
@@ -247,7 +247,7 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
* The following statfs calls are copies of code from fs/statfs.c and
* should be checked against those from time to time
*/
-asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
{
struct kstatfs tmp;
int error = user_statfs(pathname, &tmp);
@@ -256,7 +256,7 @@ asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_sta
return error;
}
-asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
{
struct kstatfs tmp;
int error = fd_statfs(fd, &tmp);
@@ -298,7 +298,7 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
return 0;
}
-asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
{
struct kstatfs tmp;
int error;
@@ -312,7 +312,7 @@ asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t s
return error;
}
-asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
{
struct kstatfs tmp;
int error;
@@ -331,7 +331,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
* Given how simple this syscall is that apporach is more maintainable
* than the various conversion hacks.
*/
-asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
{
struct compat_ustat tmp;
struct kstatfs sbuf;
@@ -399,12 +399,28 @@ static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *u
}
#endif
-asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
- unsigned long arg)
+static unsigned int
+convert_fcntl_cmd(unsigned int cmd)
+{
+ switch (cmd) {
+ case F_GETLK64:
+ return F_GETLK;
+ case F_SETLK64:
+ return F_SETLK;
+ case F_SETLKW64:
+ return F_SETLKW;
+ }
+
+ return cmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
{
mm_segment_t old_fs;
struct flock f;
long ret;
+ unsigned int conv_cmd;
switch (cmd) {
case F_GETLK:
@@ -441,16 +457,18 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
case F_GETLK64:
case F_SETLK64:
case F_SETLKW64:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
ret = get_compat_flock64(&f, compat_ptr(arg));
if (ret != 0)
break;
old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = sys_fcntl(fd, (cmd == F_GETLK64) ? F_GETLK :
- ((cmd == F_SETLK64) ? F_SETLK : F_SETLKW),
- (unsigned long)&f);
+ conv_cmd = convert_fcntl_cmd(cmd);
+ ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
set_fs(old_fs);
- if (cmd == F_GETLK64 && ret == 0) {
+ if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
/* need to return lock information - see above for commentary */
if (f.l_start > COMPAT_LOFF_T_MAX)
ret = -EOVERFLOW;
@@ -468,16 +486,22 @@ asmlinkage long compat_sys_fcntl64(unsigned int fd, unsigned int cmd,
return ret;
}
-asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd,
- unsigned long arg)
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg)
{
- if ((cmd == F_GETLK64) || (cmd == F_SETLK64) || (cmd == F_SETLKW64))
+ switch (cmd) {
+ case F_GETLK64:
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_OFD_GETLK:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
return -EINVAL;
+ }
return compat_sys_fcntl64(fd, cmd, arg);
}
-asmlinkage long
-compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
{
long ret;
aio_context_t ctx64;
@@ -496,32 +520,24 @@ compat_sys_io_setup(unsigned nr_reqs, u32 __user *ctx32p)
return ret;
}
-asmlinkage long
-compat_sys_io_getevents(aio_context_t ctx_id,
- unsigned long min_nr,
- unsigned long nr,
- struct io_event __user *events,
- struct compat_timespec __user *timeout)
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
+ compat_long_t, min_nr,
+ compat_long_t, nr,
+ struct io_event __user *, events,
+ struct compat_timespec __user *, timeout)
{
- long ret;
struct timespec t;
struct timespec __user *ut = NULL;
- ret = -EFAULT;
- if (unlikely(!access_ok(VERIFY_WRITE, events,
- nr * sizeof(struct io_event))))
- goto out;
if (timeout) {
- if (get_compat_timespec(&t, timeout))
- goto out;
+ if (compat_get_timespec(&t, timeout))
+ return -EFAULT;
ut = compat_alloc_user_space(sizeof(*ut));
if (copy_to_user(ut, &t, sizeof(t)) )
- goto out;
+ return -EFAULT;
}
- ret = sys_io_getevents(ctx_id, min_nr, nr, events, ut);
-out:
- return ret;
+ return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
}
/* A write operation does a read from user space and vice versa */
@@ -617,8 +633,8 @@ copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *))
-asmlinkage long
-compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+ int, nr, u32 __user *, iocb)
{
struct iocb __user * __user *iocb64;
long ret;
@@ -770,10 +786,10 @@ static int do_nfs4_super_data_conv(void *raw_data)
#define NCPFS_NAME "ncpfs"
#define NFS4_NAME "nfs4"
-asmlinkage long compat_sys_mount(const char __user * dev_name,
- const char __user * dir_name,
- const char __user * type, unsigned long flags,
- const void __user * data)
+COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
+ const char __user *, dir_name,
+ const char __user *, type, compat_ulong_t, flags,
+ const void __user *, data)
{
char *kernel_type;
unsigned long data_page;
@@ -869,8 +885,8 @@ efault:
return -EFAULT;
}
-asmlinkage long compat_sys_old_readdir(unsigned int fd,
- struct compat_old_linux_dirent __user *dirent, unsigned int count)
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+ struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
struct fd f = fdget(fd);
@@ -948,8 +964,8 @@ efault:
return -EFAULT;
}
-asmlinkage long compat_sys_getdents(unsigned int fd,
- struct compat_linux_dirent __user *dirent, unsigned int count)
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
+ struct compat_linux_dirent __user *, dirent, unsigned int, count)
{
struct fd f;
struct compat_linux_dirent __user * lastdirent;
@@ -981,7 +997,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
return error;
}
-#ifndef __ARCH_OMIT_COMPAT_SYS_GETDENTS64
+#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
struct compat_getdents_callback64 {
struct dir_context ctx;
@@ -1033,8 +1049,8 @@ efault:
return -EFAULT;
}
-asmlinkage long compat_sys_getdents64(unsigned int fd,
- struct linux_dirent64 __user * dirent, unsigned int count)
+COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
+ struct linux_dirent64 __user *, dirent, unsigned int, count)
{
struct fd f;
struct linux_dirent64 __user * lastdirent;
@@ -1066,7 +1082,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
fdput(f);
return error;
}
-#endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
+#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
/*
* Exactly like fs/open.c:sys_open(), except that it doesn't set the
@@ -1287,9 +1303,9 @@ out_nofds:
return ret;
}
-asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct compat_timeval __user *tvp)
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timeval __user *, tvp)
{
struct timespec end_time, *to = NULL;
struct compat_timeval tv;
@@ -1320,7 +1336,7 @@ struct compat_sel_arg_struct {
compat_uptr_t tvp;
};
-asmlinkage long compat_sys_old_select(struct compat_sel_arg_struct __user *arg)
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
struct compat_sel_arg_struct a;
@@ -1381,9 +1397,9 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return ret;
}
-asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
- compat_ulong_t __user *outp, compat_ulong_t __user *exp,
- struct compat_timespec __user *tsp, void __user *sig)
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+ compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+ struct compat_timespec __user *, tsp, void __user *, sig)
{
compat_size_t sigsetsize = 0;
compat_uptr_t up = 0;
@@ -1400,9 +1416,9 @@ asmlinkage long compat_sys_pselect6(int n, compat_ulong_t __user *inp,
sigsetsize);
}
-asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
- unsigned int nfds, struct compat_timespec __user *tsp,
- const compat_sigset_t __user *sigmask, compat_size_t sigsetsize)
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+ unsigned int, nfds, struct compat_timespec __user *, tsp,
+ const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
compat_sigset_t ss32;
sigset_t ksigmask, sigsaved;
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index a81147e2e4e..4d24d17bcfc 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -88,6 +88,11 @@ static void cputime_to_compat_timeval(const cputime_t cputime,
#define ELF_HWCAP COMPAT_ELF_HWCAP
#endif
+#ifdef COMPAT_ELF_HWCAP2
+#undef ELF_HWCAP2
+#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
+#endif
+
#ifdef COMPAT_ARCH_DLINFO
#undef ARCH_DLINFO
#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 5d19acfa7c6..e8228904727 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -680,7 +680,8 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
struct i2c_msg __user *tmsgs;
struct i2c_msg32 __user *umsgs;
compat_caddr_t datap;
- int nmsgs, i;
+ u32 nmsgs;
+ int i;
if (get_user(nmsgs, &udata->nmsgs))
return -EFAULT;
@@ -1537,9 +1538,10 @@ static int compat_ioctl_check_table(unsigned int xcmd)
return ioctl_pointer[i] == xcmd;
}
-asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
- unsigned long arg)
+COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
+ compat_ulong_t, arg32)
{
+ unsigned long arg = arg32;
struct fd f = fdget(fd);
int error = -EBADF;
if (!f.file)
@@ -1583,13 +1585,13 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
/*FALL THROUGH*/
default:
- if (f.file->f_op && f.file->f_op->compat_ioctl) {
+ if (f.file->f_op->compat_ioctl) {
error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
if (error != -ENOIOCTLCMD)
goto out_fput;
}
- if (!f.file->f_op || !f.file->f_op->unlocked_ioctl)
+ if (!f.file->f_op->unlocked_ioctl)
goto do_ioctl;
break;
}
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index b5f0a3b91f1..bd4a3c16709 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -24,6 +24,12 @@
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*/
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 277bd1be21f..668dcabc569 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -56,29 +56,28 @@ static void configfs_d_iput(struct dentry * dentry,
struct configfs_dirent *sd = dentry->d_fsdata;
if (sd) {
- BUG_ON(sd->s_dentry != dentry);
/* Coordinate with configfs_readdir */
spin_lock(&configfs_dirent_lock);
- sd->s_dentry = NULL;
+ /* Coordinate with configfs_attach_attr where will increase
+ * sd->s_count and update sd->s_dentry to new allocated one.
+ * Only set sd->dentry to null when this dentry is the only
+ * sd owner.
+ * If not do so, configfs_d_iput may run just after
+ * configfs_attach_attr and set sd->s_dentry to null
+ * even it's still in use.
+ */
+ if (atomic_read(&sd->s_count) <= 2)
+ sd->s_dentry = NULL;
+
spin_unlock(&configfs_dirent_lock);
configfs_put(sd);
}
iput(inode);
}
-/*
- * We _must_ delete our dentries on last dput, as the chain-to-parent
- * behavior is required to clear the parents of default_groups.
- */
-static int configfs_d_delete(const struct dentry *dentry)
-{
- return 1;
-}
-
const struct dentry_operations configfs_dentry_ops = {
.d_iput = configfs_d_iput,
- /* simple_delete_dentry() isn't exported */
- .d_delete = configfs_d_delete,
+ .d_delete = always_delete_dentry,
};
#ifdef CONFIG_LOCKDEP
@@ -426,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
struct configfs_attribute * attr = sd->s_element;
int error;
+ spin_lock(&configfs_dirent_lock);
dentry->d_fsdata = configfs_get(sd);
sd->s_dentry = dentry;
+ spin_unlock(&configfs_dirent_lock);
+
error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
configfs_init_file);
if (error) {
@@ -938,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item,
#ifdef DEBUG
static void configfs_dump_one(struct configfs_dirent *sd, int level)
{
- printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+ pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd));
-#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type);
type_print(CONFIGFS_ROOT);
type_print(CONFIGFS_DIR);
type_print(CONFIGFS_ITEM_ATTR);
@@ -1697,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
struct dentry *root = dentry->d_sb->s_root;
if (dentry->d_parent != root) {
- printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n");
+ pr_err("Tried to unregister non-subsystem!\n");
return;
}
@@ -1707,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
mutex_lock(&configfs_symlink_mutex);
spin_lock(&configfs_dirent_lock);
if (configfs_detach_prep(dentry, NULL)) {
- printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n");
+ pr_err("Tried to unregister non-empty subsystem!\n");
}
spin_unlock(&configfs_dirent_lock);
mutex_unlock(&configfs_symlink_mutex);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index a9d35b0e06c..5946ad98053 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
* In practice the maximum level of locking depth is
* already reached. Just inform about possible reasons.
*/
- printk(KERN_INFO "configfs: Too many levels of inodes"
- " for the locking correctness validator.\n");
- printk(KERN_INFO "Spurious warnings may appear.\n");
+ pr_info("Too many levels of inodes for the locking correctness validator.\n");
+ pr_info("Spurious warnings may appear.\n");
}
}
}
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 50cee7f9110..e65f9ffbb99 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -19,7 +19,7 @@
* Boston, MA 021110-1307, USA.
*
* Based on kobject:
- * kobject is Copyright (c) 2002-2003 Patrick Mochel
+ * kobject is Copyright (c) 2002-2003 Patrick Mochel
*
* configfs Copyright (C) 2005 Oracle. All rights reserved.
*
@@ -35,9 +35,9 @@
#include <linux/configfs.h>
-static inline struct config_item * to_item(struct list_head * entry)
+static inline struct config_item *to_item(struct list_head *entry)
{
- return container_of(entry,struct config_item,ci_entry);
+ return container_of(entry, struct config_item, ci_entry);
}
/* Evil kernel */
@@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref);
* config_item_init - initialize item.
* @item: item in question.
*/
-void config_item_init(struct config_item * item)
+void config_item_init(struct config_item *item)
{
kref_init(&item->ci_kref);
INIT_LIST_HEAD(&item->ci_entry);
}
+EXPORT_SYMBOL(config_item_init);
/**
* config_item_set_name - Set the name of an item
* @item: item.
- * @name: name.
+ * @fmt: The vsnprintf()'s format string.
*
* If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a
* dynamically allocated string that @item->ci_name points to.
* Otherwise, use the static @item->ci_namebuf array.
*/
-int config_item_set_name(struct config_item * item, const char * fmt, ...)
+int config_item_set_name(struct config_item *item, const char *fmt, ...)
{
int error = 0;
int limit = CONFIGFS_ITEM_NAME_LEN;
int need;
va_list args;
- char * name;
+ char *name;
/*
* First, try the static array
*/
- va_start(args,fmt);
- need = vsnprintf(item->ci_namebuf,limit,fmt,args);
+ va_start(args, fmt);
+ need = vsnprintf(item->ci_namebuf, limit, fmt, args);
va_end(args);
if (need < limit)
name = item->ci_namebuf;
@@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...)
* Need more space? Allocate it and try again
*/
limit = need + 1;
- name = kmalloc(limit,GFP_KERNEL);
+ name = kmalloc(limit, GFP_KERNEL);
if (!name) {
error = -ENOMEM;
goto Done;
}
- va_start(args,fmt);
- need = vsnprintf(name,limit,fmt,args);
+ va_start(args, fmt);
+ need = vsnprintf(name, limit, fmt, args);
va_end(args);
/* Still? Give up. */
@@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...)
Done:
return error;
}
-
EXPORT_SYMBOL(config_item_set_name);
void config_item_init_type_name(struct config_item *item,
@@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name,
}
EXPORT_SYMBOL(config_group_init_type_name);
-struct config_item * config_item_get(struct config_item * item)
+struct config_item *config_item_get(struct config_item *item)
{
if (item)
kref_get(&item->ci_kref);
return item;
}
+EXPORT_SYMBOL(config_item_get);
-static void config_item_cleanup(struct config_item * item)
+static void config_item_cleanup(struct config_item *item)
{
- struct config_item_type * t = item->ci_type;
- struct config_group * s = item->ci_group;
- struct config_item * parent = item->ci_parent;
+ struct config_item_type *t = item->ci_type;
+ struct config_group *s = item->ci_group;
+ struct config_item *parent = item->ci_parent;
- pr_debug("config_item %s: cleaning up\n",config_item_name(item));
+ pr_debug("config_item %s: cleaning up\n", config_item_name(item));
if (item->ci_name != item->ci_namebuf)
kfree(item->ci_name);
item->ci_name = NULL;
@@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref)
*
* Decrement the refcount, and if 0, call config_item_cleanup().
*/
-void config_item_put(struct config_item * item)
+void config_item_put(struct config_item *item)
{
if (item)
kref_put(&item->ci_kref, config_item_release);
}
+EXPORT_SYMBOL(config_item_put);
/**
* config_group_init - initialize a group for use
- * @k: group
+ * @group: config_group
*/
void config_group_init(struct config_group *group)
{
config_item_init(&group->cg_item);
INIT_LIST_HEAD(&group->cg_children);
}
+EXPORT_SYMBOL(config_group_init);
/**
* config_group_find_item - search for item in group.
@@ -195,11 +198,11 @@ void config_group_init(struct config_group *group)
struct config_item *config_group_find_item(struct config_group *group,
const char *name)
{
- struct list_head * entry;
- struct config_item * ret = NULL;
+ struct list_head *entry;
+ struct config_item *ret = NULL;
- list_for_each(entry,&group->cg_children) {
- struct config_item * item = to_item(entry);
+ list_for_each(entry, &group->cg_children) {
+ struct config_item *item = to_item(entry);
if (config_item_name(item) &&
!strcmp(config_item_name(item), name)) {
ret = config_item_get(item);
@@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group,
}
return ret;
}
-
-EXPORT_SYMBOL(config_item_init);
-EXPORT_SYMBOL(config_group_init);
-EXPORT_SYMBOL(config_item_get);
-EXPORT_SYMBOL(config_item_put);
EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 7f26c3cf75a..f6c28583339 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -85,7 +85,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent)
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
} else {
- pr_debug("configfs: could not get root inode\n");
+ pr_debug("could not get root inode\n");
return -ENOMEM;
}
@@ -155,7 +155,7 @@ static int __init configfs_init(void)
return 0;
out4:
- printk(KERN_ERR "configfs: Unable to register filesystem!\n");
+ pr_err("Unable to register filesystem!\n");
configfs_inode_exit();
out3:
kobject_put(config_kobj);
diff --git a/fs/coredump.c b/fs/coredump.c
index 9bdeca12ae0..a93f7e6ea4c 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -40,7 +40,6 @@
#include <trace/events/task.h>
#include "internal.h"
-#include "coredump.h"
#include <trace/events/sched.h>
@@ -74,10 +73,15 @@ static int expand_corename(struct core_name *cn, int size)
static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg)
{
int free, need;
+ va_list arg_copy;
again:
free = cn->size - cn->used;
- need = vsnprintf(cn->corename + cn->used, free, fmt, arg);
+
+ va_copy(arg_copy, arg);
+ need = vsnprintf(cn->corename + cn->used, free, fmt, arg_copy);
+ va_end(arg_copy);
+
if (need < free) {
cn->used += need;
return 0;
@@ -302,7 +306,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
if (unlikely(nr < 0))
return nr;
- tsk->flags = PF_DUMPCORE;
+ tsk->flags |= PF_DUMPCORE;
if (atomic_read(&mm->mm_users) == nr + 1)
goto done;
/*
@@ -485,7 +489,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return err;
}
-void do_coredump(siginfo_t *siginfo)
+void do_coredump(const siginfo_t *siginfo)
{
struct core_state core_state;
struct core_name cn;
@@ -645,7 +649,7 @@ void do_coredump(siginfo_t *siginfo)
*/
if (!uid_eq(inode->i_uid, current_fsuid()))
goto close_fail;
- if (!cprm.file->f_op || !cprm.file->f_op->write)
+ if (!cprm.file->f_op->write)
goto close_fail;
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
goto close_fail;
@@ -685,40 +689,55 @@ fail:
* do on a core-file: use only these functions to write out all the
* necessary info.
*/
-int dump_write(struct file *file, const void *addr, int nr)
+int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
{
- return !dump_interrupted() &&
- access_ok(VERIFY_READ, addr, nr) &&
- file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+ struct file *file = cprm->file;
+ loff_t pos = file->f_pos;
+ ssize_t n;
+ if (cprm->written + nr > cprm->limit)
+ return 0;
+ while (nr) {
+ if (dump_interrupted())
+ return 0;
+ n = __kernel_write(file, addr, nr, &pos);
+ if (n <= 0)
+ return 0;
+ file->f_pos = pos;
+ cprm->written += n;
+ nr -= n;
+ }
+ return 1;
}
-EXPORT_SYMBOL(dump_write);
+EXPORT_SYMBOL(dump_emit);
-int dump_seek(struct file *file, loff_t off)
+int dump_skip(struct coredump_params *cprm, size_t nr)
{
- int ret = 1;
-
+ static char zeroes[PAGE_SIZE];
+ struct file *file = cprm->file;
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+ if (cprm->written + nr > cprm->limit)
+ return 0;
if (dump_interrupted() ||
- file->f_op->llseek(file, off, SEEK_CUR) < 0)
+ file->f_op->llseek(file, nr, SEEK_CUR) < 0)
return 0;
+ cprm->written += nr;
+ return 1;
} else {
- char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-
- if (!buf)
- return 0;
- while (off > 0) {
- unsigned long n = off;
-
- if (n > PAGE_SIZE)
- n = PAGE_SIZE;
- if (!dump_write(file, buf, n)) {
- ret = 0;
- break;
- }
- off -= n;
+ while (nr > PAGE_SIZE) {
+ if (!dump_emit(cprm, zeroes, PAGE_SIZE))
+ return 0;
+ nr -= PAGE_SIZE;
}
- free_page((unsigned long)buf);
+ return dump_emit(cprm, zeroes, nr);
}
- return ret;
}
-EXPORT_SYMBOL(dump_seek);
+EXPORT_SYMBOL(dump_skip);
+
+int dump_align(struct coredump_params *cprm, int align)
+{
+ unsigned mod = cprm->written & (align - 1);
+ if (align & (align - 1))
+ return 0;
+ return mod ? dump_skip(cprm, align - mod) : 1;
+}
+EXPORT_SYMBOL(dump_align);
diff --git a/fs/coredump.h b/fs/coredump.h
deleted file mode 100644
index e39ff072110..00000000000
--- a/fs/coredump.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _FS_COREDUMP_H
-#define _FS_COREDUMP_H
-
-extern int __get_dumpable(unsigned long mm_flags);
-
-#endif
diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig
index cd06466f365..11b29d491b7 100644
--- a/fs/cramfs/Kconfig
+++ b/fs/cramfs/Kconfig
@@ -1,5 +1,5 @@
config CRAMFS
- tristate "Compressed ROM file system support (cramfs)"
+ tristate "Compressed ROM file system support (cramfs) (OBSOLETE)"
depends on BLOCK
select ZLIB_INFLATE
help
@@ -16,4 +16,7 @@ config CRAMFS
cramfs. Note that the root file system (the one containing the
directory /) cannot be compiled as a module.
+ This filesystem is obsoleted by SquashFS, which is much better
+ in terms of performance and features.
+
If unsure, say N.
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index e501ac3a49f..ddcfe590b8a 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -17,14 +17,30 @@
#include <linux/init.h>
#include <linux/string.h>
#include <linux/blkdev.h>
-#include <linux/cramfs_fs.h>
#include <linux/slab.h>
-#include <linux/cramfs_fs_sb.h>
#include <linux/vfs.h>
#include <linux/mutex.h>
-
+#include <uapi/linux/cramfs_fs.h>
#include <asm/uaccess.h>
+#include "internal.h"
+
+/*
+ * cramfs super-block data in memory
+ */
+struct cramfs_sb_info {
+ unsigned long magic;
+ unsigned long size;
+ unsigned long blocks;
+ unsigned long files;
+ unsigned long flags;
+};
+
+static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
static const struct super_operations cramfs_ops;
static const struct inode_operations cramfs_dir_inode_operations;
static const struct file_operations cramfs_directory_operations;
@@ -179,8 +195,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
struct page *page = NULL;
if (blocknr + i < devsize) {
- page = read_mapping_page_async(mapping, blocknr + i,
- NULL);
+ page = read_mapping_page(mapping, blocknr + i, NULL);
/* synchronous error? */
if (IS_ERR(page))
page = NULL;
@@ -219,14 +234,16 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
return read_buffers[buffer] + offset;
}
-static void cramfs_put_super(struct super_block *sb)
+static void cramfs_kill_sb(struct super_block *sb)
{
- kfree(sb->s_fs_info);
- sb->s_fs_info = NULL;
+ struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
+ kill_block_super(sb);
+ kfree(sbi);
}
static int cramfs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_RDONLY;
return 0;
}
@@ -261,7 +278,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
if (super.magic == CRAMFS_MAGIC_WEND) {
if (!silent)
printk(KERN_ERR "cramfs: wrong endianness\n");
- goto out;
+ return -EINVAL;
}
/* check at 512 byte offset */
@@ -273,20 +290,20 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
printk(KERN_ERR "cramfs: wrong endianness\n");
else if (!silent)
printk(KERN_ERR "cramfs: wrong magic\n");
- goto out;
+ return -EINVAL;
}
}
/* get feature flags first */
if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
printk(KERN_ERR "cramfs: unsupported filesystem features\n");
- goto out;
+ return -EINVAL;
}
/* Check that the root inode is in a sane state */
if (!S_ISDIR(super.root.mode)) {
printk(KERN_ERR "cramfs: root is not a directory\n");
- goto out;
+ return -EINVAL;
}
/* correct strange, hard-coded permissions of mkcramfs */
super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
@@ -310,22 +327,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
(root_offset != 512 + sizeof(struct cramfs_super))))
{
printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset);
- goto out;
+ return -EINVAL;
}
/* Set it all up.. */
sb->s_op = &cramfs_ops;
root = get_cramfs_inode(sb, &super.root, 0);
if (IS_ERR(root))
- goto out;
+ return PTR_ERR(root);
sb->s_root = d_make_root(root);
if (!sb->s_root)
- goto out;
+ return -ENOMEM;
return 0;
-out:
- kfree(sbi);
- sb->s_fs_info = NULL;
- return -EINVAL;
}
static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -550,7 +563,6 @@ static const struct inode_operations cramfs_dir_inode_operations = {
};
static const struct super_operations cramfs_ops = {
- .put_super = cramfs_put_super,
.remount_fs = cramfs_remount,
.statfs = cramfs_statfs,
};
@@ -565,7 +577,7 @@ static struct file_system_type cramfs_fs_type = {
.owner = THIS_MODULE,
.name = "cramfs",
.mount = cramfs_mount,
- .kill_sb = kill_block_super,
+ .kill_sb = cramfs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("cramfs");
diff --git a/fs/cramfs/internal.h b/fs/cramfs/internal.h
new file mode 100644
index 00000000000..349d7127215
--- /dev/null
+++ b/fs/cramfs/internal.h
@@ -0,0 +1,4 @@
+/* Uncompression interfaces to the underlying zlib */
+int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen);
+int cramfs_uncompress_init(void);
+void cramfs_uncompress_exit(void);
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 023329800d2..1760c1b84d9 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -19,7 +19,7 @@
#include <linux/errno.h>
#include <linux/vmalloc.h>
#include <linux/zlib.h>
-#include <linux/cramfs_fs.h>
+#include "internal.h"
static z_stream stream;
static int initialized;
diff --git a/fs/dcache.c b/fs/dcache.c
index 41000305d71..06f65857a85 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock);
static struct kmem_cache *dentry_cache __read_mostly;
-/**
- * read_seqbegin_or_lock - begin a sequence number check or locking block
- * @lock: sequence lock
- * @seq : sequence number to be checked
- *
- * First try it once optimistically without taking the lock. If that fails,
- * take the lock. The sequence number is also used as a marker for deciding
- * whether to be a reader (even) or writer (odd).
- * N.B. seq must be initialized to an even number to begin with.
- */
-static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
-{
- if (!(*seq & 1)) /* Even */
- *seq = read_seqbegin(lock);
- else /* Odd */
- read_seqlock_excl(lock);
-}
-
-static inline int need_seqretry(seqlock_t *lock, int seq)
-{
- return !(seq & 1) && read_seqretry(lock, seq);
-}
-
-static inline void done_seqretry(seqlock_t *lock, int seq)
-{
- if (seq & 1)
- read_sequnlock_excl(lock);
-}
-
/*
* This is the single most critical data structure when it comes
* to the dcache: the hashtable for lookups. Somebody should try
@@ -125,8 +96,6 @@ static inline void done_seqretry(seqlock_t *lock, int seq)
* This hash-function tries to avoid losing too many bits of hash
* information, yet avoid using a prime hash-size or similar.
*/
-#define D_HASHBITS d_hash_shift
-#define D_HASHMASK d_hash_mask
static unsigned int d_hash_mask __read_mostly;
static unsigned int d_hash_shift __read_mostly;
@@ -137,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
unsigned int hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
- hash = hash + (hash >> D_HASHBITS);
- return dentry_hashtable + (hash & D_HASHMASK);
+ hash = hash + (hash >> d_hash_shift);
+ return dentry_hashtable + (hash & d_hash_mask);
}
/* Statistics gathering. */
@@ -181,7 +150,7 @@ static long get_nr_dentry_unused(void)
return sum < 0 ? 0 : sum;
}
-int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
+int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
@@ -223,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
if (!tcount)
return 0;
}
- mask = ~(~0ul << tcount*8);
+ mask = bytemask_from_count(tcount);
return unlikely(!!((a ^ b) & mask));
}
@@ -277,16 +246,8 @@ static void __d_free(struct rcu_head *head)
kmem_cache_free(dentry_cache, dentry);
}
-/*
- * no locks, please.
- */
-static void d_free(struct dentry *dentry)
+static void dentry_free(struct dentry *dentry)
{
- BUG_ON((int)dentry->d_lockref.count > 0);
- this_cpu_dec(nr_dentry);
- if (dentry->d_op && dentry->d_op->d_release)
- dentry->d_op->d_release(dentry);
-
/* if dentry was never visible to RCU, immediate free is OK */
if (!(dentry->d_flags & DCACHE_RCUACCESS))
__d_free(&dentry->d_u.d_rcu);
@@ -343,6 +304,7 @@ static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+ __d_clear_type(dentry);
dentry->d_inode = NULL;
hlist_del_init(&dentry->d_alias);
dentry_rcuwalk_barrier(dentry);
@@ -433,77 +395,6 @@ static void dentry_lru_add(struct dentry *dentry)
d_lru_add(dentry);
}
-/*
- * Remove a dentry with references from the LRU.
- *
- * If we are on the shrink list, then we can get to try_prune_one_dentry() and
- * lose our last reference through the parent walk. In this case, we need to
- * remove ourselves from the shrink list, not the LRU.
- */
-static void dentry_lru_del(struct dentry *dentry)
-{
- if (dentry->d_flags & DCACHE_LRU_LIST) {
- if (dentry->d_flags & DCACHE_SHRINK_LIST)
- return d_shrink_del(dentry);
- d_lru_del(dentry);
- }
-}
-
-/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
- */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
- __releases(dentry->d_lock)
- __releases(parent->d_lock)
- __releases(dentry->d_inode->i_lock)
-{
- list_del(&dentry->d_u.d_child);
- /*
- * Inform try_to_ascend() that we are no longer attached to the
- * dentry tree
- */
- dentry->d_flags |= DCACHE_DENTRY_KILLED;
- if (parent)
- spin_unlock(&parent->d_lock);
- dentry_iput(dentry);
- /*
- * dentry_iput drops the locks, at which point nobody (except
- * transient RCU lookups) can reach this dentry.
- */
- d_free(dentry);
- return parent;
-}
-
-/*
- * Unhash a dentry without inserting an RCU walk barrier or checking that
- * dentry->d_lock is locked. The caller must take care of that, if
- * appropriate.
- */
-static void __d_shrink(struct dentry *dentry)
-{
- if (!d_unhashed(dentry)) {
- struct hlist_bl_head *b;
- if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
- b = &dentry->d_sb->s_anon;
- else
- b = d_hash(dentry->d_parent, dentry->d_name.hash);
-
- hlist_bl_lock(b);
- __hlist_bl_del(&dentry->d_hash);
- dentry->d_hash.pprev = NULL;
- hlist_bl_unlock(b);
- }
-}
-
/**
* d_drop - drop a dentry
* @dentry: dentry to drop
@@ -522,7 +413,21 @@ static void __d_shrink(struct dentry *dentry)
void __d_drop(struct dentry *dentry)
{
if (!d_unhashed(dentry)) {
- __d_shrink(dentry);
+ struct hlist_bl_head *b;
+ /*
+ * Hashed dentries are normally on the dentry hashtable,
+ * with the exception of those newly allocated by
+ * d_obtain_alias, which are always IS_ROOT:
+ */
+ if (unlikely(IS_ROOT(dentry)))
+ b = &dentry->d_sb->s_anon;
+ else
+ b = d_hash(dentry->d_parent, dentry->d_name.hash);
+
+ hlist_bl_lock(b);
+ __hlist_bl_del(&dentry->d_hash);
+ dentry->d_hash.pprev = NULL;
+ hlist_bl_unlock(b);
dentry_rcuwalk_barrier(dentry);
}
}
@@ -536,37 +441,12 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static inline struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
- __releases(dentry->d_lock)
+static void __dentry_kill(struct dentry *dentry)
{
- struct inode *inode;
- struct dentry *parent;
-
- inode = dentry->d_inode;
- if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
- if (unlock_on_failure) {
- spin_unlock(&dentry->d_lock);
- cpu_relax();
- }
- return dentry; /* try again with same dentry */
- }
- if (IS_ROOT(dentry))
- parent = NULL;
- else
+ struct dentry *parent = NULL;
+ bool can_free = true;
+ if (!IS_ROOT(dentry))
parent = dentry->d_parent;
- if (parent && !spin_trylock(&parent->d_lock)) {
- if (inode)
- spin_unlock(&inode->i_lock);
- goto relock;
- }
/*
* The dentry is now unrecoverably dead to the world.
@@ -580,10 +460,105 @@ relock:
if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
dentry->d_op->d_prune(dentry);
- dentry_lru_del(dentry);
+ if (dentry->d_flags & DCACHE_LRU_LIST) {
+ if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+ d_lru_del(dentry);
+ }
/* if it was on the hash then remove it */
__d_drop(dentry);
- return d_kill(dentry, parent);
+ list_del(&dentry->d_u.d_child);
+ /*
+ * Inform d_walk() that we are no longer attached to the
+ * dentry tree
+ */
+ dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ dentry_iput(dentry);
+ /*
+ * dentry_iput drops the locks, at which point nobody (except
+ * transient RCU lookups) can reach this dentry.
+ */
+ BUG_ON((int)dentry->d_lockref.count > 0);
+ this_cpu_dec(nr_dentry);
+ if (dentry->d_op && dentry->d_op->d_release)
+ dentry->d_op->d_release(dentry);
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+ dentry->d_flags |= DCACHE_MAY_FREE;
+ can_free = false;
+ }
+ spin_unlock(&dentry->d_lock);
+ if (likely(can_free))
+ dentry_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+ __releases(dentry->d_lock)
+{
+ struct inode *inode = dentry->d_inode;
+ struct dentry *parent = NULL;
+
+ if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+ goto failed;
+
+ if (!IS_ROOT(dentry)) {
+ parent = dentry->d_parent;
+ if (unlikely(!spin_trylock(&parent->d_lock))) {
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ goto failed;
+ }
+ }
+
+ __dentry_kill(dentry);
+ return parent;
+
+failed:
+ spin_unlock(&dentry->d_lock);
+ cpu_relax();
+ return dentry; /* try again with same dentry */
+}
+
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+ struct dentry *parent = dentry->d_parent;
+ if (IS_ROOT(dentry))
+ return NULL;
+ if (unlikely((int)dentry->d_lockref.count < 0))
+ return NULL;
+ if (likely(spin_trylock(&parent->d_lock)))
+ return parent;
+ rcu_read_lock();
+ spin_unlock(&dentry->d_lock);
+again:
+ parent = ACCESS_ONCE(dentry->d_parent);
+ spin_lock(&parent->d_lock);
+ /*
+ * We can't blindly lock dentry until we are sure
+ * that we won't violate the locking order.
+ * Any changes of dentry->d_parent must have
+ * been done with parent->d_lock held, so
+ * spin_lock() above is enough of a barrier
+ * for checking if it's still our child.
+ */
+ if (unlikely(parent != dentry->d_parent)) {
+ spin_unlock(&parent->d_lock);
+ goto again;
+ }
+ rcu_read_unlock();
+ if (parent != dentry)
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ else
+ parent = NULL;
+ return parent;
}
/*
@@ -630,7 +605,8 @@ repeat:
goto kill_it;
}
- dentry->d_flags |= DCACHE_REFERENCED;
+ if (!(dentry->d_flags & DCACHE_REFERENCED))
+ dentry->d_flags |= DCACHE_REFERENCED;
dentry_lru_add(dentry);
dentry->d_lockref.count--;
@@ -638,7 +614,7 @@ repeat:
return;
kill_it:
- dentry = dentry_kill(dentry, 1);
+ dentry = dentry_kill(dentry);
if (dentry)
goto repeat;
}
@@ -851,64 +827,15 @@ restart:
}
EXPORT_SYMBOL(d_prune_aliases);
-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static struct dentry * try_prune_one_dentry(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct dentry *parent;
-
- parent = dentry_kill(dentry, 0);
- /*
- * If dentry_kill returns NULL, we have nothing more to do.
- * if it returns the same dentry, trylocks failed. In either
- * case, just loop again.
- *
- * Otherwise, we need to prune ancestors too. This is necessary
- * to prevent quadratic behavior of shrink_dcache_parent(), but
- * is also expected to be beneficial in reducing dentry cache
- * fragmentation.
- */
- if (!parent)
- return NULL;
- if (parent == dentry)
- return dentry;
-
- /* Prune ancestors. */
- dentry = parent;
- while (dentry) {
- if (lockref_put_or_lock(&dentry->d_lockref))
- return NULL;
- dentry = dentry_kill(dentry, 1);
- }
- return NULL;
-}
-
static void shrink_dentry_list(struct list_head *list)
{
- struct dentry *dentry;
-
- rcu_read_lock();
- for (;;) {
- dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
- if (&dentry->d_lru == list)
- break; /* empty */
+ struct dentry *dentry, *parent;
- /*
- * Get the dentry lock, and re-verify that the dentry is
- * this on the shrinking list. If it is, we know that
- * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
- */
+ while (!list_empty(list)) {
+ struct inode *inode;
+ dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
- if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
- spin_unlock(&dentry->d_lock);
- continue;
- }
+ parent = lock_parent(dentry);
/*
* The dispose list is isolated and dentries are not accounted
@@ -921,30 +848,63 @@ static void shrink_dentry_list(struct list_head *list)
* We found an inuse dentry which was not removed from
* the LRU because of laziness during lookup. Do not free it.
*/
- if (dentry->d_lockref.count) {
+ if ((int)dentry->d_lockref.count > 0) {
spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
continue;
}
- rcu_read_unlock();
- /*
- * If 'try_to_prune()' returns a dentry, it will
- * be the same one we passed in, and d_lock will
- * have been held the whole time, so it will not
- * have been added to any other lists. We failed
- * to get the inode lock.
- *
- * We just add it back to the shrink list.
- */
- dentry = try_prune_one_dentry(dentry);
- rcu_read_lock();
- if (dentry) {
+ if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+ bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ if (can_free)
+ dentry_free(dentry);
+ continue;
+ }
+
+ inode = dentry->d_inode;
+ if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
d_shrink_add(dentry, list);
spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ continue;
+ }
+
+ __dentry_kill(dentry);
+
+ /*
+ * We need to prune ancestors too. This is necessary to prevent
+ * quadratic behavior of shrink_dcache_parent(), but is also
+ * expected to be beneficial in reducing dentry cache
+ * fragmentation.
+ */
+ dentry = parent;
+ while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+ parent = lock_parent(dentry);
+ if (dentry->d_lockref.count != 1) {
+ dentry->d_lockref.count--;
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ break;
+ }
+ inode = dentry->d_inode; /* can't be NULL */
+ if (unlikely(!spin_trylock(&inode->i_lock))) {
+ spin_unlock(&dentry->d_lock);
+ if (parent)
+ spin_unlock(&parent->d_lock);
+ cpu_relax();
+ continue;
+ }
+ __dentry_kill(dentry);
+ dentry = parent;
}
}
- rcu_read_unlock();
}
static enum lru_status
@@ -1074,144 +1034,6 @@ void shrink_dcache_sb(struct super_block *sb)
}
EXPORT_SYMBOL(shrink_dcache_sb);
-/*
- * destroy a single subtree of dentries for unmount
- * - see the comments on shrink_dcache_for_umount() for a description of the
- * locking
- */
-static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
-{
- struct dentry *parent;
-
- BUG_ON(!IS_ROOT(dentry));
-
- for (;;) {
- /* descend to the first leaf in the current subtree */
- while (!list_empty(&dentry->d_subdirs))
- dentry = list_entry(dentry->d_subdirs.next,
- struct dentry, d_u.d_child);
-
- /* consume the dentries from this leaf up through its parents
- * until we find one with children or run out altogether */
- do {
- struct inode *inode;
-
- /*
- * inform the fs that this dentry is about to be
- * unhashed and destroyed.
- */
- if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
- !d_unhashed(dentry))
- dentry->d_op->d_prune(dentry);
-
- dentry_lru_del(dentry);
- __d_shrink(dentry);
-
- if (dentry->d_lockref.count != 0) {
- printk(KERN_ERR
- "BUG: Dentry %p{i=%lx,n=%s}"
- " still in use (%d)"
- " [unmount of %s %s]\n",
- dentry,
- dentry->d_inode ?
- dentry->d_inode->i_ino : 0UL,
- dentry->d_name.name,
- dentry->d_lockref.count,
- dentry->d_sb->s_type->name,
- dentry->d_sb->s_id);
- BUG();
- }
-
- if (IS_ROOT(dentry)) {
- parent = NULL;
- list_del(&dentry->d_u.d_child);
- } else {
- parent = dentry->d_parent;
- parent->d_lockref.count--;
- list_del(&dentry->d_u.d_child);
- }
-
- inode = dentry->d_inode;
- if (inode) {
- dentry->d_inode = NULL;
- hlist_del_init(&dentry->d_alias);
- if (dentry->d_op && dentry->d_op->d_iput)
- dentry->d_op->d_iput(dentry, inode);
- else
- iput(inode);
- }
-
- d_free(dentry);
-
- /* finished when we fall off the top of the tree,
- * otherwise we ascend to the parent and move to the
- * next sibling if there is one */
- if (!parent)
- return;
- dentry = parent;
- } while (list_empty(&dentry->d_subdirs));
-
- dentry = list_entry(dentry->d_subdirs.next,
- struct dentry, d_u.d_child);
- }
-}
-
-/*
- * destroy the dentries attached to a superblock on unmounting
- * - we don't need to use dentry->d_lock because:
- * - the superblock is detached from all mountings and open files, so the
- * dentry trees will not be rearranged by the VFS
- * - s_umount is write-locked, so the memory pressure shrinker will ignore
- * any dentries belonging to this superblock that it comes across
- * - the filesystem itself is no longer permitted to rearrange the dentries
- * in this superblock
- */
-void shrink_dcache_for_umount(struct super_block *sb)
-{
- struct dentry *dentry;
-
- if (down_read_trylock(&sb->s_umount))
- BUG();
-
- dentry = sb->s_root;
- sb->s_root = NULL;
- dentry->d_lockref.count--;
- shrink_dcache_for_umount_subtree(dentry);
-
- while (!hlist_bl_empty(&sb->s_anon)) {
- dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
- shrink_dcache_for_umount_subtree(dentry);
- }
-}
-
-/*
- * This tries to ascend one level of parenthood, but
- * we can race with renaming, so we need to re-check
- * the parenthood after dropping the lock and check
- * that the sequence number still matches.
- */
-static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
-{
- struct dentry *new = old->d_parent;
-
- rcu_read_lock();
- spin_unlock(&old->d_lock);
- spin_lock(&new->d_lock);
-
- /*
- * might go back up the wrong parent if we have had a rename
- * or deletion
- */
- if (new != old->d_parent ||
- (old->d_flags & DCACHE_DENTRY_KILLED) ||
- need_seqretry(&rename_lock, seq)) {
- spin_unlock(&new->d_lock);
- new = NULL;
- }
- rcu_read_unlock();
- return new;
-}
-
/**
* enum d_walk_ret - action to talke during tree walk
* @D_WALK_CONTINUE: contrinue walk
@@ -1300,9 +1122,24 @@ resume:
*/
if (this_parent != parent) {
struct dentry *child = this_parent;
- this_parent = try_to_ascend(this_parent, seq);
- if (!this_parent)
+ this_parent = child->d_parent;
+
+ rcu_read_lock();
+ spin_unlock(&child->d_lock);
+ spin_lock(&this_parent->d_lock);
+
+ /*
+ * might go back up the wrong parent if we have had a rename
+ * or deletion
+ */
+ if (this_parent != child->d_parent ||
+ (child->d_flags & DCACHE_DENTRY_KILLED) ||
+ need_seqretry(&rename_lock, seq)) {
+ spin_unlock(&this_parent->d_lock);
+ rcu_read_unlock();
goto rename_retry;
+ }
+ rcu_read_unlock();
next = child->d_u.d_child.next;
goto resume;
}
@@ -1331,14 +1168,6 @@ rename_retry:
* list is non-empty and continue searching.
*/
-/**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
- *
- * Return true if the parent or its subdirectories contain
- * a mount point
- */
-
static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
{
int *ret = data;
@@ -1349,6 +1178,13 @@ static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
return D_WALK_CONTINUE;
}
+/**
+ * have_submounts - check for mounts over a dentry
+ * @parent: dentry to check.
+ *
+ * Return true if the parent or its subdirectories contain
+ * a mount point
+ */
int have_submounts(struct dentry *parent)
{
int ret = 0;
@@ -1421,34 +1257,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
if (data->start == dentry)
goto out;
- /*
- * move only zero ref count dentries to the dispose list.
- *
- * Those which are presently on the shrink list, being processed
- * by shrink_dentry_list(), shouldn't be moved. Otherwise the
- * loop in shrink_dcache_parent() might not make any progress
- * and loop forever.
- */
- if (dentry->d_lockref.count) {
- dentry_lru_del(dentry);
- } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- /*
- * We can't use d_lru_shrink_move() because we
- * need to get the global LRU lock and do the
- * LRU accounting.
- */
- d_lru_del(dentry);
- d_shrink_add(dentry, &data->dispose);
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
data->found++;
- ret = D_WALK_NORETRY;
+ } else {
+ if (dentry->d_flags & DCACHE_LRU_LIST)
+ d_lru_del(dentry);
+ if (!dentry->d_lockref.count) {
+ d_shrink_add(dentry, &data->dispose);
+ data->found++;
+ }
}
/*
* We can return to the caller if we have found some (this
* ensures forward progress). We'll be coming back to find
* the rest.
*/
- if (data->found && need_resched())
- ret = D_WALK_QUIT;
+ if (!list_empty(&data->dispose))
+ ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
return ret;
}
@@ -1478,6 +1303,56 @@ void shrink_dcache_parent(struct dentry *parent)
}
EXPORT_SYMBOL(shrink_dcache_parent);
+static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
+{
+ /* it has busy descendents; complain about those instead */
+ if (!list_empty(&dentry->d_subdirs))
+ return D_WALK_CONTINUE;
+
+ /* root with refcount 1 is fine */
+ if (dentry == _data && dentry->d_lockref.count == 1)
+ return D_WALK_CONTINUE;
+
+ printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+ " still in use (%d) [unmount of %s %s]\n",
+ dentry,
+ dentry->d_inode ?
+ dentry->d_inode->i_ino : 0UL,
+ dentry,
+ dentry->d_lockref.count,
+ dentry->d_sb->s_type->name,
+ dentry->d_sb->s_id);
+ WARN_ON(1);
+ return D_WALK_CONTINUE;
+}
+
+static void do_one_tree(struct dentry *dentry)
+{
+ shrink_dcache_parent(dentry);
+ d_walk(dentry, dentry, umount_check, NULL);
+ d_drop(dentry);
+ dput(dentry);
+}
+
+/*
+ * destroy the dentries attached to a superblock on unmounting
+ */
+void shrink_dcache_for_umount(struct super_block *sb)
+{
+ struct dentry *dentry;
+
+ WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
+
+ dentry = sb->s_root;
+ sb->s_root = NULL;
+ do_one_tree(dentry);
+
+ while (!hlist_bl_empty(&sb->s_anon)) {
+ dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
+ do_one_tree(dentry);
+ }
+}
+
static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry)
{
struct select_data *data = _data;
@@ -1638,12 +1513,17 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
}
EXPORT_SYMBOL(d_alloc);
+/**
+ * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
+ * @sb: the superblock
+ * @name: qstr of the name
+ *
+ * For a filesystem that just pins its dentries in memory and never
+ * performs lookups at all, return an unhashed IS_ROOT dentry.
+ */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
- struct dentry *dentry = __d_alloc(sb, name);
- if (dentry)
- dentry->d_flags |= DCACHE_DISCONNECTED;
- return dentry;
+ return __d_alloc(sb, name);
}
EXPORT_SYMBOL(d_alloc_pseudo);
@@ -1685,14 +1565,41 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
}
EXPORT_SYMBOL(d_set_d_op);
+static unsigned d_flags_for_inode(struct inode *inode)
+{
+ unsigned add_flags = DCACHE_FILE_TYPE;
+
+ if (!inode)
+ return DCACHE_MISS_TYPE;
+
+ if (S_ISDIR(inode->i_mode)) {
+ add_flags = DCACHE_DIRECTORY_TYPE;
+ if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
+ if (unlikely(!inode->i_op->lookup))
+ add_flags = DCACHE_AUTODIR_TYPE;
+ else
+ inode->i_opflags |= IOP_LOOKUP;
+ }
+ } else if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
+ if (unlikely(inode->i_op->follow_link))
+ add_flags = DCACHE_SYMLINK_TYPE;
+ else
+ inode->i_opflags |= IOP_NOFOLLOW;
+ }
+
+ if (unlikely(IS_AUTOMOUNT(inode)))
+ add_flags |= DCACHE_NEED_AUTOMOUNT;
+ return add_flags;
+}
+
static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
+ unsigned add_flags = d_flags_for_inode(inode);
+
spin_lock(&dentry->d_lock);
- if (inode) {
- if (unlikely(IS_AUTOMOUNT(inode)))
- dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+ __d_set_type(dentry, add_flags);
+ if (inode)
hlist_add_head(&dentry->d_alias, &inode->i_dentry);
- }
dentry->d_inode = inode;
dentry_rcuwalk_barrier(dentry);
spin_unlock(&dentry->d_lock);
@@ -1801,6 +1708,33 @@ struct dentry *d_instantiate_unique(struct dentry *entry, struct inode *inode)
EXPORT_SYMBOL(d_instantiate_unique);
+/**
+ * d_instantiate_no_diralias - instantiate a non-aliased dentry
+ * @entry: dentry to complete
+ * @inode: inode to attach to this dentry
+ *
+ * Fill in inode information in the entry. If a directory alias is found, then
+ * return an error (and drop inode). Together with d_materialise_unique() this
+ * guarantees that a directory inode may never have more than one alias.
+ */
+int d_instantiate_no_diralias(struct dentry *entry, struct inode *inode)
+{
+ BUG_ON(!hlist_unhashed(&entry->d_alias));
+
+ spin_lock(&inode->i_lock);
+ if (S_ISDIR(inode->i_mode) && !hlist_empty(&inode->i_dentry)) {
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ return -EBUSY;
+ }
+ __d_instantiate(entry, inode);
+ spin_unlock(&inode->i_lock);
+ security_d_instantiate(entry, inode);
+
+ return 0;
+}
+EXPORT_SYMBOL(d_instantiate_no_diralias);
+
struct dentry *d_make_root(struct inode *root_inode)
{
struct dentry *res = NULL;
@@ -1870,6 +1804,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
static const struct qstr anonstring = QSTR_INIT("/", 1);
struct dentry *tmp;
struct dentry *res;
+ unsigned add_flags;
if (!inode)
return ERR_PTR(-ESTALE);
@@ -1895,9 +1830,11 @@ struct dentry *d_obtain_alias(struct inode *inode)
}
/* attach a disconnected dentry */
+ add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED;
+
spin_lock(&tmp->d_lock);
tmp->d_inode = inode;
- tmp->d_flags |= DCACHE_DISCONNECTED;
+ tmp->d_flags |= add_flags;
hlist_add_head(&tmp->d_alias, &inode->i_dentry);
hlist_bl_lock(&tmp->d_sb->s_anon);
hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon);
@@ -2495,12 +2432,14 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
dentry->d_name.name = dentry->d_iname;
} else {
/*
- * Both are internal. Just copy target to dentry
+ * Both are internal.
*/
- memcpy(dentry->d_iname, target->d_name.name,
- target->d_name.len + 1);
- dentry->d_name.len = target->d_name.len;
- return;
+ unsigned int i;
+ BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+ for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
+ swap(((long *) &dentry->d_iname)[i],
+ ((long *) &target->d_iname)[i]);
+ }
}
}
swap(dentry->d_name.len, target->d_name.len);
@@ -2557,13 +2496,15 @@ static void dentry_unlock_parents_for_move(struct dentry *dentry,
* __d_move - move a dentry
* @dentry: entry to move
* @target: new dentry
+ * @exchange: exchange the two dentries
*
* Update the dcache to reflect the move of a file name. Negative
* dcache entries should not be moved in this way. Caller must hold
* rename_lock, the i_mutex of the source and target directories,
* and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
*/
-static void __d_move(struct dentry * dentry, struct dentry * target)
+static void __d_move(struct dentry *dentry, struct dentry *target,
+ bool exchange)
{
if (!dentry->d_inode)
printk(KERN_WARNING "VFS: moving negative dcache entry\n");
@@ -2574,7 +2515,7 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
dentry_lock_for_move(dentry, target);
write_seqcount_begin(&dentry->d_seq);
- write_seqcount_begin(&target->d_seq);
+ write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
/* __d_drop does write_seqcount_barrier, but they're OK to nest. */
@@ -2585,8 +2526,15 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
__d_drop(dentry);
__d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
- /* Unhash the target: dput() will then get rid of it */
+ /*
+ * Unhash the target (d_delete() is not usable here). If exchanging
+ * the two dentries, then rehash onto the other's hash queue.
+ */
__d_drop(target);
+ if (exchange) {
+ __d_rehash(target,
+ d_hash(dentry->d_parent, dentry->d_name.hash));
+ }
list_del(&dentry->d_u.d_child);
list_del(&target->d_u.d_child);
@@ -2613,6 +2561,8 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
write_seqcount_end(&dentry->d_seq);
dentry_unlock_parents_for_move(dentry, target);
+ if (exchange)
+ fsnotify_d_move(target);
spin_unlock(&target->d_lock);
fsnotify_d_move(dentry);
spin_unlock(&dentry->d_lock);
@@ -2630,11 +2580,30 @@ static void __d_move(struct dentry * dentry, struct dentry * target)
void d_move(struct dentry *dentry, struct dentry *target)
{
write_seqlock(&rename_lock);
- __d_move(dentry, target);
+ __d_move(dentry, target, false);
write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);
+/*
+ * d_exchange - exchange two dentries
+ * @dentry1: first dentry
+ * @dentry2: second dentry
+ */
+void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
+{
+ write_seqlock(&rename_lock);
+
+ WARN_ON(!dentry1->d_inode);
+ WARN_ON(!dentry2->d_inode);
+ WARN_ON(IS_ROOT(dentry1));
+ WARN_ON(IS_ROOT(dentry2));
+
+ __d_move(dentry1, dentry2, true);
+
+ write_sequnlock(&rename_lock);
+}
+
/**
* d_ancestor - search for an ancestor
* @p1: ancestor dentry
@@ -2682,7 +2651,7 @@ static struct dentry *__d_unalias(struct inode *inode,
m2 = &alias->d_parent->d_inode->i_mutex;
out_unalias:
if (likely(!d_mountpoint(alias))) {
- __d_move(alias, dentry);
+ __d_move(alias, dentry, false);
ret = alias;
}
out_err:
@@ -2706,7 +2675,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
dentry_lock_for_move(anon, dentry);
write_seqcount_begin(&dentry->d_seq);
- write_seqcount_begin(&anon->d_seq);
+ write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
dparent = dentry->d_parent;
@@ -2725,7 +2694,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
spin_unlock(&dentry->d_lock);
/* anon->d_lock still locked, returns locked */
- anon->d_flags &= ~DCACHE_DISCONNECTED;
}
/**
@@ -2846,9 +2814,9 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
u32 dlen = ACCESS_ONCE(name->len);
char *p;
- if (*buflen < dlen + 1)
- return -ENAMETOOLONG;
*buflen -= dlen + 1;
+ if (*buflen < 0)
+ return -ENAMETOOLONG;
p = *buffer -= dlen + 1;
*p++ = '/';
while (dlen--) {
@@ -2881,27 +2849,36 @@ static int prepend_path(const struct path *path,
const struct path *root,
char **buffer, int *buflen)
{
- struct dentry *dentry = path->dentry;
- struct vfsmount *vfsmnt = path->mnt;
- struct mount *mnt = real_mount(vfsmnt);
+ struct dentry *dentry;
+ struct vfsmount *vfsmnt;
+ struct mount *mnt;
int error = 0;
- unsigned seq = 0;
+ unsigned seq, m_seq = 0;
char *bptr;
int blen;
rcu_read_lock();
+restart_mnt:
+ read_seqbegin_or_lock(&mount_lock, &m_seq);
+ seq = 0;
+ rcu_read_lock();
restart:
bptr = *buffer;
blen = *buflen;
+ error = 0;
+ dentry = path->dentry;
+ vfsmnt = path->mnt;
+ mnt = real_mount(vfsmnt);
read_seqbegin_or_lock(&rename_lock, &seq);
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
+ struct mount *parent = ACCESS_ONCE(mnt->mnt_parent);
/* Global root? */
- if (mnt_has_parent(mnt)) {
- dentry = mnt->mnt_mountpoint;
- mnt = mnt->mnt_parent;
+ if (mnt != parent) {
+ dentry = ACCESS_ONCE(mnt->mnt_mountpoint);
+ mnt = parent;
vfsmnt = &mnt->mnt;
continue;
}
@@ -2936,6 +2913,14 @@ restart:
}
done_seqretry(&rename_lock, seq);
+ if (!(m_seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&mount_lock, m_seq)) {
+ m_seq = 1;
+ goto restart_mnt;
+ }
+ done_seqretry(&mount_lock, m_seq);
+
if (error >= 0 && bptr == *buffer) {
if (--blen < 0)
error = -ENAMETOOLONG;
@@ -2971,9 +2956,7 @@ char *__d_path(const struct path *path,
int error;
prepend(&res, &buflen, "\0", 1);
- br_read_lock(&vfsmount_lock);
error = prepend_path(path, root, &res, &buflen);
- br_read_unlock(&vfsmount_lock);
if (error < 0)
return ERR_PTR(error);
@@ -2990,9 +2973,7 @@ char *d_absolute_path(const struct path *path,
int error;
prepend(&res, &buflen, "\0", 1);
- br_read_lock(&vfsmount_lock);
error = prepend_path(path, &root, &res, &buflen);
- br_read_unlock(&vfsmount_lock);
if (error > 1)
error = -EINVAL;
@@ -3061,15 +3042,18 @@ char *d_path(const struct path *path, char *buf, int buflen)
* thus don't need to be hashed. They also don't need a name until a
* user wants to identify the object in /proc/pid/fd/. The little hack
* below allows us to generate a name for these objects on demand:
+ *
+ * Some pseudo inodes are mountable. When they are mounted
+ * path->dentry == path->mnt->mnt_root. In that case don't call d_dname
+ * and instead have d_path return the mounted path.
*/
- if (path->dentry->d_op && path->dentry->d_op->d_dname)
+ if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+ (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
rcu_read_lock();
get_fs_root_rcu(current->fs, &root);
- br_read_lock(&vfsmount_lock);
error = path_with_deleted(path, &root, &res, &buflen);
- br_read_unlock(&vfsmount_lock);
rcu_read_unlock();
if (error < 0)
@@ -3109,30 +3093,33 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
end = ERR_PTR(-ENAMETOOLONG);
return end;
}
+EXPORT_SYMBOL(simple_dname);
/*
* Write full pathname from the root of the filesystem into the buffer.
*/
-static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
+static char *__dentry_path(struct dentry *d, char *buf, int buflen)
{
+ struct dentry *dentry;
char *end, *retval;
int len, seq = 0;
int error = 0;
+ if (buflen < 2)
+ goto Elong;
+
rcu_read_lock();
restart:
+ dentry = d;
end = buf + buflen;
len = buflen;
prepend(&end, &len, "\0", 1);
- if (buflen < 1)
- goto Elong;
/* Get '/' right */
retval = end-1;
*retval = '/';
read_seqbegin_or_lock(&rename_lock, &seq);
while (!IS_ROOT(dentry)) {
struct dentry *parent = dentry->d_parent;
- int error;
prefetch(parent);
error = prepend_name(&end, &len, &dentry->d_name);
@@ -3224,7 +3211,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
error = -ENOENT;
- br_read_lock(&vfsmount_lock);
if (!d_unlinked(pwd.dentry)) {
unsigned long len;
char *cwd = page + PATH_MAX;
@@ -3232,7 +3218,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
prepend(&cwd, &buflen, "\0", 1);
error = prepend_path(&pwd, &root, &cwd, &buflen);
- br_read_unlock(&vfsmount_lock);
rcu_read_unlock();
if (error < 0)
@@ -3253,7 +3238,6 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
error = -EFAULT;
}
} else {
- br_read_unlock(&vfsmount_lock);
rcu_read_unlock();
}
diff --git a/fs/dcookies.c b/fs/dcookies.c
index ab5954b5026..ac44a69fbea 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -204,7 +204,7 @@ out:
}
#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len)
{
#ifdef __BIG_ENDIAN
return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c7c83ff0f75..8c41b52da35 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -218,6 +218,7 @@ static int debugfs_remount(struct super_block *sb, int *flags, char *data)
int err;
struct debugfs_fs_info *fsi = sb->s_fs_info;
+ sync_filesystem(sb);
err = debugfs_parse_options(data, &fsi->mount_opts);
if (err)
goto fail;
@@ -358,7 +359,7 @@ exit:
* @name: a pointer to a string containing the name of the file to create.
* @mode: the permission that the file should have.
* @parent: a pointer to the parent dentry for this file. This should be a
- * directory dentry if set. If this paramater is NULL, then the
+ * directory dentry if set. If this parameter is NULL, then the
* file will be created in the root of the debugfs filesystem.
* @data: a pointer to something that the caller will want to get to later
* on. The inode.i_private pointer will point to this value on
@@ -400,7 +401,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
* @name: a pointer to a string containing the name of the directory to
* create.
* @parent: a pointer to the parent dentry for this file. This should be a
- * directory dentry if set. If this paramater is NULL, then the
+ * directory dentry if set. If this parameter is NULL, then the
* directory will be created in the root of the debugfs filesystem.
*
* This function creates a directory in debugfs with the given name.
@@ -425,7 +426,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
* @name: a pointer to a string containing the name of the symbolic link to
* create.
* @parent: a pointer to the parent dentry for this symbolic link. This
- * should be a directory dentry if set. If this paramater is NULL,
+ * should be a directory dentry if set. If this parameter is NULL,
* then the symbolic link will be created in the root of the debugfs
* filesystem.
* @target: a pointer to a string containing the path to the target of the
@@ -566,8 +567,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
mutex_lock(&parent->d_inode->i_mutex);
if (child != dentry) {
- next = list_entry(child->d_u.d_child.next, struct dentry,
- d_u.d_child);
+ next = list_next_entry(child, d_u.d_child);
goto up;
}
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 073d30b9d1a..cfe8466f7fe 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -10,6 +10,8 @@
*
* ------------------------------------------------------------------------- */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
@@ -148,10 +150,10 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
/*
* parse_mount_options():
- * Set @opts to mount options specified in @data. If an option is not
- * specified in @data, set it to its default value. The exception is
- * 'newinstance' option which can only be set/cleared on a mount (i.e.
- * cannot be changed during remount).
+ * Set @opts to mount options specified in @data. If an option is not
+ * specified in @data, set it to its default value. The exception is
+ * 'newinstance' option which can only be set/cleared on a mount (i.e.
+ * cannot be changed during remount).
*
* Note: @data may be NULL (in which case all options are set to default).
*/
@@ -225,7 +227,7 @@ static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
break;
#endif
default:
- printk(KERN_ERR "devpts: called with bogus options\n");
+ pr_err("called with bogus options\n");
return -EINVAL;
}
}
@@ -261,7 +263,7 @@ static int mknod_ptmx(struct super_block *sb)
dentry = d_alloc_name(root, "ptmx");
if (!dentry) {
- printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+ pr_err("Unable to alloc dentry for ptmx node\n");
goto out;
}
@@ -270,7 +272,7 @@ static int mknod_ptmx(struct super_block *sb)
*/
inode = new_inode(sb);
if (!inode) {
- printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+ pr_err("Unable to alloc inode for ptmx node\n");
dput(dentry);
goto out;
}
@@ -303,7 +305,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi)
#else
static inline void update_ptmx_mode(struct pts_fs_info *fsi)
{
- return;
+ return;
}
#endif
@@ -313,6 +315,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
+ sync_filesystem(sb);
err = parse_mount_options(data, PARSE_REMOUNT, opts);
/*
@@ -332,9 +335,11 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root)
struct pts_mount_opts *opts = &fsi->mount_opts;
if (opts->setuid)
- seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, opts->uid));
+ seq_printf(seq, ",uid=%u",
+ from_kuid_munged(&init_user_ns, opts->uid));
if (opts->setgid)
- seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, opts->gid));
+ seq_printf(seq, ",gid=%u",
+ from_kgid_munged(&init_user_ns, opts->gid));
seq_printf(seq, ",mode=%03o", opts->mode);
#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
@@ -395,7 +400,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
if (s->s_root)
return 0;
- printk(KERN_ERR "devpts: get root dentry failed\n");
+ pr_err("get root dentry failed\n");
fail:
return -ENOMEM;
@@ -498,6 +503,7 @@ static void devpts_kill_sb(struct super_block *sb)
{
struct pts_fs_info *fsi = DEVPTS_SB(sb);
+ ida_destroy(&fsi->allocated_ptys);
kfree(fsi);
kill_litter_super(sb);
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0e04142d596..17e39b047de 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -71,13 +71,11 @@ struct dio_submit {
been performed at the start of a
write */
int pages_in_io; /* approximate total IO pages */
- size_t size; /* total request size (doesn't change)*/
sector_t block_in_file; /* Current offset into the underlying
file in dio_block units. */
unsigned blocks_available; /* At block_in_file. changes */
int reap_counter; /* rate limit reaping */
sector_t final_block_in_request;/* doesn't change */
- unsigned first_block_in_page; /* doesn't change, Used only once */
int boundary; /* prev block is at a boundary */
get_block_t *get_block; /* block mapping function */
dio_submit_t *submit_io; /* IO submition function */
@@ -98,19 +96,14 @@ struct dio_submit {
sector_t cur_page_block; /* Where it starts */
loff_t cur_page_fs_offset; /* Offset in file */
- /*
- * Page fetching state. These variables belong to dio_refill_pages().
- */
- int curr_page; /* changes */
- int total_pages; /* doesn't change */
- unsigned long curr_user_address;/* changes */
-
+ struct iov_iter *iter;
/*
* Page queue. These variables belong to dio_refill_pages() and
* dio_get_page().
*/
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
+ size_t from, to;
};
/* dio_state communicated between submission path and end_io */
@@ -163,15 +156,10 @@ static inline unsigned dio_pages_present(struct dio_submit *sdio)
*/
static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
{
- int ret;
- int nr_pages;
+ ssize_t ret;
- nr_pages = min(sdio->total_pages - sdio->curr_page, DIO_PAGES);
- ret = get_user_pages_fast(
- sdio->curr_user_address, /* Where from? */
- nr_pages, /* How many pages? */
- dio->rw == READ, /* Write to memory? */
- &dio->pages[0]); /* Put results here */
+ ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE,
+ &sdio->from);
if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
struct page *page = ZERO_PAGE(0);
@@ -186,18 +174,19 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
dio->pages[0] = page;
sdio->head = 0;
sdio->tail = 1;
- ret = 0;
- goto out;
+ sdio->from = 0;
+ sdio->to = PAGE_SIZE;
+ return 0;
}
if (ret >= 0) {
- sdio->curr_user_address += ret * PAGE_SIZE;
- sdio->curr_page += ret;
+ iov_iter_advance(sdio->iter, ret);
+ ret += sdio->from;
sdio->head = 0;
- sdio->tail = ret;
- ret = 0;
+ sdio->tail = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+ sdio->to = ((ret - 1) & (PAGE_SIZE - 1)) + 1;
+ return 0;
}
-out:
return ret;
}
@@ -208,7 +197,7 @@ out:
* L1 cache.
*/
static inline struct page *dio_get_page(struct dio *dio,
- struct dio_submit *sdio)
+ struct dio_submit *sdio)
{
if (dio_pages_present(sdio) == 0) {
int ret;
@@ -218,7 +207,7 @@ static inline struct page *dio_get_page(struct dio *dio,
return ERR_PTR(ret);
BUG_ON(dio_pages_present(sdio) == 0);
}
- return dio->pages[sdio->head++];
+ return dio->pages[sdio->head];
}
/**
@@ -375,7 +364,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio = bio_alloc(GFP_KERNEL, nr_vecs);
bio->bi_bdev = bdev;
- bio->bi_sector = first_sector;
+ bio->bi_iter.bi_sector = first_sector;
if (dio->is_async)
bio->bi_end_io = dio_bio_end_aio;
else
@@ -422,8 +411,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
*/
static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio)
{
- while (dio_pages_present(sdio))
- page_cache_release(dio_get_page(dio, sdio));
+ while (sdio->head < sdio->tail)
+ page_cache_release(dio->pages[sdio->head++]);
}
/*
@@ -664,7 +653,6 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
goto out;
sector = start_sector << (sdio->blkbits - 9);
nr_pages = min(sdio->pages_in_io, bio_get_nr_vecs(map_bh->b_bdev));
- nr_pages = min(nr_pages, BIO_MAX_PAGES);
BUG_ON(nr_pages <= 0);
dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
sdio->boundary = 0;
@@ -719,7 +707,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
if (sdio->bio) {
loff_t cur_offset = sdio->cur_page_fs_offset;
loff_t bio_next_offset = sdio->logical_offset_in_bio +
- sdio->bio->bi_size;
+ sdio->bio->bi_iter.bi_size;
/*
* See whether this new request is contiguous with the old.
@@ -913,23 +901,22 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
const unsigned blkbits = sdio->blkbits;
- const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
- struct page *page;
- unsigned block_in_page;
int ret = 0;
- /* The I/O can start at any block offset within the first page */
- block_in_page = sdio->first_block_in_page;
-
while (sdio->block_in_file < sdio->final_block_in_request) {
+ struct page *page;
+ size_t from, to;
+
page = dio_get_page(dio, sdio);
if (IS_ERR(page)) {
ret = PTR_ERR(page);
goto out;
}
+ from = sdio->head ? 0 : sdio->from;
+ to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
+ sdio->head++;
- while (block_in_page < blocks_per_page) {
- unsigned offset_in_page = block_in_page << blkbits;
+ while (from < to) {
unsigned this_chunk_bytes; /* # of bytes mapped */
unsigned this_chunk_blocks; /* # of blocks */
unsigned u;
@@ -1000,10 +987,10 @@ do_holes:
page_cache_release(page);
goto out;
}
- zero_user(page, block_in_page << blkbits,
- 1 << blkbits);
+ zero_user(page, from, 1 << blkbits);
sdio->block_in_file++;
- block_in_page++;
+ from += 1 << blkbits;
+ dio->result += 1 << blkbits;
goto next_block;
}
@@ -1020,7 +1007,7 @@ do_holes:
* can add to this page
*/
this_chunk_blocks = sdio->blocks_available;
- u = (PAGE_SIZE - offset_in_page) >> blkbits;
+ u = (to - from) >> blkbits;
if (this_chunk_blocks > u)
this_chunk_blocks = u;
u = sdio->final_block_in_request - sdio->block_in_file;
@@ -1032,7 +1019,7 @@ do_holes:
if (this_chunk_blocks == sdio->blocks_available)
sdio->boundary = buffer_boundary(map_bh);
ret = submit_page_section(dio, sdio, page,
- offset_in_page,
+ from,
this_chunk_bytes,
sdio->next_block_for_io,
map_bh);
@@ -1043,7 +1030,8 @@ do_holes:
sdio->next_block_for_io += this_chunk_blocks;
sdio->block_in_file += this_chunk_blocks;
- block_in_page += this_chunk_blocks;
+ from += this_chunk_bytes;
+ dio->result += this_chunk_bytes;
sdio->blocks_available -= this_chunk_blocks;
next_block:
BUG_ON(sdio->block_in_file > sdio->final_block_in_request);
@@ -1053,7 +1041,6 @@ next_block:
/* Drop the ref which was taken in get_user_pages() */
page_cache_release(page);
- block_in_page = 0;
}
out:
return ret;
@@ -1108,24 +1095,21 @@ static inline int drop_refcount(struct dio *dio)
*/
static inline ssize_t
do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, const struct iovec *iov, loff_t offset,
- unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+ get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
- int seg;
- size_t size;
- unsigned long addr;
unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
- loff_t end = offset;
+ size_t count = iov_iter_count(iter);
+ loff_t end = offset + count;
struct dio *dio;
struct dio_submit sdio = { 0, };
- unsigned long user_addr;
- size_t bytes;
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
+ unsigned long align = offset | iov_iter_alignment(iter);
if (rw & WRITE)
rw = WRITE_ODIRECT;
@@ -1135,32 +1119,16 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
* the early prefetch in the caller enough time.
*/
- if (offset & blocksize_mask) {
+ if (align & blocksize_mask) {
if (bdev)
blkbits = blksize_bits(bdev_logical_block_size(bdev));
blocksize_mask = (1 << blkbits) - 1;
- if (offset & blocksize_mask)
+ if (align & blocksize_mask)
goto out;
}
- /* Check the memory alignment. Blocks cannot straddle pages */
- for (seg = 0; seg < nr_segs; seg++) {
- addr = (unsigned long)iov[seg].iov_base;
- size = iov[seg].iov_len;
- end += size;
- if (unlikely((addr & blocksize_mask) ||
- (size & blocksize_mask))) {
- if (bdev)
- blkbits = blksize_bits(
- bdev_logical_block_size(bdev));
- blocksize_mask = (1 << blkbits) - 1;
- if ((addr & blocksize_mask) || (size & blocksize_mask))
- goto out;
- }
- }
-
/* watch out for a 0 len io from a tricksy fs */
- if (rw == READ && end == offset)
+ if (rw == READ && !iov_iter_count(iter))
return 0;
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1194,13 +1162,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
}
/*
- * For file extending writes updating i_size before data
- * writeouts complete can expose uninitialized blocks. So
- * even for AIO, we need to wait for i/o to complete before
- * returning in this case.
+ * For file extending writes updating i_size before data writeouts
+ * complete can expose uninitialized blocks in dumb filesystems.
+ * In that case we need to wait for I/O completion even if asked
+ * for an asynchronous write.
*/
- dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
- (end > i_size_read(inode)));
+ if (is_sync_kiocb(iocb))
+ dio->is_async = false;
+ else if (!(dio->flags & DIO_ASYNC_EXTEND) &&
+ (rw & WRITE) && end > i_size_read(inode))
+ dio->is_async = false;
+ else
+ dio->is_async = true;
+
dio->inode = inode;
dio->rw = rw;
@@ -1244,6 +1218,10 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
+ sdio.iter = iter;
+ sdio.final_block_in_request =
+ (offset + iov_iter_count(iter)) >> blkbits;
+
/*
* In case of non-aligned buffers, we may need 2 more
* pages since we need to zero out first and last block.
@@ -1251,47 +1229,13 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (unlikely(sdio.blkfactor))
sdio.pages_in_io = 2;
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.pages_in_io +=
- ((user_addr + iov[seg].iov_len + PAGE_SIZE-1) /
- PAGE_SIZE - user_addr / PAGE_SIZE);
- }
+ sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
blk_start_plug(&plug);
- for (seg = 0; seg < nr_segs; seg++) {
- user_addr = (unsigned long)iov[seg].iov_base;
- sdio.size += bytes = iov[seg].iov_len;
-
- /* Index into the first page of the first block */
- sdio.first_block_in_page = (user_addr & ~PAGE_MASK) >> blkbits;
- sdio.final_block_in_request = sdio.block_in_file +
- (bytes >> blkbits);
- /* Page fetching state */
- sdio.head = 0;
- sdio.tail = 0;
- sdio.curr_page = 0;
-
- sdio.total_pages = 0;
- if (user_addr & (PAGE_SIZE-1)) {
- sdio.total_pages++;
- bytes -= PAGE_SIZE - (user_addr & (PAGE_SIZE - 1));
- }
- sdio.total_pages += (bytes + PAGE_SIZE - 1) / PAGE_SIZE;
- sdio.curr_user_address = user_addr;
-
- retval = do_direct_IO(dio, &sdio, &map_bh);
-
- dio->result += iov[seg].iov_len -
- ((sdio.final_block_in_request - sdio.block_in_file) <<
- blkbits);
-
- if (retval) {
- dio_cleanup(dio, &sdio);
- break;
- }
- } /* end iovec loop */
+ retval = do_direct_IO(dio, &sdio, &map_bh);
+ if (retval)
+ dio_cleanup(dio, &sdio);
if (retval == -ENOTBLK) {
/*
@@ -1343,10 +1287,9 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
*/
BUG_ON(retval == -EIOCBQUEUED);
if (dio->is_async && retval == 0 && dio->result &&
- ((rw == READ) || (dio->result == sdio.size)))
+ (rw == READ || dio->result == count))
retval = -EIOCBQUEUED;
-
- if (retval != -EIOCBQUEUED)
+ else
dio_await_completion(dio);
if (drop_refcount(dio) == 0) {
@@ -1360,8 +1303,8 @@ out:
ssize_t
__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
- struct block_device *bdev, const struct iovec *iov, loff_t offset,
- unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+ struct block_device *bdev, struct iov_iter *iter, loff_t offset,
+ get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
/*
@@ -1376,9 +1319,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
prefetch(bdev->bd_queue);
prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
- return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
- nr_segs, get_block, end_io,
- submit_io, flags);
+ return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset,
+ get_block, end_io, submit_io, flags);
}
EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 0e90f0c91b9..dcea1e37a1b 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -14,6 +14,7 @@
#include "dlm_internal.h"
#include "lock.h"
#include "user.h"
+#include "ast.h"
static uint64_t dlm_cb_seq;
static DEFINE_SPINLOCK(dlm_cb_seq_spin);
@@ -308,6 +309,6 @@ void dlm_callback_resume(struct dlm_ls *ls)
mutex_unlock(&ls->ls_cb_mutex);
if (count)
- log_debug(ls, "dlm_callback_resume %d", count);
+ log_rinfo(ls, "dlm_callback_resume %d", count);
}
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 76feb4b60fa..d521bddf876 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -157,11 +157,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field,
const char *buf, size_t len)
{
unsigned int x;
+ int rc;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
-
- x = simple_strtoul(buf, NULL, 0);
+ rc = kstrtouint(buf, 0, &x);
+ if (rc)
+ return rc;
if (check_zero && !x)
return -EINVAL;
@@ -730,7 +732,10 @@ static ssize_t comm_nodeid_read(struct dlm_comm *cm, char *buf)
static ssize_t comm_nodeid_write(struct dlm_comm *cm, const char *buf,
size_t len)
{
- cm->nodeid = simple_strtol(buf, NULL, 0);
+ int rc = kstrtoint(buf, 0, &cm->nodeid);
+
+ if (rc)
+ return rc;
return len;
}
@@ -742,7 +747,10 @@ static ssize_t comm_local_read(struct dlm_comm *cm, char *buf)
static ssize_t comm_local_write(struct dlm_comm *cm, const char *buf,
size_t len)
{
- cm->local= simple_strtol(buf, NULL, 0);
+ int rc = kstrtoint(buf, 0, &cm->local);
+
+ if (rc)
+ return rc;
if (cm->local && !local_comm)
local_comm = cm;
return len;
@@ -846,7 +854,10 @@ static ssize_t node_nodeid_write(struct dlm_node *nd, const char *buf,
size_t len)
{
uint32_t seq = 0;
- nd->nodeid = simple_strtol(buf, NULL, 0);
+ int rc = kstrtoint(buf, 0, &nd->nodeid);
+
+ if (rc)
+ return rc;
dlm_comm_seq(nd->nodeid, &seq);
nd->comm_seq = seq;
return len;
@@ -860,7 +871,10 @@ static ssize_t node_weight_read(struct dlm_node *nd, char *buf)
static ssize_t node_weight_write(struct dlm_node *nd, const char *buf,
size_t len)
{
- nd->weight = simple_strtol(buf, NULL, 0);
+ int rc = kstrtoint(buf, 0, &nd->weight);
+
+ if (rc)
+ return rc;
return len;
}
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index b969deef9eb..8d77ba7b175 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -68,7 +68,7 @@ static int print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb,
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
- return seq_printf(s, "\n");
+ return seq_puts(s, "\n");
}
static int print_format1(struct dlm_rsb *res, struct seq_file *s)
@@ -92,31 +92,31 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
}
if (res->res_nodeid > 0)
- rv = seq_printf(s, "\" \nLocal Copy, Master is node %d\n",
+ rv = seq_printf(s, "\"\nLocal Copy, Master is node %d\n",
res->res_nodeid);
else if (res->res_nodeid == 0)
- rv = seq_printf(s, "\" \nMaster Copy\n");
+ rv = seq_puts(s, "\"\nMaster Copy\n");
else if (res->res_nodeid == -1)
- rv = seq_printf(s, "\" \nLooking up master (lkid %x)\n",
+ rv = seq_printf(s, "\"\nLooking up master (lkid %x)\n",
res->res_first_lkid);
else
- rv = seq_printf(s, "\" \nInvalid master %d\n",
+ rv = seq_printf(s, "\"\nInvalid master %d\n",
res->res_nodeid);
if (rv)
goto out;
/* Print the LVB: */
if (res->res_lvbptr) {
- seq_printf(s, "LVB: ");
+ seq_puts(s, "LVB: ");
for (i = 0; i < lvblen; i++) {
if (i == lvblen / 2)
- seq_printf(s, "\n ");
+ seq_puts(s, "\n ");
seq_printf(s, "%02x ",
(unsigned char) res->res_lvbptr[i]);
}
if (rsb_flag(res, RSB_VALNOTVALID))
- seq_printf(s, " (INVALID)");
- rv = seq_printf(s, "\n");
+ seq_puts(s, " (INVALID)");
+ rv = seq_puts(s, "\n");
if (rv)
goto out;
}
@@ -133,21 +133,21 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
}
/* Print the locks attached to this resource */
- seq_printf(s, "Granted Queue\n");
+ seq_puts(s, "Granted Queue\n");
list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) {
rv = print_format1_lock(s, lkb, res);
if (rv)
goto out;
}
- seq_printf(s, "Conversion Queue\n");
+ seq_puts(s, "Conversion Queue\n");
list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) {
rv = print_format1_lock(s, lkb, res);
if (rv)
goto out;
}
- seq_printf(s, "Waiting Queue\n");
+ seq_puts(s, "Waiting Queue\n");
list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) {
rv = print_format1_lock(s, lkb, res);
if (rv)
@@ -157,13 +157,13 @@ static int print_format1(struct dlm_rsb *res, struct seq_file *s)
if (list_empty(&res->res_lookup))
goto out;
- seq_printf(s, "Lookup Queue\n");
+ seq_puts(s, "Lookup Queue\n");
list_for_each_entry(lkb, &res->res_lookup, lkb_rsb_lookup) {
rv = seq_printf(s, "%08x %s", lkb->lkb_id,
print_lockmode(lkb->lkb_rqmode));
if (lkb->lkb_wait_type)
seq_printf(s, " wait_type: %d", lkb->lkb_wait_type);
- rv = seq_printf(s, "\n");
+ rv = seq_puts(s, "\n");
}
out:
unlock_rsb(res);
@@ -300,7 +300,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
else
seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
}
- rv = seq_printf(s, "\n");
+ rv = seq_puts(s, "\n");
if (rv)
goto out;
@@ -311,7 +311,7 @@ static int print_format3(struct dlm_rsb *r, struct seq_file *s)
for (i = 0; i < lvblen; i++)
seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]);
- rv = seq_printf(s, "\n");
+ rv = seq_puts(s, "\n");
if (rv)
goto out;
@@ -377,7 +377,7 @@ static int print_format4(struct dlm_rsb *r, struct seq_file *s)
else
seq_printf(s, " %02x", (unsigned char)r->res_name[i]);
}
- rv = seq_printf(s, "\n");
+ rv = seq_puts(s, "\n");
out:
unlock_rsb(r);
return rv;
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 278a75cda44..d975851a7e1 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -68,7 +68,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
uint16_t namelen;
unsigned int count = 0, count_match = 0, count_bad = 0, count_add = 0;
- log_debug(ls, "dlm_recover_directory");
+ log_rinfo(ls, "dlm_recover_directory");
if (dlm_no_directory(ls))
goto out_status;
@@ -189,7 +189,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
error = 0;
dlm_set_recover_status(ls, DLM_RS_DIR);
- log_debug(ls, "dlm_recover_directory %u in %u new",
+ log_rinfo(ls, "dlm_recover_directory %u in %u new",
count, count_add);
out_free:
kfree(last_name);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index e7665c31f7b..5eff6ea3e27 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -65,6 +65,8 @@ struct dlm_mhandle;
printk(KERN_ERR "dlm: "fmt"\n" , ##args)
#define log_error(ls, fmt, args...) \
printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
+#define log_rinfo(ls, fmt, args...) \
+ printk(KERN_INFO "dlm: %s: " fmt "\n", (ls)->ls_name , ##args);
#define log_debug(ls, fmt, args...) \
do { \
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index e223a911a83..83f3d552030 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -687,6 +687,7 @@ static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
from_nodeid, dir_nodeid, our_nodeid, r->res_name);
dlm_free_rsb(r);
+ r = NULL;
error = -ENOTBLK;
goto out_unlock;
}
@@ -5462,7 +5463,7 @@ void dlm_recover_purge(struct dlm_ls *ls)
up_write(&ls->ls_root_sem);
if (lkb_count)
- log_debug(ls, "dlm_recover_purge %u locks for %u nodes",
+ log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
lkb_count, nodes_count);
}
@@ -5536,7 +5537,7 @@ void dlm_recover_grant(struct dlm_ls *ls)
}
if (lkb_count)
- log_debug(ls, "dlm_recover_grant %u locks on %u resources",
+ log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
lkb_count, rsb_count);
}
@@ -5695,7 +5696,7 @@ int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
put_rsb(r);
out:
if (error && error != -EEXIST)
- log_debug(ls, "dlm_recover_master_copy remote %d %x error %d",
+ log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
from_nodeid, remid, error);
rl->rl_result = cpu_to_le32(error);
return error;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 88556dc0458..f3e72787e7f 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -35,8 +35,11 @@ static struct task_struct * scand_task;
static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
{
ssize_t ret = len;
- int n = simple_strtol(buf, NULL, 0);
+ int n;
+ int rc = kstrtoint(buf, 0, &n);
+ if (rc)
+ return rc;
ls = dlm_find_lockspace_local(ls->ls_local_handle);
if (!ls)
return -EINVAL;
@@ -57,7 +60,10 @@ static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len)
static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len)
{
- ls->ls_uevent_result = simple_strtol(buf, NULL, 0);
+ int rc = kstrtoint(buf, 0, &ls->ls_uevent_result);
+
+ if (rc)
+ return rc;
set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags);
wake_up(&ls->ls_uevent_wait);
return len;
@@ -70,7 +76,10 @@ static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf)
static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len)
{
- ls->ls_global_id = simple_strtoul(buf, NULL, 0);
+ int rc = kstrtouint(buf, 0, &ls->ls_global_id);
+
+ if (rc)
+ return rc;
return len;
}
@@ -81,7 +90,11 @@ static ssize_t dlm_nodir_show(struct dlm_ls *ls, char *buf)
static ssize_t dlm_nodir_store(struct dlm_ls *ls, const char *buf, size_t len)
{
- int val = simple_strtoul(buf, NULL, 0);
+ int val;
+ int rc = kstrtoint(buf, 0, &val);
+
+ if (rc)
+ return rc;
if (val == 1)
set_bit(LSFL_NODIR, &ls->ls_flags);
return len;
@@ -190,7 +203,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
else
kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE);
- log_debug(ls, "%s the lockspace group...", in ? "joining" : "leaving");
+ log_rinfo(ls, "%s the lockspace group...", in ? "joining" : "leaving");
/* dlm_controld will see the uevent, do the necessary group management
and then write to sysfs to wake us */
@@ -198,7 +211,7 @@ static int do_uevent(struct dlm_ls *ls, int in)
error = wait_event_interruptible(ls->ls_uevent_wait,
test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags));
- log_debug(ls, "group event done %d %d", error, ls->ls_uevent_result);
+ log_rinfo(ls, "group event done %d %d", error, ls->ls_uevent_result);
if (error)
goto out;
@@ -640,7 +653,7 @@ static int new_lockspace(const char *name, const char *cluster,
dlm_create_debug_file(ls);
- log_debug(ls, "join complete");
+ log_rinfo(ls, "join complete");
*lockspace = ls;
return 0;
@@ -706,9 +719,7 @@ static int lkb_idr_is_local(int id, void *p, void *data)
{
struct dlm_lkb *lkb = p;
- if (!lkb->lkb_nodeid)
- return 1;
- return 0;
+ return lkb->lkb_nodeid == 0 && lkb->lkb_grmode != DLM_LOCK_IV;
}
static int lkb_idr_is_any(int id, void *p, void *data)
@@ -837,7 +848,7 @@ static int release_lockspace(struct dlm_ls *ls, int force)
dlm_clear_members(ls);
dlm_clear_members_gone(ls);
kfree(ls->ls_node_array);
- log_debug(ls, "release_lockspace final free");
+ log_rinfo(ls, "release_lockspace final free");
kobject_put(&ls->ls_kobj);
/* The ls structure will be freed when the kobject is done with */
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa..d08e079ea5d 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -424,7 +424,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len)
}
/* Data available on socket or listen socket received a connect */
-static void lowcomms_data_ready(struct sock *sk, int count_unused)
+static void lowcomms_data_ready(struct sock *sk)
{
struct connection *con = sock2con(sk);
if (con && !test_and_set_bit(CF_READ_PENDING, &con->flags))
@@ -617,6 +617,11 @@ static void retry_failed_sctp_send(struct connection *recv_con,
int nodeid = sn_send_failed->ssf_info.sinfo_ppid;
log_print("Retry sending %d bytes to node id %d", len, nodeid);
+
+ if (!nodeid) {
+ log_print("Shouldn't resend data via listening connection.");
+ return;
+ }
con = nodeid2con(nodeid, 0);
if (!con) {
@@ -649,6 +654,7 @@ static void process_sctp_notification(struct connection *con,
struct msghdr *msg, char *buf)
{
union sctp_notification *sn = (union sctp_notification *)buf;
+ struct linger linger;
switch (sn->sn_header.sn_type) {
case SCTP_SEND_FAILED:
@@ -713,11 +719,11 @@ static void process_sctp_notification(struct connection *con,
return;
/* Peel off a new sock */
- sctp_lock_sock(con->sock->sk);
+ lock_sock(con->sock->sk);
ret = sctp_do_peeloff(con->sock->sk,
sn->sn_assoc_change.sac_assoc_id,
&new_con->sock);
- sctp_release_sock(con->sock->sk);
+ release_sock(con->sock->sk);
if (ret < 0) {
log_print("Can't peel off a socket for "
"connection %d to node %d: err=%d",
@@ -727,6 +733,13 @@ static void process_sctp_notification(struct connection *con,
}
add_sock(new_con->sock, new_con);
+ linger.l_onoff = 1;
+ linger.l_linger = 0;
+ ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+ (char *)&linger, sizeof(linger));
+ if (ret < 0)
+ log_print("set socket option SO_LINGER failed");
+
log_print("connecting to %d sctp association %d",
nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index 476557b5492..9c47f1c14a8 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -60,18 +60,15 @@ void dlm_slots_copy_out(struct dlm_ls *ls, struct dlm_rcom *rc)
#define SLOT_DEBUG_LINE 128
-static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
- struct rcom_slot *ro0, struct dlm_slot *array,
- int array_size)
+static void log_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
+ struct rcom_slot *ro0, struct dlm_slot *array,
+ int array_size)
{
char line[SLOT_DEBUG_LINE];
int len = SLOT_DEBUG_LINE - 1;
int pos = 0;
int ret, i;
- if (!dlm_config.ci_log_debug)
- return;
-
memset(line, 0, sizeof(line));
if (array) {
@@ -95,7 +92,7 @@ static void log_debug_slots(struct dlm_ls *ls, uint32_t gen, int num_slots,
}
}
- log_debug(ls, "generation %u slots %d%s", gen, num_slots, line);
+ log_rinfo(ls, "generation %u slots %d%s", gen, num_slots, line);
}
int dlm_slots_copy_in(struct dlm_ls *ls)
@@ -129,7 +126,7 @@ int dlm_slots_copy_in(struct dlm_ls *ls)
ro->ro_slot = le16_to_cpu(ro->ro_slot);
}
- log_debug_slots(ls, gen, num_slots, ro0, NULL, 0);
+ log_slots(ls, gen, num_slots, ro0, NULL, 0);
list_for_each_entry(memb, &ls->ls_nodes, list) {
for (i = 0, ro = ro0; i < num_slots; i++, ro++) {
@@ -274,7 +271,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size,
gen++;
- log_debug_slots(ls, gen, num, NULL, array, array_size);
+ log_slots(ls, gen, num, NULL, array, array_size);
max_slots = (dlm_config.ci_buffer_size - sizeof(struct dlm_rcom) -
sizeof(struct rcom_config)) / sizeof(struct rcom_slot);
@@ -447,7 +444,7 @@ static int ping_members(struct dlm_ls *ls)
break;
}
if (error)
- log_debug(ls, "ping_members aborted %d last nodeid %d",
+ log_rinfo(ls, "ping_members aborted %d last nodeid %d",
error, ls->ls_recover_nodeid);
return error;
}
@@ -539,7 +536,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
count as a negative change so the "neg" recovery steps will happen */
list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
- log_debug(ls, "prev removed member %d", memb->nodeid);
+ log_rinfo(ls, "prev removed member %d", memb->nodeid);
neg++;
}
@@ -551,10 +548,10 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
continue;
if (!node) {
- log_debug(ls, "remove member %d", memb->nodeid);
+ log_rinfo(ls, "remove member %d", memb->nodeid);
} else {
/* removed and re-added */
- log_debug(ls, "remove member %d comm_seq %u %u",
+ log_rinfo(ls, "remove member %d comm_seq %u %u",
memb->nodeid, memb->comm_seq, node->comm_seq);
}
@@ -571,7 +568,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
if (dlm_is_member(ls, node->nodeid))
continue;
dlm_add_member(ls, node);
- log_debug(ls, "add member %d", node->nodeid);
+ log_rinfo(ls, "add member %d", node->nodeid);
}
list_for_each_entry(memb, &ls->ls_nodes, list) {
@@ -591,7 +588,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
complete(&ls->ls_members_done);
}
- log_debug(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
+ log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
return error;
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 60a327863b1..e7cfbaf8d0e 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -74,14 +74,16 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static struct genl_ops dlm_nl_ops = {
- .cmd = DLM_CMD_HELLO,
- .doit = user_cmd,
+static struct genl_ops dlm_nl_ops[] = {
+ {
+ .cmd = DLM_CMD_HELLO,
+ .doit = user_cmd,
+ },
};
int __init dlm_netlink_init(void)
{
- return genl_register_family_with_ops(&family, &dlm_nl_ops, 1);
+ return genl_register_family_with_ops(&family, dlm_nl_ops);
}
void dlm_netlink_exit(void)
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index a6bc63f6e31..eaea789bf97 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -526,7 +526,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
int nodir = dlm_no_directory(ls);
int error;
- log_debug(ls, "dlm_recover_masters");
+ log_rinfo(ls, "dlm_recover_masters");
down_read(&ls->ls_root_sem);
list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
@@ -552,7 +552,7 @@ int dlm_recover_masters(struct dlm_ls *ls)
}
up_read(&ls->ls_root_sem);
- log_debug(ls, "dlm_recover_masters %u of %u", count, total);
+ log_rinfo(ls, "dlm_recover_masters %u of %u", count, total);
error = dlm_wait_function(ls, &recover_idr_empty);
out:
@@ -685,7 +685,7 @@ int dlm_recover_locks(struct dlm_ls *ls)
}
up_read(&ls->ls_root_sem);
- log_debug(ls, "dlm_recover_locks %d out", count);
+ log_rinfo(ls, "dlm_recover_locks %d out", count);
error = dlm_wait_function(ls, &recover_list_empty);
out:
@@ -883,7 +883,7 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
up_read(&ls->ls_root_sem);
if (count)
- log_debug(ls, "dlm_recover_rsbs %d done", count);
+ log_rinfo(ls, "dlm_recover_rsbs %d done", count);
}
/* Create a single list of all root rsb's to be used during recovery */
@@ -950,6 +950,6 @@ void dlm_clear_toss(struct dlm_ls *ls)
}
if (count)
- log_debug(ls, "dlm_clear_toss %u done", count);
+ log_rinfo(ls, "dlm_clear_toss %u done", count);
}
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 32f9f8926ec..6859b4bf971 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -55,7 +55,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
unsigned long start;
int error, neg = 0;
- log_debug(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
+ log_rinfo(ls, "dlm_recover %llu", (unsigned long long)rv->seq);
mutex_lock(&ls->ls_recoverd_active);
@@ -76,7 +76,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_members(ls, rv, &neg);
if (error) {
- log_debug(ls, "dlm_recover_members error %d", error);
+ log_rinfo(ls, "dlm_recover_members error %d", error);
goto fail;
}
@@ -90,7 +90,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_members_wait(ls);
if (error) {
- log_debug(ls, "dlm_recover_members_wait error %d", error);
+ log_rinfo(ls, "dlm_recover_members_wait error %d", error);
goto fail;
}
@@ -103,7 +103,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_directory(ls);
if (error) {
- log_debug(ls, "dlm_recover_directory error %d", error);
+ log_rinfo(ls, "dlm_recover_directory error %d", error);
goto fail;
}
@@ -111,11 +111,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_directory_wait(ls);
if (error) {
- log_debug(ls, "dlm_recover_directory_wait error %d", error);
+ log_rinfo(ls, "dlm_recover_directory_wait error %d", error);
goto fail;
}
- log_debug(ls, "dlm_recover_directory %u out %u messages",
+ log_rinfo(ls, "dlm_recover_directory %u out %u messages",
ls->ls_recover_dir_sent_res, ls->ls_recover_dir_sent_msg);
/*
@@ -144,7 +144,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_masters(ls);
if (error) {
- log_debug(ls, "dlm_recover_masters error %d", error);
+ log_rinfo(ls, "dlm_recover_masters error %d", error);
goto fail;
}
@@ -154,7 +154,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks(ls);
if (error) {
- log_debug(ls, "dlm_recover_locks error %d", error);
+ log_rinfo(ls, "dlm_recover_locks error %d", error);
goto fail;
}
@@ -162,11 +162,11 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks_wait(ls);
if (error) {
- log_debug(ls, "dlm_recover_locks_wait error %d", error);
+ log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
}
- log_debug(ls, "dlm_recover_locks %u in",
+ log_rinfo(ls, "dlm_recover_locks %u in",
ls->ls_recover_locks_in);
/*
@@ -186,7 +186,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_locks_wait(ls);
if (error) {
- log_debug(ls, "dlm_recover_locks_wait error %d", error);
+ log_rinfo(ls, "dlm_recover_locks_wait error %d", error);
goto fail;
}
}
@@ -205,7 +205,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = dlm_recover_done_wait(ls);
if (error) {
- log_debug(ls, "dlm_recover_done_wait error %d", error);
+ log_rinfo(ls, "dlm_recover_done_wait error %d", error);
goto fail;
}
@@ -217,25 +217,25 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
error = enable_locking(ls, rv->seq);
if (error) {
- log_debug(ls, "enable_locking error %d", error);
+ log_rinfo(ls, "enable_locking error %d", error);
goto fail;
}
error = dlm_process_requestqueue(ls);
if (error) {
- log_debug(ls, "dlm_process_requestqueue error %d", error);
+ log_rinfo(ls, "dlm_process_requestqueue error %d", error);
goto fail;
}
error = dlm_recover_waiters_post(ls);
if (error) {
- log_debug(ls, "dlm_recover_waiters_post error %d", error);
+ log_rinfo(ls, "dlm_recover_waiters_post error %d", error);
goto fail;
}
dlm_recover_grant(ls);
- log_debug(ls, "dlm_recover %llu generation %u done: %u ms",
+ log_rinfo(ls, "dlm_recover %llu generation %u done: %u ms",
(unsigned long long)rv->seq, ls->ls_generation,
jiffies_to_msecs(jiffies - start));
mutex_unlock(&ls->ls_recoverd_active);
@@ -245,7 +245,7 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
fail:
dlm_release_root_list(ls);
- log_debug(ls, "dlm_recover %llu error %d",
+ log_rinfo(ls, "dlm_recover %llu error %d",
(unsigned long long)rv->seq, error);
mutex_unlock(&ls->ls_recoverd_active);
return error;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 9fd702f5bfb..1de7294aad2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -50,7 +50,7 @@ static void drop_slab(void)
} while (nr_objects > 10);
}
-int drop_caches_sysctl_handler(ctl_table *table, int write,
+int drop_caches_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int ret;
@@ -59,10 +59,22 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
if (ret)
return ret;
if (write) {
- if (sysctl_drop_caches & 1)
+ static int stfu;
+
+ if (sysctl_drop_caches & 1) {
iterate_supers(drop_pagecache_sb, NULL);
- if (sysctl_drop_caches & 2)
+ count_vm_event(DROP_PAGECACHE);
+ }
+ if (sysctl_drop_caches & 2) {
drop_slab();
+ count_vm_event(DROP_SLAB);
+ }
+ if (!stfu) {
+ pr_info("%s (%d): drop_caches: %d\n",
+ current->comm, task_pid_nr(current),
+ sysctl_drop_caches);
+ }
+ stfu |= sysctl_drop_caches & 4;
}
return 0;
}
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c88e355f763..2f6735dbf1a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -392,7 +392,7 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
wait_for_completion(&ecr->completion);
rc = ecr->rc;
- INIT_COMPLETION(ecr->completion);
+ reinit_completion(&ecr->completion);
}
out:
ablkcipher_request_free(req);
@@ -408,7 +408,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
struct page *page)
{
return ecryptfs_lower_header_size(crypt_stat) +
- (page->index << PAGE_CACHE_SHIFT);
+ ((loff_t)page->index << PAGE_CACHE_SHIFT);
}
/**
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 992cf95830b..db0fad3269c 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -45,14 +45,13 @@
* The function to be used for directory reads is ecryptfs_read.
*/
static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ struct iov_iter *to)
{
ssize_t rc;
struct path *path;
struct file *file = iocb->ki_filp;
- rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ rc = generic_file_read_iter(iocb, to);
/*
* Even though this is a async interface, we need to wait
* for IO to finish to update atime
@@ -271,7 +270,7 @@ static int ecryptfs_flush(struct file *file, fl_owner_t td)
{
struct file *lower_file = ecryptfs_file_to_lower(file);
- if (lower_file->f_op && lower_file->f_op->flush) {
+ if (lower_file->f_op->flush) {
filemap_write_and_wait(file->f_mapping);
return lower_file->f_op->flush(lower_file, td);
}
@@ -305,7 +304,7 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
struct file *lower_file = NULL;
lower_file = ecryptfs_file_to_lower(file);
- if (lower_file->f_op && lower_file->f_op->fasync)
+ if (lower_file->f_op->fasync)
rc = lower_file->f_op->fasync(fd, lower_file, flag);
return rc;
}
@@ -313,12 +312,10 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
static long
ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct file *lower_file = NULL;
+ struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOTTY;
- if (ecryptfs_file_to_private(file))
- lower_file = ecryptfs_file_to_lower(file);
- if (lower_file && lower_file->f_op && lower_file->f_op->unlocked_ioctl)
+ if (lower_file->f_op->unlocked_ioctl)
rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
return rc;
}
@@ -327,12 +324,10 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
static long
ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct file *lower_file = NULL;
+ struct file *lower_file = ecryptfs_file_to_lower(file);
long rc = -ENOIOCTLCMD;
- if (ecryptfs_file_to_private(file))
- lower_file = ecryptfs_file_to_lower(file);
- if (lower_file && lower_file->f_op && lower_file->f_op->compat_ioctl)
+ if (lower_file->f_op && lower_file->f_op->compat_ioctl)
rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
return rc;
}
@@ -356,10 +351,10 @@ const struct file_operations ecryptfs_dir_fops = {
const struct file_operations ecryptfs_main_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = ecryptfs_read_update_atime,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = ecryptfs_read_update_atime,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.iterate = ecryptfs_readdir,
.unlocked_ioctl = ecryptfs_unlocked_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0f9b66eaa76..d4a9431ec73 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -153,7 +153,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
dget(lower_dentry);
lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_unlink(lower_dir_inode, lower_dentry);
+ rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
@@ -208,7 +208,7 @@ ecryptfs_do_create(struct inode *directory_inode,
inode = __ecryptfs_get_inode(lower_dentry->d_inode,
directory_inode->i_sb);
if (IS_ERR(inode)) {
- vfs_unlink(lower_dir_dentry->d_inode, lower_dentry);
+ vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL);
goto out_lock;
}
fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode);
@@ -475,7 +475,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
dget(lower_new_dentry);
lower_dir_dentry = lock_parent(lower_new_dentry);
rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode,
- lower_new_dentry);
+ lower_new_dentry, NULL);
if (rc || !lower_new_dentry->d_inode)
goto out_lock;
rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
@@ -640,7 +640,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_lock;
}
rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry,
- lower_new_dir_dentry->d_inode, lower_new_dentry);
+ lower_new_dir_dentry->d_inode, lower_new_dentry,
+ NULL, 0);
if (rc)
goto out_lock;
if (target_inode)
@@ -658,19 +659,17 @@ out_lock:
return rc;
}
-static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
- size_t *bufsiz)
+static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
char *lower_buf;
+ char *buf;
mm_segment_t old_fs;
int rc;
lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!lower_buf) {
- rc = -ENOMEM;
- goto out;
- }
+ if (!lower_buf)
+ return ERR_PTR(-ENOMEM);
old_fs = get_fs();
set_fs(get_ds());
rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
@@ -679,21 +678,18 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
set_fs(old_fs);
if (rc < 0)
goto out;
- rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry->d_sb,
+ rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
lower_buf, rc);
out:
kfree(lower_buf);
- return rc;
+ return rc ? ERR_PTR(rc) : buf;
}
static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- char *buf;
- size_t len = PATH_MAX;
- int rc;
-
- rc = ecryptfs_readlink_lower(dentry, &buf, &len);
- if (rc)
+ size_t len;
+ char *buf = ecryptfs_readlink_lower(dentry, &len);
+ if (IS_ERR(buf))
goto out;
fsstack_copy_attr_atime(dentry->d_inode,
ecryptfs_dentry_to_lower(dentry)->d_inode);
@@ -881,7 +877,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = notify_change(lower_dentry, &lower_ia);
+ rc = notify_change(lower_dentry, &lower_ia, NULL);
mutex_unlock(&lower_dentry->d_inode->i_mutex);
}
return rc;
@@ -982,7 +978,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
lower_ia.ia_valid &= ~ATTR_MODE;
mutex_lock(&lower_dentry->d_inode->i_mutex);
- rc = notify_change(lower_dentry, &lower_ia);
+ rc = notify_change(lower_dentry, &lower_ia, NULL);
mutex_unlock(&lower_dentry->d_inode->i_mutex);
out:
fsstack_copy_attr_all(inode, lower_inode);
@@ -1002,10 +998,12 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry,
char *target;
size_t targetsiz;
- rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz);
- if (!rc) {
+ target = ecryptfs_readlink_lower(dentry, &targetsiz);
+ if (!IS_ERR(target)) {
kfree(target);
stat->size = targetsiz;
+ } else {
+ rc = PTR_ERR(target);
}
}
return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 7d52806c211..4725a07f003 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1149,7 +1149,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
struct ecryptfs_msg_ctx *msg_ctx;
struct ecryptfs_message *msg = NULL;
char *auth_tok_sig;
- char *payload;
+ char *payload = NULL;
size_t payload_len = 0;
int rc;
@@ -1203,6 +1203,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok,
}
out:
kfree(msg);
+ kfree(payload);
return rc;
}
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e879cf8ff0b..afa1b81c341 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
static void ecryptfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
iput(ecryptfs_inode_to_lower(inode));
}
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8dd524f3228..cdb2971192a 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -21,7 +21,7 @@ static ssize_t efivarfs_file_write(struct file *file,
u32 attributes;
struct inode *inode = file->f_mapping->host;
unsigned long datasize = count - sizeof(attributes);
- ssize_t bytes = 0;
+ ssize_t bytes;
bool set = false;
if (count < sizeof(attributes))
@@ -33,14 +33,9 @@ static ssize_t efivarfs_file_write(struct file *file,
if (attributes & ~(EFI_VARIABLE_MASK))
return -EINVAL;
- data = kmalloc(datasize, GFP_KERNEL);
- if (!data)
- return -ENOMEM;
-
- if (copy_from_user(data, userbuf + sizeof(attributes), datasize)) {
- bytes = -EFAULT;
- goto out;
- }
+ data = memdup_user(userbuf + sizeof(attributes), datasize);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
bytes = efivar_entry_set_get_size(var, attributes, &datasize,
data, &set);
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index a8766b880c0..0a48886e069 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
return 0;
}
-/*
- * Retaining negative dentries for an in-memory filesystem just wastes
- * memory and lookup time: arrange for them to be deleted immediately.
- */
-static int efivarfs_delete_dentry(const struct dentry *dentry)
-{
- return 1;
-}
-
-static struct dentry_operations efivarfs_d_ops = {
+static const struct dentry_operations efivarfs_d_ops = {
.d_compare = efivarfs_d_compare,
.d_hash = efivarfs_d_hash,
- .d_delete = efivarfs_delete_dentry,
+ .d_delete = always_delete_dentry,
};
static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
diff --git a/fs/efs/dir.c b/fs/efs/dir.c
index b72307ccdf7..ce63b24f7c3 100644
--- a/fs/efs/dir.c
+++ b/fs/efs/dir.c
@@ -26,7 +26,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
int slot;
if (inode->i_size & (EFS_DIRBSIZE-1))
- printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n");
+ pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n",
+ __func__);
/* work out where this entry can be found */
block = ctx->pos >> EFS_DIRBSIZE_BITS;
@@ -43,14 +44,15 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
if (!bh) {
- printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block);
+ pr_err("%s(): failed to read dir block %d\n",
+ __func__, block);
break;
}
dirblock = (struct efs_dir *) bh->b_data;
if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
- printk(KERN_ERR "EFS: readdir(): invalid directory block\n");
+ pr_err("%s(): invalid directory block\n", __func__);
brelse(bh);
break;
}
@@ -69,10 +71,9 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
inodenum = be32_to_cpu(dirslot->inode);
namelen = dirslot->namelen;
nameptr = dirslot->name;
-
-#ifdef DEBUG
- printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen);
-#endif
+ pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n",
+ __func__, block, slot, dirblock->slots-1,
+ inodenum, nameptr, namelen);
if (!namelen)
continue;
/* found the next entry */
@@ -80,7 +81,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx)
/* sanity check */
if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) {
- printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot);
+ pr_warn("directory entry %d exceeds directory block\n",
+ slot);
continue;
}
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 5528926ac7f..5bbf9612140 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -7,6 +7,12 @@
#ifndef _EFS_EFS_H_
#define _EFS_EFS_H_
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <asm/uaccess.h>
diff --git a/fs/efs/file.c b/fs/efs/file.c
index 1ccb364ffa6..a37dcee4686 100644
--- a/fs/efs/file.c
+++ b/fs/efs/file.c
@@ -22,10 +22,8 @@ int efs_get_block(struct inode *inode, sector_t iblock,
/*
* i have no idea why this happens as often as it does
*/
- printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n",
- block,
- inode->i_blocks,
- inode->i_size);
+ pr_warn("%s(): block %d >= %ld (filesize %ld)\n",
+ __func__, block, inode->i_blocks, inode->i_size);
#endif
return 0;
}
@@ -38,7 +36,7 @@ int efs_get_block(struct inode *inode, sector_t iblock,
int efs_bmap(struct inode *inode, efs_block_t block) {
if (block < 0) {
- printk(KERN_WARNING "EFS: bmap(): block < 0\n");
+ pr_warn("%s(): block < 0\n", __func__);
return 0;
}
@@ -48,10 +46,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) {
/*
* i have no idea why this happens as often as it does
*/
- printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n",
- block,
- inode->i_blocks,
- inode->i_size);
+ pr_warn("%s(): block %d >= %ld (filesize %ld)\n",
+ __func__, block, inode->i_blocks, inode->i_size);
#endif
return 0;
}
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index d15ccf20f1b..079d20306ee 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
bh = sb_bread(inode->i_sb, block);
if (!bh) {
- printk(KERN_WARNING "EFS: bread() failed at block %d\n", block);
+ pr_warn("%s() failed at block %d\n", __func__, block);
goto read_inode_error;
}
@@ -130,19 +130,16 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
for(i = 0; i < EFS_DIRECTEXTENTS; i++) {
extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i]));
if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) {
- printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino);
+ pr_warn("extent %d has bad magic number in inode %lu\n",
+ i, inode->i_ino);
brelse(bh);
goto read_inode_error;
}
}
brelse(bh);
-
-#ifdef DEBUG
- printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n",
- inode->i_ino, in->numextents, inode->i_mode);
-#endif
-
+ pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n",
+ inode->i_ino, in->numextents, inode->i_mode);
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
inode->i_op = &efs_dir_inode_operations;
@@ -162,7 +159,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
init_special_inode(inode, inode->i_mode, device);
break;
default:
- printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode);
+ pr_warn("unsupported inode mode %o\n", inode->i_mode);
goto read_inode_error;
break;
}
@@ -171,7 +168,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
return inode;
read_inode_error:
- printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino);
+ pr_warn("failed to read inode %lu\n", inode->i_ino);
iget_failed(inode);
return ERR_PTR(-EIO);
}
@@ -216,7 +213,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
/* if we only have one extent then nothing can be found */
if (in->numextents == 1) {
- printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n");
+ pr_err("%s() failed to map (1 extent)\n", __func__);
return 0;
}
@@ -234,13 +231,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
}
}
- printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block);
+ pr_err("%s() failed to map block %u (dir)\n", __func__, block);
return 0;
}
-#ifdef DEBUG
- printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block);
-#endif
+ pr_debug("%s(): indirect search for logical block %u\n",
+ __func__, block);
direxts = in->extents[0].cooked.ex_offset;
indexts = in->numextents;
@@ -262,7 +258,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
if (dirext == direxts) {
/* should never happen */
- printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block);
+ pr_err("couldn't find direct extent for indirect extent %d (block %u)\n",
+ cur, block);
if (bh) brelse(bh);
return 0;
}
@@ -279,12 +276,12 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
bh = sb_bread(inode->i_sb, iblock);
if (!bh) {
- printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock);
+ pr_err("%s() failed at block %d\n",
+ __func__, iblock);
return 0;
}
-#ifdef DEBUG
- printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock);
-#endif
+ pr_debug("%s(): read indirect extent block %d\n",
+ __func__, iblock);
first = 0;
lastblock = iblock;
}
@@ -294,7 +291,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
extent_copy(&(exts[ioffset]), &ext);
if (ext.cooked.ex_magic != 0) {
- printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock);
+ pr_err("extent %d has bad magic number in block %d\n",
+ cur, iblock);
if (bh) brelse(bh);
return 0;
}
@@ -306,7 +304,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) {
}
}
if (bh) brelse(bh);
- printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block);
+ pr_err("%s() failed to map block %u (indir)\n", __func__, block);
return 0;
}
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 96f66d213a1..356c044e2cd 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -23,20 +23,22 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
efs_block_t block;
if (inode->i_size & (EFS_DIRBSIZE-1))
- printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n");
+ pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n",
+ __func__);
for(block = 0; block < inode->i_blocks; block++) {
bh = sb_bread(inode->i_sb, efs_bmap(inode, block));
if (!bh) {
- printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block);
+ pr_err("%s(): failed to read dir block %d\n",
+ __func__, block);
return 0;
}
dirblock = (struct efs_dir *) bh->b_data;
if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
- printk(KERN_ERR "EFS: find_entry(): invalid directory block\n");
+ pr_err("%s(): invalid directory block\n", __func__);
brelse(bh);
return(0);
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c6f57a74a55..7fca462ea4e 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -26,11 +26,18 @@ static struct dentry *efs_mount(struct file_system_type *fs_type,
return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
}
+static void efs_kill_sb(struct super_block *s)
+{
+ struct efs_sb_info *sbi = SUPER_INFO(s);
+ kill_block_super(s);
+ kfree(sbi);
+}
+
static struct file_system_type efs_fs_type = {
.owner = THIS_MODULE,
.name = "efs",
.mount = efs_mount,
- .kill_sb = kill_block_super,
+ .kill_sb = efs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("efs");
@@ -84,7 +91,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
efs_inode_cachep = kmem_cache_create("efs_inode_cache",
sizeof(struct efs_inode_info),
@@ -105,14 +112,9 @@ static void destroy_inodecache(void)
kmem_cache_destroy(efs_inode_cachep);
}
-static void efs_put_super(struct super_block *s)
-{
- kfree(s->s_fs_info);
- s->s_fs_info = NULL;
-}
-
static int efs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_RDONLY;
return 0;
}
@@ -120,7 +122,6 @@ static int efs_remount(struct super_block *sb, int *flags, char *data)
static const struct super_operations efs_superblock_operations = {
.alloc_inode = efs_alloc_inode,
.destroy_inode = efs_destroy_inode,
- .put_super = efs_put_super,
.statfs = efs_statfs,
.remount_fs = efs_remount,
};
@@ -133,7 +134,7 @@ static const struct export_operations efs_export_ops = {
static int __init init_efs_fs(void) {
int err;
- printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n");
+ pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n");
err = init_inodecache();
if (err)
goto out1;
@@ -178,12 +179,12 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
csum += be32_to_cpu(cs);
}
if (csum) {
- printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n");
+ pr_warn("SGI disklabel: checksum bad, label corrupted\n");
return 0;
}
#ifdef DEBUG
- printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile);
+ pr_debug("bf: \"%16s\"\n", vh->vh_bootfile);
for(i = 0; i < NVDIR; i++) {
int j;
@@ -195,9 +196,8 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
name[j] = (char) 0;
if (name[0]) {
- printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n",
- name,
- (int) be32_to_cpu(vh->vh_vd[i].vd_lbn),
+ pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n",
+ name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn),
(int) be32_to_cpu(vh->vh_vd[i].vd_nbytes));
}
}
@@ -210,12 +210,11 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
}
#ifdef DEBUG
if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) {
- printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n",
- i,
- (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn),
- (int) be32_to_cpu(vh->vh_pt[i].pt_nblks),
- pt_type,
- (pt_entry->pt_name) ? pt_entry->pt_name : "unknown");
+ pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n",
+ i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn),
+ (int)be32_to_cpu(vh->vh_pt[i].pt_nblks),
+ pt_type, (pt_entry->pt_name) ?
+ pt_entry->pt_name : "unknown");
}
#endif
if (IS_EFS(pt_type)) {
@@ -225,11 +224,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) {
}
if (slice == -1) {
- printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n");
+ pr_notice("partition table contained no EFS partitions\n");
#ifdef DEBUG
} else {
- printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n",
- slice,
+ pr_info("using slice %d (type %s, offset 0x%x)\n", slice,
(pt_entry->pt_name) ? pt_entry->pt_name : "unknown",
sblock);
#endif
@@ -259,7 +257,6 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
struct efs_sb_info *sb;
struct buffer_head *bh;
struct inode *root;
- int ret = -EINVAL;
sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
if (!sb)
@@ -268,17 +265,17 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
s->s_magic = EFS_SUPER_MAGIC;
if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
- printk(KERN_ERR "EFS: device does not support %d byte blocks\n",
+ pr_err("device does not support %d byte blocks\n",
EFS_BLOCKSIZE);
- goto out_no_fs_ul;
+ return -EINVAL;
}
/* read the vh (volume header) block */
bh = sb_bread(s, 0);
if (!bh) {
- printk(KERN_ERR "EFS: cannot read volume header\n");
- goto out_no_fs_ul;
+ pr_err("cannot read volume header\n");
+ return -EINVAL;
}
/*
@@ -290,27 +287,28 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
brelse(bh);
if (sb->fs_start == -1) {
- goto out_no_fs_ul;
+ return -EINVAL;
}
bh = sb_bread(s, sb->fs_start + EFS_SUPER);
if (!bh) {
- printk(KERN_ERR "EFS: cannot read superblock\n");
- goto out_no_fs_ul;
+ pr_err("cannot read superblock\n");
+ return -EINVAL;
}
if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
#ifdef DEBUG
- printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER);
+ pr_warn("invalid superblock at block %u\n",
+ sb->fs_start + EFS_SUPER);
#endif
brelse(bh);
- goto out_no_fs_ul;
+ return -EINVAL;
}
brelse(bh);
if (!(s->s_flags & MS_RDONLY)) {
#ifdef DEBUG
- printk(KERN_INFO "EFS: forcing read-only mode\n");
+ pr_info("forcing read-only mode\n");
#endif
s->s_flags |= MS_RDONLY;
}
@@ -318,25 +316,17 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
s->s_export_op = &efs_export_ops;
root = efs_iget(s, EFS_ROOTINODE);
if (IS_ERR(root)) {
- printk(KERN_ERR "EFS: get root inode failed\n");
- ret = PTR_ERR(root);
- goto out_no_fs;
+ pr_err("get root inode failed\n");
+ return PTR_ERR(root);
}
s->s_root = d_make_root(root);
if (!(s->s_root)) {
- printk(KERN_ERR "EFS: get root dentry failed\n");
- ret = -ENOMEM;
- goto out_no_fs;
+ pr_err("get root dentry failed\n");
+ return -ENOMEM;
}
return 0;
-
-out_no_fs_ul:
-out_no_fs:
- s->s_fs_info = NULL;
- kfree(sb);
- return ret;
}
static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 35470d9b96e..d6a88e7812f 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -349,15 +349,12 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
*/
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
- struct file *file;
struct eventfd_ctx *ctx;
-
- file = eventfd_fget(fd);
- if (IS_ERR(file))
- return (struct eventfd_ctx *) file;
- ctx = eventfd_ctx_get(file->private_data);
- fput(file);
-
+ struct fd f = fdget(fd);
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+ ctx = eventfd_ctx_fileget(f.file);
+ fdput(f);
return ctx;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 473e09da7d0..b10b48c2a7a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,6 @@
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
-#include <linux/freezer.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
@@ -42,6 +41,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
+#include <linux/rculist.h>
/*
* LOCKING:
@@ -134,8 +134,12 @@ struct nested_calls {
* of these on a server and we do not want this to take another cache line.
*/
struct epitem {
- /* RB tree node used to link this structure to the eventpoll RB tree */
- struct rb_node rbn;
+ union {
+ /* RB tree node links this structure to the eventpoll RB tree */
+ struct rb_node rbn;
+ /* Used to free the struct epitem */
+ struct rcu_head rcu;
+ };
/* List header used to link this structure to the eventpoll ready list */
struct list_head rdllink;
@@ -289,7 +293,7 @@ static LIST_HEAD(tfile_check_list);
static long zero;
static long long_max = LONG_MAX;
-ctl_table epoll_table[] = {
+struct ctl_table epoll_table[] = {
{
.procname = "max_user_watches",
.data = &max_user_watches,
@@ -581,14 +585,14 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
* @sproc: Pointer to the scan callback.
* @priv: Private opaque data passed to the @sproc callback.
* @depth: The current depth of recursive f_op->poll calls.
+ * @ep_locked: caller already holds ep->mtx
*
* Returns: The same integer error code returned by the @sproc callback.
*/
static int ep_scan_ready_list(struct eventpoll *ep,
int (*sproc)(struct eventpoll *,
struct list_head *, void *),
- void *priv,
- int depth)
+ void *priv, int depth, bool ep_locked)
{
int error, pwake = 0;
unsigned long flags;
@@ -599,7 +603,9 @@ static int ep_scan_ready_list(struct eventpoll *ep,
* We need to lock this because we could be hit by
* eventpoll_release_file() and epoll_ctl().
*/
- mutex_lock_nested(&ep->mtx, depth);
+
+ if (!ep_locked)
+ mutex_lock_nested(&ep->mtx, depth);
/*
* Steal the ready list, and re-init the original one to the
@@ -663,7 +669,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
}
spin_unlock_irqrestore(&ep->lock, flags);
- mutex_unlock(&ep->mtx);
+ if (!ep_locked)
+ mutex_unlock(&ep->mtx);
/* We have to call this outside the lock */
if (pwake)
@@ -672,6 +679,12 @@ static int ep_scan_ready_list(struct eventpoll *ep,
return error;
}
+static void epi_rcu_free(struct rcu_head *head)
+{
+ struct epitem *epi = container_of(head, struct epitem, rcu);
+ kmem_cache_free(epi_cache, epi);
+}
+
/*
* Removes a "struct epitem" from the eventpoll RB tree and deallocates
* all the associated resources. Must be called with "mtx" held.
@@ -693,8 +706,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
/* Remove the current item from the list of epoll hooks */
spin_lock(&file->f_lock);
- if (ep_is_linked(&epi->fllink))
- list_del_init(&epi->fllink);
+ list_del_rcu(&epi->fllink);
spin_unlock(&file->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
@@ -705,9 +717,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
spin_unlock_irqrestore(&ep->lock, flags);
wakeup_source_unregister(ep_wakeup_source(epi));
-
- /* At this point it is safe to free the eventpoll item */
- kmem_cache_free(epi_cache, epi);
+ /*
+ * At this point it is safe to free the eventpoll item. Use the union
+ * field epi->rcu, since we are trying to minimize the size of
+ * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
+ * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
+ * use of the rbn field.
+ */
+ call_rcu(&epi->rcu, epi_rcu_free);
atomic_long_dec(&ep->user->epoll_watches);
@@ -808,15 +825,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
return 0;
}
+static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
+ poll_table *pt);
+
+struct readyevents_arg {
+ struct eventpoll *ep;
+ bool locked;
+};
+
static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
{
- return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
+ struct readyevents_arg *arg = priv;
+
+ return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
+ call_nests + 1, arg->locked);
}
static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
int pollflags;
struct eventpoll *ep = file->private_data;
+ struct readyevents_arg arg;
+
+ /*
+ * During ep_insert() we already hold the ep->mtx for the tfile.
+ * Prevent re-aquisition.
+ */
+ arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
+ arg.ep = ep;
/* Insert inside our poll wait queue */
poll_wait(file, &ep->poll_wait, wait);
@@ -828,7 +864,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
* could re-enter here.
*/
pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
- ep_poll_readyevents_proc, ep, ep, current);
+ ep_poll_readyevents_proc, &arg, ep, current);
return pollflags != -1 ? pollflags : 0;
}
@@ -873,9 +909,8 @@ static const struct file_operations eventpoll_fops = {
*/
void eventpoll_release_file(struct file *file)
{
- struct list_head *lsthead = &file->f_ep_links;
struct eventpoll *ep;
- struct epitem *epi;
+ struct epitem *epi, *next;
/*
* We don't want to get "file->f_lock" because it is not
@@ -891,17 +926,12 @@ void eventpoll_release_file(struct file *file)
* Besides, ep_remove() acquires the lock, so we can't hold it here.
*/
mutex_lock(&epmutex);
-
- while (!list_empty(lsthead)) {
- epi = list_first_entry(lsthead, struct epitem, fllink);
-
+ list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
ep = epi->ep;
- list_del_init(&epi->fllink);
mutex_lock_nested(&ep->mtx, 0);
ep_remove(ep, epi);
mutex_unlock(&ep->mtx);
}
-
mutex_unlock(&epmutex);
}
@@ -1139,7 +1169,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
struct file *child_file;
struct epitem *epi;
- list_for_each_entry(epi, &file->f_ep_links, fllink) {
+ /* CTL_DEL can remove links here, but that can't increase our count */
+ rcu_read_lock();
+ list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
child_file = epi->ep->file;
if (is_file_epoll(child_file)) {
if (list_empty(&child_file->f_ep_links)) {
@@ -1161,6 +1193,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
"file is not an ep!\n");
}
}
+ rcu_read_unlock();
return error;
}
@@ -1232,7 +1265,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
* Must be called with "mtx" held.
*/
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
- struct file *tfile, int fd)
+ struct file *tfile, int fd, int full_check)
{
int error, revents, pwake = 0;
unsigned long flags;
@@ -1287,7 +1320,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* Add the current item to the list of active epoll hook for this file */
spin_lock(&tfile->f_lock);
- list_add_tail(&epi->fllink, &tfile->f_ep_links);
+ list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/*
@@ -1298,7 +1331,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
/* now check if we've created too many backpaths */
error = -EINVAL;
- if (reverse_path_check())
+ if (full_check && reverse_path_check())
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
@@ -1328,8 +1361,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
error_remove_epi:
spin_lock(&tfile->f_lock);
- if (ep_is_linked(&epi->fllink))
- list_del_init(&epi->fllink);
+ list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
@@ -1522,7 +1554,7 @@ static int ep_send_events(struct eventpoll *ep,
esed.maxevents = maxevents;
esed.events = events;
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
+ return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
}
static inline struct timespec ep_set_mstimeout(long ms)
@@ -1605,8 +1637,7 @@ fetch_events:
}
spin_unlock_irqrestore(&ep->lock, flags);
- if (!freezable_schedule_hrtimeout_range(to, slack,
- HRTIMER_MODE_ABS))
+ if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
spin_lock_irqsave(&ep->lock, flags);
@@ -1793,11 +1824,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
- int did_lock_epmutex = 0;
+ int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
+ struct eventpoll *tep = NULL;
error = -EFAULT;
if (ep_op_has_event(op) &&
@@ -1816,12 +1848,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* The target file descriptor must support poll */
error = -EPERM;
- if (!tf.file->f_op || !tf.file->f_op->poll)
+ if (!tf.file->f_op->poll)
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
- if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
- epds.events &= ~EPOLLWAKEUP;
+ ep_take_care_of_epollwakeup(&epds);
/*
* We have to check that the file structure underneath the file descriptor
@@ -1846,27 +1877,37 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
- * We need to hold the epmutex across both ep_insert and ep_remove
- * b/c we want to make sure we are looking at a coherent view of
- * epoll network.
+ * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
+ * the epoll file descriptor is attaching directly to a wakeup source,
+ * unless the epoll file descriptor is nested. The purpose of taking the
+ * 'epmutex' on add is to prevent complex toplogies such as loops and
+ * deep wakeup paths from forming in parallel through multiple
+ * EPOLL_CTL_ADD operations.
*/
- if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
- mutex_lock(&epmutex);
- did_lock_epmutex = 1;
- }
+ mutex_lock_nested(&ep->mtx, 0);
if (op == EPOLL_CTL_ADD) {
- if (is_file_epoll(tf.file)) {
- error = -ELOOP;
- if (ep_loop_check(ep, tf.file) != 0) {
- clear_tfile_check_list();
- goto error_tgt_fput;
+ if (!list_empty(&f.file->f_ep_links) ||
+ is_file_epoll(tf.file)) {
+ full_check = 1;
+ mutex_unlock(&ep->mtx);
+ mutex_lock(&epmutex);
+ if (is_file_epoll(tf.file)) {
+ error = -ELOOP;
+ if (ep_loop_check(ep, tf.file) != 0) {
+ clear_tfile_check_list();
+ goto error_tgt_fput;
+ }
+ } else
+ list_add(&tf.file->f_tfile_llink,
+ &tfile_check_list);
+ mutex_lock_nested(&ep->mtx, 0);
+ if (is_file_epoll(tf.file)) {
+ tep = tf.file->private_data;
+ mutex_lock_nested(&tep->mtx, 1);
}
- } else
- list_add(&tf.file->f_tfile_llink, &tfile_check_list);
+ }
}
- mutex_lock_nested(&ep->mtx, 0);
-
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
@@ -1879,10 +1920,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
- error = ep_insert(ep, &epds, tf.file, fd);
+ error = ep_insert(ep, &epds, tf.file, fd, full_check);
} else
error = -EEXIST;
- clear_tfile_check_list();
+ if (full_check)
+ clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
@@ -1898,10 +1940,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
error = -ENOENT;
break;
}
+ if (tep != NULL)
+ mutex_unlock(&tep->mtx);
mutex_unlock(&ep->mtx);
error_tgt_fput:
- if (did_lock_epmutex)
+ if (full_check)
mutex_unlock(&epmutex);
fdput(tf);
diff --git a/fs/exec.c b/fs/exec.c
index 8875dd10ae7..a3d33fe592d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
@@ -62,7 +63,6 @@
#include <trace/events/task.h>
#include "internal.h"
-#include "coredump.h"
#include <trace/events/sched.h>
@@ -98,6 +98,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
module_put(fmt->module);
}
+#ifdef CONFIG_USELIB
/*
* Note that a shared library must be both readable and executable due to
* security reasons.
@@ -106,6 +107,7 @@ static inline void put_binfmt(struct linux_binfmt * fmt)
*/
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
+ struct linux_binfmt *fmt;
struct file *file;
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
@@ -136,29 +138,27 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
fsnotify_open(file);
error = -ENOEXEC;
- if(file->f_op) {
- struct linux_binfmt * fmt;
- read_lock(&binfmt_lock);
- list_for_each_entry(fmt, &formats, lh) {
- if (!fmt->load_shlib)
- continue;
- if (!try_module_get(fmt->module))
- continue;
- read_unlock(&binfmt_lock);
- error = fmt->load_shlib(file);
- read_lock(&binfmt_lock);
- put_binfmt(fmt);
- if (error != -ENOEXEC)
- break;
- }
+ read_lock(&binfmt_lock);
+ list_for_each_entry(fmt, &formats, lh) {
+ if (!fmt->load_shlib)
+ continue;
+ if (!try_module_get(fmt->module))
+ continue;
read_unlock(&binfmt_lock);
+ error = fmt->load_shlib(file);
+ read_lock(&binfmt_lock);
+ put_binfmt(fmt);
+ if (error != -ENOEXEC)
+ break;
}
+ read_unlock(&binfmt_lock);
exit:
fput(file);
out:
return error;
}
+#endif /* #ifdef CONFIG_USELIB */
#ifdef CONFIG_MMU
/*
@@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm,
unsigned long rlim_stack;
#ifdef CONFIG_STACK_GROWSUP
- /* Limit stack size to 1GB */
+ /* Limit stack size */
stack_base = rlimit_max(RLIMIT_STACK);
- if (stack_base > (1 << 30))
- stack_base = 1 << 30;
+ if (stack_base > STACK_SIZE_MAX)
+ stack_base = STACK_SIZE_MAX;
/* Make sure we didn't let the argument array grow too large. */
if (vma->vm_end - vma->vm_start > stack_base)
@@ -751,11 +751,10 @@ EXPORT_SYMBOL(setup_arg_pages);
#endif /* CONFIG_MMU */
-struct file *open_exec(const char *name)
+static struct file *do_open_exec(struct filename *name)
{
struct file *file;
int err;
- struct filename tmp = { .name = name };
static const struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_EXEC | MAY_OPEN,
@@ -763,7 +762,7 @@ struct file *open_exec(const char *name)
.lookup_flags = LOOKUP_FOLLOW,
};
- file = do_filp_open(AT_FDCWD, &tmp, &open_exec_flags);
+ file = do_filp_open(AT_FDCWD, name, &open_exec_flags);
if (IS_ERR(file))
goto out;
@@ -787,6 +786,12 @@ exit:
fput(file);
return ERR_PTR(err);
}
+
+struct file *open_exec(const char *name)
+{
+ struct filename tmp = { .name = name };
+ return do_open_exec(&tmp);
+}
EXPORT_SYMBOL(open_exec);
int kernel_read(struct file *file, loff_t offset,
@@ -808,7 +813,7 @@ EXPORT_SYMBOL(kernel_read);
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
- ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+ ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
if (res > 0)
flush_icache_range(addr, addr + len);
return res;
@@ -818,7 +823,7 @@ EXPORT_SYMBOL(read_code);
static int exec_mmap(struct mm_struct *mm)
{
struct task_struct *tsk;
- struct mm_struct * old_mm, *active_mm;
+ struct mm_struct *old_mm, *active_mm;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
@@ -844,8 +849,9 @@ static int exec_mmap(struct mm_struct *mm)
tsk->mm = mm;
tsk->active_mm = mm;
activate_mm(active_mm, mm);
+ tsk->mm->vmacache_seqnum = 0;
+ vmacache_flush(tsk);
task_unlock(tsk);
- arch_pick_mmap_layout(mm);
if (old_mm) {
up_read(&old_mm->mmap_sem);
BUG_ON(active_mm != old_mm);
@@ -1040,28 +1046,13 @@ EXPORT_SYMBOL_GPL(get_task_comm);
* so that a new one can be started
*/
-void set_task_comm(struct task_struct *tsk, char *buf)
+void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
task_lock(tsk);
trace_task_rename(tsk, buf);
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
task_unlock(tsk);
- perf_event_comm(tsk);
-}
-
-static void filename_to_taskname(char *tcomm, const char *fn, unsigned int len)
-{
- int i, ch;
-
- /* Copies the binary name from after last slash */
- for (i = 0; (ch = *(fn++)) != '\0';) {
- if (ch == '/')
- i = 0; /* overwrite what we wrote */
- else
- if (i < len - 1)
- tcomm[i++] = ch;
- }
- tcomm[i] = '\0';
+ perf_event_comm(tsk, exec);
}
int flush_old_exec(struct linux_binprm * bprm)
@@ -1077,8 +1068,6 @@ int flush_old_exec(struct linux_binprm * bprm)
goto out;
set_mm_exe_file(bprm->mm, bprm->file);
-
- filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
/*
* Release all of the old mmap stuff
*/
@@ -1090,8 +1079,8 @@ int flush_old_exec(struct linux_binprm * bprm)
bprm->mm = NULL; /* We're using it now */
set_fs(USER_DS);
- current->flags &=
- ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
+ current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD |
+ PF_NOFREEZE | PF_NO_SETAFFINITY);
flush_thread();
current->personality &= ~bprm->per_clear;
@@ -1121,7 +1110,8 @@ void setup_new_exec(struct linux_binprm * bprm)
else
set_dumpable(current->mm, suid_dumpable);
- set_task_comm(current, bprm->tcomm);
+ perf_event_exec();
+ __set_task_comm(current, kbasename(bprm->filename), true);
/* Set the new mm task size. We have to do that late because it may
* depend on TIF_32BIT which is only updated in flush_thread() on
@@ -1141,9 +1131,7 @@ void setup_new_exec(struct linux_binprm * bprm)
/* An exec changes our domain. We are no longer part of the thread
group */
-
current->self_exec_id++;
-
flush_signal_handlers(current, 0);
do_close_on_exec(current->files);
}
@@ -1168,13 +1156,17 @@ int prepare_bprm_creds(struct linux_binprm *bprm)
return -ENOMEM;
}
-void free_bprm(struct linux_binprm *bprm)
+static void free_bprm(struct linux_binprm *bprm)
{
free_arg_pages(bprm);
if (bprm->cred) {
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
+ if (bprm->file) {
+ allow_write_access(bprm->file);
+ fput(bprm->file);
+ }
/* If a binfmt changed the interp, free it. */
if (bprm->interp != bprm->filename)
kfree(bprm->interp);
@@ -1226,11 +1218,10 @@ EXPORT_SYMBOL(install_exec_creds);
* - the caller must hold ->cred_guard_mutex to protect against
* PTRACE_ATTACH
*/
-static int check_unsafe_exec(struct linux_binprm *bprm)
+static void check_unsafe_exec(struct linux_binprm *bprm)
{
struct task_struct *p = current, *t;
unsigned n_fs;
- int res = 0;
if (p->ptrace) {
if (p->ptrace & PT_PTRACE_CAP)
@@ -1246,44 +1237,35 @@ static int check_unsafe_exec(struct linux_binprm *bprm)
if (current->no_new_privs)
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
+ t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
- for (t = next_thread(p); t != p; t = next_thread(t)) {
+ while_each_thread(p, t) {
if (t->fs == p->fs)
n_fs++;
}
rcu_read_unlock();
- if (p->fs->users > n_fs) {
+ if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
- } else {
- res = -EAGAIN;
- if (!p->fs->in_exec) {
- p->fs->in_exec = 1;
- res = 1;
- }
- }
+ else
+ p->fs->in_exec = 1;
spin_unlock(&p->fs->lock);
-
- return res;
}
-/*
- * Fill the binprm structure from the inode.
+/*
+ * Fill the binprm structure from the inode.
* Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
*
* This may be called multiple times for binary chains (scripts for example).
*/
int prepare_binprm(struct linux_binprm *bprm)
{
- umode_t mode;
- struct inode * inode = file_inode(bprm->file);
+ struct inode *inode = file_inode(bprm->file);
+ umode_t mode = inode->i_mode;
int retval;
- mode = inode->i_mode;
- if (bprm->file->f_op == NULL)
- return -EACCES;
/* clear any previous set[ug]id data from a previous binary */
bprm->cred->euid = current_euid();
@@ -1385,10 +1367,6 @@ int search_binary_handler(struct linux_binprm *bprm)
if (retval)
return retval;
- retval = audit_bprm(bprm);
- if (retval)
- return retval;
-
retval = -ENOENT;
retry:
read_lock(&binfmt_lock);
@@ -1436,16 +1414,10 @@ static int exec_binprm(struct linux_binprm *bprm)
ret = search_binary_handler(bprm);
if (ret >= 0) {
+ audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
- current->did_exec = 1;
proc_exec_connector(current);
-
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- bprm->file = NULL; /* to catch use-after-free */
- }
}
return ret;
@@ -1454,16 +1426,18 @@ static int exec_binprm(struct linux_binprm *bprm)
/*
* sys_execve() executes a new program.
*/
-static int do_execve_common(const char *filename,
+static int do_execve_common(struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp)
{
struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
- bool clear_in_exec;
int retval;
+ if (IS_ERR(filename))
+ return PTR_ERR(filename);
+
/*
* We move the actual failure in case of RLIMIT_NPROC excess from
* set*uid() to execve() because too many poorly written programs
@@ -1493,13 +1467,10 @@ static int do_execve_common(const char *filename,
if (retval)
goto out_free;
- retval = check_unsafe_exec(bprm);
- if (retval < 0)
- goto out_free;
- clear_in_exec = retval;
+ check_unsafe_exec(bprm);
current->in_execve = 1;
- file = open_exec(filename);
+ file = do_open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file))
goto out_unmark;
@@ -1507,12 +1478,11 @@ static int do_execve_common(const char *filename,
sched_exec();
bprm->file = file;
- bprm->filename = filename;
- bprm->interp = filename;
+ bprm->filename = bprm->interp = filename->name;
retval = bprm_mm_init(bprm);
if (retval)
- goto out_file;
+ goto out_unmark;
bprm->argc = count(argv, MAX_ARG_STRINGS);
if ((retval = bprm->argc) < 0)
@@ -1547,7 +1517,9 @@ static int do_execve_common(const char *filename,
current->fs->in_exec = 0;
current->in_execve = 0;
acct_update_integrals(current);
+ task_numa_free(current);
free_bprm(bprm);
+ putname(filename);
if (displaced)
put_files_struct(displaced);
return retval;
@@ -1558,15 +1530,8 @@ out:
mmput(bprm->mm);
}
-out_file:
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
-
out_unmark:
- if (clear_in_exec)
- current->fs->in_exec = 0;
+ current->fs->in_exec = 0;
current->in_execve = 0;
out_free:
@@ -1576,10 +1541,11 @@ out_files:
if (displaced)
reset_files_struct(displaced);
out_ret:
+ putname(filename);
return retval;
}
-int do_execve(const char *filename,
+int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
{
@@ -1589,7 +1555,7 @@ int do_execve(const char *filename,
}
#ifdef CONFIG_COMPAT
-static int compat_do_execve(const char *filename,
+static int compat_do_execve(struct filename *filename,
const compat_uptr_t __user *__argv,
const compat_uptr_t __user *__envp)
{
@@ -1616,61 +1582,22 @@ void set_binfmt(struct linux_binfmt *new)
if (new)
__module_get(new->module);
}
-
EXPORT_SYMBOL(set_binfmt);
/*
- * set_dumpable converts traditional three-value dumpable to two flags and
- * stores them into mm->flags. It modifies lower two bits of mm->flags, but
- * these bits are not changed atomically. So get_dumpable can observe the
- * intermediate state. To avoid doing unexpected behavior, get get_dumpable
- * return either old dumpable or new one by paying attention to the order of
- * modifying the bits.
- *
- * dumpable | mm->flags (binary)
- * old new | initial interim final
- * ---------+-----------------------
- * 0 1 | 00 01 01
- * 0 2 | 00 10(*) 11
- * 1 0 | 01 00 00
- * 1 2 | 01 11 11
- * 2 0 | 11 10(*) 00
- * 2 1 | 11 11 01
- *
- * (*) get_dumpable regards interim value of 10 as 11.
+ * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
*/
void set_dumpable(struct mm_struct *mm, int value)
{
- switch (value) {
- case SUID_DUMP_DISABLE:
- clear_bit(MMF_DUMPABLE, &mm->flags);
- smp_wmb();
- clear_bit(MMF_DUMP_SECURELY, &mm->flags);
- break;
- case SUID_DUMP_USER:
- set_bit(MMF_DUMPABLE, &mm->flags);
- smp_wmb();
- clear_bit(MMF_DUMP_SECURELY, &mm->flags);
- break;
- case SUID_DUMP_ROOT:
- set_bit(MMF_DUMP_SECURELY, &mm->flags);
- smp_wmb();
- set_bit(MMF_DUMPABLE, &mm->flags);
- break;
- }
-}
+ unsigned long old, new;
-int __get_dumpable(unsigned long mm_flags)
-{
- int ret;
-
- ret = mm_flags & MMF_DUMPABLE_MASK;
- return (ret > SUID_DUMP_USER) ? SUID_DUMP_ROOT : ret;
-}
+ if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
+ return;
-int get_dumpable(struct mm_struct *mm)
-{
- return __get_dumpable(mm->flags);
+ do {
+ old = ACCESS_ONCE(mm->flags);
+ new = (old & ~MMF_DUMPABLE_MASK) | value;
+ } while (cmpxchg(&mm->flags, old, new) != old);
}
SYSCALL_DEFINE3(execve,
@@ -1678,25 +1605,13 @@ SYSCALL_DEFINE3(execve,
const char __user *const __user *, argv,
const char __user *const __user *, envp)
{
- struct filename *path = getname(filename);
- int error = PTR_ERR(path);
- if (!IS_ERR(path)) {
- error = do_execve(path->name, argv, envp);
- putname(path);
- }
- return error;
+ return do_execve(getname(filename), argv, envp);
}
#ifdef CONFIG_COMPAT
-asmlinkage long compat_sys_execve(const char __user * filename,
- const compat_uptr_t __user * argv,
- const compat_uptr_t __user * envp)
+COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
+ const compat_uptr_t __user *, argv,
+ const compat_uptr_t __user *, envp)
{
- struct filename *path = getname(filename);
- int error = PTR_ERR(path);
- if (!IS_ERR(path)) {
- error = compat_do_execve(path->name, argv, envp);
- putname(path);
- }
- return error;
+ return compat_do_execve(getname(filename), argv, envp);
}
#endif
diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore
index 1ca7fb7b6ba..2daf2329c28 100644
--- a/fs/exofs/Kconfig.ore
+++ b/fs/exofs/Kconfig.ore
@@ -9,4 +9,6 @@ config ORE
tristate
depends on EXOFS_FS || PNFS_OBJLAYOUT
select ASYNC_XOR
+ select RAID6_PQ
+ select ASYNC_PQ
default SCSI_OSD_ULD
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 491c6c078e7..71bf8e4fb5d 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -67,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id)
const struct file_operations exofs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
.release = exofs_release_file,
.fsync = exofs_file_fsync,
.flush = exofs_flush,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a52a5d23c30..3f9cafd7393 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -577,7 +577,7 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
if (offset >= i_size) {
*uptodate = true;
- EXOFS_DBGMSG("offset >= i_size index=0x%lx\n", index);
+ EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index);
return ZERO_PAGE(0);
}
@@ -596,10 +596,10 @@ static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
*uptodate = true;
else
*uptodate = PageUptodate(page);
- EXOFS_DBGMSG("index=0x%lx uptodate=%d\n", index, *uptodate);
+ EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate);
return page;
} else {
- EXOFS_DBGMSG("YES that_locked_page index=0x%lx\n",
+ EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n",
pcol->that_locked_page->index);
*uptodate = true;
return pcol->that_locked_page;
@@ -611,11 +611,11 @@ static void __r4w_put_page(void *priv, struct page *page)
struct page_collect *pcol = priv;
if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) {
- EXOFS_DBGMSG("index=0x%lx\n", page->index);
+ EXOFS_DBGMSG2("index=0x%lx\n", page->index);
page_cache_release(page);
return;
}
- EXOFS_DBGMSG("that_locked_page index=0x%lx\n",
+ EXOFS_DBGMSG2("that_locked_page index=0x%lx\n",
ZERO_PAGE(0) == page ? -1 : page->index);
}
@@ -961,6 +961,14 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
WARN_ON(1);
}
+
+ /* TODO: Should be easy enough to do proprly */
+static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb,
+ struct iov_iter *iter, loff_t offset)
+{
+ return 0;
+}
+
const struct address_space_operations exofs_aops = {
.readpage = exofs_readpage,
.readpages = exofs_readpages,
@@ -974,7 +982,7 @@ const struct address_space_operations exofs_aops = {
/* Not implemented Yet */
.bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
- .direct_IO = NULL, /* TODO: Should be trivial to do */
+ .direct_IO = exofs_direct_IO,
/* With these NULL has special meaning or default is not exported */
.get_xip_mem = NULL,
@@ -1010,7 +1018,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize)
if (likely(!ret))
truncate_setsize(inode, newsize);
- EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n",
+ EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n",
inode->i_ino, newsize, ret);
return ret;
}
@@ -1094,14 +1102,13 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
/* If object is lost on target we might as well enable it's
* delete.
*/
- if ((ret == -ENOENT) || (ret == -EINVAL))
- ret = 0;
+ ret = 0;
goto out;
}
ret = extract_attr_from_ios(ios, &attrs[0]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__);
goto out;
}
WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
@@ -1109,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
ret = extract_attr_from_ios(ios, &attrs[1]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__);
goto out;
}
if (attrs[1].len) {
@@ -1124,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
ret = extract_attr_from_ios(ios, &attrs[2]);
if (ret) {
- EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
+ EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__);
goto out;
}
if (attrs[2].len) {
@@ -1479,7 +1486,7 @@ void exofs_evict_inode(struct inode *inode)
struct ore_io_state *ios;
int ret;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
/* TODO: should do better here */
if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index b7442288860..cfc0205d62c 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -58,9 +58,12 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->parity = 1;
break;
case PNFS_OSD_RAID_PQ:
+ layout->parity = 2;
+ break;
case PNFS_OSD_RAID_4:
default:
- ORE_ERR("Only RAID_0/5 for now\n");
+ ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n",
+ layout->raid_algorithm);
return -EINVAL;
}
if (0 != (layout->stripe_unit & ~PAGE_MASK)) {
@@ -103,7 +106,7 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->max_io_length =
(BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) *
- layout->group_width;
+ (layout->group_width - layout->parity);
if (layout->parity) {
unsigned stripe_length =
(layout->group_width - layout->parity) *
@@ -112,6 +115,8 @@ int ore_verify_layout(unsigned total_comps, struct ore_layout *layout)
layout->max_io_length /= stripe_length;
layout->max_io_length *= stripe_length;
}
+ ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length);
+
return 0;
}
EXPORT_SYMBOL(ore_verify_layout);
@@ -286,7 +291,8 @@ int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc,
if (length) {
ore_calc_stripe_info(layout, offset, length, &ios->si);
ios->length = ios->si.length;
- ios->nr_pages = (ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
+ ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) +
+ ios->length + PAGE_SIZE - 1) / PAGE_SIZE;
if (layout->parity)
_ore_post_alloc_raid_stuff(ios);
}
@@ -430,8 +436,12 @@ int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error)
if (likely(!ret))
continue;
- if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) {
- /* start read offset passed endof file */
+ if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) &&
+ per_dev->bio) {
+ /* start read offset passed endof file.
+ * Note: if we do not have bio it means read-attributes
+ * In this case we should return error to caller.
+ */
_clear_bio(per_dev->bio);
ORE_DBGMSG("start read offset passed end of file "
"offset=0x%llx, length=0x%llx\n",
@@ -536,24 +546,28 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
u64 H = LmodS - G * T;
u32 N = div_u64(H, U);
+ u32 Nlast;
/* "H - (N * U)" is just "H % U" so it's bound to u32 */
u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width;
+ u32 first_dev = C - C % group_width;
div_u64_rem(file_offset, stripe_unit, &si->unit_off);
si->obj_offset = si->unit_off + (N * stripe_unit) +
(M * group_depth * stripe_unit);
+ si->cur_comp = C - first_dev;
+ si->cur_pg = si->unit_off / PAGE_SIZE;
if (parity) {
u32 LCMdP = lcm(group_width, parity) / parity;
/* R = N % LCMdP; */
u32 RxP = (N % LCMdP) * parity;
- u32 first_dev = C - C % group_width;
si->par_dev = (group_width + group_width - parity - RxP) %
group_width + first_dev;
- si->dev = (group_width + C - RxP) % group_width + first_dev;
+ si->dev = (group_width + group_width + C - RxP) %
+ group_width + first_dev;
si->bytes_in_stripe = U;
si->first_stripe_start = M * S + G * T + N * U;
} else {
@@ -568,6 +582,10 @@ void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset,
si->length = T - H;
if (si->length > length)
si->length = length;
+
+ Nlast = div_u64(H + si->length + U - 1, U);
+ si->maxdevUnits = Nlast - N;
+
si->M = M;
}
EXPORT_SYMBOL(ore_calc_stripe_info);
@@ -583,13 +601,16 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
int ret;
if (per_dev->bio == NULL) {
- unsigned pages_in_stripe = ios->layout->group_width *
- (ios->layout->stripe_unit / PAGE_SIZE);
- unsigned nr_pages = ios->nr_pages * ios->layout->group_width /
- (ios->layout->group_width -
- ios->layout->parity);
- unsigned bio_size = (nr_pages + pages_in_stripe) /
- ios->layout->group_width;
+ unsigned bio_size;
+
+ if (!ios->reading) {
+ bio_size = ios->si.maxdevUnits;
+ } else {
+ bio_size = (ios->si.maxdevUnits + 1) *
+ (ios->layout->group_width - ios->layout->parity) /
+ ios->layout->group_width;
+ }
+ bio_size *= (ios->layout->stripe_unit / PAGE_SIZE);
per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size);
if (unlikely(!per_dev->bio)) {
@@ -609,8 +630,12 @@ int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg,
added_len = bio_add_pc_page(q, per_dev->bio, pages[pg],
pglen, pgbase);
if (unlikely(pglen != added_len)) {
- ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=%u\n",
- per_dev->bio->bi_vcnt);
+ /* If bi_vcnt == bi_max then this is a SW BUG */
+ ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x "
+ "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n",
+ per_dev->bio->bi_vcnt,
+ per_dev->bio->bi_max_vecs,
+ BIO_MAX_PAGES_KMALLOC, cur_len);
ret = -ENOMEM;
goto out;
}
@@ -632,6 +657,43 @@ out: /* we fail the complete unit on an error eg don't advance
return ret;
}
+static int _add_parity_units(struct ore_io_state *ios,
+ struct ore_striping_info *si,
+ unsigned dev, unsigned first_dev,
+ unsigned mirrors_p1, unsigned devs_in_group,
+ unsigned cur_len)
+{
+ unsigned do_parity;
+ int ret = 0;
+
+ for (do_parity = ios->layout->parity; do_parity; --do_parity) {
+ struct ore_per_dev_state *per_dev;
+
+ per_dev = &ios->per_dev[dev - first_dev];
+ if (!per_dev->length && !per_dev->offset) {
+ /* Only/always the parity unit of the first
+ * stripe will be empty. So this is a chance to
+ * initialize the per_dev info.
+ */
+ per_dev->dev = dev;
+ per_dev->offset = si->obj_offset - si->unit_off;
+ }
+
+ ret = _ore_add_parity_unit(ios, si, per_dev, cur_len,
+ do_parity == 1);
+ if (unlikely(ret))
+ break;
+
+ if (do_parity != 1) {
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
+ si->cur_comp = (si->cur_comp + 1) %
+ ios->layout->group_width;
+ }
+ }
+
+ return ret;
+}
+
static int _prepare_for_striping(struct ore_io_state *ios)
{
struct ore_striping_info *si = &ios->si;
@@ -641,7 +703,6 @@ static int _prepare_for_striping(struct ore_io_state *ios)
unsigned devs_in_group = group_width * mirrors_p1;
unsigned dev = si->dev;
unsigned first_dev = dev - (dev % devs_in_group);
- unsigned dev_order;
unsigned cur_pg = ios->pages_consumed;
u64 length = ios->length;
int ret = 0;
@@ -653,16 +714,13 @@ static int _prepare_for_striping(struct ore_io_state *ios)
BUG_ON(length > si->length);
- dev_order = _dev_order(devs_in_group, mirrors_p1, si->par_dev, dev);
- si->cur_comp = dev_order;
- si->cur_pg = si->unit_off / PAGE_SIZE;
-
while (length) {
- unsigned comp = dev - first_dev;
- struct ore_per_dev_state *per_dev = &ios->per_dev[comp];
+ struct ore_per_dev_state *per_dev =
+ &ios->per_dev[dev - first_dev];
unsigned cur_len, page_off = 0;
- if (!per_dev->length) {
+ if (!per_dev->length && !per_dev->offset) {
+ /* First time initialize the per_dev info. */
per_dev->dev = dev;
if (dev == si->dev) {
WARN_ON(dev == si->par_dev);
@@ -671,13 +729,7 @@ static int _prepare_for_striping(struct ore_io_state *ios)
page_off = si->unit_off & ~PAGE_MASK;
BUG_ON(page_off && (page_off != ios->pgbase));
} else {
- if (si->cur_comp > dev_order)
- per_dev->offset =
- si->obj_offset - si->unit_off;
- else /* si->cur_comp < dev_order */
- per_dev->offset =
- si->obj_offset + stripe_unit -
- si->unit_off;
+ per_dev->offset = si->obj_offset - si->unit_off;
cur_len = stripe_unit;
}
} else {
@@ -691,11 +743,9 @@ static int _prepare_for_striping(struct ore_io_state *ios)
if (unlikely(ret))
goto out;
- dev += mirrors_p1;
- dev = (dev % devs_in_group) + first_dev;
-
length -= cur_len;
+ dev = ((dev + mirrors_p1) % devs_in_group) + first_dev;
si->cur_comp = (si->cur_comp + 1) % group_width;
if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) {
if (!length && ios->sp2d) {
@@ -703,23 +753,16 @@ static int _prepare_for_striping(struct ore_io_state *ios)
* stripe. then operate on parity dev.
*/
dev = si->par_dev;
- }
- if (ios->sp2d)
- /* In writes cur_len just means if it's the
- * last one. See _ore_add_parity_unit.
- */
- cur_len = length;
- per_dev = &ios->per_dev[dev - first_dev];
- if (!per_dev->length) {
- /* Only/always the parity unit of the first
- * stripe will be empty. So this is a chance to
- * initialize the per_dev info.
- */
- per_dev->dev = dev;
- per_dev->offset = si->obj_offset - si->unit_off;
+ /* If last stripe operate on parity comp */
+ si->cur_comp = group_width - ios->layout->parity;
}
- ret = _ore_add_parity_unit(ios, si, per_dev, cur_len);
+ /* In writes cur_len just means if it's the
+ * last one. See _ore_add_parity_unit.
+ */
+ ret = _add_parity_units(ios, si, dev, first_dev,
+ mirrors_p1, devs_in_group,
+ ios->sp2d ? length : cur_len);
if (unlikely(ret))
goto out;
@@ -730,6 +773,8 @@ static int _prepare_for_striping(struct ore_io_state *ios)
/* Next stripe, start fresh */
si->cur_comp = 0;
si->cur_pg = 0;
+ si->obj_offset += cur_len;
+ si->unit_off = 0;
}
}
out:
@@ -1098,7 +1143,7 @@ int ore_truncate(struct ore_layout *layout, struct ore_components *oc,
size_attr->attr = g_attr_logical_length;
size_attr->attr.val_ptr = &size_attr->newsize;
- ORE_DBGMSG("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
+ ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n",
_LLU(oc->comps->obj.id), _LLU(obj_size), i);
ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1,
&size_attr->attr);
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7682b970d0f..7f20f25c232 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -21,12 +21,12 @@
#undef ORE_DBGMSG2
#define ORE_DBGMSG2 ORE_DBGMSG
-struct page *_raid_page_alloc(void)
+static struct page *_raid_page_alloc(void)
{
return alloc_page(GFP_KERNEL);
}
-void _raid_page_free(struct page *p)
+static void _raid_page_free(struct page *p)
{
__free_page(p);
}
@@ -218,22 +218,28 @@ static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d)
static void _gen_xor_unit(struct __stripe_pages_2d *sp2d)
{
unsigned p;
+ unsigned tx_flags = ASYNC_TX_ACK;
+
+ if (sp2d->parity == 1)
+ tx_flags |= ASYNC_TX_XOR_ZERO_DST;
+
for (p = 0; p < sp2d->pages_in_unit; p++) {
struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p];
if (!_1ps->write_count)
continue;
- init_async_submit(&_1ps->submit,
- ASYNC_TX_XOR_ZERO_DST | ASYNC_TX_ACK,
- NULL,
- NULL, NULL,
- (addr_conv_t *)_1ps->scribble);
-
- /* TODO: raid6 */
- _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], _1ps->pages,
- 0, sp2d->data_devs, PAGE_SIZE,
- &_1ps->submit);
+ init_async_submit(&_1ps->submit, tx_flags,
+ NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble);
+
+ if (sp2d->parity == 1)
+ _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs],
+ _1ps->pages, 0, sp2d->data_devs,
+ PAGE_SIZE, &_1ps->submit);
+ else /* parity == 2 */
+ _1ps->tx = async_gen_syndrome(_1ps->pages, 0,
+ sp2d->data_devs + sp2d->parity,
+ PAGE_SIZE, &_1ps->submit);
}
for (p = 0; p < sp2d->pages_in_unit; p++) {
@@ -404,9 +410,8 @@ static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset)
ore_calc_stripe_info(ios->layout, *offset, 0, &si);
- p = si.unit_off / PAGE_SIZE;
- c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
- ios->layout->mirrors_p1, si.par_dev, si.dev);
+ p = si.cur_pg;
+ c = si.cur_comp;
page = ios->sp2d->_1p_stripes[p].pages[c];
pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE);
@@ -534,9 +539,8 @@ static int _read_4_write_last_stripe(struct ore_io_state *ios)
goto read_it;
ore_calc_stripe_info(ios->layout, offset, 0, &read_si);
- p = read_si.unit_off / PAGE_SIZE;
- c = _dev_order(ios->layout->group_width * ios->layout->mirrors_p1,
- ios->layout->mirrors_p1, read_si.par_dev, read_si.dev);
+ p = read_si.cur_pg;
+ c = read_si.cur_comp;
if (min_p == sp2d->pages_in_unit) {
/* Didn't do it yet */
@@ -620,7 +624,7 @@ static int _read_4_write_execute(struct ore_io_state *ios)
int _ore_add_parity_unit(struct ore_io_state *ios,
struct ore_striping_info *si,
struct ore_per_dev_state *per_dev,
- unsigned cur_len)
+ unsigned cur_len, bool do_xor)
{
if (ios->reading) {
if (per_dev->cur_sg >= ios->sgs_per_dev) {
@@ -640,17 +644,16 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
si->cur_pg = _sp2d_min_pg(sp2d);
num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg;
- if (!cur_len) /* If last stripe operate on parity comp */
- si->cur_comp = sp2d->data_devs;
-
if (!per_dev->length) {
per_dev->offset += si->cur_pg * PAGE_SIZE;
/* If first stripe, Read in all read4write pages
* (if needed) before we calculate the first parity.
*/
- _read_4_write_first_stripe(ios);
+ if (do_xor)
+ _read_4_write_first_stripe(ios);
}
- if (!cur_len) /* If last stripe r4w pages of last stripe */
+ if (!cur_len && do_xor)
+ /* If last stripe r4w pages of last stripe */
_read_4_write_last_stripe(ios);
_read_4_write_execute(ios);
@@ -662,7 +665,7 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
++(ios->cur_par_page);
}
- BUG_ON(si->cur_comp != sp2d->data_devs);
+ BUG_ON(si->cur_comp < sp2d->data_devs);
BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit);
ret = _ore_add_stripe_unit(ios, &array_start, 0, pages,
@@ -670,9 +673,10 @@ int _ore_add_parity_unit(struct ore_io_state *ios,
if (unlikely(ret))
return ret;
- /* TODO: raid6 if (last_parity_dev) */
- _gen_xor_unit(sp2d);
- _sp2d_reset(sp2d, ios->r4w, ios->private);
+ if (do_xor) {
+ _gen_xor_unit(sp2d);
+ _sp2d_reset(sp2d, ios->r4w, ios->private);
+ }
}
return 0;
}
diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h
index 2ffd2c3c6e4..cf6375d8212 100644
--- a/fs/exofs/ore_raid.h
+++ b/fs/exofs/ore_raid.h
@@ -31,24 +31,6 @@
#define ORE_DBGMSG2(M...) do {} while (0)
/* #define ORE_DBGMSG2 ORE_DBGMSG */
-/* Calculate the component order in a stripe. eg the logical data unit
- * address within the stripe of @dev given the @par_dev of this stripe.
- */
-static inline unsigned _dev_order(unsigned devs_in_group, unsigned mirrors_p1,
- unsigned par_dev, unsigned dev)
-{
- unsigned first_dev = dev - dev % devs_in_group;
-
- dev -= first_dev;
- par_dev -= first_dev;
-
- if (devs_in_group == par_dev) /* The raid 0 case */
- return dev / mirrors_p1;
- /* raid4/5/6 case */
- return ((devs_in_group + dev - par_dev - mirrors_p1) % devs_in_group) /
- mirrors_p1;
-}
-
/* ios_raid.c stuff needed by ios.c */
int _ore_post_alloc_raid_stuff(struct ore_io_state *ios);
void _ore_free_raid_stuff(struct ore_io_state *ios);
@@ -56,7 +38,8 @@ void _ore_free_raid_stuff(struct ore_io_state *ios);
void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len,
bool not_last);
int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si,
- struct ore_per_dev_state *per_dev, unsigned cur_len);
+ struct ore_per_dev_state *per_dev, unsigned cur_len,
+ bool do_xor);
void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d,
struct ore_striping_info *si, struct page *page);
static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d,
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9d976332873..ed73ed8ebbe 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -543,7 +543,7 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
return !(odi->systemid_len || odi->osdname_len);
}
-int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
+static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs,
struct exofs_dev **peds)
{
struct __alloc_ore_devs_and_exofs_devs {
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index a235f001688..b01fbfb51f4 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -69,145 +69,162 @@ find_acceptable_alias(struct dentry *result,
return NULL;
}
-/*
- * Find root of a disconnected subtree and return a reference to it.
- */
-static struct dentry *
-find_disconnected_root(struct dentry *dentry)
+static bool dentry_connected(struct dentry *dentry)
{
dget(dentry);
- while (!IS_ROOT(dentry)) {
+ while (dentry->d_flags & DCACHE_DISCONNECTED) {
struct dentry *parent = dget_parent(dentry);
- if (!(parent->d_flags & DCACHE_DISCONNECTED)) {
+ dput(dentry);
+ if (IS_ROOT(dentry)) {
dput(parent);
- break;
+ return false;
}
+ dentry = parent;
+ }
+ dput(dentry);
+ return true;
+}
+
+static void clear_disconnected(struct dentry *dentry)
+{
+ dget(dentry);
+ while (dentry->d_flags & DCACHE_DISCONNECTED) {
+ struct dentry *parent = dget_parent(dentry);
+
+ WARN_ON_ONCE(IS_ROOT(dentry));
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_flags &= ~DCACHE_DISCONNECTED;
+ spin_unlock(&dentry->d_lock);
dput(dentry);
dentry = parent;
}
- return dentry;
+ dput(dentry);
+}
+
+/*
+ * Reconnect a directory dentry with its parent.
+ *
+ * This can return a dentry, or NULL, or an error.
+ *
+ * In the first case the returned dentry is the parent of the given
+ * dentry, and may itself need to be reconnected to its parent.
+ *
+ * In the NULL case, a concurrent VFS operation has either renamed or
+ * removed this directory. The concurrent operation has reconnected our
+ * dentry, so we no longer need to.
+ */
+static struct dentry *reconnect_one(struct vfsmount *mnt,
+ struct dentry *dentry, char *nbuf)
+{
+ struct dentry *parent;
+ struct dentry *tmp;
+ int err;
+
+ parent = ERR_PTR(-EACCES);
+ mutex_lock(&dentry->d_inode->i_mutex);
+ if (mnt->mnt_sb->s_export_op->get_parent)
+ parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ if (IS_ERR(parent)) {
+ dprintk("%s: get_parent of %ld failed, err %d\n",
+ __func__, dentry->d_inode->i_ino, PTR_ERR(parent));
+ return parent;
+ }
+
+ dprintk("%s: find name of %lu in %lu\n", __func__,
+ dentry->d_inode->i_ino, parent->d_inode->i_ino);
+ err = exportfs_get_name(mnt, parent, nbuf, dentry);
+ if (err == -ENOENT)
+ goto out_reconnected;
+ if (err)
+ goto out_err;
+ dprintk("%s: found name: %s\n", __func__, nbuf);
+ mutex_lock(&parent->d_inode->i_mutex);
+ tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
+ mutex_unlock(&parent->d_inode->i_mutex);
+ if (IS_ERR(tmp)) {
+ dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
+ goto out_err;
+ }
+ if (tmp != dentry) {
+ dput(tmp);
+ goto out_reconnected;
+ }
+ dput(tmp);
+ if (IS_ROOT(dentry)) {
+ err = -ESTALE;
+ goto out_err;
+ }
+ return parent;
+
+out_err:
+ dput(parent);
+ return ERR_PTR(err);
+out_reconnected:
+ dput(parent);
+ /*
+ * Someone must have renamed our entry into another parent, in
+ * which case it has been reconnected by the rename.
+ *
+ * Or someone removed it entirely, in which case filehandle
+ * lookup will succeed but the directory is now IS_DEAD and
+ * subsequent operations on it will fail.
+ *
+ * Alternatively, maybe there was no race at all, and the
+ * filesystem is just corrupt and gave us a parent that doesn't
+ * actually contain any entry pointing to this inode. So,
+ * double check that this worked and return -ESTALE if not:
+ */
+ if (!dentry_connected(dentry))
+ return ERR_PTR(-ESTALE);
+ return NULL;
}
/*
* Make sure target_dir is fully connected to the dentry tree.
*
- * It may already be, as the flag isn't always updated when connection happens.
+ * On successful return, DCACHE_DISCONNECTED will be cleared on
+ * target_dir, and target_dir->d_parent->...->d_parent will reach the
+ * root of the filesystem.
+ *
+ * Whenever DCACHE_DISCONNECTED is unset, target_dir is fully connected.
+ * But the converse is not true: target_dir may have DCACHE_DISCONNECTED
+ * set but already be connected. In that case we'll verify the
+ * connection to root and then clear the flag.
+ *
+ * Note that target_dir could be removed by a concurrent operation. In
+ * that case reconnect_path may still succeed with target_dir fully
+ * connected, but further operations using the filehandle will fail when
+ * necessary (due to S_DEAD being set on the directory).
*/
static int
reconnect_path(struct vfsmount *mnt, struct dentry *target_dir, char *nbuf)
{
- int noprogress = 0;
- int err = -ESTALE;
+ struct dentry *dentry, *parent;
- /*
- * It is possible that a confused file system might not let us complete
- * the path to the root. For example, if get_parent returns a directory
- * in which we cannot find a name for the child. While this implies a
- * very sick filesystem we don't want it to cause knfsd to spin. Hence
- * the noprogress counter. If we go through the loop 10 times (2 is
- * probably enough) without getting anywhere, we just give up
- */
- while (target_dir->d_flags & DCACHE_DISCONNECTED && noprogress++ < 10) {
- struct dentry *pd = find_disconnected_root(target_dir);
-
- if (!IS_ROOT(pd)) {
- /* must have found a connected parent - great */
- spin_lock(&pd->d_lock);
- pd->d_flags &= ~DCACHE_DISCONNECTED;
- spin_unlock(&pd->d_lock);
- noprogress = 0;
- } else if (pd == mnt->mnt_sb->s_root) {
- printk(KERN_ERR "export: Eeek filesystem root is not connected, impossible\n");
- spin_lock(&pd->d_lock);
- pd->d_flags &= ~DCACHE_DISCONNECTED;
- spin_unlock(&pd->d_lock);
- noprogress = 0;
- } else {
- /*
- * We have hit the top of a disconnected path, try to
- * find parent and connect.
- *
- * Racing with some other process renaming a directory
- * isn't much of a problem here. If someone renames
- * the directory, it will end up properly connected,
- * which is what we want
- *
- * Getting the parent can't be supported generically,
- * the locking is too icky.
- *
- * Instead we just return EACCES. If server reboots
- * or inodes get flushed, you lose
- */
- struct dentry *ppd = ERR_PTR(-EACCES);
- struct dentry *npd;
-
- mutex_lock(&pd->d_inode->i_mutex);
- if (mnt->mnt_sb->s_export_op->get_parent)
- ppd = mnt->mnt_sb->s_export_op->get_parent(pd);
- mutex_unlock(&pd->d_inode->i_mutex);
-
- if (IS_ERR(ppd)) {
- err = PTR_ERR(ppd);
- dprintk("%s: get_parent of %ld failed, err %d\n",
- __func__, pd->d_inode->i_ino, err);
- dput(pd);
- break;
- }
+ dentry = dget(target_dir);
- dprintk("%s: find name of %lu in %lu\n", __func__,
- pd->d_inode->i_ino, ppd->d_inode->i_ino);
- err = exportfs_get_name(mnt, ppd, nbuf, pd);
- if (err) {
- dput(ppd);
- dput(pd);
- if (err == -ENOENT)
- /* some race between get_parent and
- * get_name? just try again
- */
- continue;
- break;
- }
- dprintk("%s: found name: %s\n", __func__, nbuf);
- mutex_lock(&ppd->d_inode->i_mutex);
- npd = lookup_one_len(nbuf, ppd, strlen(nbuf));
- mutex_unlock(&ppd->d_inode->i_mutex);
- if (IS_ERR(npd)) {
- err = PTR_ERR(npd);
- dprintk("%s: lookup failed: %d\n",
- __func__, err);
- dput(ppd);
- dput(pd);
- break;
- }
- /* we didn't really want npd, we really wanted
- * a side-effect of the lookup.
- * hopefully, npd == pd, though it isn't really
- * a problem if it isn't
- */
- if (npd == pd)
- noprogress = 0;
- else
- printk("%s: npd != pd\n", __func__);
- dput(npd);
- dput(ppd);
- if (IS_ROOT(pd)) {
- /* something went wrong, we have to give up */
- dput(pd);
- break;
- }
- }
- dput(pd);
- }
+ while (dentry->d_flags & DCACHE_DISCONNECTED) {
+ BUG_ON(dentry == mnt->mnt_sb->s_root);
- if (target_dir->d_flags & DCACHE_DISCONNECTED) {
- /* something went wrong - oh-well */
- if (!err)
- err = -ESTALE;
- return err;
- }
+ if (IS_ROOT(dentry))
+ parent = reconnect_one(mnt, dentry, nbuf);
+ else
+ parent = dget_parent(dentry);
+ if (!parent)
+ break;
+ dput(dentry);
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+ dentry = parent;
+ }
+ dput(dentry);
+ clear_disconnected(target_dir);
return 0;
}
@@ -215,7 +232,7 @@ struct getdents_callback {
struct dir_context ctx;
char *name; /* name that was found. It already points to a
buffer NAME_MAX+1 is size */
- unsigned long ino; /* the inum we are looking for */
+ u64 ino; /* the inum we are looking for */
int found; /* inode matched? */
int sequence; /* sequence counter */
};
@@ -242,7 +259,7 @@ static int filldir_one(void * __buf, const char * name, int len,
/**
* get_name - default export_operations->get_name function
- * @dentry: the directory in which to find a name
+ * @path: the directory in which to find a name
* @name: a pointer to a %NAME_MAX+1 char buffer to store the name
* @child: the dentry for the child directory.
*
@@ -255,10 +272,14 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
struct inode *dir = path->dentry->d_inode;
int error;
struct file *file;
+ struct kstat stat;
+ struct path child_path = {
+ .mnt = path->mnt,
+ .dentry = child,
+ };
struct getdents_callback buffer = {
.ctx.actor = filldir_one,
.name = name,
- .ino = child->d_inode->i_ino
};
error = -ENOTDIR;
@@ -268,6 +289,16 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
if (!dir->i_fop)
goto out;
/*
+ * inode->i_ino is unsigned long, kstat->ino is u64, so the
+ * former would be insufficient on 32-bit hosts when the
+ * filesystem supports 64-bit inode numbers. So we need to
+ * actually call ->getattr, not just read i_ino:
+ */
+ error = vfs_getattr_nosec(&child_path, &stat);
+ if (error)
+ return error;
+ buffer.ino = stat.ino;
+ /*
* Open the directory ...
*/
file = dentry_open(path, O_RDONLY, cred);
@@ -306,7 +337,7 @@ out:
/**
* export_encode_fh - default export_operations->encode_fh function
* @inode: the object to encode
- * @fh: where to store the file handle fragment
+ * @fid: where to store the file handle fragment
* @max_len: maximum length to store there
* @parent: parent directory inode, if wanted
*
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 110b6b371a4..27695e6f4e4 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -4,7 +4,6 @@
* Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
*/
-#include <linux/capability.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -148,13 +147,6 @@ ext2_get_acl(struct inode *inode, int type)
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -189,19 +181,14 @@ ext2_get_acl(struct inode *inode, int type)
/*
* inode->i_mutex: down
*/
-static int
-ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int
+ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int name_index;
void *value = NULL;
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
-
switch(type) {
case ACL_TYPE_ACCESS:
name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -250,169 +237,21 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
int
ext2_init_acl(struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext2_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- if (S_ISDIR(inode->i_mode)) {
- error = ext2_set_acl(inode, ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
- if (error < 0)
- return error;
- if (error > 0) {
- /* This is an extended ACL */
- error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
- }
- }
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext2_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
- int error;
+ struct posix_acl *default_acl, *acl;
+ int error;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (error)
return error;
- error = ext2_set_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- return error;
-}
-/*
- * Extended attribut handlers
- */
-static size_t
-ext2_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_size)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext2_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_size)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext2_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext2_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct posix_acl *acl;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(dentry->d_inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
-
- error = ext2_set_acl(dentry->d_inode, type, acl);
-
-release_and_out:
- posix_acl_release(acl);
+ if (default_acl) {
+ error = ext2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = ext2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
return error;
}
-
-const struct xattr_handler ext2_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ext2_xattr_list_acl_access,
- .get = ext2_xattr_get_acl,
- .set = ext2_xattr_set_acl,
-};
-
-const struct xattr_handler ext2_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ext2_xattr_list_acl_default,
- .get = ext2_xattr_get_acl,
- .set = ext2_xattr_set_acl,
-};
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index 503bfb0ed79..44937f9fcf3 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size)
/* acl.c */
extern struct posix_acl *ext2_get_acl(struct inode *inode, int type);
-extern int ext2_acl_chmod (struct inode *);
+extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext2_init_acl (struct inode *, struct inode *);
#else
@@ -63,12 +63,6 @@ extern int ext2_init_acl (struct inode *, struct inode *);
#define ext2_get_acl NULL
#define ext2_set_acl NULL
-static inline int
-ext2_acl_chmod (struct inode *inode)
-{
- return 0;
-}
-
static inline int ext2_init_acl (struct inode *inode, struct inode *dir)
{
return 0;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a5b3a5db312..7c87b22a722 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -62,10 +62,10 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
@@ -75,7 +75,7 @@ const struct file_operations ext2_file_operations = {
.release = ext2_release_file,
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
#ifdef CONFIG_EXT2_FS_XIP
@@ -103,5 +103,6 @@ const struct inode_operations ext2_file_inode_operations = {
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
+ .set_acl = ext2_set_acl,
.fiemap = ext2_fiemap,
};
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7cadd823bb3..7d66fb0e4cc 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -284,7 +284,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- get_random_bytes(&group, sizeof(group));
+ group = prandom_u32();
parent_group = (unsigned)group % ngroups;
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c260de6d7b6..36d35c36311 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode)
dquot_drop(inode);
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (want_delete) {
sb_start_intwrite(inode->i_sb);
@@ -632,6 +632,8 @@ static int ext2_get_blocks(struct inode *inode,
int count = 0;
ext2_fsblk_t first_block = 0;
+ BUG_ON(maxblocks == 0);
+
depth = ext2_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
if (depth == 0)
@@ -848,18 +850,18 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
}
static ssize_t
-ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- ext2_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
if (ret < 0 && (rw & WRITE))
- ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ ext2_write_failed(mapping, offset + count);
return ret;
}
@@ -1564,7 +1566,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
}
setattr_copy(inode, iattr);
if (iattr->ia_valid & ATTR_MODE)
- error = ext2_acl_chmod(inode);
+ error = posix_acl_chmod(inode, inode->i_mode);
mark_inode_dirty(inode);
return error;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 256dd5f4c1c..c268d0af1db 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -421,6 +421,7 @@ const struct inode_operations ext2_dir_inode_operations = {
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
+ .set_acl = ext2_set_acl,
.tmpfile = ext2_tmpfile,
};
@@ -433,4 +434,5 @@ const struct inode_operations ext2_special_inode_operations = {
#endif
.setattr = ext2_setattr,
.get_acl = ext2_get_acl,
+ .set_acl = ext2_set_acl,
};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 288534920fe..3750031cfa2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,7 +192,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
sizeof(struct ext2_inode_info),
@@ -1254,6 +1254,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
unsigned long old_sb_flags;
int err;
+ sync_filesystem(sb);
spin_lock(&sbi->s_lock);
/* Store the old options */
@@ -1493,6 +1494,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
sb->s_blocksize - offset : towrite;
tmp_bh.b_state = 0;
+ tmp_bh.b_size = sb->s_blocksize;
err = ext2_get_block(inode, blk, &tmp_bh, 1);
if (err < 0)
goto out;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 2d7557db3ae..91426141c33 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -103,8 +103,8 @@ static struct mb_cache *ext2_xattr_cache;
static const struct xattr_handler *ext2_xattr_handler_map[] = {
[EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
- [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
- [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext2_xattr_acl_default_handler,
+ [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT2_XATTR_INDEX_TRUSTED] = &ext2_xattr_trusted_handler,
#ifdef CONFIG_EXT2_FS_SECURITY
@@ -116,8 +116,8 @@ const struct xattr_handler *ext2_xattr_handlers[] = {
&ext2_xattr_user_handler,
&ext2_xattr_trusted_handler,
#ifdef CONFIG_EXT2_FS_POSIX_ACL
- &ext2_xattr_acl_access_handler,
- &ext2_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT2_FS_SECURITY
&ext2_xattr_security_handler,
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index 5e41cccff76..60edf298644 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -57,8 +57,6 @@ struct ext2_xattr_entry {
extern const struct xattr_handler ext2_xattr_user_handler;
extern const struct xattr_handler ext2_xattr_trusted_handler;
-extern const struct xattr_handler ext2_xattr_acl_access_handler;
-extern const struct xattr_handler ext2_xattr_acl_default_handler;
extern const struct xattr_handler ext2_xattr_security_handler;
extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index cfedb2cb0d8..c0ebc4db884 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -42,8 +42,8 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
value, size, flags);
}
-int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int ext2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
int err = 0;
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
index 1c3312858fc..e98171a11cf 100644
--- a/fs/ext2/xip.c
+++ b/fs/ext2/xip.c
@@ -35,6 +35,7 @@ __ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
int rc;
memset(&tmp, 0, sizeof(struct buffer_head));
+ tmp.b_size = 1 << inode->i_blkbits;
rc = ext2_get_block(inode, pgoff, &tmp, create);
*result = tmp.b_blocknr;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index dbb5ad59a7f..8bbaf5bcf98 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -145,13 +145,6 @@ ext3_get_acl(struct inode *inode, int type)
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -190,7 +183,7 @@ ext3_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext3_new_inode
*/
static int
-ext3_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext3_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
int name_index;
@@ -198,9 +191,6 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch(type) {
case ACL_TYPE_ACCESS:
name_index = EXT3_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -243,204 +233,49 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-/*
- * Initialize the ACLs of a new inode. Called from ext3_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
int
-ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext3_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- if (S_ISDIR(inode->i_mode)) {
- error = ext3_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (error < 0)
- return error;
-
- if (error > 0) {
- /* This is an extended ACL */
- error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
- }
- }
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext3_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
handle_t *handle;
- int retries = 0;
- int error;
+ int error, retries = 0;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (error)
- return error;
retry:
- handle = ext3_journal_start(inode,
- EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext3_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext3_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+ handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ error = __ext3_set_acl(handle, inode, type, acl);
ext3_journal_stop(handle);
- if (error == -ENOSPC &&
- ext3_should_retry_alloc(inode->i_sb, &retries))
+ if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
goto retry;
-out:
- posix_acl_release(acl);
return error;
}
/*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext3_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
*/
-static size_t
-ext3_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext3_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext3_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+int
+ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl;
+ struct posix_acl *default_acl, *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext3_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- handle_t *handle;
- struct posix_acl *acl;
- int error, retries = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
-
-retry:
- handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- error = ext3_set_acl(handle, inode, type, acl);
- ext3_journal_stop(handle);
- if (error == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
-release_and_out:
- posix_acl_release(acl);
+ if (default_acl) {
+ error = __ext3_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = __ext3_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
+ }
return error;
}
-
-const struct xattr_handler ext3_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ext3_xattr_list_acl_access,
- .get = ext3_xattr_get_acl,
- .set = ext3_xattr_set_acl,
-};
-
-const struct xattr_handler ext3_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ext3_xattr_list_acl_default,
- .get = ext3_xattr_get_acl,
- .set = ext3_xattr_set_acl,
-};
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index dbc921e458c..ea1c69edab9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -55,18 +55,13 @@ static inline int ext3_acl_count(size_t size)
/* acl.c */
extern struct posix_acl *ext3_get_acl(struct inode *inode, int type);
-extern int ext3_acl_chmod (struct inode *);
+extern int ext3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT3_FS_POSIX_ACL */
#include <linux/sched.h>
#define ext3_get_acl NULL
-
-static inline int
-ext3_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext3_set_acl NULL
static inline int
ext3_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 22548f56197..158b5d4ce06 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -1727,10 +1727,7 @@ allocated:
percpu_counter_sub(&sbi->s_freeblocks_counter, num);
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
- err = ext3_journal_dirty_metadata(handle, gdp_bh);
- if (!fatal)
- fatal = err;
-
+ fatal = ext3_journal_dirty_metadata(handle, gdp_bh);
if (fatal)
goto out;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index bafdd48eefd..17742eed2c1 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -275,7 +275,7 @@ static inline loff_t ext3_get_htree_eof(struct file *filp)
* NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX)
* will be invalid once the directory was converted into a dx directory
*/
-loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
+static loff_t ext3_dir_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
int dx_dir = is_dx_dir(inode);
@@ -309,43 +309,17 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
- while (fname) {
- struct fname * old = fname;
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
+ do {
+ struct fname *old = fname;
fname = fname->next;
- kfree (old);
- }
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
-}
+ kfree(old);
+ } while (fname);
+ *root = RB_ROOT;
+}
static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp,
loff_t pos)
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 25cb413277e..a062fa1e1b1 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -50,10 +50,10 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
const struct file_operations ext3_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext3_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext3_compat_ioctl,
@@ -63,7 +63,7 @@ const struct file_operations ext3_file_operations = {
.release = ext3_release_file,
.fsync = ext3_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
const struct inode_operations ext3_file_inode_operations = {
@@ -75,6 +75,7 @@ const struct inode_operations ext3_file_inode_operations = {
.removexattr = generic_removexattr,
#endif
.get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
.fiemap = ext3_fiemap,
};
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 082afd78b10..a1b810230cc 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -215,7 +215,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
int best_ndir = inodes_per_group;
int best_group = -1;
- get_random_bytes(&group, sizeof(group));
+ group = prandom_u32();
parent_group = (unsigned)group % ngroups;
for (i = 0; i < ngroups; i++) {
group = (parent_group + i) % ngroups;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2bd85486b87..2c6ccc49ba2 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode)
log_wait_commit(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
ext3_discard_reservation(inode);
rsv = ei->i_block_alloc_info;
@@ -1559,56 +1559,17 @@ static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
}
/*
- * Note that we always start a transaction even if we're not journalling
- * data. This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext3_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
+ * Note that whenever we need to map blocks we start a transaction even if
+ * we're not journalling data. This is to preserve ordering: any hole
+ * instantiation within __block_write_full_page -> ext3_get_block() should be
+ * journalled along with the data so we don't crash and then get metadata which
* refers to old data.
*
* In all journalling modes block_write_full_page() will start the I/O.
*
- * Problem:
- *
- * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
- * ext3_writepage()
- *
- * Similar for:
- *
- * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
- *
- * Same applies to ext3_get_block(). We will deadlock on various things like
- * lock_journal and i_truncate_mutex.
- *
- * Setting PF_MEMALLOC here doesn't work - too many internal memory
- * allocations fail.
- *
- * 16May01: If we're reentered then journal_current_handle() will be
- * non-zero. We simply *return*.
- *
- * 1 July 2001: @@@ FIXME:
- * In journalled data mode, a data buffer may be metadata against the
- * current transaction. But the same file is part of a shared mapping
- * and someone does a writepage() on it.
- *
- * We will move the buffer onto the async_data list, but *after* it has
- * been dirtied. So there's a small window where we have dirty data on
- * BJ_Metadata.
- *
- * Note that this only applies to the last partial page in the file. The
- * bit which block_write_full_page() uses prepare/commit for. (That's
- * broken code anyway: it's wrong for msync()).
- *
- * It's a rare case: affects the final partial page, for journalled data
- * where the file is subject to bith write() and writepage() in the same
- * transction. To fix it we'll need a custom block_write_full_page().
- * We'll probably need that anyway for journalling writepage() output.
- *
* We don't honour synchronous mounts for writepage(). That would be
* disastrous. Any write() or metadata operation will sync the fs for
* us.
- *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
*/
static int ext3_ordered_writepage(struct page *page,
struct writeback_control *wbc)
@@ -1673,12 +1634,9 @@ static int ext3_ordered_writepage(struct page *page,
* block_write_full_page() succeeded. Otherwise they are unmapped,
* and generally junk.
*/
- if (ret == 0) {
- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
+ if (ret == 0)
+ ret = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
NULL, journal_dirty_data_fn);
- if (!ret)
- ret = err;
- }
walk_page_buffers(handle, page_bufs, 0,
PAGE_CACHE_SIZE, NULL, bput_one);
err = ext3_journal_stop(handle);
@@ -1758,17 +1716,17 @@ static int ext3_journalled_writepage(struct page *page,
WARN_ON_ONCE(IS_RDONLY(inode) &&
!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ERROR_FS));
- if (ext3_journal_current_handle())
- goto no_write;
-
trace_ext3_journalled_writepage(page);
- handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto no_write;
- }
-
if (!page_has_buffers(page) || PageChecked(page)) {
+ if (ext3_journal_current_handle())
+ goto no_write;
+
+ handle = ext3_journal_start(inode,
+ ext3_writepage_trans_blocks(inode));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto no_write;
+ }
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
@@ -1791,17 +1749,18 @@ static int ext3_journalled_writepage(struct page *page,
atomic_set(&EXT3_I(inode)->i_datasync_tid,
handle->h_transaction->t_tid);
unlock_page(page);
+ err = ext3_journal_stop(handle);
+ if (!ret)
+ ret = err;
} else {
/*
- * It may be a page full of checkpoint-mode buffers. We don't
- * really know unless we go poke around in the buffer_heads.
- * But block_write_full_page will do the right thing.
+ * It is a page full of checkpoint-mode buffers. Go and write
+ * them. They should have been already mapped when they went
+ * to the journal so provide NULL get_block function to catch
+ * errors.
*/
- ret = block_write_full_page(page, ext3_get_block, wbc);
+ ret = block_write_full_page(page, NULL, wbc);
}
- err = ext3_journal_stop(handle);
- if (!ret)
- ret = err;
out:
return ret;
@@ -1862,8 +1821,7 @@ static int ext3_releasepage(struct page *page, gfp_t wait)
* VFS code falls back into buffered path in that case so we are safe.
*/
static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -1871,10 +1829,10 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int retries = 0;
- trace_ext3_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+ trace_ext3_direct_IO_enter(inode, offset, count, rw);
if (rw == WRITE) {
loff_t final_size = offset + count;
@@ -1898,15 +1856,14 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
}
retry:
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- ext3_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block);
/*
* In case of error extending write may have instantiated a few
* blocks outside i_size. Trim these off again.
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
ext3_truncate_failed_direct_write(inode);
@@ -1925,6 +1882,8 @@ retry:
* and pretend the write failed... */
ext3_truncate_failed_direct_write(inode);
ret = PTR_ERR(handle);
+ if (inode->i_nlink)
+ ext3_orphan_del(NULL, inode);
goto out;
}
if (inode->i_nlink)
@@ -1949,8 +1908,7 @@ retry:
ret = err;
}
out:
- trace_ext3_direct_IO_exit(inode, offset,
- iov_length(iov, nr_segs), rw, ret);
+ trace_ext3_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -3212,21 +3170,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
* transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (for sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext3_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -3238,13 +3195,13 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (ext3_journal_current_handle()) {
@@ -3253,7 +3210,12 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext3_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
return ext3_force_commit(inode->i_sb);
@@ -3365,7 +3327,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
mark_inode_dirty(inode);
if (ia_valid & ATTR_MODE)
- rc = ext3_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext3_std_error(inode->i_sb, error);
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 1194b1f0f83..f197736dccf 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1783,7 +1783,7 @@ retry:
d_tmpfile(dentry, inode);
err = ext3_orphan_add(handle, inode);
if (err)
- goto err_drop_inode;
+ goto err_unlock_inode;
mark_inode_dirty(inode);
unlock_new_inode(inode);
}
@@ -1791,10 +1791,9 @@ retry:
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
-err_drop_inode:
+err_unlock_inode:
ext3_journal_stop(handle);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2570,6 +2569,7 @@ const struct inode_operations ext3_dir_inode_operations = {
.removexattr = generic_removexattr,
#endif
.get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
};
const struct inode_operations ext3_special_inode_operations = {
@@ -2581,4 +2581,5 @@ const struct inode_operations ext3_special_inode_operations = {
.removexattr = generic_removexattr,
#endif
.get_acl = ext3_get_acl,
+ .set_acl = ext3_set_acl,
};
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index c50c7619037..08cdfe5461e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -527,7 +527,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext3_inode_cachep = kmem_cache_create("ext3_inode_cache",
sizeof(struct ext3_inode_info),
@@ -2649,6 +2649,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
int i;
#endif
+ sync_filesystem(sb);
+
/* Store the original options */
old_sb_flags = sb->s_flags;
old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -2825,6 +2827,10 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
* bitmap, and an inode table.
*/
overhead += ngroups * (2 + sbi->s_itb_per_group);
+
+ /* Add the journal blocks as well */
+ overhead += sbi->s_journal->j_maxlen;
+
sbi->s_overhead_last = overhead;
smp_wmb();
sbi->s_blocks_last = le32_to_cpu(es->s_blocks_count);
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index b1fc96383e0..c6874be6d58 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -102,8 +102,8 @@ static struct mb_cache *ext3_xattr_cache;
static const struct xattr_handler *ext3_xattr_handler_map[] = {
[EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
- [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
- [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext3_xattr_acl_default_handler,
+ [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT3_XATTR_INDEX_TRUSTED] = &ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_SECURITY
@@ -115,8 +115,8 @@ const struct xattr_handler *ext3_xattr_handlers[] = {
&ext3_xattr_user_handler,
&ext3_xattr_trusted_handler,
#ifdef CONFIG_EXT3_FS_POSIX_ACL
- &ext3_xattr_acl_access_handler,
- &ext3_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT3_FS_SECURITY
&ext3_xattr_security_handler,
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 2be4f69bfa6..32e93ebf803 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -60,8 +60,6 @@ struct ext3_xattr_entry {
extern const struct xattr_handler ext3_xattr_user_handler;
extern const struct xattr_handler ext3_xattr_trusted_handler;
-extern const struct xattr_handler ext3_xattr_acl_access_handler;
-extern const struct xattr_handler ext3_xattr_acl_default_handler;
extern const struct xattr_handler ext3_xattr_security_handler;
extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3387664ad70..722c2bf9645 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -43,8 +43,9 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
name, value, size, flags);
}
-int ext3_initxattrs(struct inode *inode, const struct xattr *xattr_array,
- void *fs_info)
+static int ext3_initxattrs(struct inode *inode,
+ const struct xattr *xattr_array,
+ void *fs_info)
{
const struct xattr *xattr;
handle_t *handle = fs_info;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 39a54a0e9fe..d40c8dbbb0d 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -152,13 +152,6 @@ ext4_get_acl(struct inode *inode, int type)
struct posix_acl *acl;
int retval;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -196,7 +189,7 @@ ext4_get_acl(struct inode *inode, int type)
* inode->i_mutex: down unless called from ext4_new_inode
*/
static int
-ext4_set_acl(handle_t *handle, struct inode *inode, int type,
+__ext4_set_acl(handle_t *handle, struct inode *inode, int type,
struct posix_acl *acl)
{
int name_index;
@@ -204,9 +197,6 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -248,208 +238,51 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
return error;
}
-/*
- * Initialize the ACLs of a new inode. Called from ext4_new_inode.
- *
- * dir->i_mutex: down
- * inode->i_mutex: up (access to inode is still exclusive)
- */
int
-ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
+ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = NULL;
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(dir->i_sb, POSIX_ACL)) {
- acl = ext4_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
- if (test_opt(inode->i_sb, POSIX_ACL) && acl) {
- if (S_ISDIR(inode->i_mode)) {
- error = ext4_set_acl(handle, inode,
- ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- error = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (error < 0)
- return error;
-
- if (error > 0) {
- /* This is an extended ACL */
- error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
- }
- }
-cleanup:
- posix_acl_release(acl);
- return error;
-}
-
-/*
- * Does chmod for an inode that may have an Access Control List. The
- * inode->i_mode field must be updated to the desired value by the caller
- * before calling this function.
- * Returns 0 on success, or a negative error number.
- *
- * We change the ACL rather than storing some ACL entries in the file
- * mode permission bits (which would be more efficient), because that
- * would break once additional permissions (like ACL_APPEND, ACL_DELETE
- * for directories) are added. There are no more bits available in the
- * file mode.
- *
- * inode->i_mutex: down
- */
-int
-ext4_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
handle_t *handle;
- int retries = 0;
- int error;
-
+ int error, retries = 0;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return 0;
- acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (error)
- return error;
retry:
handle = ext4_journal_start(inode, EXT4_HT_XATTR,
ext4_jbd2_credits_xattr(inode));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- ext4_std_error(inode->i_sb, error);
- goto out;
- }
- error = ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ error = __ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
- if (error == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries))
+ if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
-out:
- posix_acl_release(acl);
return error;
}
/*
- * Extended attribute handlers
+ * Initialize the ACLs of a new inode. Called from ext4_new_inode.
+ *
+ * dir->i_mutex: down
+ * inode->i_mutex: up (access to inode is still exclusive)
*/
-static size_t
-ext4_xattr_list_acl_access(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t
-ext4_xattr_list_acl_default(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return 0;
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int
-ext4_xattr_get_acl(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+int
+ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl;
+ struct posix_acl *default_acl, *acl;
int error;
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(dentry->d_sb, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ext4_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- handle_t *handle;
- struct posix_acl *acl;
- int error, retries = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(inode->i_sb, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
-retry:
- handle = ext4_journal_start(inode, EXT4_HT_XATTR,
- ext4_jbd2_credits_xattr(inode));
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto release_and_out;
+ if (default_acl) {
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS,
+ acl);
+ posix_acl_release(acl);
}
- error = ext4_set_acl(handle, inode, type, acl);
- ext4_journal_stop(handle);
- if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
- goto retry;
-
-release_and_out:
- posix_acl_release(acl);
return error;
}
-
-const struct xattr_handler ext4_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ext4_xattr_list_acl_access,
- .get = ext4_xattr_get_acl,
- .set = ext4_xattr_set_acl,
-};
-
-const struct xattr_handler ext4_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ext4_xattr_list_acl_default,
- .get = ext4_xattr_get_acl,
- .set = ext4_xattr_set_acl,
-};
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 18cb39ed7c7..da2c79577d7 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -55,18 +55,13 @@ static inline int ext4_acl_count(size_t size)
/* acl.c */
struct posix_acl *ext4_get_acl(struct inode *inode, int type);
-extern int ext4_acl_chmod(struct inode *);
+int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
#else /* CONFIG_EXT4_FS_POSIX_ACL */
#include <linux/sched.h>
#define ext4_get_acl NULL
-
-static inline int
-ext4_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define ext4_set_acl NULL
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index dc5d572ebd6..fca382037dd 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -83,9 +83,9 @@ static inline int ext4_block_in_group(struct super_block *sb,
/* Return the number of clusters used for file system metadata; this
* represents the overhead needed by the file system.
*/
-unsigned ext4_num_overhead_clusters(struct super_block *sb,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+static unsigned ext4_num_overhead_clusters(struct super_block *sb,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
unsigned num_clusters;
int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c;
@@ -176,9 +176,10 @@ static unsigned int num_clusters_in_group(struct super_block *sb,
}
/* Initializes an uninitialized block bitmap */
-void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp)
+static void ext4_init_block_bitmap(struct super_block *sb,
+ struct buffer_head *bh,
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
unsigned int bit, bit_max;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -193,7 +194,16 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
@@ -307,6 +317,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
ext4_group_t block_group,
struct buffer_head *bh)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t offset;
ext4_grpblk_t next_zero_bit;
ext4_fsblk_t blk;
@@ -326,14 +337,14 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
/* check whether block bitmap block number is set */
blk = ext4_block_bitmap(sb, desc);
offset = blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
/* check whether the inode bitmap block number is set */
blk = ext4_inode_bitmap(sb, desc);
offset = blk - group_first_block;
- if (!ext4_test_bit(offset, bh->b_data))
+ if (!ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data))
/* bad block bitmap */
return blk;
@@ -341,21 +352,23 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
blk = ext4_inode_table(sb, desc);
offset = blk - group_first_block;
next_zero_bit = ext4_find_next_zero_bit(bh->b_data,
- offset + EXT4_SB(sb)->s_itb_per_group,
- offset);
- if (next_zero_bit < offset + EXT4_SB(sb)->s_itb_per_group)
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group),
+ EXT4_B2C(sbi, offset));
+ if (next_zero_bit <
+ EXT4_B2C(sbi, offset + EXT4_SB(sb)->s_itb_per_group))
/* bad bitmap for inode tables */
return blk;
return 0;
}
-void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh)
+static void ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (buffer_verified(bh))
return;
@@ -366,6 +379,9 @@ void ext4_validate_block_bitmap(struct super_block *sb,
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
block_group, blk);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
@@ -373,6 +389,9 @@ void ext4_validate_block_bitmap(struct super_block *sb,
desc, bh))) {
ext4_unlock_group(sb, block_group);
ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
return;
}
@@ -640,6 +659,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
struct ext4_group_desc *gdp;
ext4_group_t i;
ext4_group_t ngroups = ext4_get_groups_count(sb);
+ struct ext4_group_info *grp;
#ifdef EXT4FS_DEBUG
struct ext4_super_block *es;
ext4_fsblk_t bitmap_count;
@@ -655,14 +675,18 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_group_clusters(sb, gdp);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
if (bitmap_bh == NULL)
continue;
x = ext4_count_free(bitmap_bh->b_data,
- EXT4_BLOCKS_PER_GROUP(sb) / 8);
+ EXT4_CLUSTERS_PER_GROUP(sb) / 8);
printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n",
i, ext4_free_group_clusters(sb, gdp), x);
bitmap_count += x;
@@ -679,7 +703,11 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
if (!gdp)
continue;
- desc_count += ext4_free_group_clusters(sb, gdp);
+ grp = NULL;
+ if (EXT4_SB(sb)->s_group_info)
+ grp = ext4_get_group_info(sb, i);
+ if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ desc_count += ext4_free_group_clusters(sb, gdp);
}
return desc_count;
@@ -699,16 +727,6 @@ static inline int test_root(ext4_group_t a, int b)
}
}
-static int ext4_group_sparse(ext4_group_t group)
-{
- if (group <= 1)
- return 1;
- if (!(group & 1))
- return 0;
- return (test_root(group, 7) || test_root(group, 5) ||
- test_root(group, 3));
-}
-
/**
* ext4_bg_has_super - number of blocks used by the superblock in group
* @sb: superblock for filesystem
@@ -719,11 +737,26 @@ static int ext4_group_sparse(ext4_group_t group)
*/
int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
{
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
- !ext4_group_sparse(group))
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ if (group == 0)
+ return 1;
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
+ group == le32_to_cpu(es->s_backup_bgs[1]))
+ return 1;
return 0;
- return 1;
+ }
+ if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ return 1;
+ if (!(group & 1))
+ return 0;
+ if (test_root(group, 3) || (test_root(group, 5)) ||
+ test_root(group, 7))
+ return 1;
+
+ return 0;
}
static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3f11656bd72..41eb9dcfac7 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -180,37 +180,12 @@ int ext4_setup_system_zone(struct super_block *sb)
/* Called when the filesystem is unmounted */
void ext4_release_system_zone(struct super_block *sb)
{
- struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node;
- struct rb_node *parent;
- struct ext4_system_zone *entry;
+ struct ext4_system_zone *entry, *n;
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- entry = rb_entry(n, struct ext4_system_zone, node);
+ rbtree_postorder_for_each_entry_safe(entry, n,
+ &EXT4_SB(sb)->system_blks, node)
kmem_cache_free(ext4_system_zone_cachep, entry);
- if (!parent)
- EXT4_SB(sb)->system_blks = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
EXT4_SB(sb)->system_blks = RB_ROOT;
}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 680bb338891..ef1bed66c14 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -105,7 +105,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
static int ext4_readdir(struct file *file, struct dir_context *ctx)
{
unsigned int offset;
- int i, stored;
+ int i;
struct ext4_dir_entry_2 *de;
int err;
struct inode *inode = file_inode(file);
@@ -133,7 +133,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
return ret;
}
- stored = 0;
offset = ctx->pos & (sb->s_blocksize - 1);
while (ctx->pos < inode->i_size) {
@@ -353,41 +352,16 @@ struct fname {
*/
static void free_rb_tree_fname(struct rb_root *root)
{
- struct rb_node *n = root->rb_node;
- struct rb_node *parent;
- struct fname *fname;
-
- while (n) {
- /* Do the node's children first */
- if (n->rb_left) {
- n = n->rb_left;
- continue;
- }
- if (n->rb_right) {
- n = n->rb_right;
- continue;
- }
- /*
- * The node has no children; free it, and then zero
- * out parent's link to it. Finally go to the
- * beginning of the loop and try to free the parent
- * node.
- */
- parent = rb_parent(n);
- fname = rb_entry(n, struct fname, rb_hash);
+ struct fname *fname, *next;
+
+ rbtree_postorder_for_each_entry_safe(fname, next, root, rb_hash)
while (fname) {
struct fname *old = fname;
fname = fname->next;
kfree(old);
}
- if (!parent)
- *root = RB_ROOT;
- else if (parent->rb_left == n)
- parent->rb_left = NULL;
- else if (parent->rb_right == n)
- parent->rb_right = NULL;
- n = parent;
- }
+
+ *root = RB_ROOT;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af815ea9d7c..7cc5a0e2368 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,7 +29,9 @@
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
#include <crypto/hash.h>
+#include <linux/falloc.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
@@ -156,7 +158,6 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED (1 << BH_Mapped)
#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
-#define EXT4_MAP_UNINIT (1 << BH_Uninit)
/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of
* ext4_map_blocks wants to know whether or not the underlying cluster has
* already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that
@@ -167,7 +168,7 @@ struct ext4_allocation_request {
#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_UNINIT | EXT4_MAP_FROM_CLUSTER)
+ EXT4_MAP_FROM_CLUSTER)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -182,7 +183,7 @@ struct ext4_map_blocks {
#define EXT4_IO_END_UNWRITTEN 0x0001
/*
- * For converting uninitialized extents on a work queue. 'handle' is used for
+ * For converting unwritten extents on a work queue. 'handle' is used for
* buffered writeback.
*/
typedef struct ext4_io_end {
@@ -267,6 +268,16 @@ struct ext4_io_submit {
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \
(sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \
+ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \
+ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) & \
+ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) & \
+ ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/*
* Structure of a blocks group descriptor
@@ -525,26 +536,26 @@ enum {
/*
* Flags used by ext4_map_blocks()
*/
- /* Allocate any needed blocks and/or convert an unitialized
+ /* Allocate any needed blocks and/or convert an unwritten
extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE 0x0001
- /* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002
-#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
+ /* Request the creation of an unwritten extent */
+#define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\
EXT4_GET_BLOCKS_CREATE)
/* Caller is from the delayed allocation writeout path
* finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
/* caller is from the direct IO path, request to creation of an
- unitialized extents if not allocated, split the uninitialized
+ unwritten extents if not allocated, split the unwritten
extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO 0x0008
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -556,6 +567,8 @@ enum {
#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/* Do not put hole in extent cache */
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
+ /* Convert written extents to unwritten */
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400
/*
* The bit position of these flags must not overlap with any of the
@@ -760,6 +773,8 @@ do { \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \
(einode)->xtime.tv_sec = \
(signed)le32_to_cpu((raw_inode)->xtime); \
+ else \
+ (einode)->xtime.tv_sec = 0; \
if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \
ext4_decode_extra_time(&(einode)->xtime, \
raw_inode->xtime ## _extra); \
@@ -860,6 +875,8 @@ struct ext4_inode_info {
struct inode vfs_inode;
struct jbd2_inode *jinode;
+ spinlock_t i_raw_lock; /* protects updates to the raw inode */
+
/*
* File creation time. Its function is same as that of
* struct timespec i_{a,c,m}time in the generic inode.
@@ -985,6 +1002,8 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group
size of blocksize * 8
blocks */
+#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
+ file systems */
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
@@ -1141,7 +1160,8 @@ struct ext4_super_block {
__le32 s_usr_quota_inum; /* inode for tracking user quota */
__le32 s_grp_quota_inum; /* inode for tracking group quota */
__le32 s_overhead_clusters; /* overhead blocks/clusters in fs */
- __le32 s_reserved[108]; /* Padding to the end of the block */
+ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */
+ __le32 s_reserved[106]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1313,7 +1333,13 @@ struct ext4_sb_info {
struct list_head s_es_lru;
unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
+ struct mb_cache *s_mb_cache;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
+
+ /* Ratelimit ext4 messages. */
+ struct ratelimit_state s_err_ratelimit_state;
+ struct ratelimit_state s_warning_ratelimit_state;
+ struct ratelimit_state s_msg_ratelimit_state;
};
static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1396,7 +1422,18 @@ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \
}
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_flag(struct inode *inode, int bit);
+static inline void ext4_set_inode_flag(struct inode *inode, int bit);
+static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)
+
+/* Add these declarations here only so that these functions can be
+ * found by name. Otherwise, they are very hard to locate. */
+static inline int ext4_test_inode_state(struct inode *inode, int bit);
+static inline void ext4_set_inode_state(struct inode *inode, int bit);
+static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)
@@ -1470,6 +1507,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020
+#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
@@ -1918,10 +1956,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block);
-extern void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh);
extern unsigned int ext4_block_group(struct super_block *sb,
ext4_fsblk_t blocknr);
extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1950,16 +1984,9 @@ extern int ext4_wait_block_bitmap(struct super_block *sb,
struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
ext4_group_t block_group);
-extern void ext4_init_block_bitmap(struct super_block *sb,
- struct buffer_head *bh,
- ext4_group_t group,
- struct ext4_group_desc *desc);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
ext4_group_t block_group,
struct ext4_group_desc *gdp);
-extern unsigned ext4_num_overhead_clusters(struct super_block *sb,
- ext4_group_t block_group,
- struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
/* dir.c */
@@ -2102,10 +2129,6 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
-extern int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from);
-extern int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
@@ -2117,8 +2140,7 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs);
+ struct iov_iter *iter, loff_t offset);
extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
@@ -2165,8 +2187,6 @@ extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
extern int ext4_calculate_overhead(struct super_block *sb);
-extern int ext4_superblock_csum_verify(struct super_block *sb,
- struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
extern void *ext4_kvzalloc(size_t size, gfp_t flags);
@@ -2433,23 +2453,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
-/*
- * Update i_disksize after writeback has been started. Races with truncate
- * are avoided by checking i_size under i_data_sem.
- */
-static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
-{
- loff_t i_size;
-
- down_write(&EXT4_I(inode)->i_data_sem);
- i_size = i_size_read(inode);
- if (newsize > i_size)
- newsize = i_size;
- if (newsize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = newsize;
- up_write(&EXT4_I(inode)->i_data_sem);
-}
-
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
@@ -2555,19 +2558,11 @@ extern const struct file_operations ext4_dir_operations;
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
-extern void ext4_unwritten_wait(struct inode *inode);
/* inline.c */
extern int ext4_has_inline_data(struct inode *inode);
-extern int ext4_get_inline_size(struct inode *inode);
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
-extern void ext4_write_inline_data(struct inode *inode,
- struct ext4_iloc *iloc,
- void *buffer, loff_t pos,
- unsigned int len);
-extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len);
extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
unsigned int len);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
@@ -2728,17 +2723,18 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
+extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
struct inode *donor_inode);
-void ext4_inode_double_lock(struct inode *inode1, struct inode *inode2);
-void ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
__u64 len, __u64 *moved_len);
+extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
+ struct ext4_extent **extent);
/* page-io.c */
extern int __init ext4_init_pageio(void);
@@ -2754,23 +2750,20 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc);
+ struct writeback_control *wbc,
+ bool keep_towrite);
/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
-extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
-extern int ext4_mmp_csum_verify(struct super_block *sb,
- struct mmp_struct *mmp);
/*
* Note that these flags will never ever appear in a buffer_head's state flag.
* See EXT4_MAP_... to see where this is used.
*/
enum ext4_state_bits {
- BH_Uninit /* blocks are allocated but uninitialized on disk */
- = BH_JBDPrivateStart,
- BH_AllocFromCluster, /* allocated blocks were part of already
+ BH_AllocFromCluster /* allocated blocks were part of already
* allocated cluster. */
+ = BH_JBDPrivateStart
};
/*
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 5074fe23f19..a867f5ca999 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -137,21 +137,21 @@ struct ext4_ext_path {
* EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an
* initialized extent. This is 2^15 and not (2^16 - 1), since we use the
* MSB of ee_len field in the extent datastructure to signify if this
- * particular extent is an initialized extent or an uninitialized (i.e.
+ * particular extent is an initialized extent or an unwritten (i.e.
* preallocated).
- * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an
- * uninitialized extent.
+ * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an
+ * unwritten extent.
* If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an
- * uninitialized one. In other words, if MSB of ee_len is set, it is an
- * uninitialized extent with only one special scenario when ee_len = 0x8000.
- * In this case we can not have an uninitialized extent of zero length and
+ * unwritten one. In other words, if MSB of ee_len is set, it is an
+ * unwritten extent with only one special scenario when ee_len = 0x8000.
+ * In this case we can not have an unwritten extent of zero length and
* thus we make it as a special case of initialized extent with 0x8000 length.
* This way we get better extent-to-group alignment for initialized extents.
* Hence, the maximum number of blocks we can have in an *initialized*
- * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767).
+ * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767).
*/
#define EXT_INIT_MAX_LEN (1UL << 15)
-#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1)
+#define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1)
#define EXT_FIRST_EXTENT(__hdr__) \
@@ -187,14 +187,14 @@ static inline unsigned short ext_depth(struct inode *inode)
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
-static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext)
+static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext)
{
- /* We can not have an uninitialized extent of zero length! */
+ /* We can not have an unwritten extent of zero length! */
BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0);
ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN);
}
-static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext)
+static inline int ext4_ext_is_unwritten(struct ext4_extent *ext)
{
/* Extent with ee_len of 0x8000 is treated as an initialized extent */
return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN);
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 17ac112ab10..0074e0d23d6 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -122,9 +122,10 @@ handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
return handle;
}
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn, struct buffer_head *bh,
- handle_t *handle, int err)
+static void ext4_journal_abort_handle(const char *caller, unsigned int line,
+ const char *err_fn,
+ struct buffer_head *bh,
+ handle_t *handle, int err)
{
char nbuf[16];
const char *errstr = ext4_decode_error(NULL, err, nbuf);
@@ -259,6 +260,25 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
if (WARN_ON_ONCE(err)) {
ext4_journal_abort_handle(where, line, __func__, bh,
handle, err);
+ if (inode == NULL) {
+ pr_err("EXT4: jbd2_journal_dirty_metadata "
+ "failed: handle type %u started at "
+ "line %u, credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
+ return err;
+ }
+ ext4_error_inode(inode, where, line,
+ bh->b_blocknr,
+ "journal_dirty_metadata failed: "
+ "handle type %u started at line %u, "
+ "credits %u/%u, errcode %d",
+ handle->h_type,
+ handle->h_line_no,
+ handle->h_requested_credits,
+ handle->h_buffer_credits, err);
}
} else {
if (inode)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 81cfefa9dc0..17c00ff202f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -231,10 +231,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
/*
* Wrapper functions with which ext4 calls into JBD.
*/
-void ext4_journal_abort_handle(const char *caller, unsigned int line,
- const char *err_fn,
- struct buffer_head *bh, handle_t *handle, int err);
-
int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle_t *handle, struct buffer_head *bh);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 54d52afcdb1..4da228a0e6d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,6 @@
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/slab.h>
-#include <linux/falloc.h>
#include <asm/uaccess.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
@@ -51,8 +50,8 @@
*/
#define EXT4_EXT_MAY_ZEROOUT 0x1 /* safe to zeroout if split fails \
due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1 0x2 /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2 0x4 /* mark second half uninitialized */
+#define EXT4_EXT_MARK_UNWRIT1 0x2 /* mark first half unwritten */
+#define EXT4_EXT_MARK_UNWRIT2 0x4 /* mark second half unwritten */
#define EXT4_EXT_DATA_VALID1 0x8 /* first half contains valid data */
#define EXT4_EXT_DATA_VALID2 0x10 /* second half contains valid data */
@@ -144,6 +143,7 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode,
{
if (path->p_bh) {
/* path points to block */
+ BUFFER_TRACE(path->p_bh, "get_write_access");
return ext4_journal_get_write_access(handle, path->p_bh);
}
/* path points to leaf/index in inode body */
@@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
{
ext4_fsblk_t block = ext4_ext_pblock(ext);
int len = ext4_ext_get_actual_len(ext);
+ ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+ ext4_lblk_t last = lblock + len - 1;
- if (len == 0)
+ if (lblock > last)
return 0;
return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
}
@@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,
if (depth == 0) {
/* leaf entries */
struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ ext4_fsblk_t pblock = 0;
+ ext4_lblk_t lblock = 0;
+ ext4_lblk_t prev = 0;
+ int len = 0;
while (entries) {
if (!ext4_valid_extent(inode, ext))
return 0;
+
+ /* Check for overlapping extents */
+ lblock = le32_to_cpu(ext->ee_block);
+ len = ext4_ext_get_actual_len(ext);
+ if ((lblock <= prev) && prev) {
+ pblock = ext4_ext_pblock(ext);
+ es->s_last_error_block = cpu_to_le64(pblock);
+ return 0;
+ }
ext++;
entries--;
+ prev = lblock + len - 1;
}
} else {
struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
@@ -508,7 +525,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
lblk - prev, ~0,
EXTENT_STATUS_HOLE);
- if (ext4_ext_is_uninitialized(ex))
+ if (ext4_ext_is_unwritten(ex))
status = EXTENT_STATUS_UNWRITTEN;
ext4_es_cache_extent(inode, lblk, len,
ext4_ext_pblock(ex), status);
@@ -604,7 +621,7 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
} else if (path->p_ext) {
ext_debug(" %d:[%d]%d:%llu ",
le32_to_cpu(path->p_ext->ee_block),
- ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext),
ext4_ext_pblock(path->p_ext));
} else
@@ -630,7 +647,7 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex));
}
ext_debug("\n");
@@ -661,7 +678,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
le32_to_cpu(ex->ee_block),
ext4_ext_pblock(ex),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
newblock);
ex++;
@@ -786,7 +803,7 @@ ext4_ext_binsearch(struct inode *inode,
ext_debug(" -> %d:%llu:[%d]%d ",
le32_to_cpu(path->p_ext->ee_block),
ext4_ext_pblock(path->p_ext),
- ext4_ext_is_uninitialized(path->p_ext),
+ ext4_ext_is_unwritten(path->p_ext),
ext4_ext_get_actual_len(path->p_ext));
#ifdef CHECK_BINSEARCH
@@ -1666,22 +1683,17 @@ int
ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
struct ext4_extent *ex2)
{
- unsigned short ext1_ee_len, ext2_ee_len, max_len;
+ unsigned short ext1_ee_len, ext2_ee_len;
/*
* Make sure that both extents are initialized. We don't merge
- * uninitialized extents so that we can be sure that end_io code has
+ * unwritten extents so that we can be sure that end_io code has
* the extent that was written properly split out and conversion to
* initialized is trivial.
*/
- if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2))
+ if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2))
return 0;
- if (ext4_ext_is_uninitialized(ex1))
- max_len = EXT_UNINIT_MAX_LEN;
- else
- max_len = EXT_INIT_MAX_LEN;
-
ext1_ee_len = ext4_ext_get_actual_len(ex1);
ext2_ee_len = ext4_ext_get_actual_len(ex2);
@@ -1694,7 +1706,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
* as an RO_COMPAT feature, refuse to merge to extents if
* this can result in the top bit of ee_len being set.
*/
- if (ext1_ee_len + ext2_ee_len > max_len)
+ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
+ return 0;
+ if (ext4_ext_is_unwritten(ex1) &&
+ (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
+ atomic_read(&EXT4_I(inode)->i_unwritten) ||
+ (ext1_ee_len + ext2_ee_len > EXT_UNWRITTEN_MAX_LEN)))
return 0;
#ifdef AGGRESSIVE_TEST
if (ext1_ee_len >= 4)
@@ -1719,8 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
{
struct ext4_extent_header *eh;
unsigned int depth, len;
- int merge_done = 0;
- int uninitialized = 0;
+ int merge_done = 0, unwritten;
depth = ext_depth(inode);
BUG_ON(path[depth].p_hdr == NULL);
@@ -1730,12 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode,
if (!ext4_can_extents_be_merged(inode, ex, ex + 1))
break;
/* merge with next extent! */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(ex + 1));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
if (ex + 1 < EXT_LAST_EXTENT(eh)) {
len = (EXT_LAST_EXTENT(eh) - ex - 1)
@@ -1844,8 +1859,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
depth = ext_depth(inode);
if (!path[depth].p_ext)
goto out;
- b2 = le32_to_cpu(path[depth].p_ext->ee_block);
- b2 &= ~(sbi->s_cluster_ratio - 1);
+ b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
/*
* get the next allocated block if the extent in the path
@@ -1855,7 +1869,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
b2 = ext4_ext_next_allocated_block(path);
if (b2 == EXT_MAX_BLOCKS)
goto out;
- b2 &= ~(sbi->s_cluster_ratio - 1);
+ b2 = EXT4_LBLK_CMASK(sbi, b2);
}
/* check for wrap through zero on extent logical start block*/
@@ -1890,8 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
struct ext4_ext_path *npath = NULL;
int depth, len, err;
ext4_lblk_t next;
- unsigned uninitialized = 0;
- int mb_flags = 0;
+ int mb_flags = 0, unwritten;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
@@ -1931,29 +1944,21 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
if (ext4_can_extents_be_merged(inode, ex, newext)) {
ext_debug("append [%d]%d block to %u:[%d]%d"
"(from %llu)\n",
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
ext4_ext_pblock(ex));
err = ext4_ext_get_access(handle, inode,
path + depth);
if (err)
return err;
-
- /*
- * ext4_can_extents_be_merged should have checked
- * that either both extents are uninitialized, or
- * both aren't. Thus we need to check only one of
- * them here.
- */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -1965,10 +1970,10 @@ prepend:
ext_debug("prepend %u[%d]%d block to %u:[%d]%d"
"(from %llu)\n",
le32_to_cpu(newext->ee_block),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
le32_to_cpu(ex->ee_block),
- ext4_ext_is_uninitialized(ex),
+ ext4_ext_is_unwritten(ex),
ext4_ext_get_actual_len(ex),
ext4_ext_pblock(ex));
err = ext4_ext_get_access(handle, inode,
@@ -1976,20 +1981,13 @@ prepend:
if (err)
return err;
- /*
- * ext4_can_extents_be_merged should have checked
- * that either both extents are uninitialized, or
- * both aren't. Thus we need to check only one of
- * them here.
- */
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ unwritten = ext4_ext_is_unwritten(ex);
ex->ee_block = newext->ee_block;
ext4_ext_store_pblock(ex, ext4_ext_pblock(newext));
ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex)
+ ext4_ext_get_actual_len(newext));
- if (uninitialized)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten)
+ ext4_ext_mark_unwritten(ex);
eh = path[depth].p_hdr;
nearex = ex;
goto merge;
@@ -2049,7 +2047,7 @@ has_space:
ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext));
nearex = EXT_FIRST_EXTENT(eh);
} else {
@@ -2060,7 +2058,7 @@ has_space:
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
nearex);
nearex++;
@@ -2071,7 +2069,7 @@ has_space:
"nearest %p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
nearex);
}
@@ -2081,7 +2079,7 @@ has_space:
"move %d extents from 0x%p to 0x%p\n",
le32_to_cpu(newext->ee_block),
ext4_ext_pblock(newext),
- ext4_ext_is_uninitialized(newext),
+ ext4_ext_is_unwritten(newext),
ext4_ext_get_actual_len(newext),
len, nearex, nearex + 1);
memmove(nearex + 1, nearex,
@@ -2203,7 +2201,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
es.es_lblk = le32_to_cpu(ex->ee_block);
es.es_len = ext4_ext_get_actual_len(ex);
es.es_pblk = ext4_ext_pblock(ex);
- if (ext4_ext_is_uninitialized(ex))
+ if (ext4_ext_is_unwritten(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
@@ -2535,7 +2533,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
* extent, we have to mark the cluster as used (store negative
* cluster number in partial_cluster).
*/
- unaligned = pblk & (sbi->s_cluster_ratio - 1);
+ unaligned = EXT4_PBLK_COFF(sbi, pblk);
if (unaligned && (ee_len == num) &&
(*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
*partial_cluster = EXT4_B2C(sbi, pblk);
@@ -2579,7 +2577,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
unsigned num;
ext4_lblk_t ex_ee_block;
unsigned short ex_ee_len;
- unsigned uninitialized = 0;
+ unsigned unwritten = 0;
struct ext4_extent *ex;
ext4_fsblk_t pblk;
@@ -2600,18 +2598,39 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex_ee_block = le32_to_cpu(ex->ee_block);
ex_ee_len = ext4_ext_get_actual_len(ex);
+ /*
+ * If we're starting with an extent other than the last one in the
+ * node, we need to see if it shares a cluster with the extent to
+ * the right (towards the end of the file). If its leftmost cluster
+ * is this extent's rightmost cluster and it is not cluster aligned,
+ * we'll mark it as a partial that is not to be deallocated.
+ */
+
+ if (ex != EXT_LAST_EXTENT(eh)) {
+ ext4_fsblk_t current_pblk, right_pblk;
+ long long current_cluster, right_cluster;
+
+ current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1;
+ current_cluster = (long long)EXT4_B2C(sbi, current_pblk);
+ right_pblk = ext4_ext_pblock(ex + 1);
+ right_cluster = (long long)EXT4_B2C(sbi, right_pblk);
+ if (current_cluster == right_cluster &&
+ EXT4_PBLK_COFF(sbi, right_pblk))
+ *partial_cluster = -right_cluster;
+ }
+
trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster);
while (ex >= EXT_FIRST_EXTENT(eh) &&
ex_ee_block + ex_ee_len > start) {
- if (ext4_ext_is_uninitialized(ex))
- uninitialized = 1;
+ if (ext4_ext_is_unwritten(ex))
+ unwritten = 1;
else
- uninitialized = 0;
+ unwritten = 0;
ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
- uninitialized, ex_ee_len);
+ unwritten, ex_ee_len);
path[depth].p_ext = ex;
a = ex_ee_block > start ? ex_ee_block : start;
@@ -2629,7 +2648,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
* accidentally freeing it later on
*/
pblk = ext4_ext_pblock(ex);
- if (pblk & (sbi->s_cluster_ratio - 1))
+ if (EXT4_PBLK_COFF(sbi, pblk))
*partial_cluster =
-((long long)EXT4_B2C(sbi, pblk));
ex--;
@@ -2683,11 +2702,11 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
ex->ee_len = cpu_to_le16(num);
/*
- * Do not mark uninitialized if all the blocks in the
+ * Do not mark unwritten if all the blocks in the
* extent have been removed.
*/
- if (uninitialized && num)
- ext4_ext_mark_uninitialized(ex);
+ if (unwritten && num)
+ ext4_ext_mark_unwritten(ex);
/*
* If the extent was completely released,
* we need to remove it from the leaf
@@ -2725,10 +2744,15 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
err = ext4_ext_correct_indexes(handle, inode, path);
/*
- * Free the partial cluster only if the current extent does not
- * reference it. Otherwise we might free used cluster.
+ * If there's a partial cluster and at least one extent remains in
+ * the leaf, free the partial cluster if it isn't shared with the
+ * current extent. If there's a partial cluster and no extents
+ * remain in the leaf, it can't be freed here. It can only be
+ * freed when it's possible to determine if it's not shared with
+ * any other extent - when the next leaf is processed or when space
+ * removal is complete.
*/
- if (*partial_cluster > 0 &&
+ if (*partial_cluster > 0 && eh->eh_entries &&
(EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
*partial_cluster)) {
int flags = get_default_free_blocks_flags(inode);
@@ -2831,9 +2855,9 @@ again:
end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
int split_flag = 0;
- if (ext4_ext_is_uninitialized(ex))
- split_flag = EXT4_EXT_MARK_UNINIT1 |
- EXT4_EXT_MARK_UNINIT2;
+ if (ext4_ext_is_unwritten(ex))
+ split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
/*
* Split the extent in two so that 'end' is the last
@@ -3090,7 +3114,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
* @path: the path to the extent
* @split: the logical block where the extent is splitted.
* @split_flags: indicates if the extent could be zeroout if split fails, and
- * the states(init or uninit) of new extents.
+ * the states(init or unwritten) of new extents.
* @flags: flags used to insert new extent to extent tree.
*
*
@@ -3132,10 +3156,10 @@ static int ext4_split_extent_at(handle_t *handle,
newblock = split - ee_block + ext4_ext_pblock(ex);
BUG_ON(split < ee_block || split >= (ee_block + ee_len));
- BUG_ON(!ext4_ext_is_uninitialized(ex) &&
+ BUG_ON(!ext4_ext_is_unwritten(ex) &&
split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_MARK_UNINIT1 |
- EXT4_EXT_MARK_UNINIT2));
+ EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2));
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
@@ -3147,8 +3171,8 @@ static int ext4_split_extent_at(handle_t *handle,
* then we just change the state of the extent, and splitting
* is not needed.
*/
- if (split_flag & EXT4_EXT_MARK_UNINIT2)
- ext4_ext_mark_uninitialized(ex);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex);
else
ext4_ext_mark_initialized(ex);
@@ -3162,8 +3186,8 @@ static int ext4_split_extent_at(handle_t *handle,
/* case a */
memcpy(&orig_ex, ex, sizeof(orig_ex));
ex->ee_len = cpu_to_le16(split - ee_block);
- if (split_flag & EXT4_EXT_MARK_UNINIT1)
- ext4_ext_mark_uninitialized(ex);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT1)
+ ext4_ext_mark_unwritten(ex);
/*
* path may lead to new leaf, not to original leaf any more
@@ -3177,8 +3201,8 @@ static int ext4_split_extent_at(handle_t *handle,
ex2->ee_block = cpu_to_le32(split);
ex2->ee_len = cpu_to_le16(ee_len - (split - ee_block));
ext4_ext_store_pblock(ex2, newblock);
- if (split_flag & EXT4_EXT_MARK_UNINIT2)
- ext4_ext_mark_uninitialized(ex2);
+ if (split_flag & EXT4_EXT_MARK_UNWRIT2)
+ ext4_ext_mark_unwritten(ex2);
err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) {
@@ -3255,7 +3279,7 @@ static int ext4_split_extent(handle_t *handle,
struct ext4_extent *ex;
unsigned int ee_len, depth;
int err = 0;
- int uninitialized;
+ int unwritten;
int split_flag1, flags1;
int allocated = map->m_len;
@@ -3263,14 +3287,14 @@ static int ext4_split_extent(handle_t *handle,
ex = path[depth].p_ext;
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- uninitialized = ext4_ext_is_uninitialized(ex);
+ unwritten = ext4_ext_is_unwritten(ex);
if (map->m_lblk + map->m_len < ee_block + ee_len) {
split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT;
flags1 = flags | EXT4_GET_BLOCKS_PRE_IO;
- if (uninitialized)
- split_flag1 |= EXT4_EXT_MARK_UNINIT1 |
- EXT4_EXT_MARK_UNINIT2;
+ if (unwritten)
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1 |
+ EXT4_EXT_MARK_UNWRIT2;
if (split_flag & EXT4_EXT_DATA_VALID2)
split_flag1 |= EXT4_EXT_DATA_VALID1;
err = ext4_split_extent_at(handle, inode, path,
@@ -3290,15 +3314,20 @@ static int ext4_split_extent(handle_t *handle,
return PTR_ERR(path);
depth = ext_depth(inode);
ex = path[depth].p_ext;
- uninitialized = ext4_ext_is_uninitialized(ex);
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ return -EIO;
+ }
+ unwritten = ext4_ext_is_unwritten(ex);
split_flag1 = 0;
if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
- if (uninitialized) {
- split_flag1 |= EXT4_EXT_MARK_UNINIT1;
+ if (unwritten) {
+ split_flag1 |= EXT4_EXT_MARK_UNWRIT1;
split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT |
- EXT4_EXT_MARK_UNINIT2);
+ EXT4_EXT_MARK_UNWRIT2);
}
err = ext4_split_extent_at(handle, inode, path,
map->m_lblk, split_flag1, flags);
@@ -3313,16 +3342,16 @@ out:
/*
* This function is called by ext4_ext_map_blocks() if someone tries to write
- * to an uninitialized extent. It may result in splitting the uninitialized
+ * to an unwritten extent. It may result in splitting the unwritten
* extent into multiple extents (up to three - one initialized and two
- * uninitialized).
+ * unwritten).
* There are three possibilities:
* a> There is no split required: Entire extent should be initialized
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
* Pre-conditions:
- * - The extent pointed to by 'path' is uninitialized.
+ * - The extent pointed to by 'path' is unwritten.
* - The extent pointed to by 'path' contains a superset
* of the logical span [map->m_lblk, map->m_lblk + map->m_len).
*
@@ -3368,12 +3397,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
/* Pre-conditions */
- BUG_ON(!ext4_ext_is_uninitialized(ex));
+ BUG_ON(!ext4_ext_is_unwritten(ex));
BUG_ON(!in_range(map->m_lblk, ee_block, ee_len));
/*
* Attempt to transfer newly initialized blocks from the currently
- * uninitialized extent to its neighbor. This is much cheaper
+ * unwritten extent to its neighbor. This is much cheaper
* than an insertion followed by a merge as those involve costly
* memmove() calls. Transferring to the left is the common case in
* steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
@@ -3409,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
- if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
((prev_lblk + prev_len) == ee_block) && /*C2*/
((prev_pblk + prev_len) == ee_pblk) && /*C3*/
(prev_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
@@ -3424,7 +3453,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
ex->ee_block = cpu_to_le32(ee_block + map_len);
ext4_ext_store_pblock(ex, ee_pblk + map_len);
ex->ee_len = cpu_to_le16(ee_len - map_len);
- ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
/* Extend abut_ex by 'map_len' blocks */
abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
@@ -3455,7 +3484,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
* - C4: abut_ex can receive the additional blocks without
* overflowing the (initialized) length limit.
*/
- if ((!ext4_ext_is_uninitialized(abut_ex)) && /*C1*/
+ if ((!ext4_ext_is_unwritten(abut_ex)) && /*C1*/
((map->m_lblk + map_len) == next_lblk) && /*C2*/
((ee_pblk + ee_len) == next_pblk) && /*C3*/
(next_len < (EXT_INIT_MAX_LEN - map_len))) { /*C4*/
@@ -3470,7 +3499,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
ex->ee_len = cpu_to_le16(ee_len - map_len);
- ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+ ext4_ext_mark_unwritten(ex); /* Restore the flag */
/* Extend abut_ex by 'map_len' blocks */
abut_ex->ee_len = cpu_to_le16(next_len + map_len);
@@ -3492,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
WARN_ON(map->m_lblk < ee_block);
/*
* It is safe to convert extent to initialized via explicit
- * zeroout only if extent is fully insde i_size or new_size.
+ * zeroout only if extent is fully inside i_size or new_size.
*/
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
@@ -3575,26 +3604,28 @@ out:
/*
* This function is called by ext4_ext_map_blocks() from
* ext4_get_blocks_dio_write() when DIO to write
- * to an uninitialized extent.
+ * to an unwritten extent.
*
- * Writing to an uninitialized extent may result in splitting the uninitialized
- * extent into multiple initialized/uninitialized extents (up to three)
+ * Writing to an unwritten extent may result in splitting the unwritten
+ * extent into multiple initialized/unwritten extents (up to three)
* There are three possibilities:
- * a> There is no split required: Entire extent should be uninitialized
+ * a> There is no split required: Entire extent should be unwritten
* b> Splits in two extents: Write is happening at either end of the extent
* c> Splits in three extents: Somone is writing in middle of the extent
*
+ * This works the same way in the case of initialized -> unwritten conversion.
+ *
* One of more index blocks maybe needed if the extent tree grow after
- * the uninitialized extent split. To prevent ENOSPC occur at the IO
- * complete, we need to split the uninitialized extent before DIO submit
- * the IO. The uninitialized extent called at this time will be split
- * into three uninitialized extent(at most). After IO complete, the part
+ * the unwritten extent split. To prevent ENOSPC occur at the IO
+ * complete, we need to split the unwritten extent before DIO submit
+ * the IO. The unwritten extent called at this time will be split
+ * into three unwritten extent(at most). After IO complete, the part
* being filled will be convert to initialized by the end_io callback function
* via ext4_convert_unwritten_extents().
*
- * Returns the size of uninitialized extent to be written on success.
+ * Returns the size of unwritten extent to be written on success.
*/
-static int ext4_split_unwritten_extents(handle_t *handle,
+static int ext4_split_convert_extents(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path,
@@ -3606,9 +3637,9 @@ static int ext4_split_unwritten_extents(handle_t *handle,
unsigned int ee_len;
int split_flag = 0, depth;
- ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
- "block %llu, max_blocks %u\n", inode->i_ino,
- (unsigned long long)map->m_lblk, map->m_len);
+ ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n",
+ __func__, inode->i_ino,
+ (unsigned long long)map->m_lblk, map->m_len);
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
inode->i_sb->s_blocksize_bits;
@@ -3623,14 +3654,79 @@ static int ext4_split_unwritten_extents(handle_t *handle,
ee_block = le32_to_cpu(ex->ee_block);
ee_len = ext4_ext_get_actual_len(ex);
- split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
- split_flag |= EXT4_EXT_MARK_UNINIT2;
- if (flags & EXT4_GET_BLOCKS_CONVERT)
- split_flag |= EXT4_EXT_DATA_VALID2;
+ /* Convert to unwritten */
+ if (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN) {
+ split_flag |= EXT4_EXT_DATA_VALID1;
+ /* Convert to initialized */
+ } else if (flags & EXT4_GET_BLOCKS_CONVERT) {
+ split_flag |= ee_block + ee_len <= eof_block ?
+ EXT4_EXT_MAY_ZEROOUT : 0;
+ split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2);
+ }
flags |= EXT4_GET_BLOCKS_PRE_IO;
return ext4_split_extent(handle, inode, path, map, split_flag, flags);
}
+static int ext4_convert_initialized_extents(handle_t *handle,
+ struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path)
+{
+ struct ext4_extent *ex;
+ ext4_lblk_t ee_block;
+ unsigned int ee_len;
+ int depth;
+ int err = 0;
+
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ ee_block = le32_to_cpu(ex->ee_block);
+ ee_len = ext4_ext_get_actual_len(ex);
+
+ ext_debug("%s: inode %lu, logical"
+ "block %llu, max_blocks %u\n", __func__, inode->i_ino,
+ (unsigned long long)ee_block, ee_len);
+
+ if (ee_block != map->m_lblk || ee_len > map->m_len) {
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN);
+ if (err < 0)
+ goto out;
+ ext4_ext_drop_refs(path);
+ path = ext4_ext_find_extent(inode, map->m_lblk, path, 0);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
+ goto out;
+ }
+ depth = ext_depth(inode);
+ ex = path[depth].p_ext;
+ if (!ex) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) map->m_lblk);
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ err = ext4_ext_get_access(handle, inode, path + depth);
+ if (err)
+ goto out;
+ /* first mark the extent as unwritten */
+ ext4_ext_mark_unwritten(ex);
+
+ /* note: ext4_ext_correct_indexes() isn't needed here because
+ * borders are not changed
+ */
+ ext4_ext_try_to_merge(handle, inode, path, ex);
+
+ /* Mark modified extent as dirty */
+ err = ext4_ext_dirty(handle, inode, path + path->p_depth);
+out:
+ ext4_ext_show_leaf(inode, path);
+ return err;
+}
+
+
static int ext4_convert_unwritten_extents_endio(handle_t *handle,
struct inode *inode,
struct ext4_map_blocks *map,
@@ -3664,8 +3760,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle,
inode->i_ino, (unsigned long long)ee_block, ee_len,
(unsigned long long)map->m_lblk, map->m_len);
#endif
- err = ext4_split_unwritten_extents(handle, inode, map, path,
- EXT4_GET_BLOCKS_CONVERT);
+ err = ext4_split_convert_extents(handle, inode, map, path,
+ EXT4_GET_BLOCKS_CONVERT);
if (err < 0)
goto out;
ext4_ext_drop_refs(path);
@@ -3784,7 +3880,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
- lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+ lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
@@ -3843,9 +3939,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
/* Check towards left side */
- c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
if (c_offset) {
- lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+ lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
lblk_to = lblk_from + c_offset - 1;
if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
@@ -3853,7 +3949,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
}
/* Now check towards right. */
- c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+ c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
if (allocated_clusters && c_offset) {
lblk_from = lblk_start + num_blks;
lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
@@ -3866,7 +3962,39 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
}
static int
-ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
+ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map,
+ struct ext4_ext_path *path, int flags,
+ unsigned int allocated, ext4_fsblk_t newblock)
+{
+ int ret = 0;
+ int err = 0;
+
+ /*
+ * Make sure that the extent is no bigger than we support with
+ * unwritten extent
+ */
+ if (map->m_len > EXT_UNWRITTEN_MAX_LEN)
+ map->m_len = EXT_UNWRITTEN_MAX_LEN / 2;
+
+ ret = ext4_convert_initialized_extents(handle, inode, map,
+ path);
+ if (ret >= 0) {
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ err = check_eofblocks_fl(handle, inode, map->m_lblk,
+ path, map->m_len);
+ } else
+ err = ret;
+ map->m_flags |= EXT4_MAP_UNWRITTEN;
+ if (allocated > map->m_len)
+ allocated = map->m_len;
+ map->m_len = allocated;
+
+ return err ? err : allocated;
+}
+
+static int
+ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
struct ext4_ext_path *path, int flags,
unsigned int allocated, ext4_fsblk_t newblock)
@@ -3875,25 +4003,25 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
int err = 0;
ext4_io_end_t *io = ext4_inode_aio(inode);
- ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical "
+ ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n",
inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
flags, allocated);
ext4_ext_show_leaf(inode, path);
/*
- * When writing into uninitialized space, we should not fail to
+ * When writing into unwritten space, we should not fail to
* allocate metadata blocks for the new extent block if needed.
*/
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;
- trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
+ trace_ext4_ext_handle_unwritten_extents(inode, map, flags,
allocated, newblock);
/* get_block() before submit the IO, split the extent */
- if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
- ret = ext4_split_unwritten_extents(handle, inode, map,
- path, flags);
+ if (flags & EXT4_GET_BLOCKS_PRE_IO) {
+ ret = ext4_split_convert_extents(handle, inode, map,
+ path, flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0)
goto out;
/*
@@ -3906,12 +4034,10 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
map->m_flags |= EXT4_MAP_UNWRITTEN;
- if (ext4_should_dioread_nolock(inode))
- map->m_flags |= EXT4_MAP_UNINIT;
goto out;
}
/* IO end_io complete, convert the filled extent to written */
- if ((flags & EXT4_GET_BLOCKS_CONVERT)) {
+ if (flags & EXT4_GET_BLOCKS_CONVERT) {
ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
path);
if (ret >= 0) {
@@ -3921,6 +4047,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
} else
err = ret;
map->m_flags |= EXT4_MAP_MAPPED;
+ map->m_pblk = newblock;
if (allocated > map->m_len)
allocated = map->m_len;
map->m_len = allocated;
@@ -3931,7 +4058,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
* repeat fallocate creation request
* we already have an unwritten extent
*/
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) {
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) {
map->m_flags |= EXT4_MAP_UNWRITTEN;
goto map_out;
}
@@ -4007,10 +4134,6 @@ out1:
map->m_pblk = newblock;
map->m_len = allocated;
out2:
- if (path) {
- ext4_ext_drop_refs(path);
- kfree(path);
- }
return err ? err : allocated;
}
@@ -4061,7 +4184,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
struct ext4_ext_path *path)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+ ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
ext4_lblk_t ex_cluster_start, ex_cluster_end;
ext4_lblk_t rr_cluster_start;
ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
@@ -4079,8 +4202,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
(rr_cluster_start == ex_cluster_start)) {
if (rr_cluster_start == ex_cluster_end)
ee_start += ee_len - 1;
- map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
- c_offset;
+ map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
map->m_len = min(map->m_len,
(unsigned) sbi->s_cluster_ratio - c_offset);
/*
@@ -4143,7 +4265,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, *ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth;
+ int free_on_err = 0, err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
@@ -4185,8 +4307,9 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t ee_start = ext4_ext_pblock(ex);
unsigned short ee_len;
+
/*
- * Uninitialized extents are treated as holes, except that
+ * unwritten extents are treated as holes, except that
* we split out initialized portions during a write.
*/
ee_len = ext4_ext_get_actual_len(ex);
@@ -4201,13 +4324,27 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
ee_block, ee_len, newblock);
- if (!ext4_ext_is_uninitialized(ex))
+ /*
+ * If the extent is initialized check whether the
+ * caller wants to convert it to unwritten.
+ */
+ if ((!ext4_ext_is_unwritten(ex)) &&
+ (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
+ allocated = ext4_ext_convert_initialized_extent(
+ handle, inode, map, path, flags,
+ allocated, newblock);
+ goto out2;
+ } else if (!ext4_ext_is_unwritten(ex))
goto out;
- allocated = ext4_ext_handle_uninitialized_extents(
+ ret = ext4_ext_handle_unwritten_extents(
handle, inode, map, path, flags,
allocated, newblock);
- goto out3;
+ if (ret < 0)
+ err = ret;
+ else
+ allocated = ret;
+ goto out2;
}
}
@@ -4234,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
newex.ee_block = cpu_to_le32(map->m_lblk);
- cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+ cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
/*
* If we are doing bigalloc, check to see if the extent returned
@@ -4272,15 +4409,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/*
* See if request is beyond maximum number of blocks we can have in
* a single extent. For an initialized extent this limit is
- * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
- * EXT_UNINIT_MAX_LEN.
+ * EXT_INIT_MAX_LEN and for an unwritten extent this limit is
+ * EXT_UNWRITTEN_MAX_LEN.
*/
if (map->m_len > EXT_INIT_MAX_LEN &&
- !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
+ !(flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
map->m_len = EXT_INIT_MAX_LEN;
- else if (map->m_len > EXT_UNINIT_MAX_LEN &&
- (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
- map->m_len = EXT_UNINIT_MAX_LEN;
+ else if (map->m_len > EXT_UNWRITTEN_MAX_LEN &&
+ (flags & EXT4_GET_BLOCKS_UNWRIT_EXT))
+ map->m_len = EXT_UNWRITTEN_MAX_LEN;
/* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
newex.ee_len = cpu_to_le16(map->m_len);
@@ -4302,7 +4439,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* needed so that future calls to get_implied_cluster_alloc()
* work correctly.
*/
- offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+ offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
ar.goal -= offset;
ar.logical -= offset;
@@ -4328,21 +4465,19 @@ got_allocated_blocks:
/* try to insert new extent into found leaf and return */
ext4_ext_store_pblock(&newex, newblock + offset);
newex.ee_len = cpu_to_le16(ar.len);
- /* Mark uninitialized */
- if (flags & EXT4_GET_BLOCKS_UNINIT_EXT){
- ext4_ext_mark_uninitialized(&newex);
+ /* Mark unwritten */
+ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
+ ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* io_end structure was created for every IO write to an
- * uninitialized extent. To avoid unnecessary conversion,
+ * unwritten extent. To avoid unnecessary conversion,
* here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
- if ((flags & EXT4_GET_BLOCKS_PRE_IO))
+ if (flags & EXT4_GET_BLOCKS_PRE_IO)
set_unwritten = 1;
- if (ext4_should_dioread_nolock(inode))
- map->m_flags |= EXT4_MAP_UNINIT;
}
err = 0;
@@ -4469,9 +4604,9 @@ got_allocated_blocks:
/*
* Cache the extent and update transaction to commit on fdatasync only
- * when it is _not_ an uninitialized extent.
+ * when it is _not_ an unwritten extent.
*/
- if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
+ if ((flags & EXT4_GET_BLOCKS_UNWRIT_EXT) == 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
else
ext4_update_inode_fsync_trans(handle, inode, 0);
@@ -4488,7 +4623,6 @@ out2:
kfree(path);
}
-out3:
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
ext4_es_lru_add(inode);
@@ -4529,34 +4663,210 @@ retry:
ext4_std_error(inode->i_sb, err);
}
-static void ext4_falloc_update_inode(struct inode *inode,
- int mode, loff_t new_size, int update_ctime)
+static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
+ ext4_lblk_t len, int flags, int mode)
+{
+ struct inode *inode = file_inode(file);
+ handle_t *handle;
+ int ret = 0;
+ int ret2 = 0;
+ int retries = 0;
+ struct ext4_map_blocks map;
+ unsigned int credits;
+
+ map.m_lblk = offset;
+ /*
+ * Don't normalize the request if it can fit in one extent so
+ * that it doesn't get unnecessarily split into multiple
+ * extents.
+ */
+ if (len <= EXT_UNWRITTEN_MAX_LEN)
+ flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+
+ /*
+ * credits to insert 1 extent into extent tree
+ */
+ credits = ext4_chunk_trans_blocks(inode, len);
+
+retry:
+ while (ret >= 0 && ret < len) {
+ map.m_lblk = map.m_lblk + ret;
+ map.m_len = len = len - ret;
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+ credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ break;
+ }
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret <= 0) {
+ ext4_debug("inode #%lu: block %u: len %u: "
+ "ext4_ext_map_blocks returned %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ ext4_mark_inode_dirty(handle, inode);
+ ret2 = ext4_journal_stop(handle);
+ break;
+ }
+ ret2 = ext4_journal_stop(handle);
+ if (ret2)
+ break;
+ }
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries)) {
+ ret = 0;
+ goto retry;
+ }
+
+ return ret > 0 ? ret2 : ret;
+}
+
+static long ext4_zero_range(struct file *file, loff_t offset,
+ loff_t len, int mode)
{
- struct timespec now;
+ struct inode *inode = file_inode(file);
+ handle_t *handle = NULL;
+ unsigned int max_blocks;
+ loff_t new_size = 0;
+ int ret = 0;
+ int flags;
+ int partial;
+ loff_t start, end;
+ ext4_lblk_t lblk;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned int blkbits = inode->i_blkbits;
+
+ trace_ext4_zero_range(inode, offset, len, mode);
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
- if (update_ctime) {
- now = current_fs_time(inode->i_sb);
- if (!timespec_equal(&inode->i_ctime, &now))
- inode->i_ctime = now;
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
}
+
/*
- * Update only when preallocation was requested beyond
- * the file size.
+ * Write out all dirty pages to avoid race conditions
+ * Then release them.
*/
- if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
+ ret = filemap_write_and_wait_range(mapping, offset,
+ offset + len - 1);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Round up offset. This is not fallocate, we neet to zero out
+ * blocks, so convert interior block aligned part of the range to
+ * unwritten and possibly manually zero out unaligned parts of the
+ * range.
+ */
+ start = round_up(offset, 1 << blkbits);
+ end = round_down((offset + len), 1 << blkbits);
+
+ if (start < offset || end > offset + len)
+ return -EINVAL;
+ partial = (offset + len) & ((1 << blkbits) - 1);
+
+ lblk = start >> blkbits;
+ max_blocks = (end >> blkbits);
+ if (max_blocks < lblk)
+ max_blocks = 0;
+ else
+ max_blocks -= lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN;
+ if (mode & FALLOC_FL_KEEP_SIZE)
+ flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * Indirect files do not support unwritten extnets
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out_mutex;
+ /*
+ * If we have a partial block after EOF we have to allocate
+ * the entire block.
+ */
+ if (partial)
+ max_blocks += 1;
+ }
+
+ if (max_blocks > 0) {
+
+ /* Now release the pages and zero block aligned part of pages*/
+ truncate_pagecache_range(inode, start, end - 1);
+
+ /* Wait all existing dio workers, newcomers will block on i_mutex */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ /*
+ * Remove entire range from the extent status tree.
+ */
+ ret = ext4_es_remove_extent(inode, lblk, max_blocks);
+ if (ret)
+ goto out_dio;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
+ mode);
+ if (ret)
+ goto out_dio;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ ext4_std_error(inode->i_sb, ret);
+ goto out_dio;
+ }
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
if (new_size > i_size_read(inode))
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
} else {
/*
- * Mark that we allocate beyond EOF so the subsequent truncate
- * can proceed even if the new size is the same as i_size.
- */
- if (new_size > i_size_read(inode))
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
+ ext4_mark_inode_dirty(handle, inode);
+
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
}
/*
@@ -4570,17 +4880,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
handle_t *handle;
- loff_t new_size;
+ loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
- int ret2 = 0;
- int retries = 0;
int flags;
- struct ext4_map_blocks map;
- unsigned int credits, blkbits = inode->i_blkbits;
+ ext4_lblk_t lblk;
+ struct timespec tv;
+ unsigned int blkbits = inode->i_blkbits;
/* Return error if mode is not supported */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -4597,83 +4907,69 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
+ if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ return ext4_collapse_range(inode, offset, len);
+
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ return ext4_zero_range(file, offset, len, mode);
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
- map.m_lblk = offset >> blkbits;
+ lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
* If blocksize = 4096 offset = 3072 and len = 2048
*/
max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
- - map.m_lblk;
- /*
- * credits to insert 1 extent into extent tree
- */
- credits = ext4_chunk_trans_blocks(inode, max_blocks);
- mutex_lock(&inode->i_mutex);
- ret = inode_newsize_ok(inode, (len + offset));
- if (ret) {
- mutex_unlock(&inode->i_mutex);
- trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
- return ret;
- }
- flags = EXT4_GET_BLOCKS_CREATE_UNINIT_EXT;
+ - lblk;
+
+ flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
if (mode & FALLOC_FL_KEEP_SIZE)
flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
- /*
- * Don't normalize the request if it can fit in one extent so
- * that it doesn't get unnecessarily split into multiple
- * extents.
- */
- if (len <= EXT_UNINIT_MAX_LEN << blkbits)
- flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
-retry:
- while (ret >= 0 && ret < max_blocks) {
- map.m_lblk = map.m_lblk + ret;
- map.m_len = max_blocks = max_blocks - ret;
- handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
- credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- break;
- }
- ret = ext4_map_blocks(handle, inode, &map, flags);
- if (ret <= 0) {
-#ifdef EXT4FS_DEBUG
- ext4_warning(inode->i_sb,
- "inode #%lu: block %u: len %u: "
- "ext4_ext_map_blocks returned %d",
- inode->i_ino, map.m_lblk,
- map.m_len, ret);
-#endif
- ext4_mark_inode_dirty(handle, inode);
- ret2 = ext4_journal_stop(handle);
- break;
- }
- if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
- blkbits) >> blkbits))
- new_size = offset + len;
- else
- new_size = ((loff_t) map.m_lblk + ret) << blkbits;
+ mutex_lock(&inode->i_mutex);
- ext4_falloc_update_inode(inode, mode, new_size,
- (map.m_flags & EXT4_MAP_NEW));
- ext4_mark_inode_dirty(handle, inode);
- if ((file->f_flags & O_SYNC) && ret >= max_blocks)
- ext4_handle_sync(handle);
- ret2 = ext4_journal_stop(handle);
- if (ret2)
- break;
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ ret = inode_newsize_ok(inode, new_size);
+ if (ret)
+ goto out;
}
- if (ret == -ENOSPC &&
- ext4_should_retry_alloc(inode->i_sb, &retries)) {
- ret = 0;
- goto retry;
+
+ ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
+ if (ret)
+ goto out;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ goto out;
+
+ tv = inode->i_ctime = ext4_current_time(inode);
+
+ if (new_size) {
+ if (new_size > i_size_read(inode)) {
+ i_size_write(inode, new_size);
+ inode->i_mtime = tv;
+ }
+ if (new_size > EXT4_I(inode)->i_disksize)
+ ext4_update_i_disksize(inode, new_size);
+ } else {
+ /*
+ * Mark that we allocate beyond EOF so the subsequent truncate
+ * can proceed even if the new size is the same as i_size.
+ */
+ if ((offset + len) > i_size_read(inode))
+ ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
+ ext4_mark_inode_dirty(handle, inode);
+ if (file->f_flags & O_SYNC)
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+out:
mutex_unlock(&inode->i_mutex);
- trace_ext4_fallocate_exit(inode, offset, max_blocks,
- ret > 0 ? ret2 : ret);
- return ret > 0 ? ret2 : ret;
+ trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
+ return ret;
}
/*
@@ -4884,3 +5180,333 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ext4_es_lru_add(inode);
return error;
}
+
+/*
+ * ext4_access_path:
+ * Function to access the path buffer for marking it dirty.
+ * It also checks if there are sufficient credits left in the journal handle
+ * to update path.
+ */
+static int
+ext4_access_path(handle_t *handle, struct inode *inode,
+ struct ext4_ext_path *path)
+{
+ int credits, err;
+
+ if (!ext4_handle_valid(handle))
+ return 0;
+
+ /*
+ * Check if need to extend journal credits
+ * 3 for leaf, sb, and inode plus 2 (bmap and group
+ * descriptor) for each block group; assume two block
+ * groups
+ */
+ if (handle->h_buffer_credits < 7) {
+ credits = ext4_writepage_trans_blocks(inode);
+ err = ext4_ext_truncate_extend_restart(handle, inode, credits);
+ /* EAGAIN is success */
+ if (err && err != -EAGAIN)
+ return err;
+ }
+
+ err = ext4_ext_get_access(handle, inode, path);
+ return err;
+}
+
+/*
+ * ext4_ext_shift_path_extents:
+ * Shift the extents of a path structure lying between path[depth].p_ext
+ * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
+ * from starting block for each extent.
+ */
+static int
+ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
+ struct inode *inode, handle_t *handle,
+ ext4_lblk_t *start)
+{
+ int depth, err = 0;
+ struct ext4_extent *ex_start, *ex_last;
+ bool update = 0;
+ depth = path->p_depth;
+
+ while (depth >= 0) {
+ if (depth == path->p_depth) {
+ ex_start = path[depth].p_ext;
+ if (!ex_start)
+ return -EIO;
+
+ ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
+ if (!ex_last)
+ return -EIO;
+
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
+ update = 1;
+
+ *start = le32_to_cpu(ex_last->ee_block) +
+ ext4_ext_get_actual_len(ex_last);
+
+ while (ex_start <= ex_last) {
+ le32_add_cpu(&ex_start->ee_block, -shift);
+ /* Try to merge to the left. */
+ if ((ex_start >
+ EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
+ ext4_ext_try_to_merge_right(inode,
+ path, ex_start - 1))
+ ex_last--;
+ else
+ ex_start++;
+ }
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ if (--depth < 0 || !update)
+ break;
+ }
+
+ /* Update index too */
+ err = ext4_access_path(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ err = ext4_ext_dirty(handle, inode, path + depth);
+ if (err)
+ goto out;
+
+ /* we are done if current index is not a starting index */
+ if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr))
+ break;
+
+ depth--;
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ext4_ext_shift_extents:
+ * All the extents which lies in the range from start to the last allocated
+ * block for the file are shifted downwards by shift blocks.
+ * On success, 0 is returned, error otherwise.
+ */
+static int
+ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
+ ext4_lblk_t start, ext4_lblk_t shift)
+{
+ struct ext4_ext_path *path;
+ int ret = 0, depth;
+ struct ext4_extent *extent;
+ ext4_lblk_t stop_block, current_block;
+ ext4_lblk_t ex_start, ex_end;
+
+ /* Let path point to the last extent */
+ path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ return ret;
+ }
+
+ stop_block = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ /* Nothing to shift, if hole is at the end of file */
+ if (start >= stop_block)
+ return ret;
+
+ /*
+ * Don't start shifting extents until we make sure the hole is big
+ * enough to accomodate the shift.
+ */
+ path = ext4_ext_find_extent(inode, start - 1, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (extent) {
+ ex_start = le32_to_cpu(extent->ee_block);
+ ex_end = le32_to_cpu(extent->ee_block) +
+ ext4_ext_get_actual_len(extent);
+ } else {
+ ex_start = 0;
+ ex_end = 0;
+ }
+ ext4_ext_drop_refs(path);
+ kfree(path);
+
+ if ((start == ex_start && shift > ex_start) ||
+ (shift > start - ex_end))
+ return -EINVAL;
+
+ /* Its safe to start updating extents */
+ while (start < stop_block) {
+ path = ext4_ext_find_extent(inode, start, NULL, 0);
+ if (IS_ERR(path))
+ return PTR_ERR(path);
+ depth = path->p_depth;
+ extent = path[depth].p_ext;
+ if (!extent) {
+ EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
+ (unsigned long) start);
+ return -EIO;
+ }
+
+ current_block = le32_to_cpu(extent->ee_block);
+ if (start > current_block) {
+ /* Hole, move to the next extent */
+ ret = mext_next_extent(inode, path, &extent);
+ if (ret != 0) {
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ }
+ ret = ext4_ext_shift_path_extents(path, shift, inode,
+ handle, &start);
+ ext4_ext_drop_refs(path);
+ kfree(path);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * ext4_collapse_range:
+ * This implements the fallocate's collapse range functionality for ext4
+ * Returns: 0 and non-zero on error.
+ */
+int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct super_block *sb = inode->i_sb;
+ ext4_lblk_t punch_start, punch_stop;
+ handle_t *handle;
+ unsigned int credits;
+ loff_t new_size, ioffset;
+ int ret;
+
+ /* Collapse range works only on fs block size aligned offsets. */
+ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) ||
+ len & (EXT4_BLOCK_SIZE(sb) - 1))
+ return -EINVAL;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
+ return -EOPNOTSUPP;
+
+ trace_ext4_collapse_range(inode, offset, len);
+
+ punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb);
+
+ /* Call ext4_force_commit to flush all data in case of data=journal. */
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * Need to round down offset to be aligned with page size boundary
+ * for page size > block size.
+ */
+ ioffset = round_down(offset, PAGE_SIZE);
+
+ /* Write out all dirty pages */
+ ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ LLONG_MAX);
+ if (ret)
+ return ret;
+
+ /* Take mutex lock */
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * There is no need to overlap collapse range with EOF, in which case
+ * it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ ret = -EINVAL;
+ goto out_mutex;
+ }
+
+ /* Currently just for extent based files */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ ret = -EOPNOTSUPP;
+ goto out_mutex;
+ }
+
+ truncate_pagecache(inode, ioffset);
+
+ /* Wait for existing dio to complete */
+ ext4_inode_block_unlocked_dio(inode);
+ inode_dio_wait(inode);
+
+ credits = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_dio;
+ }
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_es_remove_extent(inode, punch_start,
+ EXT_MAX_BLOCKS - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+ ext4_discard_preallocations(inode);
+
+ ret = ext4_ext_shift_extents(inode, handle, punch_stop,
+ punch_stop - punch_start);
+ if (ret) {
+ up_write(&EXT4_I(inode)->i_data_sem);
+ goto out_stop;
+ }
+
+ new_size = i_size_read(inode) - len;
+ i_size_write(inode, new_size);
+ EXT4_I(inode)->i_disksize = new_size;
+
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+
+out_stop:
+ ext4_journal_stop(handle);
+out_dio:
+ ext4_inode_resume_unlocked_dio(inode);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ return ret;
+}
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 3981ff78395..0b7e28e7eaa 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode)
while (node) {
struct extent_status *es;
es = rb_entry(node, struct extent_status, rb_node);
- printk(KERN_DEBUG " [%u/%u) %llu %llx",
+ printk(KERN_DEBUG " [%u/%u) %llu %x",
es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
node = rb_next(node);
@@ -344,8 +344,14 @@ static int ext4_es_can_be_merged(struct extent_status *es1,
if (ext4_es_status(es1) != ext4_es_status(es2))
return 0;
- if (((__u64) es1->es_len) + es2->es_len > 0xFFFFFFFFULL)
+ if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
+ pr_warn("ES assertion failed when merging extents. "
+ "The sum of lengths of es1 (%d) and es2 (%d) "
+ "is bigger than allowed file size (%d)\n",
+ es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
+ WARN_ON(1);
return 0;
+ }
if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
return 0;
@@ -433,7 +439,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
ee_start = ext4_ext_pblock(ex);
ee_len = ext4_ext_get_actual_len(ex);
- ee_status = ext4_ext_is_uninitialized(ex) ? 1 : 0;
+ ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
es_status = ext4_es_is_unwritten(es) ? 1 : 0;
/*
@@ -445,8 +451,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
pr_warn("ES insert assertion failed for "
"inode: %lu we can find an extent "
"at block [%d/%d/%llu/%c], but we "
- "want to add an delayed/hole extent "
- "[%d/%d/%llu/%llx]\n",
+ "want to add a delayed/hole extent "
+ "[%d/%d/%llu/%x]\n",
inode->i_ino, ee_block, ee_len,
ee_start, ee_status ? 'u' : 'w',
es->es_lblk, es->es_len,
@@ -486,8 +492,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
pr_warn("ES insert assertion failed for inode: %lu "
"can't find an extent at block %d but we want "
- "to add an written/unwritten extent "
- "[%d/%d/%llu/%llx]\n", inode->i_ino,
+ "to add a written/unwritten extent "
+ "[%d/%d/%llu/%x]\n", inode->i_ino,
es->es_lblk, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
}
@@ -524,7 +530,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
*/
pr_warn("ES insert assertion failed for inode: %lu "
"We can find blocks but we want to add a "
- "delayed/hole extent [%d/%d/%llu/%llx]\n",
+ "delayed/hole extent [%d/%d/%llu/%x]\n",
inode->i_ino, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
return;
@@ -554,7 +560,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
if (ext4_es_is_written(es)) {
pr_warn("ES insert assertion failed for inode: %lu "
"We can't find the block but we want to add "
- "an written extent [%d/%d/%llu/%llx]\n",
+ "a written extent [%d/%d/%llu/%x]\n",
inode->i_ino, es->es_lblk, es->es_len,
ext4_es_pblock(es), ext4_es_status(es));
return;
@@ -658,8 +664,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
- ext4_es_store_pblock(&newes, pblk);
- ext4_es_store_status(&newes, status);
+ ext4_es_store_pblock_status(&newes, pblk, status);
trace_ext4_es_insert_extent(inode, &newes);
ext4_es_insert_extent_check(inode, &newes);
@@ -699,8 +704,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
newes.es_lblk = lblk;
newes.es_len = len;
- ext4_es_store_pblock(&newes, pblk);
- ext4_es_store_status(&newes, status);
+ ext4_es_store_pblock_status(&newes, pblk, status);
trace_ext4_es_cache_extent(inode, &newes);
if (!len)
@@ -812,13 +816,13 @@ retry:
newes.es_lblk = end + 1;
newes.es_len = len2;
+ block = 0x7FDEADBEEFULL;
if (ext4_es_is_written(&orig_es) ||
- ext4_es_is_unwritten(&orig_es)) {
+ ext4_es_is_unwritten(&orig_es))
block = ext4_es_pblock(&orig_es) +
orig_es.es_len - len2;
- ext4_es_store_pblock(&newes, block);
- }
- ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+ ext4_es_store_pblock_status(&newes, block,
+ ext4_es_status(&orig_es));
err = __es_insert_extent(inode, &newes);
if (err) {
es->es_lblk = orig_es.es_lblk;
@@ -962,10 +966,10 @@ retry:
continue;
}
- if (ei->i_es_lru_nr == 0 || ei == locked_ei)
+ if (ei->i_es_lru_nr == 0 || ei == locked_ei ||
+ !write_trylock(&ei->i_es_lock))
continue;
- write_lock(&ei->i_es_lock);
shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
if (ei->i_es_lru_nr == 0)
list_del_init(&ei->i_es_lru);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 167f4ab8ecc..f1b62a41992 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es,
(es->es_pblk & ~ES_MASK));
}
+static inline void ext4_es_store_pblock_status(struct extent_status *es,
+ ext4_fsblk_t pb,
+ unsigned int status)
+{
+ es->es_pblk = (((ext4_fsblk_t)
+ (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) |
+ (pb & ~ES_MASK));
+}
+
extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3da21945ff1..8695f70af1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -57,7 +57,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
return 0;
}
-void ext4_unwritten_wait(struct inode *inode)
+static void ext4_unwritten_wait(struct inode *inode)
{
wait_queue_head_t *wq = ext4_ioend_wq(inode);
@@ -74,132 +74,126 @@ void ext4_unwritten_wait(struct inode *inode)
* or one thread will zero the other's data, causing corruption.
*/
static int
-ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
{
struct super_block *sb = inode->i_sb;
int blockmask = sb->s_blocksize - 1;
- size_t count = iov_length(iov, nr_segs);
- loff_t final_size = pos + count;
- if (pos >= inode->i_size)
+ if (pos >= i_size_read(inode))
return 0;
- if ((pos & blockmask) || (final_size & blockmask))
+ if ((pos | iov_iter_alignment(from)) & blockmask)
return 1;
return 0;
}
static ssize_t
-ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int unaligned_aio = 0;
- ssize_t ret;
+ int o_direct = file->f_flags & O_DIRECT;
int overwrite = 0;
- size_t length = iov_length(iov, nr_segs);
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb))
- unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+ size_t length = iov_iter_count(from);
+ ssize_t ret;
+ loff_t pos = iocb->ki_pos;
- /* Unaligned direct AIO must be serialized; see comment above */
- if (unaligned_aio) {
- mutex_lock(ext4_aio_mutex(inode));
+ /*
+ * Unaligned direct AIO must be serialized; see comment above
+ * In the case of O_APPEND, assume that we must always serialize
+ */
+ if (o_direct &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) &&
+ (file->f_flags & O_APPEND ||
+ ext4_unaligned_aio(inode, from, pos))) {
+ aio_mutex = ext4_aio_mutex(inode);
+ mutex_lock(aio_mutex);
ext4_unwritten_wait(inode);
}
- BUG_ON(iocb->ki_pos != pos);
-
mutex_lock(&inode->i_mutex);
- blk_start_plug(&plug);
-
- iocb->private = &overwrite;
+ if (file->f_flags & O_APPEND)
+ iocb->ki_pos = pos = i_size_read(inode);
- /* check whether we do a DIO overwrite or not */
- if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
- !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
- struct ext4_map_blocks map;
- unsigned int blkbits = inode->i_blkbits;
- int err, len;
+ /*
+ * If we have encountered a bitmap-format file, the size limit
+ * is smaller than s_maxbytes, which is for extent-mapped files.
+ */
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- map.m_lblk = pos >> blkbits;
- map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
- - map.m_lblk;
- len = map.m_len;
+ if ((pos > sbi->s_bitmap_maxbytes) ||
+ (pos == sbi->s_bitmap_maxbytes && length > 0)) {
+ mutex_unlock(&inode->i_mutex);
+ ret = -EFBIG;
+ goto errout;
+ }
- err = ext4_map_blocks(NULL, inode, &map, 0);
- /*
- * 'err==len' means that all of blocks has been preallocated no
- * matter they are initialized or not. For excluding
- * uninitialized extents, we need to check m_flags. There are
- * two conditions that indicate for initialized extents.
- * 1) If we hit extent cache, EXT4_MAP_MAPPED flag is returned;
- * 2) If we do a real lookup, non-flags are returned.
- * So we should check these two conditions.
- */
- if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
- overwrite = 1;
+ if (pos + length > sbi->s_bitmap_maxbytes)
+ iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
}
- ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
- mutex_unlock(&inode->i_mutex);
-
- if (ret > 0) {
- ssize_t err;
-
- err = generic_write_sync(file, pos, ret);
- if (err < 0 && ret > 0)
- ret = err;
- }
- blk_finish_plug(&plug);
+ if (o_direct) {
+ blk_start_plug(&plug);
- if (unaligned_aio)
- mutex_unlock(ext4_aio_mutex(inode));
+ iocb->private = &overwrite;
- return ret;
-}
+ /* check whether we do a DIO overwrite or not */
+ if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
+ !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err, len;
-static ssize_t
-ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- struct inode *inode = file_inode(iocb->ki_filp);
- ssize_t ret;
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
- /*
- * If we have encountered a bitmap-format file, the size limit
- * is smaller than s_maxbytes, which is for extent-mapped files.
- */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ /*
+ * 'err==len' means that all of blocks has
+ * been preallocated no matter they are
+ * initialized or not. For excluding
+ * unwritten extents, we need to check
+ * m_flags. There are two conditions that
+ * indicate for initialized extents. 1) If we
+ * hit extent cache, EXT4_MAP_MAPPED flag is
+ * returned; 2) If we do a real lookup,
+ * non-flags are returned. So we should check
+ * these two conditions.
+ */
+ if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
+ overwrite = 1;
+ }
+ }
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
- size_t length = iov_length(iov, nr_segs);
+ ret = __generic_file_write_iter(iocb, from);
+ mutex_unlock(&inode->i_mutex);
- if ((pos > sbi->s_bitmap_maxbytes ||
- (pos == sbi->s_bitmap_maxbytes && length > 0)))
- return -EFBIG;
+ if (ret > 0) {
+ ssize_t err;
- if (pos + length > sbi->s_bitmap_maxbytes) {
- nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
- sbi->s_bitmap_maxbytes - pos);
- }
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+ if (err < 0)
+ ret = err;
}
+ if (o_direct)
+ blk_finish_plug(&plug);
- if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
- ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
- else
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-
+errout:
+ if (aio_mutex)
+ mutex_unlock(aio_mutex);
return ret;
}
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -243,6 +237,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err) {
ext4_journal_stop(handle);
@@ -592,10 +587,10 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ext4_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
@@ -605,7 +600,7 @@ const struct file_operations ext4_file_operations = {
.release = ext4_release_file,
.fsync = ext4_sync_file,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
@@ -617,6 +612,7 @@ const struct inode_operations ext4_file_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 137193ff389..5b87fc36aab 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -71,6 +71,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
struct ext4_group_desc *gdp)
{
struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
J_ASSERT_BH(bh, buffer_locked(bh));
/* If checksum is bad mark all blocks and inodes use to prevent
@@ -78,7 +79,16 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return 0;
}
@@ -116,6 +126,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
struct ext4_group_info *grp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
@@ -185,6 +196,12 @@ verify:
ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
"inode_bitmap = %llu", block_group, bitmap_blk);
grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
return NULL;
}
@@ -321,6 +338,12 @@ out:
fatal = err;
} else {
ext4_error(sb, "bit already cleared for inode %lu", ino);
+ if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, gdp);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
}
@@ -432,7 +455,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
ext4fs_dirhash(qstr->name, qstr->len, &hinfo);
grp = hinfo.hash;
} else
- get_random_bytes(&grp, sizeof(grp));
+ grp = prandom_u32();
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
g = (parent_group + i) % ngroups;
@@ -851,6 +874,13 @@ got:
goto out;
}
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, group_desc_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -887,13 +917,6 @@ got:
}
}
- BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, group_desc_bh);
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
-
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 594009f5f52..fd69da19482 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -389,7 +389,13 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
return 0;
failed:
for (; i >= 0; i--) {
- if (i != indirect_blks && branch[i].bh)
+ /*
+ * We want to ext4_forget() only freshly allocated indirect
+ * blocks. Buffer for new_blocks[i-1] is at branch[i].bh and
+ * buffer at branch[0].bh is indirect block / inode already
+ * existing before ext4_alloc_branch() was called.
+ */
+ if (i > 0 && i != indirect_blks && branch[i].bh)
ext4_forget(handle, 1, inode, branch[i].bh,
branch[i].bh->b_blocknr);
ext4_free_blocks(handle, inode, NULL, new_blocks[i],
@@ -639,8 +645,7 @@ out:
* VFS code falls back into buffered path in that case so we are safe.
*/
ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
@@ -648,7 +653,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
handle_t *handle;
ssize_t ret;
int orphan = 0;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int retries = 0;
if (rw == WRITE) {
@@ -687,18 +692,17 @@ retry:
goto locked;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
+ inode->i_sb->s_bdev, iter, offset,
ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iov,
- offset, nr_segs, ext4_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
ext4_truncate_failed_write(inode);
@@ -1312,16 +1316,24 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
blk = *i_data;
if (level > 0) {
ext4_lblk_t first2;
+ ext4_lblk_t count2;
+
bh = sb_bread(inode->i_sb, le32_to_cpu(blk));
if (!bh) {
EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk),
"Read failure");
return -EIO;
}
- first2 = (first > offset) ? first - offset : 0;
+ if (first > offset) {
+ first2 = first - offset;
+ count2 = count;
+ } else {
+ first2 = 0;
+ count2 = count - (offset - first);
+ }
ret = free_hole_blocks(handle, inode, bh,
(__le32 *)bh->b_data, level - 1,
- first2, count - offset,
+ first2, count2,
inode->i_sb->s_blocksize >> 2);
if (ret) {
brelse(bh);
@@ -1331,8 +1343,8 @@ static int free_hole_blocks(handle_t *handle, struct inode *inode,
if (level == 0 ||
(bh && all_zeroes((__le32 *)bh->b_data,
(__le32 *)bh->b_data + addr_per_block))) {
- ext4_free_data(handle, inode, parent_bh, &blk, &blk+1);
- *i_data = 0;
+ ext4_free_data(handle, inode, parent_bh,
+ i_data, i_data + 1);
}
brelse(bh);
bh = NULL;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d9ecbf1113a..645205d8ada 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -22,7 +22,7 @@
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
-int ext4_get_inline_size(struct inode *inode)
+static int ext4_get_inline_size(struct inode *inode)
{
if (EXT4_I(inode)->i_inline_off)
return EXT4_I(inode)->i_inline_size;
@@ -211,8 +211,8 @@ out:
* value since it is already handled by ext4_xattr_ibody_inline_set.
* That saves us one memcpy.
*/
-void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
- void *buffer, loff_t pos, unsigned int len)
+static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+ void *buffer, loff_t pos, unsigned int len)
{
struct ext4_xattr_entry *entry;
struct ext4_xattr_ibody_header *header;
@@ -264,6 +264,7 @@ static int ext4_create_inline_data(handle_t *handle,
if (error)
return error;
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
error = ext4_journal_get_write_access(handle, is.iloc.bh);
if (error)
goto out;
@@ -347,6 +348,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
if (error == -ENODATA)
goto out;
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
error = ext4_journal_get_write_access(handle, is.iloc.bh);
if (error)
goto out;
@@ -373,8 +375,8 @@ out:
return error;
}
-int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
{
int ret, size;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -424,6 +426,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
if (error)
goto out;
+ BUFFER_TRACE(is.iloc.bh, "get_write_access");
error = ext4_journal_get_write_access(handle, is.iloc.bh);
if (error)
goto out;
@@ -849,15 +852,16 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
handle_t *handle;
struct page *page;
struct ext4_iloc iloc;
+ int retries;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
+retry_journal:
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- handle = NULL;
goto out;
}
@@ -867,7 +871,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
if (inline_size >= pos + len) {
ret = ext4_prepare_inline_data(handle, inode, pos + len);
if (ret && ret != -ENOSPC)
- goto out;
+ goto out_journal;
}
if (ret == -ENOSPC) {
@@ -875,6 +879,10 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
inode,
flags,
fsdata);
+ ext4_journal_stop(handle);
+ if (ret == -ENOSPC &&
+ ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry_journal;
goto out;
}
@@ -887,7 +895,7 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page) {
ret = -ENOMEM;
- goto out;
+ goto out_journal;
}
down_read(&EXT4_I(inode)->xattr_sem);
@@ -904,16 +912,15 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
up_read(&EXT4_I(inode)->xattr_sem);
*pagep = page;
- handle = NULL;
brelse(iloc.bh);
return 1;
out_release_page:
up_read(&EXT4_I(inode)->xattr_sem);
unlock_page(page);
page_cache_release(page);
+out_journal:
+ ext4_journal_stop(handle);
out:
- if (handle)
- ext4_journal_stop(handle);
brelse(iloc.bh);
return ret;
}
@@ -994,17 +1001,16 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned short reclen;
int err;
struct ext4_dir_entry_2 *de;
- reclen = EXT4_DIR_REC_LEN(namelen);
err = ext4_find_dest_de(dir, inode, iloc->bh,
inline_start, inline_size,
name, namelen, &de);
if (err)
return err;
+ BUFFER_TRACE(iloc->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, iloc->bh);
if (err)
return err;
@@ -1442,6 +1448,7 @@ int ext4_read_inline_dir(struct file *file,
if (ret < 0)
goto out;
+ ret = 0;
sb = inode->i_sb;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
offset = ctx->pos;
@@ -1666,6 +1673,7 @@ int ext4_delete_inline_entry(handle_t *handle,
EXT4_MIN_INLINE_DATA_SIZE;
}
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
goto out;
@@ -1838,7 +1846,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
{
int error;
struct ext4_xattr_entry *entry;
- struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
struct ext4_iloc iloc;
@@ -1847,7 +1854,6 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
return error;
raw_inode = ext4_raw_inode(&iloc);
- header = IHDR(inode, raw_inode);
entry = (struct ext4_xattr_entry *)((void *)raw_inode +
EXT4_I(inode)->i_inline_off);
if (EXT4_XATTR_LEN(entry->e_name_len) +
@@ -1925,9 +1931,11 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
}
/* Clear the content within i_blocks. */
- if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
- memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
- EXT4_MIN_INLINE_DATA_SIZE - i_size);
+ if (i_size < EXT4_MIN_INLINE_DATA_SIZE) {
+ void *p = (void *) ext4_raw_inode(&is.iloc)->i_block;
+ memset(p + i_size, 0,
+ EXT4_MIN_INLINE_DATA_SIZE - i_size);
+ }
EXT4_I(inode)->i_inline_size = i_size <
EXT4_MIN_INLINE_DATA_SIZE ?
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d424d7ac02..8a064734e6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -38,6 +38,7 @@
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/aio.h>
+#include <linux/bitops.h>
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -144,8 +145,11 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
static int ext4_inode_is_fast_symlink(struct inode *inode)
{
- int ea_blocks = EXT4_I(inode)->i_file_acl ?
- (inode->i_sb->s_blocksize >> 9) : 0;
+ int ea_blocks = EXT4_I(inode)->i_file_acl ?
+ EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;
+
+ if (ext4_has_inline_data(inode))
+ return 0;
return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
}
@@ -214,7 +218,7 @@ void ext4_evict_inode(struct inode *inode)
jbd2_complete_transaction(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
@@ -225,7 +229,7 @@ void ext4_evict_inode(struct inode *inode)
if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
@@ -442,7 +446,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* could be converted.
*/
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -488,8 +492,8 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
* based files
*
- * On success, it returns the number of blocks being mapped or allocate.
- * if create==0 and the blocks are pre-allocated and uninitialized block,
+ * On success, it returns the number of blocks being mapped or allocated.
+ * if create==0 and the blocks are pre-allocated and unwritten block,
* the result buffer head is unmapped. If the create ==1, it will make sure
* the buffer head is mapped.
*
@@ -503,6 +507,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
{
struct extent_status es;
int retval;
+ int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -514,6 +519,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
"logical block %lu\n", inode->i_ino, flags, map->m_len,
(unsigned long) map->m_lblk);
+ /*
+ * ext4_map_blocks returns an int, and m_len is an unsigned int
+ */
+ if (unlikely(map->m_len > INT_MAX))
+ map->m_len = INT_MAX;
+
+ /* We can handle the block number less than EXT_MAX_BLOCKS */
+ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
+ return -EIO;
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
ext4_es_lru_add(inode);
@@ -543,7 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -552,7 +567,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
EXT4_GET_BLOCKS_KEEP_SIZE);
}
if (retval > 0) {
- int ret;
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -579,7 +593,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
found:
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -596,7 +610,13 @@ found:
* with buffer head unmapped.
*/
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- return retval;
+ /*
+ * If we need to convert extent to unwritten
+ * we continue and do the actual work in
+ * ext4_ext_map_blocks()
+ */
+ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
+ return retval;
/*
* Here we clear m_flags because after allocating an new extent,
@@ -605,12 +625,12 @@ found:
map->m_flags &= ~EXT4_MAP_FLAGS;
/*
- * New blocks allocate and/or writing to uninitialized extent
+ * New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
* the write lock of i_data_sem, and call get_blocks()
* with create == 1 flag.
*/
- down_write((&EXT4_I(inode)->i_data_sem));
+ down_write(&EXT4_I(inode)->i_data_sem);
/*
* if the caller is from delayed allocation writeout path
@@ -652,7 +672,6 @@ found:
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
if (retval > 0) {
- int ret;
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -687,7 +706,7 @@ found:
has_zeroout:
up_write((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -906,6 +925,7 @@ int do_journal_get_write_access(handle_t *handle,
*/
if (dirty)
clear_buffer_dirty(bh);
+ BUFFER_TRACE(bh, "get write access");
ret = ext4_journal_get_write_access(handle, bh);
if (!ret && dirty)
ret = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -1206,7 +1226,6 @@ static int ext4_journalled_write_end(struct file *file,
*/
static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
{
- int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int md_needed;
@@ -1218,7 +1237,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
* in order to allocate nrblocks
* worse case is one extent per block
*/
-repeat:
spin_lock(&ei->i_block_reservation_lock);
/*
* ext4_calc_metadata_amount() has side effects, which we have
@@ -1238,10 +1256,6 @@ repeat:
ei->i_da_metadata_calc_len = save_len;
ei->i_da_metadata_calc_last_lblock = save_last_lblock;
spin_unlock(&ei->i_block_reservation_lock);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- cond_resched();
- goto repeat;
- }
return -ENOSPC;
}
ei->i_reserved_meta_blocks += md_needed;
@@ -1255,7 +1269,6 @@ repeat:
*/
static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
{
- int retries = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int md_needed;
@@ -1277,7 +1290,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
* in order to allocate nrblocks
* worse case is one extent per block
*/
-repeat:
spin_lock(&ei->i_block_reservation_lock);
/*
* ext4_calc_metadata_amount() has side effects, which we have
@@ -1297,10 +1309,6 @@ repeat:
ei->i_da_metadata_calc_len = save_len;
ei->i_da_metadata_calc_last_lblock = save_last_lblock;
spin_unlock(&ei->i_block_reservation_lock);
- if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
- cond_resched();
- goto repeat;
- }
dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
return -ENOSPC;
}
@@ -1536,7 +1544,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
ext4_es_lru_add(inode);
if (ext4_es_is_hole(&es)) {
retval = 0;
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
goto add_delayed;
}
@@ -1573,7 +1581,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
if (ext4_has_inline_data(inode)) {
/*
* We will soon create blocks for this page, and let
@@ -1765,6 +1773,7 @@ static int __ext4_journalled_writepage(struct page *page,
BUG_ON(!ext4_handle_valid(handle));
if (inline_data) {
+ BUFFER_TRACE(inode_bh, "get write access");
ret = ext4_journal_get_write_access(handle, inode_bh);
err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
@@ -1784,7 +1793,7 @@ static int __ext4_journalled_writepage(struct page *page,
ret = err;
if (!ext4_has_inline_data(inode))
- ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ ext4_walk_page_buffers(NULL, page_bufs, 0, len,
NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
@@ -1842,6 +1851,7 @@ static int ext4_writepage(struct page *page,
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
struct ext4_io_submit io_submit;
+ bool keep_towrite = false;
trace_ext4_writepage(page);
size = i_size_read(inode);
@@ -1872,6 +1882,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return 0;
}
+ keep_towrite = true;
}
if (PageChecked(page) && ext4_should_journal_data(inode))
@@ -1888,7 +1899,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
- ret = ext4_bio_write_page(&io_submit, page, len, wbc);
+ ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
@@ -1907,7 +1918,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
else
len = PAGE_CACHE_SIZE;
clear_page_dirty_for_io(page);
- err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
+ err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
@@ -2028,7 +2039,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
* Scan buffers corresponding to changed extent (we expect corresponding pages
* to be already locked) and update buffer state according to new extent state.
* We map delalloc buffers to their physical location, clear unwritten bits,
- * and mark buffers as uninit when we perform writes to uninitialized extents
+ * and mark buffers as uninit when we perform writes to unwritten extents
* and do extent conversion after IO is finished. If the last page is not fully
* mapped, we update @map to the next extent in the last page that needs
* mapping. Otherwise we submit the page for IO.
@@ -2122,12 +2133,12 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
struct inode *inode = mpd->inode;
struct ext4_map_blocks *map = &mpd->map;
int get_blocks_flags;
- int err;
+ int err, dioread_nolock;
trace_ext4_da_write_pages_extent(inode, map);
/*
* Call ext4_map_blocks() to allocate any delayed allocation blocks, or
- * to convert an uninitialized extent to be initialized (in the case
+ * to convert an unwritten extent to be initialized (in the case
* where we have written into one or more preallocated blocks). It is
* possible that we're going to need more metadata blocks than
* previously reserved. However we must not fail because we're in
@@ -2144,7 +2155,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL;
- if (ext4_should_dioread_nolock(inode))
+ dioread_nolock = ext4_should_dioread_nolock(inode);
+ if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
if (map->m_flags & (1 << BH_Delay))
get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
@@ -2152,7 +2164,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
if (err < 0)
return err;
- if (map->m_flags & EXT4_MAP_UNINIT) {
+ if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
if (!mpd->io_submit.io_end->handle &&
ext4_handle_valid(handle)) {
mpd->io_submit.io_end->handle = handle->h_rsv_handle;
@@ -2178,6 +2190,9 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
*
* @handle - handle for journal operations
* @mpd - extent to map
+ * @give_up_on_write - we set this to true iff there is a fatal error and there
+ * is no hope of writing the data. The caller should discard
+ * dirty pages to avoid infinite loops.
*
* The function maps extent starting at mpd->lblk of length mpd->len. If it is
* delayed, blocks are allocated, if it is unwritten, we may need to convert
@@ -2240,13 +2255,23 @@ static int mpage_map_and_submit_extent(handle_t *handle,
return err;
} while (map->m_len);
- /* Update on-disk size after IO is submitted */
+ /*
+ * Update on-disk size after IO is submitted. Races with
+ * truncate are avoided by checking i_size under i_data_sem.
+ */
disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
if (disksize > EXT4_I(inode)->i_disksize) {
int err2;
-
- ext4_wb_update_i_disksize(inode, disksize);
+ loff_t i_size;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = i_size_read(inode);
+ if (disksize > i_size)
+ disksize = i_size;
+ if (disksize > EXT4_I(inode)->i_disksize)
+ EXT4_I(inode)->i_disksize = disksize;
err2 = ext4_mark_inode_dirty(handle, inode);
+ up_write(&EXT4_I(inode)->i_data_sem);
if (err2)
ext4_error(inode->i_sb,
"Failed to mark inode %lu dirty",
@@ -2295,6 +2320,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
struct address_space *mapping = mpd->inode->i_mapping;
struct pagevec pvec;
unsigned int nr_pages;
+ long left = mpd->wbc->nr_to_write;
pgoff_t index = mpd->first_page;
pgoff_t end = mpd->last_page;
int tag;
@@ -2330,6 +2356,17 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (page->index > end)
goto out;
+ /*
+ * Accumulated enough dirty pages? This doesn't apply
+ * to WB_SYNC_ALL mode. For integrity sync we have to
+ * keep going because someone may be concurrently
+ * dirtying pages, and we might have synced a lot of
+ * newly appeared dirty pages, but have not synced all
+ * of the old dirty pages.
+ */
+ if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
+ goto out;
+
/* If we can't merge this page, we are done. */
if (mpd->map.m_len > 0 && mpd->next_page != page->index)
goto out;
@@ -2364,19 +2401,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (err <= 0)
goto out;
err = 0;
-
- /*
- * Accumulated enough dirty pages? This doesn't apply
- * to WB_SYNC_ALL mode. For integrity sync we have to
- * keep going because someone may be concurrently
- * dirtying pages, and we might have synced a lot of
- * newly appeared dirty pages, but have not synced all
- * of the old dirty pages.
- */
- if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
- mpd->next_page - mpd->first_page >=
- mpd->wbc->nr_to_write)
- goto out;
+ left--;
}
pagevec_release(&pvec);
cond_resched();
@@ -2420,16 +2445,15 @@ static int ext4_writepages(struct address_space *mapping,
* because that could violate lock ordering on umount
*/
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
+ goto out_writepages;
if (ext4_should_journal_data(inode)) {
struct blk_plug plug;
- int ret;
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, __writepage, mapping);
blk_finish_plug(&plug);
- return ret;
+ goto out_writepages;
}
/*
@@ -2442,8 +2466,10 @@ static int ext4_writepages(struct address_space *mapping,
* *never* be called, so if that ever happens, we would want
* the stack trace.
*/
- if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
- return -EROFS;
+ if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+ ret = -EROFS;
+ goto out_writepages;
+ }
if (ext4_should_dioread_nolock(inode)) {
/*
@@ -2563,7 +2589,7 @@ retry:
break;
}
blk_finish_plug(&plug);
- if (!ret && !cycled) {
+ if (!ret && !cycled && wbc->nr_to_write > 0) {
cycled = 1;
mpd.last_page = writeback_index - 1;
mpd.first_page = 0;
@@ -3052,9 +3078,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
* preallocated extents, and those write extend the file, no need to
* fall back to buffered IO.
*
- * For holes, we fallocate those blocks, mark them as uninitialized
+ * For holes, we fallocate those blocks, mark them as unwritten
* If those blocks were preallocated, we mark sure they are split, but
- * still keep the range to write as uninitialized.
+ * still keep the range to write as unwritten.
*
* The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
@@ -3067,13 +3093,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
*
*/
static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
ssize_t ret;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
int overwrite = 0;
get_block_t *get_block_func = NULL;
int dio_flags = 0;
@@ -3082,7 +3107,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
/* Use the old path for reads and writes beyond i_size. */
if (rw != WRITE || final_size > inode->i_size)
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ return ext4_ind_direct_IO(rw, iocb, iter, offset);
BUG_ON(iocb->private == NULL);
@@ -3106,12 +3131,12 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* We could direct write to holes and fallocate.
*
* Allocated blocks to fill the hole are marked as
- * uninitialized to prevent parallel buffered read to expose
+ * unwritten to prevent parallel buffered read to expose
* the stale data before DIO complete the data IO.
*
* As to previously fallocated extents, ext4 get_block will
* just simply mark the buffer mapped but still keep the
- * extents uninitialized.
+ * extents unwritten.
*
* For non AIO case, we will convert those unwritten extents
* to written after return back from blockdev_direct_IO.
@@ -3149,8 +3174,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
dio_flags = DIO_LOCKING;
}
ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
+ inode->i_sb->s_bdev, iter,
+ offset,
get_block_func,
ext4_end_io_dio,
NULL,
@@ -3204,11 +3229,11 @@ retake_lock:
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
/*
@@ -3221,13 +3246,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_has_inline_data(inode))
return 0;
- trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
+ trace_ext4_direct_IO_enter(inode, offset, count, rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+ ret = ext4_ext_direct_IO(rw, iocb, iter, offset);
else
- ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
- trace_ext4_direct_IO_exit(inode, offset,
- iov_length(iov, nr_segs), rw, ret);
+ ret = ext4_ind_direct_IO(rw, iocb, iter, offset);
+ trace_ext4_direct_IO_exit(inode, offset, count, rw, ret);
return ret;
}
@@ -3320,33 +3344,13 @@ void ext4_set_aops(struct inode *inode)
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- */
-int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
-{
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned length;
- unsigned blocksize;
- struct inode *inode = mapping->host;
-
- blocksize = inode->i_sb->s_blocksize;
- length = blocksize - (offset & (blocksize - 1));
-
- return ext4_block_zero_page_range(handle, mapping, from, length);
-}
-
-/*
* ext4_block_zero_page_range() zeros out a mapping of length 'length'
* starting from file offset 'from'. The range to be zero'd must
* be contained with in one block. If the specified range exceeds
* the end of the block it will be shortened to end of the block
* that cooresponds to 'from'
*/
-int ext4_block_zero_page_range(handle_t *handle,
+static int ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -3436,6 +3440,26 @@ unlock:
return err;
}
+/*
+ * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
+ * up to the end of the block which corresponds to `from'.
+ * This required during truncate. We need to physically zero the tail end
+ * of that block so it doesn't yield old data if the file is later grown.
+ */
+static int ext4_block_truncate_page(handle_t *handle,
+ struct address_space *mapping, loff_t from)
+{
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned length;
+ unsigned blocksize;
+ struct inode *inode = mapping->host;
+
+ blocksize = inode->i_sb->s_blocksize;
+ length = blocksize - (offset & (blocksize - 1));
+
+ return ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t length)
{
@@ -3509,12 +3533,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (!S_ISREG(inode->i_mode))
return -EOPNOTSUPP;
- if (EXT4_SB(sb)->s_cluster_ratio > 1) {
- /* TODO: Add support for bigalloc file systems */
- return -EOPNOTSUPP;
- }
-
- trace_ext4_punch_hole(inode, offset, length);
+ trace_ext4_punch_hole(inode, offset, length, 0);
/*
* Write out all dirty pages to avoid race conditions
@@ -3528,15 +3547,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
mutex_lock(&inode->i_mutex);
- /* It's not possible punch hole on append only file */
- if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
- ret = -EPERM;
- goto out_mutex;
- }
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- goto out_mutex;
- }
/* No need to punch hole beyond i_size */
if (offset >= inode->i_size)
@@ -3617,10 +3627,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
ret = ext4_free_hole_blocks(handle, inode, first_block,
stop_block);
- ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
+
+ /* Now release the pages again to reduce race window */
+ if (last_block_offset > first_block_offset)
+ truncate_pagecache_range(inode, first_block_offset,
+ last_block_offset);
+
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
@@ -3694,7 +3709,7 @@ void ext4_truncate(struct inode *inode)
/*
* There is a possibility that we're either freeing the inode
- * or it completely new indode. In those cases we might not
+ * or it's a completely new inode. In those cases we might not
* have i_mutex locked because it's not necessary.
*/
if (!(inode->i_state & (I_NEW|I_FREEING)))
@@ -3934,18 +3949,20 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
void ext4_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT4_I(inode)->i_flags;
+ unsigned int new_fl = 0;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
if (flags & EXT4_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & EXT4_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & EXT4_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & EXT4_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ inode_set_flags(inode, new_fl,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4164,11 +4181,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);
- inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
- if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- inode->i_version |=
- (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ inode->i_version = le32_to_cpu(raw_inode->i_disk_version);
+ if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ inode->i_version |=
+ (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
+ }
}
ret = 0;
@@ -4291,12 +4310,15 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
+ struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
- int need_datasync = 0;
+ int need_datasync = 0, set_large_file = 0;
uid_t i_uid;
gid_t i_gid;
- /* For fields not not tracking in the in-memory inode,
+ spin_lock(&ei->i_raw_lock);
+
+ /* For fields not tracked in the in-memory inode,
* initialise them to zero for new inodes. */
if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -4334,12 +4356,13 @@ static int ext4_do_update_inode(handle_t *handle,
EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode);
EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);
- if (ext4_inode_blocks_set(handle, raw_inode, ei))
+ if (ext4_inode_blocks_set(handle, raw_inode, ei)) {
+ spin_unlock(&ei->i_raw_lock);
goto out_brelse;
+ }
raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
- if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
- cpu_to_le32(EXT4_OS_HURD))
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
@@ -4348,24 +4371,11 @@ static int ext4_do_update_inode(handle_t *handle,
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
- struct super_block *sb = inode->i_sb;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
EXT4_SB(sb)->s_es->s_rev_level ==
- cpu_to_le32(EXT4_GOOD_OLD_REV)) {
- /* If this is the first large file
- * created, add a flag to the superblock.
- */
- err = ext4_journal_get_write_access(handle,
- EXT4_SB(sb)->s_sbh);
- if (err)
- goto out_brelse;
- ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
- ext4_handle_sync(handle);
- err = ext4_handle_dirty_super(handle, sb);
- }
+ cpu_to_le32(EXT4_GOOD_OLD_REV))
+ set_large_file = 1;
}
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
@@ -4384,22 +4394,37 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_block[block] = ei->i_data[block];
}
- raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
- if (ei->i_extra_isize) {
- if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
- raw_inode->i_version_hi =
- cpu_to_le32(inode->i_version >> 32);
- raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+ if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
+ raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
+ if (ei->i_extra_isize) {
+ if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
+ raw_inode->i_version_hi =
+ cpu_to_le32(inode->i_version >> 32);
+ raw_inode->i_extra_isize =
+ cpu_to_le16(ei->i_extra_isize);
+ }
}
ext4_inode_csum_set(inode, raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
-
+ if (set_large_file) {
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
+ err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ if (err)
+ goto out_brelse;
+ ext4_update_dynamic_rev(sb);
+ EXT4_SET_RO_COMPAT_FEATURE(sb,
+ EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_handle_sync(handle);
+ err = ext4_handle_dirty_super(handle, sb);
+ }
ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_brelse:
brelse(bh);
@@ -4412,21 +4437,20 @@ out_brelse:
*
* We are called from a few places:
*
- * - Within generic_file_write() for O_SYNC files.
+ * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
* Here, there will be no transaction running. We wait for any running
* transaction to commit.
*
- * - Within sys_sync(), kupdate and such.
- * We wait on commit, if tol to.
+ * - Within flush work (sys_sync(), kupdate and such).
+ * We wait on commit, if told to.
*
- * - Within prune_icache() (PF_MEMALLOC == true)
- * Here we simply return. We can't afford to block kswapd on the
- * journal commit.
+ * - Within iput_final() -> write_inode_now()
+ * We wait on commit, if told to.
*
* In all cases it is actually safe for us to return without doing anything,
* because the inode has been copied into a raw inode buffer in
- * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
- * knfsd.
+ * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL
+ * writeback.
*
* Note that we are absolutely dependent upon all inode dirtiers doing the
* right thing: they *must* call mark_inode_dirty() after dirtying info in
@@ -4438,15 +4462,15 @@ out_brelse:
* stuff();
* inode->i_size = expr;
*
- * is in error because a kswapd-driven write_inode() could occur while
- * `stuff()' is running, and the new i_size will be lost. Plus the inode
- * will no longer be on the superblock's dirty inode list.
+ * is in error because write_inode() could occur while `stuff()' is running,
+ * and the new i_size will be lost. Plus the inode will no longer be on the
+ * superblock's dirty inode list.
*/
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int err;
- if (current->flags & PF_MEMALLOC)
+ if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
return 0;
if (EXT4_SB(inode->i_sb)->s_journal) {
@@ -4456,7 +4480,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
return -EIO;
}
- if (wbc->sync_mode != WB_SYNC_ALL)
+ /*
+ * No need to force transaction in WB_SYNC_NONE mode. Also
+ * ext4_sync_fs() will force the commit after everything is
+ * written.
+ */
+ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
return 0;
err = ext4_force_commit(inode->i_sb);
@@ -4466,7 +4495,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
err = __ext4_get_inode_loc(inode, &iloc, 0);
if (err)
return err;
- if (wbc->sync_mode == WB_SYNC_ALL)
+ /*
+ * sync(2) will flush the whole buffer cache. No need to do
+ * it here separately for each inode.
+ */
+ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
sync_dirty_buffer(iloc.bh);
if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr,
@@ -4594,6 +4627,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
if (attr->ia_size > sbi->s_bitmap_maxbytes)
return -EFBIG;
}
+
+ if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size)
+ inode_inc_iversion(inode);
+
if (S_ISREG(inode->i_mode) &&
(attr->ia_size < inode->i_size)) {
if (ext4_should_order_data(inode)) {
@@ -4671,7 +4708,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
ext4_orphan_del(NULL, inode);
if (!rc && (ia_valid & ATTR_MODE))
- rc = ext4_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
err_out:
ext4_std_error(inode->i_sb, error);
@@ -4690,6 +4727,15 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
generic_fillattr(inode, stat);
/*
+ * If there is inline data in the inode, the inode will normally not
+ * have data blocks allocated (it may have an external xattr block).
+ * Report at least one sector for such files, so tools like tar, rsync,
+ * others doen't incorrectly think the file is completely sparse.
+ */
+ if (unlikely(ext4_has_inline_data(inode)))
+ stat->blocks += (stat->size + 511) >> 9;
+
+ /*
* We can't update i_blocks if the block allocation is delayed
* otherwise in the case of system crash before the real block
* allocation is done, we will have i_blocks inconsistent with
@@ -4700,9 +4746,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
* blocks for this file.
*/
delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
- EXT4_I(inode)->i_reserved_data_blocks);
-
- stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
+ EXT4_I(inode)->i_reserved_data_blocks);
+ stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
return 0;
}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a569d335f80..0f2252ec274 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -101,28 +101,18 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle_t *handle;
int err;
struct inode *inode_bl;
- struct ext4_inode_info *ei;
struct ext4_inode_info *ei_bl;
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
- err = -EINVAL;
- goto swap_boot_out;
- }
-
- if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto swap_boot_out;
- }
+ if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode))
+ return -EINVAL;
- sbi = EXT4_SB(sb);
- ei = EXT4_I(inode);
+ if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN))
+ return -EPERM;
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
- if (IS_ERR(inode_bl)) {
- err = PTR_ERR(inode_bl);
- goto swap_boot_out;
- }
+ if (IS_ERR(inode_bl))
+ return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
filemap_flush(inode->i_mapping);
@@ -130,7 +120,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
- ext4_inode_double_lock(inode, inode_bl);
+ lock_two_nondirectories(inode, inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
@@ -144,7 +134,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
- goto swap_boot_out;
+ goto journal_err_out;
}
/* Protect extent tree against block allocations via delalloc */
@@ -197,19 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb,
ext4_mark_inode_dirty(handle, inode);
}
}
-
ext4_journal_stop(handle);
-
ext4_double_up_write_data_sem(inode, inode_bl);
+journal_err_out:
ext4_inode_resume_unlocked_dio(inode);
ext4_inode_resume_unlocked_dio(inode_bl);
-
- ext4_inode_double_unlock(inode, inode_bl);
-
+ unlock_two_nondirectories(inode, inode_bl);
iput(inode_bl);
-
-swap_boot_out:
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a41e3ba8cfa..2dcb936be90 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -722,6 +722,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
void *buddy, void *bitmap, ext4_group_t group)
{
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
ext4_grpblk_t i = 0;
ext4_grpblk_t first;
@@ -751,14 +752,17 @@ void ext4_mb_generate_buddy(struct super_block *sb,
if (free != grp->bb_free) {
ext4_grp_locked_error(sb, group, 0, 0,
- "%u clusters in bitmap, %u in gd; "
- "block bitmap corrupt.",
+ "block bitmap and bg descriptor "
+ "inconsistent: %u vs %u free clusters",
free, grp->bb_free);
/*
* If we intend to continue, we consider group descriptor
* corrupt and update bb_free using bitmap value
*/
grp->bb_free = free;
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
}
mb_set_largest_free_order(sb, grp);
@@ -989,7 +993,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
poff = block % blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
@@ -1003,7 +1007,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
pnum = block / blocks_per_page;
page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
if (!page)
- return -EIO;
+ return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
e4b->bd_buddy_page = page;
return 0;
@@ -1044,6 +1048,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
* allocating. If we are looking at the buddy cache we would
* have taken a reference using ext4_mb_load_buddy and that
* would have pinned buddy page to page cache.
+ * The call to ext4_mb_get_buddy_page_lock will mark the
+ * page accessed.
*/
ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
@@ -1062,7 +1068,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
if (e4b.bd_buddy_page == NULL) {
/*
@@ -1082,7 +1087,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
ret = -EIO;
goto err;
}
- mark_page_accessed(page);
err:
ext4_mb_put_buddy_page_lock(&e4b);
return ret;
@@ -1141,7 +1145,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
/* we could use find_or_create_page(), but it locks page
* what we'd like to avoid in fast path ... */
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
/*
@@ -1168,19 +1172,24 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_bitmap_page = page;
e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
block++;
pnum = block / blocks_per_page;
poff = block % blocks_per_page;
- page = find_get_page(inode->i_mapping, pnum);
+ page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
if (page == NULL || !PageUptodate(page)) {
if (page)
page_cache_release(page);
@@ -1197,13 +1206,18 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
unlock_page(page);
}
}
- if (page == NULL || !PageUptodate(page)) {
+ if (page == NULL) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ if (!PageUptodate(page)) {
ret = -EIO;
goto err;
}
+
+ /* Pages marked accessed already */
e4b->bd_buddy_page = page;
e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
- mark_page_accessed(page);
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
@@ -1421,6 +1435,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
if (unlikely(block != -1)) {
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t blocknr;
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1431,6 +1446,9 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
"freeing already freed block "
"(bit %u); block bitmap corrupt.",
block);
+ if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
+ percpu_counter_sub(&sbi->s_freeclusters_counter,
+ e4b->bd_info->bb_free);
/* Mark the block group as corrupt. */
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
&e4b->bd_info->bb_state);
@@ -1808,6 +1826,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ext4_lock_group(ac->ac_sb, group);
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
+ ex.fe_logical = 0xDEADFA11; /* debug value */
if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
ext4_fsblk_t start;
@@ -1936,7 +1955,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
*/
break;
}
-
+ ex.fe_logical = 0xDEADC0DE; /* debug value */
ext4_mb_measure_extent(ac, &ex, e4b);
i += ex.fe_len;
@@ -1977,6 +1996,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
if (max >= sbi->s_stripe) {
ac->ac_found++;
+ ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
@@ -2607,7 +2627,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
if (sbi->s_locality_groups == NULL) {
ret = -ENOMEM;
- goto out_free_groupinfo_slab;
+ goto out;
}
for_each_possible_cpu(i) {
struct ext4_locality_group *lg;
@@ -2632,8 +2652,6 @@ int ext4_mb_init(struct super_block *sb)
out_free_locality_groups:
free_percpu(sbi->s_locality_groups);
sbi->s_locality_groups = NULL;
-out_free_groupinfo_slab:
- ext4_groupinfo_destroy_slabs();
out:
kfree(sbi->s_mb_offsets);
sbi->s_mb_offsets = NULL;
@@ -2866,6 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (!bitmap_bh)
goto out_err;
+ BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
if (err)
goto out_err;
@@ -2878,6 +2897,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
ext4_free_group_clusters(sb, gdp));
+ BUFFER_TRACE(gdp_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdp_bh);
if (err)
goto out_err;
@@ -3135,7 +3155,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
}
BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
start > ac->ac_o_ex.fe_logical);
- BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
+ BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
/* now prepare goal request */
@@ -3442,6 +3462,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
{
struct ext4_prealloc_space *pa;
pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+ BUG_ON(atomic_read(&pa->pa_count));
+ BUG_ON(pa->pa_deleted == 0);
kmem_cache_free(ext4_pspace_cachep, pa);
}
@@ -3455,11 +3478,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
ext4_group_t grp;
ext4_fsblk_t grp_blk;
- if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
- return;
-
/* in this short window concurrent discard can set pa_deleted */
spin_lock(&pa->pa_lock);
+ if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+ spin_unlock(&pa->pa_lock);
+ return;
+ }
+
if (pa->pa_deleted == 1) {
spin_unlock(&pa->pa_lock);
return;
@@ -4001,8 +4026,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_len,
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
- ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
- ac->ac_ex_scanned, ac->ac_found);
+ ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
ngroups = ext4_get_groups_count(sb);
for (i = 0; i < ngroups; i++) {
@@ -4121,7 +4145,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
- ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
@@ -4663,7 +4687,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
* blocks at the beginning or the end unless we are explicitly
* requested to avoid doing so.
*/
- overflow = block & (sbi->s_cluster_ratio - 1);
+ overflow = EXT4_PBLK_COFF(sbi, block);
if (overflow) {
if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
overflow = sbi->s_cluster_ratio - overflow;
@@ -4677,7 +4701,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
count += overflow;
}
}
- overflow = count & (sbi->s_cluster_ratio - 1);
+ overflow = EXT4_LBLK_COFF(sbi, count);
if (overflow) {
if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
if (count > overflow)
@@ -4794,8 +4818,8 @@ do_more:
" group:%d block:%d count:%lu failed"
" with %d", block_group, bit, count,
err);
- }
-
+ } else
+ EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -5002,6 +5026,8 @@ error_return:
*/
static int ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b)
+__releases(bitlock)
+__acquires(bitlock)
{
struct ext4_free_extent ex;
int ret = 0;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 08481ee84cd..d634e183b4d 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug;
} \
} while (0)
#else
-#define mb_debug(n, fmt, a...)
+#define mb_debug(n, fmt, a...) no_printk(fmt, ## a)
#endif
#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */
@@ -175,8 +175,6 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
- /* number of iterations done. we have to track to limit searching */
- unsigned long ac_ex_scanned;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_tail;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 2ae73a80c19..ec092437d3e 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -505,7 +505,7 @@ int ext4_ext_migrate(struct inode *inode)
* with i_data_sem held to prevent racing with block
* allocation.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
ext4_set_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
up_read((&EXT4_I(inode)->i_data_sem));
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 214461e42a0..32bce844c2e 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -18,7 +18,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
return cpu_to_le32(csum);
}
-int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
{
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -27,7 +27,7 @@ int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
}
-void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
{
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -259,7 +259,7 @@ static unsigned int mmp_new_seq(void)
u32 new_seq;
do {
- get_random_bytes(&new_seq, sizeof(u32));
+ new_seq = prandom_u32();
} while (new_seq > EXT4_MMP_SEQ_MAX);
return new_seq;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 7fa4d855dbd..2484c7ec6a7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -57,8 +57,8 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock,
static void
copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
{
- if (ext4_ext_is_uninitialized(src))
- ext4_ext_mark_uninitialized(dest);
+ if (ext4_ext_is_unwritten(src))
+ ext4_ext_mark_unwritten(dest);
else
dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
}
@@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
* ext4_ext_path structure refers to the last extent, or a negative error
* value on failure.
*/
-static int
+int
mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
struct ext4_extent **extent)
{
@@ -391,6 +391,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
if (depth) {
/* Register to journal */
+ BUFFER_TRACE(orig_path->p_bh, "get_write_access");
ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
if (ret)
return ret;
@@ -593,14 +594,14 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
* @inode: inode in question
* @from: block offset of inode
* @count: block count to be checked
- * @uninit: extents expected to be uninitialized
+ * @unwritten: extents expected to be unwritten
* @err: pointer to save error value
*
* Return 1 if all extents in range has expected type, and zero otherwise.
*/
static int
mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
- int uninit, int *err)
+ int unwritten, int *err)
{
struct ext4_ext_path *path = NULL;
struct ext4_extent *ext;
@@ -611,7 +612,7 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
if (*err)
goto out;
ext = path[ext_depth(inode)].p_ext;
- if (uninit != ext4_ext_is_uninitialized(ext))
+ if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
from += ext4_ext_get_actual_len(ext);
ext4_ext_drop_refs(path);
@@ -861,8 +862,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
}
if (!buffer_mapped(bh)) {
zero_user(page, block_start, blocksize);
- if (!err)
- set_buffer_uptodate(bh);
+ set_buffer_uptodate(bh);
continue;
}
}
@@ -895,7 +895,7 @@ out:
* @orig_page_offset: page index on original file
* @data_offset_in_page: block index where data swapping starts
* @block_len_in_page: the number of blocks to be swapped
- * @uninit: orig extent is uninitialized or not
+ * @unwritten: orig extent is unwritten or not
* @err: pointer to save return value
*
* Save the data in original inode blocks and replace original inode extents
@@ -906,7 +906,7 @@ out:
static int
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
pgoff_t orig_page_offset, int data_offset_in_page,
- int block_len_in_page, int uninit, int *err)
+ int block_len_in_page, int unwritten, int *err)
{
struct inode *orig_inode = file_inode(o_filp);
struct page *pagep[2] = {NULL, NULL};
@@ -963,27 +963,27 @@ again:
if (unlikely(*err < 0))
goto stop_journal;
/*
- * If orig extent was uninitialized it can become initialized
+ * If orig extent was unwritten it can become initialized
* at any time after i_data_sem was dropped, in order to
* serialize with delalloc we have recheck extent while we
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
- if (uninit) {
+ if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
- uninit = mext_check_coverage(orig_inode, orig_blk_offset,
- block_len_in_page, 1, err);
+ unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
- uninit &= mext_check_coverage(donor_inode, orig_blk_offset,
- block_len_in_page, 1, err);
+ unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+ block_len_in_page, 1, err);
if (*err)
goto drop_data_sem;
- if (!uninit) {
+ if (!unwritten) {
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
@@ -1203,42 +1203,6 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
- * ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
- *
- * @inode1: the inode structure
- * @inode2: the inode structure
- *
- * Lock two inodes' i_mutex
- */
-void
-ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
-{
- BUG_ON(inode1 == inode2);
- if (inode1 < inode2) {
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
- }
-}
-
-/**
- * ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
- *
- * @inode1: the inode that is released first
- * @inode2: the inode that is released second
- *
- */
-
-void
-ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
-{
- mutex_unlock(&inode1->i_mutex);
- mutex_unlock(&inode2->i_mutex);
-}
-
-/**
* ext4_move_extents - Exchange the specified range of a file
*
* @o_filp: file structure of the original file
@@ -1296,7 +1260,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
int data_offset_in_page;
int block_len_in_page;
- int uninit;
+ int unwritten;
if (orig_inode->i_sb != donor_inode->i_sb) {
ext4_debug("ext4 move extent: The argument files "
@@ -1327,7 +1291,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
- ext4_inode_double_lock(orig_inode, donor_inode);
+ lock_two_nondirectories(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
@@ -1428,8 +1392,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
!last_extent)
continue;
- /* Is original extent is uninitialized */
- uninit = ext4_ext_is_uninitialized(ext_prev);
+ /* Is original extent is unwritten */
+ unwritten = ext4_ext_is_unwritten(ext_prev);
data_offset_in_page = seq_start % blocks_per_page;
@@ -1469,8 +1433,8 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
o_filp, donor_inode,
orig_page_offset,
data_offset_in_page,
- block_len_in_page, uninit,
- &ret);
+ block_len_in_page,
+ unwritten, &ret);
/* Count how many blocks we have exchanged */
*moved_len += block_len_in_page;
@@ -1535,7 +1499,7 @@ out:
ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
- ext4_inode_double_unlock(orig_inode, donor_inode);
+ unlock_two_nondirectories(orig_inode, donor_inode);
return ret;
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 1bec5a5c1e4..3520ab8a663 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -67,6 +67,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(err);
inode->i_size += inode->i_sb->s_blocksize;
EXT4_I(inode)->i_disksize = inode->i_size;
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
brelse(bh);
@@ -1425,9 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-EIO);
}
if (unlikely(ino == dir->i_ino)) {
- EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
- dentry->d_name.len,
- dentry->d_name.name);
+ EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
+ dentry);
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
@@ -1779,6 +1779,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
blocksize = dir->i_sb->s_blocksize;
dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
+ BUFFER_TRACE(bh, "get_write_access");
retval = ext4_journal_get_write_access(handle, bh);
if (retval) {
ext4_std_error(dir->i_sb, retval);
@@ -2319,7 +2320,7 @@ retry:
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
if (err)
- goto err_drop_inode;
+ goto err_unlock_inode;
mark_inode_dirty(inode);
unlock_new_inode(inode);
}
@@ -2328,10 +2329,9 @@ retry:
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
-err_drop_inode:
+err_unlock_inode:
ext4_journal_stop(handle);
unlock_new_inode(inode);
- iput(inode);
return err;
}
@@ -2512,8 +2512,7 @@ static int empty_dir(struct inode *inode)
ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
- if (!bh ||
- (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
unsigned int lblock;
err = 0;
brelse(bh);
@@ -2541,26 +2540,37 @@ static int empty_dir(struct inode *inode)
return 1;
}
-/* ext4_orphan_add() links an unlinked or truncated inode into a list of
+/*
+ * ext4_orphan_add() links an unlinked or truncated inode into a list of
* such inodes, starting at the superblock, in case we crash before the
* file is closed/deleted, or in case the inode truncate spans multiple
* transactions and the last transaction is not recovered after a crash.
*
* At filesystem recovery time, we walk this list deleting unlinked
* inodes and truncating linked inodes in ext4_orphan_cleanup().
+ *
+ * Orphan list manipulation functions must be called under i_mutex unless
+ * we are just creating the inode or deleting it.
*/
int ext4_orphan_add(handle_t *handle, struct inode *inode)
{
struct super_block *sb = inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_iloc iloc;
int err = 0, rc;
+ bool dirty = false;
- if (!EXT4_SB(sb)->s_journal)
+ if (!sbi->s_journal)
return 0;
- mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /*
+ * Exit early if inode already is on orphan list. This is a big speedup
+ * since we don't have to contend on the global s_orphan_lock.
+ */
if (!list_empty(&EXT4_I(inode)->i_orphan))
- goto out_unlock;
+ return 0;
/*
* Orphan handling is only valid for files with data blocks
@@ -2571,48 +2581,51 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
- BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
- err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
- goto out_unlock;
+ goto out;
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
- goto out_unlock;
+ goto out;
+
+ mutex_lock(&sbi->s_orphan_lock);
/*
* Due to previous errors inode may be already a part of on-disk
* orphan list. If so skip on-disk list modification.
*/
- if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
- goto mem_insert;
-
- /* Insert this inode at the head of the on-disk orphan list... */
- NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
- EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
- err = ext4_handle_dirty_super(handle, sb);
- rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
- if (!err)
- err = rc;
-
- /* Only add to the head of the in-memory list if all the
- * previous operations succeeded. If the orphan_add is going to
- * fail (possibly taking the journal offline), we can't risk
- * leaving the inode on the orphan list: stray orphan-list
- * entries can cause panics at unmount time.
- *
- * This is safe: on error we're going to ignore the orphan list
- * anyway on the next recovery. */
-mem_insert:
- if (!err)
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-
+ if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) >
+ (le32_to_cpu(sbi->s_es->s_inodes_count))) {
+ /* Insert this inode at the head of the on-disk orphan list */
+ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan);
+ sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ dirty = true;
+ }
+ list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+
+ if (dirty) {
+ err = ext4_handle_dirty_super(handle, sb);
+ rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+ if (!err)
+ err = rc;
+ if (err) {
+ /*
+ * We have to remove inode from in-memory list if
+ * addition to on disk orphan list failed. Stray orphan
+ * list entries can cause panics at unmount time.
+ */
+ mutex_lock(&sbi->s_orphan_lock);
+ list_del(&EXT4_I(inode)->i_orphan);
+ mutex_unlock(&sbi->s_orphan_lock);
+ }
+ }
jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
-out_unlock:
- mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
- ext4_std_error(inode->i_sb, err);
+out:
+ ext4_std_error(sb, err);
return err;
}
@@ -2624,45 +2637,51 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 ino_next;
struct ext4_iloc iloc;
int err = 0;
- if ((!EXT4_SB(inode->i_sb)->s_journal) &&
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
+ if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;
- mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+ WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
+ !mutex_is_locked(&inode->i_mutex));
+ /* Do this quick check before taking global s_orphan_lock. */
if (list_empty(&ei->i_orphan))
- goto out;
+ return 0;
- ino_next = NEXT_ORPHAN(inode);
- prev = ei->i_orphan.prev;
- sbi = EXT4_SB(inode->i_sb);
+ if (handle) {
+ /* Grab inode buffer early before taking global s_orphan_lock */
+ err = ext4_reserve_inode_write(handle, inode, &iloc);
+ }
+ mutex_lock(&sbi->s_orphan_lock);
jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+ prev = ei->i_orphan.prev;
list_del_init(&ei->i_orphan);
/* If we're on an error path, we may not have a valid
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (!handle)
- goto out;
-
- err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
+ if (!handle || err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_err;
+ }
+ ino_next = NEXT_ORPHAN(inode);
if (prev == &sbi->s_orphan) {
jbd_debug(4, "superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ mutex_unlock(&sbi->s_orphan_lock);
err = ext4_handle_dirty_super(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
@@ -2672,20 +2691,20 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
jbd_debug(4, "orphan inode %lu will point to %u\n",
i_prev->i_ino, ino_next);
err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err)
+ if (err) {
+ mutex_unlock(&sbi->s_orphan_lock);
goto out_brelse;
+ }
NEXT_ORPHAN(i_prev) = ino_next;
err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ mutex_unlock(&sbi->s_orphan_lock);
}
if (err)
goto out_brelse;
NEXT_ORPHAN(inode) = 0;
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
out_err:
ext4_std_error(inode->i_sb, err);
-out:
- mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
return err;
out_brelse:
@@ -3002,6 +3021,154 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
return ext4_get_first_inline_block(inode, parent_de, retval);
}
+struct ext4_renament {
+ struct inode *dir;
+ struct dentry *dentry;
+ struct inode *inode;
+ bool is_dir;
+ int dir_nlink_delta;
+
+ /* entry for "dentry" */
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+ int inlined;
+
+ /* entry for ".." in inode if it's a directory */
+ struct buffer_head *dir_bh;
+ struct ext4_dir_entry_2 *parent_de;
+ int dir_inlined;
+};
+
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+
+ ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
+ &retval, &ent->parent_de,
+ &ent->dir_inlined);
+ if (!ent->dir_bh)
+ return retval;
+ if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
+ return -EIO;
+ BUFFER_TRACE(ent->dir_bh, "get_write_access");
+ return ext4_journal_get_write_access(handle, ent->dir_bh);
+}
+
+static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
+ unsigned dir_ino)
+{
+ int retval;
+
+ ent->parent_de->inode = cpu_to_le32(dir_ino);
+ BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
+ if (!ent->dir_inlined) {
+ if (is_dx(ent->inode)) {
+ retval = ext4_handle_dirty_dx_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ } else {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->inode,
+ ent->dir_bh);
+ }
+ } else {
+ retval = ext4_mark_inode_dirty(handle, ent->inode);
+ }
+ if (retval) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ return 0;
+}
+
+static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
+ unsigned ino, unsigned file_type)
+{
+ int retval;
+
+ BUFFER_TRACE(ent->bh, "get write access");
+ retval = ext4_journal_get_write_access(handle, ent->bh);
+ if (retval)
+ return retval;
+ ent->de->inode = cpu_to_le32(ino);
+ if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
+ EXT4_FEATURE_INCOMPAT_FILETYPE))
+ ent->de->file_type = file_type;
+ ent->dir->i_version++;
+ ent->dir->i_ctime = ent->dir->i_mtime =
+ ext4_current_time(ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
+ if (!ent->inlined) {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ ent->dir, ent->bh);
+ if (unlikely(retval)) {
+ ext4_std_error(ent->dir->i_sb, retval);
+ return retval;
+ }
+ }
+ brelse(ent->bh);
+ ent->bh = NULL;
+
+ return 0;
+}
+
+static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
+ const struct qstr *d_name)
+{
+ int retval = -ENOENT;
+ struct buffer_head *bh;
+ struct ext4_dir_entry_2 *de;
+
+ bh = ext4_find_entry(dir, d_name, &de, NULL);
+ if (bh) {
+ retval = ext4_delete_entry(handle, dir, de, bh);
+ brelse(bh);
+ }
+ return retval;
+}
+
+static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
+{
+ int retval;
+ /*
+ * ent->de could have moved from under us during htree split, so make
+ * sure that we are deleting the right entry. We might also be pointing
+ * to a stale entry in the unused part of ent->bh so just checking inum
+ * and the name isn't enough.
+ */
+ if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
+ ent->de->name_len != ent->dentry->d_name.len ||
+ strncmp(ent->de->name, ent->dentry->d_name.name,
+ ent->de->name_len)) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ } else {
+ retval = ext4_delete_entry(handle, ent->dir, ent->de, ent->bh);
+ if (retval == -ENOENT) {
+ retval = ext4_find_delete_entry(handle, ent->dir,
+ &ent->dentry->d_name);
+ }
+ }
+
+ if (retval) {
+ ext4_warning(ent->dir->i_sb,
+ "Deleting old file (%lu), %d, error=%d",
+ ent->dir->i_ino, ent->dir->i_nlink, retval);
+ }
+}
+
+static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent)
+{
+ if (ent->dir_nlink_delta) {
+ if (ent->dir_nlink_delta == -1)
+ ext4_dec_count(handle, ent->dir);
+ else
+ ext4_inc_count(handle, ent->dir);
+ ext4_mark_inode_dirty(handle, ent->dir);
+ }
+}
+
/*
* Anybody can rename anything with this: the permission checks are left to the
* higher-level routines.
@@ -3014,198 +3181,267 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
handle_t *handle = NULL;
- struct inode *old_inode, *new_inode;
- struct buffer_head *old_bh, *new_bh, *dir_bh;
- struct ext4_dir_entry_2 *old_de, *new_de;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
int retval;
- int inlined = 0, new_inlined = 0;
- struct ext4_dir_entry_2 *parent_de;
- dquot_initialize(old_dir);
- dquot_initialize(new_dir);
-
- old_bh = new_bh = dir_bh = NULL;
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
/* Initialize quotas before so that eventual writes go
* in separate transaction */
- if (new_dentry->d_inode)
- dquot_initialize(new_dentry->d_inode);
+ if (new.inode)
+ dquot_initialize(new.inode);
- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
* and merrily kill the link to whatever was created under the
* same name. Goodbye sticky bit ;-<
*/
- old_inode = old_dentry->d_inode;
retval = -ENOENT;
- if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
- new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
- &new_de, &new_inlined);
- if (new_bh) {
- if (!new_inode) {
- brelse(new_bh);
- new_bh = NULL;
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+ if (new.bh) {
+ if (!new.inode) {
+ brelse(new.bh);
+ new.bh = NULL;
}
}
- if (new_inode && !test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
- ext4_alloc_da_blocks(old_inode);
+ if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC))
+ ext4_alloc_da_blocks(old.inode);
- handle = ext4_journal_start(old_dir, EXT4_HT_DIR,
- (2 * EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
ext4_handle_sync(handle);
- if (S_ISDIR(old_inode->i_mode)) {
- if (new_inode) {
+ if (S_ISDIR(old.inode->i_mode)) {
+ if (new.inode) {
retval = -ENOTEMPTY;
- if (!empty_dir(new_inode))
+ if (!empty_dir(new.inode))
+ goto end_rename;
+ } else {
+ retval = -EMLINK;
+ if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = -EIO;
- dir_bh = ext4_get_first_dir_block(handle, old_inode,
- &retval, &parent_de,
- &inlined);
- if (!dir_bh)
- goto end_rename;
- if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
- goto end_rename;
- retval = -EMLINK;
- if (!new_inode && new_dir != old_dir &&
- EXT4_DIR_LINK_MAX(new_dir))
- goto end_rename;
- BUFFER_TRACE(dir_bh, "get_write_access");
- retval = ext4_journal_get_write_access(handle, dir_bh);
+ retval = ext4_rename_dir_prepare(handle, &old);
if (retval)
goto end_rename;
}
- if (!new_bh) {
- retval = ext4_add_entry(handle, new_dentry, old_inode);
+ if (!new.bh) {
+ retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
goto end_rename;
} else {
- BUFFER_TRACE(new_bh, "get write access");
- retval = ext4_journal_get_write_access(handle, new_bh);
+ retval = ext4_setent(handle, &new,
+ old.inode->i_ino, old.de->file_type);
if (retval)
goto end_rename;
- new_de->inode = cpu_to_le32(old_inode->i_ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
- new_de->file_type = old_de->file_type;
- new_dir->i_version++;
- new_dir->i_ctime = new_dir->i_mtime =
- ext4_current_time(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
- BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
- if (!new_inlined) {
- retval = ext4_handle_dirty_dirent_node(handle,
- new_dir, new_bh);
- if (unlikely(retval)) {
- ext4_std_error(new_dir->i_sb, retval);
- goto end_rename;
- }
- }
- brelse(new_bh);
- new_bh = NULL;
}
/*
* Like most other Unix systems, set the ctime for inodes on a
* rename.
*/
- old_inode->i_ctime = ext4_current_time(old_inode);
- ext4_mark_inode_dirty(handle, old_inode);
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
/*
* ok, that's it
*/
- if (le32_to_cpu(old_de->inode) != old_inode->i_ino ||
- old_de->name_len != old_dentry->d_name.len ||
- strncmp(old_de->name, old_dentry->d_name.name, old_de->name_len) ||
- (retval = ext4_delete_entry(handle, old_dir,
- old_de, old_bh)) == -ENOENT) {
- /* old_de could have moved from under us during htree split, so
- * make sure that we are deleting the right entry. We might
- * also be pointing to a stale entry in the unused part of
- * old_bh so just checking inum and the name isn't enough. */
- struct buffer_head *old_bh2;
- struct ext4_dir_entry_2 *old_de2;
-
- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
- &old_de2, NULL);
- if (old_bh2) {
- retval = ext4_delete_entry(handle, old_dir,
- old_de2, old_bh2);
- brelse(old_bh2);
- }
+ ext4_rename_delete(handle, &old);
+
+ if (new.inode) {
+ ext4_dec_count(handle, new.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
}
- if (retval) {
- ext4_warning(old_dir->i_sb,
- "Deleting old file (%lu), %d, error=%d",
- old_dir->i_ino, old_dir->i_nlink, retval);
- }
-
- if (new_inode) {
- ext4_dec_count(handle, new_inode);
- new_inode->i_ctime = ext4_current_time(new_inode);
- }
- old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
- ext4_update_dx_flag(old_dir);
- if (dir_bh) {
- parent_de->inode = cpu_to_le32(new_dir->i_ino);
- BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- if (!inlined) {
- if (is_dx(old_inode)) {
- retval = ext4_handle_dirty_dx_node(handle,
- old_inode,
- dir_bh);
- } else {
- retval = ext4_handle_dirty_dirent_node(handle,
- old_inode, dir_bh);
- }
- } else {
- retval = ext4_mark_inode_dirty(handle, old_inode);
- }
- if (retval) {
- ext4_std_error(old_dir->i_sb, retval);
+ old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+ ext4_update_dx_flag(old.dir);
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
goto end_rename;
- }
- ext4_dec_count(handle, old_dir);
- if (new_inode) {
+
+ ext4_dec_count(handle, old.dir);
+ if (new.inode) {
/* checked empty_dir above, can't have another parent,
* ext4_dec_count() won't work for many-linked dirs */
- clear_nlink(new_inode);
+ clear_nlink(new.inode);
} else {
- ext4_inc_count(handle, new_dir);
- ext4_update_dx_flag(new_dir);
- ext4_mark_inode_dirty(handle, new_dir);
+ ext4_inc_count(handle, new.dir);
+ ext4_update_dx_flag(new.dir);
+ ext4_mark_inode_dirty(handle, new.dir);
}
}
- ext4_mark_inode_dirty(handle, old_dir);
- if (new_inode) {
- ext4_mark_inode_dirty(handle, new_inode);
- if (!new_inode->i_nlink)
- ext4_orphan_add(handle, new_inode);
+ ext4_mark_inode_dirty(handle, old.dir);
+ if (new.inode) {
+ ext4_mark_inode_dirty(handle, new.inode);
+ if (!new.inode->i_nlink)
+ ext4_orphan_add(handle, new.inode);
+ }
+ retval = 0;
+
+end_rename:
+ brelse(old.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
+ if (handle)
+ ext4_journal_stop(handle);
+ return retval;
+}
+
+static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ handle_t *handle = NULL;
+ struct ext4_renament old = {
+ .dir = old_dir,
+ .dentry = old_dentry,
+ .inode = old_dentry->d_inode,
+ };
+ struct ext4_renament new = {
+ .dir = new_dir,
+ .dentry = new_dentry,
+ .inode = new_dentry->d_inode,
+ };
+ u8 new_file_type;
+ int retval;
+
+ dquot_initialize(old.dir);
+ dquot_initialize(new.dir);
+
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
+ &old.de, &old.inlined);
+ /*
+ * Check for inode number is _not_ due to possible IO errors.
+ * We might rmdir the source, keep it as pwd of some process
+ * and merrily kill the link to whatever was created under the
+ * same name. Goodbye sticky bit ;-<
+ */
+ retval = -ENOENT;
+ if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
+ goto end_rename;
+
+ new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
+ &new.de, &new.inlined);
+
+ /* RENAME_EXCHANGE case: old *and* new must both exist */
+ if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
+ goto end_rename;
+
+ handle = ext4_journal_start(old.dir, EXT4_HT_DIR,
+ (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) +
+ 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2));
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir))
+ ext4_handle_sync(handle);
+
+ if (S_ISDIR(old.inode->i_mode)) {
+ old.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &old);
+ if (retval)
+ goto end_rename;
+ }
+ if (S_ISDIR(new.inode->i_mode)) {
+ new.is_dir = true;
+ retval = ext4_rename_dir_prepare(handle, &new);
+ if (retval)
+ goto end_rename;
}
+
+ /*
+ * Other than the special case of overwriting a directory, parents'
+ * nlink only needs to be modified if this is a cross directory rename.
+ */
+ if (old.dir != new.dir && old.is_dir != new.is_dir) {
+ old.dir_nlink_delta = old.is_dir ? -1 : 1;
+ new.dir_nlink_delta = -old.dir_nlink_delta;
+ retval = -EMLINK;
+ if ((old.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(old.dir)) ||
+ (new.dir_nlink_delta > 0 && EXT4_DIR_LINK_MAX(new.dir)))
+ goto end_rename;
+ }
+
+ new_file_type = new.de->file_type;
+ retval = ext4_setent(handle, &new, old.inode->i_ino, old.de->file_type);
+ if (retval)
+ goto end_rename;
+
+ retval = ext4_setent(handle, &old, new.inode->i_ino, new_file_type);
+ if (retval)
+ goto end_rename;
+
+ /*
+ * Like most other Unix systems, set the ctime for inodes on a
+ * rename.
+ */
+ old.inode->i_ctime = ext4_current_time(old.inode);
+ new.inode->i_ctime = ext4_current_time(new.inode);
+ ext4_mark_inode_dirty(handle, old.inode);
+ ext4_mark_inode_dirty(handle, new.inode);
+
+ if (old.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ if (new.dir_bh) {
+ retval = ext4_rename_dir_finish(handle, &new, old.dir->i_ino);
+ if (retval)
+ goto end_rename;
+ }
+ ext4_update_dir_count(handle, &old);
+ ext4_update_dir_count(handle, &new);
retval = 0;
end_rename:
- brelse(dir_bh);
- brelse(old_bh);
- brelse(new_bh);
+ brelse(old.dir_bh);
+ brelse(new.dir_bh);
+ brelse(old.bh);
+ brelse(new.bh);
if (handle)
ext4_journal_stop(handle);
return retval;
}
+static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
+{
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags & RENAME_EXCHANGE) {
+ return ext4_cross_rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ }
+ /*
+ * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE"
+ * is equivalent to regular rename.
+ */
+ return ext4_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
/*
* directories can handle most operations...
*/
@@ -3220,12 +3456,14 @@ const struct inode_operations ext4_dir_inode_operations = {
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename,
+ .rename2 = ext4_rename2,
.setattr = ext4_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
@@ -3236,4 +3474,5 @@ const struct inode_operations ext4_special_inode_operations = {
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
.get_acl = ext4_get_acl,
+ .set_acl = ext4_set_acl,
};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d7d0c7b46ed..b24a2541a9b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -65,9 +65,9 @@ static void ext4_finish_bio(struct bio *bio)
{
int i;
int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec;
- for (i = 0; i < bio->bi_vcnt; i++) {
- struct bio_vec *bvec = &bio->bi_io_vec[i];
+ bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
struct buffer_head *bh, *head;
unsigned bio_start = bvec->bv_offset;
@@ -197,14 +197,15 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head)
static void ext4_add_complete_io(ext4_io_end_t *io_end)
{
struct ext4_inode_info *ei = EXT4_I(io_end->inode);
+ struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb);
struct workqueue_struct *wq;
unsigned long flags;
/* Only reserved conversions from writeback should enter here */
WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
- WARN_ON(!io_end->handle);
+ WARN_ON(!io_end->handle && sbi->s_journal);
spin_lock_irqsave(&ei->i_completed_io_lock, flags);
- wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
+ wq = sbi->rsv_conversion_wq;
if (list_empty(&ei->i_rsv_conversion_list))
queue_work(wq, &ei->i_rsv_conversion_work);
list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
@@ -297,7 +298,7 @@ ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
static void ext4_end_bio(struct bio *bio, int error)
{
ext4_io_end_t *io_end = bio->bi_private;
- sector_t bi_sector = bio->bi_sector;
+ sector_t bi_sector = bio->bi_iter.bi_sector;
BUG_ON(!io_end);
bio->bi_end_io = NULL;
@@ -307,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error)
if (error) {
struct inode *inode = io_end->inode;
- ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
+ ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu "
"(offset %llu size %ld starting block %llu)",
- inode->i_ino,
+ error, inode->i_ino,
(unsigned long long) io_end->offset,
(long) io_end->size,
(unsigned long long)
bi_sector >> (inode->i_blkbits - 9));
+ mapping_set_error(inode->i_mapping, error);
}
if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
@@ -365,7 +367,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
if (!bio)
return -ENOMEM;
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
@@ -399,7 +401,8 @@ submit_and_retry:
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
- struct writeback_control *wbc)
+ struct writeback_control *wbc,
+ bool keep_towrite)
{
struct inode *inode = page->mapping->host;
unsigned block_start, blocksize;
@@ -412,10 +415,24 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- set_page_writeback(page);
+ if (keep_towrite)
+ set_page_writeback_keepwrite(page);
+ else
+ set_page_writeback(page);
ClearPageError(page);
/*
+ * Comments copied from block_write_full_page:
+ *
+ * The page straddles i_size. It must be zeroed out on each and every
+ * writepage invocation because it may be mmapped. "A file is mapped
+ * in multiples of the page size. For a file that is not a multiple of
+ * the page size, the remaining memory is zeroed when mapped, and
+ * writes to that region are not written out to the file."
+ */
+ if (len < PAGE_CACHE_SIZE)
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ /*
* In the first loop we prepare and mark buffers to submit. We have to
* mark all buffers in the page before submitting so that
* end_page_writeback() cannot be called from ext4_bio_end_io() when IO
@@ -426,19 +443,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
block_start = bh_offset(bh);
if (block_start >= len) {
- /*
- * Comments copied from block_write_full_page_endio:
- *
- * The page straddles i_size. It must be zeroed out on
- * each and every writepage invocation because it may
- * be mmapped. "A file is mapped in multiples of the
- * page size. For a file that is not a multiple of
- * the page size, the remaining memory is zeroed when
- * mapped, and writes to that region are not written
- * out to the file."
- */
- zero_user_segment(page, block_start,
- block_start + blocksize);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
continue;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c5adbb318a9..bb0e80f03e2 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb)
void ext4_resize_end(struct super_block *sb)
{
clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
}
static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
@@ -243,6 +243,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
ext4_group_t group;
ext4_group_t last_group;
unsigned overhead;
+ __u16 uninit_mask = (flexbg_size > 1) ? ~EXT4_BG_BLOCK_UNINIT : ~0;
BUG_ON(flex_gd->count == 0 || group_data == NULL);
@@ -266,7 +267,7 @@ next_group:
src_group++;
for (; src_group <= last_group; src_group++) {
overhead = ext4_group_overhead_blocks(sb, src_group);
- if (overhead != 0)
+ if (overhead == 0)
last_blk += group_data[src_group - group].blocks_count;
else
break;
@@ -280,8 +281,7 @@ next_group:
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
- if (flexbg_size > 1)
- flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ flex_gd->bg_flags[group] &= uninit_mask;
}
/* Allocate inode bitmaps */
@@ -292,22 +292,30 @@ next_group:
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
- if (flexbg_size > 1)
- flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ flex_gd->bg_flags[group] &= uninit_mask;
}
/* Allocate inode tables */
for (; it_index < flex_gd->count; it_index++) {
- if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
+ unsigned int itb = EXT4_SB(sb)->s_itb_per_group;
+ ext4_fsblk_t next_group_start;
+
+ if (start_blk + itb > last_blk)
goto next_group;
group_data[it_index].inode_table = start_blk;
- group = ext4_get_group_number(sb, start_blk - 1);
+ group = ext4_get_group_number(sb, start_blk);
+ next_group_start = ext4_group_first_block_no(sb, group + 1);
group -= group_data[0].group;
- group_data[group].free_blocks_count -=
- EXT4_SB(sb)->s_itb_per_group;
- if (flexbg_size > 1)
- flex_gd->bg_flags[group] &= ~EXT4_BG_BLOCK_UNINIT;
+ if (start_blk + itb > next_group_start) {
+ flex_gd->bg_flags[group + 1] &= uninit_mask;
+ overhead = start_blk + itb - next_group_start;
+ group_data[group + 1].free_blocks_count -= overhead;
+ itb -= overhead;
+ }
+
+ group_data[group].free_blocks_count -= itb;
+ flex_gd->bg_flags[group] &= uninit_mask;
start_blk += EXT4_SB(sb)->s_itb_per_group;
}
@@ -340,6 +348,7 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb,
bh = sb_getblk(sb, blk);
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh))) {
brelse(bh);
bh = ERR_PTR(err);
@@ -401,7 +410,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
start = ext4_group_first_block_no(sb, group);
group -= flex_gd->groups[0].group;
- count2 = sb->s_blocksize * 8 - (block - start);
+ count2 = EXT4_BLOCKS_PER_GROUP(sb) - (block - start);
if (count2 > count)
count2 = count;
@@ -418,6 +427,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
if (unlikely(!bh))
return -ENOMEM;
+ BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
if (err)
return err;
@@ -510,6 +520,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
goto out;
}
+ BUFFER_TRACE(gdb, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb);
if (err) {
brelse(gdb);
@@ -620,7 +631,7 @@ handle_ib:
if (err)
goto out;
count = group_table_count[j];
- start = group_data[i].block_bitmap;
+ start = (&group_data[i].block_bitmap)[j];
block = start;
}
@@ -782,14 +793,17 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
goto exit_dind;
}
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (unlikely(err))
goto exit_dind;
+ BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
goto exit_dind;
+ BUFFER_TRACE(dind, "get_write_access");
err = ext4_journal_get_write_access(handle, dind);
if (unlikely(err))
ext4_std_error(sb, err);
@@ -894,6 +908,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
EXT4_SB(sb)->s_group_desc = n_group_desc;
EXT4_SB(sb)->s_gdb_count++;
ext4_kvfree(o_group_desc);
+ BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
brelse(gdb_bh);
@@ -969,6 +984,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
for (i = 0; i < reserved_gdb; i++) {
+ BUFFER_TRACE(primary[i], "get_write_access");
if ((err = ext4_journal_get_write_access(handle, primary[i])))
goto exit_bh;
}
@@ -1076,6 +1092,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
ext4_debug("update metadata backup %llu(+%llu)\n",
backup_block, backup_block -
ext4_group_first_block_no(sb, group));
+ BUFFER_TRACE(bh, "get_write_access");
if ((err = ext4_journal_get_write_access(handle, bh)))
break;
lock_buffer(bh);
@@ -1155,6 +1172,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
*/
if (gdb_off) {
gdb_bh = sbi->s_group_desc[gdb_num];
+ BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group))
@@ -1425,6 +1443,7 @@ static int ext4_flex_group_add(struct super_block *sb,
goto exit;
}
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto exit_journal;
@@ -1637,6 +1656,7 @@ static int ext4_group_extend_no_check(struct super_block *sb,
return err;
}
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err) {
ext4_warning(sb, "error %d on journal write access", err);
@@ -1796,6 +1816,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
if (IS_ERR(handle))
return PTR_ERR(handle);
+ BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
if (err)
goto errout;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2c2e6cbc6be..6df7bc611db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -59,6 +59,7 @@ static struct kset *ext4_kset;
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
static struct ext4_features *ext4_feat;
+static int ext4_mballoc_ready;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
@@ -137,8 +138,8 @@ static __le32 ext4_superblock_csum(struct super_block *sb,
return cpu_to_le32(csum);
}
-int ext4_superblock_csum_verify(struct super_block *sb,
- struct ext4_super_block *es)
+static int ext4_superblock_csum_verify(struct super_block *sb,
+ struct ext4_super_block *es)
{
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -411,20 +412,26 @@ static void ext4_handle_error(struct super_block *sb)
sb->s_id);
}
+#define ext4_error_ratelimit(sb) \
+ ___ratelimit(&(EXT4_SB(sb)->s_err_ratelimit_state), \
+ "EXT4-fs error")
+
void __ext4_error(struct super_block *sb, const char *function,
unsigned int line, const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
- sb->s_id, function, line, current->comm, &vaf);
- va_end(args);
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
+ sb->s_id, function, line, current->comm, &vaf);
+ va_end(args);
+ }
save_error_info(sb, function, line);
-
ext4_handle_error(sb);
}
@@ -438,22 +445,23 @@ void __ext4_error_inode(struct inode *inode, const char *function,
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
es->s_last_error_block = cpu_to_le64(block);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: block %llu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, &vaf);
+ else
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+ "inode #%lu: comm %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, &vaf);
+ va_end(args);
+ }
save_error_info(inode->i_sb, function, line);
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (block)
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
- "inode #%lu: block %llu: comm %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- block, current->comm, &vaf);
- else
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
- "inode #%lu: comm %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- current->comm, &vaf);
- va_end(args);
-
ext4_handle_error(inode->i_sb);
}
@@ -469,27 +477,28 @@ void __ext4_error_file(struct file *file, const char *function,
es = EXT4_SB(inode->i_sb)->s_es;
es->s_last_error_ino = cpu_to_le32(inode->i_ino);
+ if (ext4_error_ratelimit(inode->i_sb)) {
+ path = d_path(&(file->f_path), pathname, sizeof(pathname));
+ if (IS_ERR(path))
+ path = "(unknown)";
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (block)
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "block %llu: comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ block, current->comm, path, &vaf);
+ else
+ printk(KERN_CRIT
+ "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+ "comm %s: path %s: %pV\n",
+ inode->i_sb->s_id, function, line, inode->i_ino,
+ current->comm, path, &vaf);
+ va_end(args);
+ }
save_error_info(inode->i_sb, function, line);
- path = d_path(&(file->f_path), pathname, sizeof(pathname));
- if (IS_ERR(path))
- path = "(unknown)";
- va_start(args, fmt);
- vaf.fmt = fmt;
- vaf.va = &args;
- if (block)
- printk(KERN_CRIT
- "EXT4-fs error (device %s): %s:%d: inode #%lu: "
- "block %llu: comm %s: path %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- block, current->comm, path, &vaf);
- else
- printk(KERN_CRIT
- "EXT4-fs error (device %s): %s:%d: inode #%lu: "
- "comm %s: path %s: %pV\n",
- inode->i_sb->s_id, function, line, inode->i_ino,
- current->comm, path, &vaf);
- va_end(args);
-
ext4_handle_error(inode->i_sb);
}
@@ -543,11 +552,13 @@ void __ext4_std_error(struct super_block *sb, const char *function,
(sb->s_flags & MS_RDONLY))
return;
- errstr = ext4_decode_error(sb, errno, nbuf);
- printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
- sb->s_id, function, line, errstr);
- save_error_info(sb, function, line);
+ if (ext4_error_ratelimit(sb)) {
+ errstr = ext4_decode_error(sb, errno, nbuf);
+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n",
+ sb->s_id, function, line, errstr);
+ }
+ save_error_info(sb, function, line);
ext4_handle_error(sb);
}
@@ -597,6 +608,9 @@ void __ext4_msg(struct super_block *sb,
struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state), "EXT4-fs"))
+ return;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -610,6 +624,10 @@ void __ext4_warning(struct super_block *sb, const char *function,
struct va_format vaf;
va_list args;
+ if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
+ "EXT4-fs warning"))
+ return;
+
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
@@ -633,18 +651,20 @@ __acquires(bitlock)
es->s_last_error_block = cpu_to_le64(block);
__save_error_info(sb, function, line);
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
- printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
- sb->s_id, function, line, grp);
- if (ino)
- printk(KERN_CONT "inode %lu: ", ino);
- if (block)
- printk(KERN_CONT "block %llu:", (unsigned long long) block);
- printk(KERN_CONT "%pV\n", &vaf);
- va_end(args);
+ if (ext4_error_ratelimit(sb)) {
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
+ sb->s_id, function, line, grp);
+ if (ino)
+ printk(KERN_CONT "inode %lu: ", ino);
+ if (block)
+ printk(KERN_CONT "block %llu:",
+ (unsigned long long) block);
+ printk(KERN_CONT "%pV\n", &vaf);
+ va_end(args);
+ }
if (test_opt(sb, ERRORS_CONT)) {
ext4_commit_super(sb, 0);
@@ -773,7 +793,7 @@ static void ext4_put_super(struct super_block *sb)
}
ext4_es_unregister_shrinker(sbi);
- del_timer(&sbi->s_err_report);
+ del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
ext4_mb_release(sb);
ext4_ext_release(sb);
@@ -826,6 +846,10 @@ static void ext4_put_super(struct super_block *sb)
invalidate_bdev(sbi->journal_bdev);
ext4_blkdev_remove(sbi);
}
+ if (sbi->s_mb_cache) {
+ ext4_xattr_destroy_cache(sbi->s_mb_cache);
+ sbi->s_mb_cache = NULL;
+ }
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
sb->s_fs_info = NULL;
@@ -855,6 +879,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
+ spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
ext4_es_init_tree(&ei->i_es_tree);
@@ -921,7 +946,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
sizeof(struct ext4_inode_info),
@@ -1500,8 +1525,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_max_batch_time) {
- if (arg == 0)
- arg = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_max_batch_time = arg;
} else if (token == Opt_min_batch_time) {
sbi->s_min_batch_time = arg;
@@ -1879,7 +1902,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
if (!(sbi->s_mount_state & EXT4_VALID_FS))
ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
"running e2fsck is recommended");
- else if ((sbi->s_mount_state & EXT4_ERROR_FS))
+ else if (sbi->s_mount_state & EXT4_ERROR_FS)
ext4_msg(sb, KERN_WARNING,
"warning: mounting fs with errors, "
"running e2fsck is recommended");
@@ -2380,6 +2403,16 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
if (ext4_bg_has_super(sb, bg))
has_super = 1;
+ /*
+ * If we have a meta_bg fs with 1k blocks, group 0's GDT is at
+ * block 2, not 1. If s_first_data_block == 0 (bigalloc is enabled
+ * on modern mke2fs or blksize > 1k on older mke2fs) then we must
+ * compensate.
+ */
+ if (sb->s_blocksize == 1024 && nr == 0 &&
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) == 0)
+ has_super++;
+
return (has_super + ext4_group_first_block_no(sb, bg));
}
@@ -2606,6 +2639,12 @@ EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
@@ -2623,6 +2662,12 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(max_writeback_mb_bump),
ATTR_LIST(extent_max_zeroout_kb),
ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
NULL,
};
@@ -2762,10 +2807,11 @@ static void print_daily_error_info(unsigned long arg)
es = sbi->s_es;
if (es->s_error_count)
- ext4_msg(sb, KERN_NOTICE, "error count: %u",
+ /* fsck newer than v1.41.13 is needed to clean this condition. */
+ ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
le32_to_cpu(es->s_error_count));
if (es->s_first_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d",
+ printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
sb->s_id, le32_to_cpu(es->s_first_error_time),
(int) sizeof(es->s_first_error_func),
es->s_first_error_func,
@@ -2779,7 +2825,7 @@ static void print_daily_error_info(unsigned long arg)
printk("\n");
}
if (es->s_last_error_time) {
- printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d",
+ printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
sb->s_id, le32_to_cpu(es->s_last_error_time),
(int) sizeof(es->s_last_error_func),
es->s_last_error_func,
@@ -3037,7 +3083,6 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_li_request *elr;
- unsigned long rnd;
elr = kzalloc(sizeof(*elr), GFP_KERNEL);
if (!elr)
@@ -3052,10 +3097,8 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb,
* spread the inode table initialization requests
* better.
*/
- get_random_bytes(&rnd, sizeof(rnd));
- elr->lr_next_sched = jiffies + (unsigned long)rnd %
- (EXT4_DEF_LI_MAX_START_DELAY * HZ);
-
+ elr->lr_next_sched = jiffies + (prandom_u32() %
+ (EXT4_DEF_LI_MAX_START_DELAY * HZ));
return elr;
}
@@ -3288,19 +3331,28 @@ int ext4_calculate_overhead(struct super_block *sb)
}
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
/*
+ * There's no need to reserve anything when we aren't using extents.
+ * The space estimates are exact, there are no unwritten extents,
+ * hole punching doesn't need new metadata... This is needed especially
+ * to keep ext2/3 backward compatibility.
+ */
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ return 0;
+ /*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
* out of space like for example punch hole, or converting
- * uninitialized extents in delalloc path. In most cases such
+ * unwritten extents in delalloc path. In most cases such
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
- resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+ resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+ EXT4_SB(sb)->s_cluster_bits;
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
@@ -3538,6 +3590,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
+ if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
+ set_opt2(sb, HURD_COMPAT);
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+ EXT4_FEATURE_INCOMPAT_64BIT)) {
+ ext4_msg(sb, KERN_ERR,
+ "The Hurd can't support 64-bit file systems");
+ goto failed_mount;
+ }
+ }
+
if (IS_EXT2_SB(sb)) {
if (ext2_feature_set_ok(sb))
ext4_msg(sb, KERN_INFO, "mounting ext2 file system "
@@ -3658,16 +3720,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
- i = le32_to_cpu(es->s_flags);
- if (i & EXT2_FLAGS_UNSIGNED_HASH)
- sbi->s_hash_unsigned = 3;
- else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
+ if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ i = le32_to_cpu(es->s_flags);
+ if (i & EXT2_FLAGS_UNSIGNED_HASH)
+ sbi->s_hash_unsigned = 3;
+ else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) {
#ifdef __CHAR_UNSIGNED__
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
- sbi->s_hash_unsigned = 3;
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH);
+ sbi->s_hash_unsigned = 3;
#else
- es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
+ if (!(sb->s_flags & MS_RDONLY))
+ es->s_flags |=
+ cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
#endif
+ }
}
/* Handle clustersize */
@@ -3967,6 +4035,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
no_journal:
+ if (ext4_mballoc_ready) {
+ sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
+ if (!sbi->s_mb_cache) {
+ ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
+ goto failed_mount_wq;
+ }
+ }
+
/*
* Get the # of file system overhead blocks from the
* superblock if present.
@@ -4043,10 +4119,10 @@ no_journal:
"available");
}
- err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+ err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
if (err) {
ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
- "reserved pool", ext4_calculate_resv_clusters(sbi));
+ "reserved pool", ext4_calculate_resv_clusters(sb));
goto failed_mount4a;
}
@@ -4118,6 +4194,11 @@ no_journal:
if (es->s_error_count)
mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
+ /* Enable message ratelimiting. Default is 10 messages per 5 secs. */
+ ratelimit_state_init(&sbi->s_err_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_warning_ratelimit_state, 5 * HZ, 10);
+ ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
+
kfree(orig_data);
return 0;
@@ -4151,7 +4232,7 @@ failed_mount_wq:
}
failed_mount3:
ext4_es_unregister_shrinker(sbi);
- del_timer(&sbi->s_err_report);
+ del_timer_sync(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
percpu_counter_destroy(&sbi->s_freeclusters_counter);
@@ -4787,6 +4868,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
if (*flags & MS_RDONLY) {
+ err = sync_filesystem(sb);
+ if (err < 0)
+ goto restore_opts;
err = dquot_suspend(sb, -1);
if (err < 0)
goto restore_opts;
@@ -5293,6 +5377,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
bh = ext4_bread(handle, inode, blk, 1, &err);
if (!bh)
goto out;
+ BUFFER_TRACE(bh, "get write access");
err = ext4_journal_get_write_access(handle, bh);
if (err) {
brelse(bh);
@@ -5468,11 +5553,9 @@ static int __init ext4_init_fs(void)
err = ext4_init_mballoc();
if (err)
- goto out3;
-
- err = ext4_init_xattr();
- if (err)
goto out2;
+ else
+ ext4_mballoc_ready = 1;
err = init_inodecache();
if (err)
goto out1;
@@ -5488,10 +5571,9 @@ out:
unregister_as_ext3();
destroy_inodecache();
out1:
- ext4_exit_xattr();
-out2:
+ ext4_mballoc_ready = 0;
ext4_exit_mballoc();
-out3:
+out2:
ext4_exit_feat_adverts();
out4:
if (ext4_proc_root)
@@ -5514,7 +5596,6 @@ static void __exit ext4_exit_fs(void)
unregister_as_ext3();
unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
- ext4_exit_xattr();
ext4_exit_mballoc();
ext4_exit_feat_adverts();
remove_proc_entry("fs/ext4", NULL);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index c081e34f717..e7387337060 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -81,7 +81,7 @@
# define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
-static void ext4_xattr_cache_insert(struct buffer_head *);
+static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
static struct buffer_head *ext4_xattr_cache_find(struct inode *,
struct ext4_xattr_header *,
struct mb_cache_entry **);
@@ -90,13 +90,11 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *,
static int ext4_xattr_list(struct dentry *dentry, char *buffer,
size_t buffer_size);
-static struct mb_cache *ext4_xattr_cache;
-
static const struct xattr_handler *ext4_xattr_handler_map[] = {
[EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
- [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_SECURITY
@@ -108,8 +106,8 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
&ext4_xattr_user_handler,
&ext4_xattr_trusted_handler,
#ifdef CONFIG_EXT4_FS_POSIX_ACL
- &ext4_xattr_acl_access_handler,
- &ext4_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
#ifdef CONFIG_EXT4_FS_SECURITY
&ext4_xattr_security_handler,
@@ -117,6 +115,9 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
NULL
};
+#define EXT4_GET_MB_CACHE(inode) (((struct ext4_sb_info *) \
+ inode->i_sb->s_fs_info)->s_mb_cache)
+
static __le32 ext4_xattr_block_csum(struct inode *inode,
sector_t block_nr,
struct ext4_xattr_header *hdr)
@@ -265,6 +266,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
struct ext4_xattr_entry *entry;
size_t size;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
name_index, name, buffer, (long)buffer_size);
@@ -286,7 +288,7 @@ bad_block:
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
if (error == -EIO)
@@ -367,6 +369,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
{
int error;
+ if (strlen(name) > 255)
+ return -ERANGE;
+
down_read(&EXT4_I(inode)->xattr_sem);
error = ext4_xattr_ibody_get(inode, name_index, name, buffer,
buffer_size);
@@ -409,6 +414,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
struct inode *inode = dentry->d_inode;
struct buffer_head *bh = NULL;
int error;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
ea_idebug(inode, "buffer=%p, buffer_size=%ld",
buffer, (long)buffer_size);
@@ -430,7 +436,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
error = -EIO;
goto cleanup;
}
- ext4_xattr_cache_insert(bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, bh);
error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
cleanup:
@@ -510,6 +516,7 @@ static void ext4_xattr_update_super_block(handle_t *handle,
if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
return;
+ BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
ext4_handle_dirty_super(handle, sb);
@@ -517,8 +524,8 @@ static void ext4_xattr_update_super_block(handle_t *handle,
}
/*
- * Release the xattr block BH: If the reference count is > 1, decrement
- * it; otherwise free the block.
+ * Release the xattr block BH: If the reference count is > 1, decrement it;
+ * otherwise free the block.
*/
static void
ext4_xattr_release_block(handle_t *handle, struct inode *inode,
@@ -526,8 +533,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
{
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
- ce = mb_cache_entry_get(ext4_xattr_cache, bh->b_bdev, bh->b_blocknr);
+ ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
+ BUFFER_TRACE(bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bh);
if (error)
goto out;
@@ -538,16 +547,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (ce)
mb_cache_entry_free(ce);
get_bh(bh);
+ unlock_buffer(bh);
ext4_free_blocks(handle, inode, bh, 0, 1,
EXT4_FREE_BLOCKS_METADATA |
EXT4_FREE_BLOCKS_FORGET);
- unlock_buffer(bh);
} else {
le32_add_cpu(&BHDR(bh)->h_refcount, -1);
if (ce)
mb_cache_entry_release(ce);
+ /*
+ * Beware of this ugliness: Releasing of xattr block references
+ * from different inodes can race and so we have to protect
+ * from a race where someone else frees the block (and releases
+ * its journal_head) before we are done dirtying the buffer. In
+ * nojournal mode this race is harmless and we actually cannot
+ * call ext4_handle_dirty_xattr_block() with locked buffer as
+ * that function can call sync_dirty_buffer() so for that case
+ * we handle the dirtying after unlocking the buffer.
+ */
+ if (ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
unlock_buffer(bh);
- error = ext4_handle_dirty_xattr_block(handle, inode, bh);
+ if (!ext4_handle_valid(handle))
+ error = ext4_handle_dirty_xattr_block(handle, inode,
+ bh);
if (IS_SYNC(inode))
ext4_handle_sync(handle);
dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1));
@@ -567,12 +591,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
size_t *min_offs, void *base, int *total)
{
for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
- *total += EXT4_XATTR_LEN(last->e_name_len);
if (!last->e_value_block && last->e_value_size) {
size_t offs = le16_to_cpu(last->e_value_offs);
if (offs < *min_offs)
*min_offs = offs;
}
+ if (total)
+ *total += EXT4_XATTR_LEN(last->e_name_len);
}
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
@@ -745,14 +770,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &bs->s;
struct mb_cache_entry *ce = NULL;
int error = 0;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
#define header(x) ((struct ext4_xattr_header *)(x))
if (i->value && i->value_len > sb->s_blocksize)
return -ENOSPC;
if (s->base) {
- ce = mb_cache_entry_get(ext4_xattr_cache, bs->bh->b_bdev,
+ ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
bs->bh->b_blocknr);
+ BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
goto cleanup;
@@ -769,7 +796,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (!IS_LAST_ENTRY(s->first))
ext4_xattr_rehash(header(s->base),
s->here);
- ext4_xattr_cache_insert(bs->bh);
+ ext4_xattr_cache_insert(ext4_mb_cache,
+ bs->bh);
}
unlock_buffer(bs->bh);
if (error == -EIO)
@@ -837,6 +865,7 @@ inserted:
EXT4_C2B(EXT4_SB(sb), 1));
if (error)
goto cleanup;
+ BUFFER_TRACE(new_bh, "get_write_access");
error = ext4_journal_get_write_access(handle,
new_bh);
if (error)
@@ -874,7 +903,7 @@ inserted:
* take i_data_sem because we will test
* i_delalloc_reserved_flag in ext4_mb_new_blocks
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ down_read(&EXT4_I(inode)->i_data_sem);
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
up_read((&EXT4_I(inode)->i_data_sem));
@@ -905,7 +934,7 @@ getblk_failed:
memcpy(new_bh->b_data, s->base, new_bh->b_size);
set_buffer_uptodate(new_bh);
unlock_buffer(new_bh);
- ext4_xattr_cache_insert(new_bh);
+ ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
error = ext4_handle_dirty_xattr_block(handle,
inode, new_bh);
if (error)
@@ -1228,7 +1257,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_xattr_block_find *bs = NULL;
char *buffer = NULL, *b_entry_name = NULL;
size_t min_offs, free;
- int total_ino, total_blk;
+ int total_ino;
void *base, *start, *end;
int extra_isize = 0, error = 0, tried_min_extra_isize = 0;
int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
@@ -1286,8 +1315,7 @@ retry:
first = BFIRST(bh);
end = bh->b_data + bh->b_size;
min_offs = end - base;
- free = ext4_xattr_free_space(first, &min_offs, base,
- &total_blk);
+ free = ext4_xattr_free_space(first, &min_offs, base, NULL);
if (free < new_extra_isize) {
if (!tried_min_extra_isize && s_min_extra_isize) {
tried_min_extra_isize++;
@@ -1350,6 +1378,9 @@ retry:
s_min_extra_isize) {
tried_min_extra_isize++;
new_extra_isize = s_min_extra_isize;
+ kfree(is); is = NULL;
+ kfree(bs); bs = NULL;
+ brelse(bh);
goto retry;
}
error = -1;
@@ -1492,13 +1523,13 @@ ext4_xattr_put_super(struct super_block *sb)
* Returns 0, or a negative error number on failure.
*/
static void
-ext4_xattr_cache_insert(struct buffer_head *bh)
+ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
{
__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
struct mb_cache_entry *ce;
int error;
- ce = mb_cache_entry_alloc(ext4_xattr_cache, GFP_NOFS);
+ ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
if (!ce) {
ea_bdebug(bh, "out of memory");
return;
@@ -1570,12 +1601,13 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
{
__u32 hash = le32_to_cpu(header->h_hash);
struct mb_cache_entry *ce;
+ struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
if (!header->h_hash)
return NULL; /* never share */
ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
again:
- ce = mb_cache_entry_find_first(ext4_xattr_cache, inode->i_sb->s_bdev,
+ ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
hash);
while (ce) {
struct buffer_head *bh;
@@ -1673,19 +1705,17 @@ static void ext4_xattr_rehash(struct ext4_xattr_header *header,
#undef BLOCK_HASH_SHIFT
-int __init
-ext4_init_xattr(void)
+#define HASH_BUCKET_BITS 10
+
+struct mb_cache *
+ext4_xattr_create_cache(char *name)
{
- ext4_xattr_cache = mb_cache_create("ext4_xattr", 6);
- if (!ext4_xattr_cache)
- return -ENOMEM;
- return 0;
+ return mb_cache_create(name, HASH_BUCKET_BITS);
}
-void
-ext4_exit_xattr(void)
+void ext4_xattr_destroy_cache(struct mb_cache *cache)
{
- if (ext4_xattr_cache)
- mb_cache_destroy(ext4_xattr_cache);
- ext4_xattr_cache = NULL;
+ if (cache)
+ mb_cache_destroy(cache);
}
+
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index c767dbdd7fc..29bedf5589f 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -96,8 +96,6 @@ struct ext4_xattr_ibody_find {
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
-extern const struct xattr_handler ext4_xattr_acl_access_handler;
-extern const struct xattr_handler ext4_xattr_acl_default_handler;
extern const struct xattr_handler ext4_xattr_security_handler;
extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
@@ -112,9 +110,6 @@ extern void ext4_xattr_put_super(struct super_block *);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
-extern int __init ext4_init_xattr(void);
-extern void ext4_exit_xattr(void);
-
extern const struct xattr_handler *ext4_xattr_handlers[];
extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
@@ -126,6 +121,9 @@ extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is);
+extern struct mb_cache *ext4_xattr_create_cache(char *name);
+extern void ext4_xattr_destroy_cache(struct mb_cache *);
+
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
struct inode *dir, const struct qstr *qstr);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index e06e0995e00..214fe1054fc 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -63,3 +63,11 @@ config F2FS_FS_SECURITY
the extended attribute support in advance.
If you are not using a security module, say N.
+
+config F2FS_CHECK_FS
+ bool "F2FS consistency checking feature"
+ depends on F2FS_FS
+ help
+ Enables BUG_ONs which check the file system consistency in runtime.
+
+ If you want to improve the performance, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 27a0820340b..2e35da12d29 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -1,6 +1,6 @@
obj-$(CONFIG_F2FS_FS) += f2fs.o
-f2fs-y := dir.o file.o inode.o namei.o hash.o super.o
+f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o
f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index b7826ec1b47..dbe2141d10a 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -17,9 +17,6 @@
#include "xattr.h"
#include "acl.h"
-#define get_inode_mode(i) ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
- (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
-
static inline size_t f2fs_acl_size(int count)
{
if (count <= 4) {
@@ -167,25 +164,17 @@ fail:
struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT;
void *value = NULL;
struct posix_acl *acl;
int retval;
- if (!test_opt(sbi, POSIX_ACL))
- return NULL;
-
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
if (type == ACL_TYPE_ACCESS)
name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
retval = f2fs_getxattr(inode, name_index, "", NULL, 0);
if (retval > 0) {
- value = kmalloc(retval, GFP_KERNEL);
+ value = kmalloc(retval, GFP_F2FS_ZERO);
if (!value)
return ERR_PTR(-ENOMEM);
retval = f2fs_getxattr(inode, name_index, "", value, retval);
@@ -205,19 +194,20 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
return acl;
}
-static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+static int __f2fs_set_acl(struct inode *inode, int type,
+ struct posix_acl *acl, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
int name_index;
void *value = NULL;
size_t size = 0;
int error;
- if (!test_opt(sbi, POSIX_ACL))
- return 0;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
+ if (acl) {
+ error = posix_acl_valid(acl);
+ if (error < 0)
+ return error;
+ }
switch (type) {
case ACL_TYPE_ACCESS:
@@ -250,7 +240,7 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
}
}
- error = f2fs_setxattr(inode, name_index, "", value, size, NULL);
+ error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0);
kfree(value);
if (!error)
@@ -260,153 +250,31 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
return error;
}
-int f2fs_init_acl(struct inode *inode, struct inode *dir)
+int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl = NULL;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
- int error = 0;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (test_opt(sbi, POSIX_ACL)) {
- acl = f2fs_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl)
- inode->i_mode &= ~current_umask();
- }
-
- if (test_opt(sbi, POSIX_ACL) && acl) {
-
- if (S_ISDIR(inode->i_mode)) {
- error = f2fs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
- if (error)
- goto cleanup;
- }
- error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
- if (error < 0)
- return error;
- if (error > 0)
- error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
- }
-cleanup:
- posix_acl_release(acl);
- return error;
+ return __f2fs_set_acl(inode, type, acl, NULL);
}
-int f2fs_acl_chmod(struct inode *inode)
+int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct posix_acl *acl;
- int error;
- umode_t mode = get_inode_mode(inode);
-
- if (!test_opt(sbi, POSIX_ACL))
- return 0;
- if (S_ISLNK(mode))
- return -EOPNOTSUPP;
-
- acl = f2fs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
+ struct posix_acl *default_acl, *acl;
+ int error = 0;
- error = posix_acl_chmod(&acl, GFP_KERNEL, mode);
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (error)
return error;
- error = f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- return error;
-}
-
-static size_t f2fs_xattr_list_acl(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- const char *xname = POSIX_ACL_XATTR_DEFAULT;
- size_t size;
- if (!test_opt(sbi, POSIX_ACL))
- return 0;
-
- if (type == ACL_TYPE_ACCESS)
- xname = POSIX_ACL_XATTR_ACCESS;
-
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
-}
-
-static int f2fs_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- struct posix_acl *acl;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(sbi, POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = f2fs_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int f2fs_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl = NULL;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!test_opt(sbi, POSIX_ACL))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else {
- acl = NULL;
+ if (default_acl) {
+ error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
+ ipage);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (error)
+ error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
+ ipage);
+ posix_acl_release(acl);
}
- error = f2fs_set_acl(inode, type, acl);
-
-release_and_out:
- posix_acl_release(acl);
return error;
}
-
-const struct xattr_handler f2fs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = f2fs_xattr_list_acl,
- .get = f2fs_xattr_get_acl,
- .set = f2fs_xattr_set_acl,
-};
-
-const struct xattr_handler f2fs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = f2fs_xattr_list_acl,
- .get = f2fs_xattr_get_acl,
- .set = f2fs_xattr_set_acl,
-};
diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h
index 80f43067441..e0864651cdc 100644
--- a/fs/f2fs/acl.h
+++ b/fs/f2fs/acl.h
@@ -36,20 +36,16 @@ struct f2fs_acl_header {
#ifdef CONFIG_F2FS_FS_POSIX_ACL
-extern struct posix_acl *f2fs_get_acl(struct inode *inode, int type);
-extern int f2fs_acl_chmod(struct inode *inode);
-extern int f2fs_init_acl(struct inode *inode, struct inode *dir);
+extern struct posix_acl *f2fs_get_acl(struct inode *, int);
+extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+extern int f2fs_init_acl(struct inode *, struct inode *, struct page *);
#else
#define f2fs_check_acl NULL
#define f2fs_get_acl NULL
#define f2fs_set_acl NULL
-static inline int f2fs_acl_chmod(struct inode *inode)
-{
- return 0;
-}
-
-static inline int f2fs_init_acl(struct inode *inode, struct inode *dir)
+static inline int f2fs_init_acl(struct inode *inode, struct inode *dir,
+ struct page *page)
{
return 0;
}
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index bb312201ca9..0b4710c1d37 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -30,7 +30,7 @@ static struct kmem_cache *inode_entry_slab;
*/
struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
- struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct address_space *mapping = META_MAPPING(sbi);
struct page *page = NULL;
repeat:
page = grab_cache_page(mapping, index);
@@ -38,9 +38,7 @@ repeat:
cond_resched();
goto repeat;
}
-
- /* We wait writeback only inside grab_meta_page() */
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, META);
SetPageUptodate(page);
return page;
}
@@ -50,7 +48,7 @@ repeat:
*/
struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
- struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
repeat:
page = grab_cache_page(mapping, index);
@@ -61,67 +59,154 @@ repeat:
if (PageUptodate(page))
goto out;
- if (f2fs_readpage(sbi, page, index, READ_SYNC))
+ if (f2fs_submit_page_bio(sbi, page, index,
+ READ_SYNC | REQ_META | REQ_PRIO))
goto repeat;
lock_page(page);
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
out:
- mark_page_accessed(page);
return page;
}
+static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
+{
+ switch (type) {
+ case META_NAT:
+ return NM_I(sbi)->max_nid / NAT_ENTRY_PER_BLOCK;
+ case META_SIT:
+ return SIT_BLK_CNT(sbi);
+ case META_SSA:
+ case META_CP:
+ return 0;
+ default:
+ BUG();
+ }
+}
+
+/*
+ * Readahead CP/NAT/SIT/SSA pages
+ */
+int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
+{
+ block_t prev_blk_addr = 0;
+ struct page *page;
+ int blkno = start;
+ int max_blks = get_max_meta_blks(sbi, type);
+
+ struct f2fs_io_info fio = {
+ .type = META,
+ .rw = READ_SYNC | REQ_META | REQ_PRIO
+ };
+
+ for (; nrpages-- > 0; blkno++) {
+ block_t blk_addr;
+
+ switch (type) {
+ case META_NAT:
+ /* get nat block addr */
+ if (unlikely(blkno >= max_blks))
+ blkno = 0;
+ blk_addr = current_nat_addr(sbi,
+ blkno * NAT_ENTRY_PER_BLOCK);
+ break;
+ case META_SIT:
+ /* get sit block addr */
+ if (unlikely(blkno >= max_blks))
+ goto out;
+ blk_addr = current_sit_addr(sbi,
+ blkno * SIT_ENTRY_PER_BLOCK);
+ if (blkno != start && prev_blk_addr + 1 != blk_addr)
+ goto out;
+ prev_blk_addr = blk_addr;
+ break;
+ case META_SSA:
+ case META_CP:
+ /* get ssa/cp block addr */
+ blk_addr = blkno;
+ break;
+ default:
+ BUG();
+ }
+
+ page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+ if (!page)
+ continue;
+ if (PageUptodate(page)) {
+ f2fs_put_page(page, 1);
+ continue;
+ }
+
+ f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+ f2fs_put_page(page, 0);
+ }
+out:
+ f2fs_submit_merged_bio(sbi, META, READ);
+ return blkno - start;
+}
+
static int f2fs_write_meta_page(struct page *page,
struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- /* Should not write any meta pages, if any IO error was occurred */
- if (wbc->for_reclaim ||
- is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)) {
- dec_page_count(sbi, F2FS_DIRTY_META);
- wbc->pages_skipped++;
- set_page_dirty(page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
+ trace_f2fs_writepage(page, META);
- wait_on_page_writeback(page);
+ if (unlikely(sbi->por_doing))
+ goto redirty_out;
+ if (wbc->for_reclaim)
+ goto redirty_out;
+ /* Should not write any meta pages, if any IO error was occurred */
+ if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
+ goto no_write;
+
+ f2fs_wait_on_page_writeback(page, META);
write_meta_page(sbi, page);
+no_write:
dec_page_count(sbi, F2FS_DIRTY_META);
unlock_page(page);
return 0;
+
+redirty_out:
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
}
static int f2fs_write_meta_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
- struct block_device *bdev = sbi->sb->s_bdev;
- long written;
+ long diff, written;
- if (wbc->for_kupdate)
- return 0;
+ trace_f2fs_writepages(mapping->host, wbc, META);
- if (get_pages(sbi, F2FS_DIRTY_META) == 0)
- return 0;
+ /* collect a number of dirty meta pages and write together */
+ if (wbc->for_kupdate ||
+ get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META))
+ goto skip_write;
/* if mounting is failed, skip writing node pages */
mutex_lock(&sbi->cp_mutex);
- written = sync_meta_pages(sbi, META, bio_get_nr_vecs(bdev));
+ diff = nr_pages_to_write(sbi, META, wbc);
+ written = sync_meta_pages(sbi, META, wbc->nr_to_write);
mutex_unlock(&sbi->cp_mutex);
- wbc->nr_to_write -= written;
+ wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff);
+ return 0;
+
+skip_write:
+ wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META);
return 0;
}
long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
- struct address_space *mapping = sbi->meta_inode->i_mapping;
+ struct address_space *mapping = META_MAPPING(sbi);
pgoff_t index = 0, end = LONG_MAX;
struct pagevec pvec;
long nwritten = 0;
@@ -136,20 +221,33 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
- if (nr_pages == 0)
+ if (unlikely(nr_pages == 0))
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+
lock_page(page);
- BUG_ON(page->mapping != mapping);
- BUG_ON(!PageDirty(page));
- clear_page_dirty_for_io(page);
+
+ if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+ unlock_page(page);
+ continue;
+ }
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
+ }
+
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
if (f2fs_write_meta_page(page, &wbc)) {
unlock_page(page);
break;
}
- if (nwritten++ >= nr_to_write)
+ nwritten++;
+ if (unlikely(nwritten >= nr_to_write))
break;
}
pagevec_release(&pvec);
@@ -157,7 +255,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
}
if (nwritten)
- f2fs_submit_bio(sbi, type, nr_to_write == LONG_MAX);
+ f2fs_submit_merged_bio(sbi, type, WRITE);
return nwritten;
}
@@ -167,6 +265,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ trace_f2fs_set_page_dirty(page, META);
+
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
@@ -184,62 +284,50 @@ const struct address_space_operations f2fs_meta_aops = {
int acquire_orphan_inode(struct f2fs_sb_info *sbi)
{
- unsigned int max_orphans;
int err = 0;
- /*
- * considering 512 blocks in a segment 5 blocks are needed for cp
- * and log segment summaries. Remaining blocks are used to keep
- * orphan entries with the limitation one reserved segment
- * for cp pack we can have max 1020*507 orphan entries
- */
- max_orphans = (sbi->blocks_per_seg - 5) * F2FS_ORPHANS_PER_BLOCK;
- mutex_lock(&sbi->orphan_inode_mutex);
- if (sbi->n_orphans >= max_orphans)
+ spin_lock(&sbi->orphan_inode_lock);
+ if (unlikely(sbi->n_orphans >= sbi->max_orphans))
err = -ENOSPC;
else
sbi->n_orphans++;
- mutex_unlock(&sbi->orphan_inode_mutex);
+ spin_unlock(&sbi->orphan_inode_lock);
+
return err;
}
void release_orphan_inode(struct f2fs_sb_info *sbi)
{
- mutex_lock(&sbi->orphan_inode_mutex);
+ spin_lock(&sbi->orphan_inode_lock);
+ f2fs_bug_on(sbi->n_orphans == 0);
sbi->n_orphans--;
- mutex_unlock(&sbi->orphan_inode_mutex);
+ spin_unlock(&sbi->orphan_inode_lock);
}
void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
- struct list_head *head, *this;
- struct orphan_inode_entry *new = NULL, *orphan = NULL;
+ struct list_head *head;
+ struct orphan_inode_entry *new, *orphan;
- mutex_lock(&sbi->orphan_inode_mutex);
+ new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
+ new->ino = ino;
+
+ spin_lock(&sbi->orphan_inode_lock);
head = &sbi->orphan_inode_list;
- list_for_each(this, head) {
- orphan = list_entry(this, struct orphan_inode_entry, list);
- if (orphan->ino == ino)
- goto out;
+ list_for_each_entry(orphan, head, list) {
+ if (orphan->ino == ino) {
+ spin_unlock(&sbi->orphan_inode_lock);
+ kmem_cache_free(orphan_entry_slab, new);
+ return;
+ }
+
if (orphan->ino > ino)
break;
- orphan = NULL;
- }
-retry:
- new = kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
- if (!new) {
- cond_resched();
- goto retry;
}
- new->ino = ino;
- /* add new_oentry into list which is sorted by inode number */
- if (orphan)
- list_add(&new->list, this->prev);
- else
- list_add_tail(&new->list, head);
-out:
- mutex_unlock(&sbi->orphan_inode_mutex);
+ /* add new orphan entry into list which is sorted by inode number */
+ list_add_tail(&new->list, &orphan->list);
+ spin_unlock(&sbi->orphan_inode_lock);
}
void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
@@ -247,40 +335,46 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
struct list_head *head;
struct orphan_inode_entry *orphan;
- mutex_lock(&sbi->orphan_inode_mutex);
+ spin_lock(&sbi->orphan_inode_lock);
head = &sbi->orphan_inode_list;
list_for_each_entry(orphan, head, list) {
if (orphan->ino == ino) {
list_del(&orphan->list);
- kmem_cache_free(orphan_entry_slab, orphan);
+ f2fs_bug_on(sbi->n_orphans == 0);
sbi->n_orphans--;
- break;
+ spin_unlock(&sbi->orphan_inode_lock);
+ kmem_cache_free(orphan_entry_slab, orphan);
+ return;
}
}
- mutex_unlock(&sbi->orphan_inode_mutex);
+ spin_unlock(&sbi->orphan_inode_lock);
}
static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
{
struct inode *inode = f2fs_iget(sbi->sb, ino);
- BUG_ON(IS_ERR(inode));
+ f2fs_bug_on(IS_ERR(inode));
clear_nlink(inode);
/* truncate all the data during iput */
iput(inode);
}
-int recover_orphan_inodes(struct f2fs_sb_info *sbi)
+void recover_orphan_inodes(struct f2fs_sb_info *sbi)
{
block_t start_blk, orphan_blkaddr, i, j;
if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
- return 0;
+ return;
- sbi->por_doing = 1;
- start_blk = __start_cp_addr(sbi) + 1;
+ sbi->por_doing = true;
+
+ start_blk = __start_cp_addr(sbi) + 1 +
+ le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
orphan_blkaddr = __start_sum_addr(sbi) - 1;
+ ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP);
+
for (i = 0; i < orphan_blkaddr; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
struct f2fs_orphan_block *orphan_blk;
@@ -294,30 +388,40 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
}
/* clear Orphan Flag */
clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
- sbi->por_doing = 0;
- return 0;
+ sbi->por_doing = false;
+ return;
}
static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
{
- struct list_head *head, *this, *next;
+ struct list_head *head;
struct f2fs_orphan_block *orphan_blk = NULL;
- struct page *page = NULL;
unsigned int nentries = 0;
- unsigned short index = 1;
- unsigned short orphan_blocks;
-
- orphan_blocks = (unsigned short)((sbi->n_orphans +
+ unsigned short index;
+ unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans +
(F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK);
+ struct page *page = NULL;
+ struct orphan_inode_entry *orphan = NULL;
- mutex_lock(&sbi->orphan_inode_mutex);
+ for (index = 0; index < orphan_blocks; index++)
+ grab_meta_page(sbi, start_blk + index);
+
+ index = 1;
+ spin_lock(&sbi->orphan_inode_lock);
head = &sbi->orphan_inode_list;
/* loop for each orphan inode entry and write them in Jornal block */
- list_for_each_safe(this, next, head) {
- struct orphan_inode_entry *orphan;
+ list_for_each_entry(orphan, head, list) {
+ if (!page) {
+ page = find_get_page(META_MAPPING(sbi), start_blk++);
+ f2fs_bug_on(!page);
+ orphan_blk =
+ (struct f2fs_orphan_block *)page_address(page);
+ memset(orphan_blk, 0, sizeof(*orphan_blk));
+ f2fs_put_page(page, 0);
+ }
- orphan = list_entry(this, struct orphan_inode_entry, list);
+ orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
if (nentries == F2FS_ORPHANS_PER_BLOCK) {
/*
@@ -331,29 +435,20 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
set_page_dirty(page);
f2fs_put_page(page, 1);
index++;
- start_blk++;
nentries = 0;
page = NULL;
}
- if (page)
- goto page_exist;
+ }
- page = grab_meta_page(sbi, start_blk);
- orphan_blk = (struct f2fs_orphan_block *)page_address(page);
- memset(orphan_blk, 0, sizeof(*orphan_blk));
-page_exist:
- orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino);
+ if (page) {
+ orphan_blk->blk_addr = cpu_to_le16(index);
+ orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
+ orphan_blk->entry_count = cpu_to_le32(nentries);
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
}
- if (!page)
- goto end;
-
- orphan_blk->blk_addr = cpu_to_le16(index);
- orphan_blk->blk_count = cpu_to_le16(orphan_blocks);
- orphan_blk->entry_count = cpu_to_le32(nentries);
- set_page_dirty(page);
- f2fs_put_page(page, 1);
-end:
- mutex_unlock(&sbi->orphan_inode_mutex);
+
+ spin_unlock(&sbi->orphan_inode_lock);
}
static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -416,8 +511,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
unsigned long blk_size = sbi->blocksize;
unsigned long long cp1_version = 0, cp2_version = 0;
unsigned long long cp_start_blk_no;
+ unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+ block_t cp_blk_no;
+ int i;
- sbi->ckpt = kzalloc(blk_size, GFP_KERNEL);
+ sbi->ckpt = kzalloc(cp_blks * blk_size, GFP_KERNEL);
if (!sbi->ckpt)
return -ENOMEM;
/*
@@ -428,7 +526,8 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
/* The second checkpoint pack should start at the next segment */
- cp_start_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+ cp_start_blk_no += ((unsigned long long)1) <<
+ le32_to_cpu(fsb->log_blocks_per_seg);
cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version);
if (cp1 && cp2) {
@@ -447,6 +546,23 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
cp_block = (struct f2fs_checkpoint *)page_address(cur_page);
memcpy(sbi->ckpt, cp_block, blk_size);
+ if (cp_blks <= 1)
+ goto done;
+
+ cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+ if (cur_page == cp2)
+ cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
+
+ for (i = 1; i < cp_blks; i++) {
+ void *sit_bitmap_ptr;
+ unsigned char *ckpt = (unsigned char *)sbi->ckpt;
+
+ cur_page = get_meta_page(sbi, cp_blk_no + i);
+ sit_bitmap_ptr = page_address(cur_page);
+ memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size);
+ f2fs_put_page(cur_page, 1);
+ }
+done:
f2fs_put_page(cp1, 1);
f2fs_put_page(cp2, 1);
return 0;
@@ -459,19 +575,14 @@ fail_no_cp:
static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct list_head *head = &sbi->dir_inode_list;
- struct list_head *this;
-
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode == inode)
- return -EEXIST;
- }
- list_add_tail(&new->list, head);
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->n_dirty_dirs++;
-#endif
+
+ if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
+ return -EEXIST;
+
+ set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
+ F2FS_I(inode)->dirty_dir = new;
+ list_add_tail(&new->list, &sbi->dir_inode_list);
+ stat_inc_dirty_dir(sbi);
return 0;
}
@@ -479,75 +590,65 @@ void set_dirty_dir_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct dir_inode_entry *new;
+ int ret = 0;
if (!S_ISDIR(inode->i_mode))
return;
-retry:
- new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- if (!new) {
- cond_resched();
- goto retry;
- }
+
+ new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new->inode = inode;
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- if (__add_dirty_inode(inode, new))
- kmem_cache_free(inode_entry_slab, new);
-
- inc_page_count(sbi, F2FS_DIRTY_DENTS);
+ ret = __add_dirty_inode(inode, new);
inode_inc_dirty_dents(inode);
SetPagePrivate(page);
spin_unlock(&sbi->dir_inode_lock);
+
+ if (ret)
+ kmem_cache_free(inode_entry_slab, new);
}
void add_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct dir_inode_entry *new;
-retry:
- new = kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
- if (!new) {
- cond_resched();
- goto retry;
- }
+ struct dir_inode_entry *new =
+ f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
+ int ret = 0;
+
new->inode = inode;
INIT_LIST_HEAD(&new->list);
spin_lock(&sbi->dir_inode_lock);
- if (__add_dirty_inode(inode, new))
- kmem_cache_free(inode_entry_slab, new);
+ ret = __add_dirty_inode(inode, new);
spin_unlock(&sbi->dir_inode_lock);
+
+ if (ret)
+ kmem_cache_free(inode_entry_slab, new);
}
void remove_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct list_head *head = &sbi->dir_inode_list;
- struct list_head *this;
+ struct dir_inode_entry *entry;
if (!S_ISDIR(inode->i_mode))
return;
spin_lock(&sbi->dir_inode_lock);
- if (atomic_read(&F2FS_I(inode)->dirty_dents)) {
+ if (get_dirty_dents(inode) ||
+ !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
spin_unlock(&sbi->dir_inode_lock);
return;
}
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode == inode) {
- list_del(&entry->list);
- kmem_cache_free(inode_entry_slab, entry);
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->n_dirty_dirs--;
-#endif
- break;
- }
- }
+ entry = F2FS_I(inode)->dirty_dir;
+ list_del(&entry->list);
+ F2FS_I(inode)->dirty_dir = NULL;
+ clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
+ stat_dec_dirty_dir(sbi);
spin_unlock(&sbi->dir_inode_lock);
+ kmem_cache_free(inode_entry_slab, entry);
/* Only from the recovery routine */
if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
@@ -556,32 +657,15 @@ void remove_dirty_dir_inode(struct inode *inode)
}
}
-struct inode *check_dirty_dir_inode(struct f2fs_sb_info *sbi, nid_t ino)
-{
- struct list_head *head = &sbi->dir_inode_list;
- struct list_head *this;
- struct inode *inode = NULL;
-
- spin_lock(&sbi->dir_inode_lock);
- list_for_each(this, head) {
- struct dir_inode_entry *entry;
- entry = list_entry(this, struct dir_inode_entry, list);
- if (entry->inode->i_ino == ino) {
- inode = entry->inode;
- break;
- }
- }
- spin_unlock(&sbi->dir_inode_lock);
- return inode;
-}
-
void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
{
- struct list_head *head = &sbi->dir_inode_list;
+ struct list_head *head;
struct dir_inode_entry *entry;
struct inode *inode;
retry:
spin_lock(&sbi->dir_inode_lock);
+
+ head = &sbi->dir_inode_list;
if (list_empty(head)) {
spin_unlock(&sbi->dir_inode_lock);
return;
@@ -590,14 +674,14 @@ retry:
inode = igrab(entry->inode);
spin_unlock(&sbi->dir_inode_lock);
if (inode) {
- filemap_flush(inode->i_mapping);
+ filemap_fdatawrite(inode->i_mapping);
iput(inode);
} else {
/*
* We should submit bio, since it exists several
* wribacking dentry pages in the freeing inode.
*/
- f2fs_submit_bio(sbi, DATA, true);
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
}
goto retry;
}
@@ -617,11 +701,10 @@ static void block_operations(struct f2fs_sb_info *sbi)
blk_start_plug(&plug);
retry_flush_dents:
- mutex_lock_all(sbi);
-
+ f2fs_lock_all(sbi);
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
- mutex_unlock_all(sbi);
+ f2fs_unlock_all(sbi);
sync_dirty_dir_inodes(sbi);
goto retry_flush_dents;
}
@@ -644,7 +727,22 @@ retry_flush_nodes:
static void unblock_operations(struct f2fs_sb_info *sbi)
{
mutex_unlock(&sbi->node_write);
- mutex_unlock_all(sbi);
+ f2fs_unlock_all(sbi);
+}
+
+static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+{
+ DEFINE_WAIT(wait);
+
+ for (;;) {
+ prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ if (!get_pages(sbi, F2FS_WRITEBACK))
+ break;
+
+ io_schedule();
+ }
+ finish_wait(&sbi->cp_wait, &wait);
}
static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
@@ -657,6 +755,13 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
__u32 crc32 = 0;
void *kaddr;
int i;
+ int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
+
+ /*
+ * This avoids to conduct wrong roll-forward operations and uses
+ * metapages, so should be called prior to sync_meta_pages below.
+ */
+ discard_next_dnode(sbi);
/* Flush all the NAT/SIT pages */
while (get_pages(sbi, F2FS_DIRTY_META))
@@ -701,16 +806,19 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1)
/ F2FS_ORPHANS_PER_BLOCK;
- ckpt->cp_pack_start_sum = cpu_to_le32(1 + orphan_blocks);
+ ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
+ orphan_blocks);
if (is_umount) {
set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
- data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE);
+ cp_payload_blks + data_sum_blocks +
+ orphan_blocks + NR_CURSEG_NODE_TYPE);
} else {
clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
ckpt->cp_pack_total_block_count = cpu_to_le32(2 +
- data_sum_blocks + orphan_blocks);
+ cp_payload_blks + data_sum_blocks +
+ orphan_blocks);
}
if (sbi->n_orphans)
@@ -736,6 +844,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
set_page_dirty(cp_page);
f2fs_put_page(cp_page, 1);
+ for (i = 1; i < 1 + cp_payload_blks; i++) {
+ cp_page = grab_meta_page(sbi, start_blk++);
+ kaddr = page_address(cp_page);
+ memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
+ (1 << sbi->log_blocksize));
+ set_page_dirty(cp_page);
+ f2fs_put_page(cp_page, 1);
+ }
+
if (sbi->n_orphans) {
write_orphan_inodes(sbi, start_blk);
start_blk += orphan_blocks;
@@ -756,11 +873,10 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
f2fs_put_page(cp_page, 1);
/* wait for previous submitted node/meta pages writeback */
- while (get_pages(sbi, F2FS_WRITEBACK))
- congestion_wait(BLK_RW_ASYNC, HZ / 50);
+ wait_on_all_pages_writeback(sbi);
- filemap_fdatawait_range(sbi->node_inode->i_mapping, 0, LONG_MAX);
- filemap_fdatawait_range(sbi->meta_inode->i_mapping, 0, LONG_MAX);
+ filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
+ filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
/* update user_block_counts */
sbi->last_valid_block_count = sbi->total_valid_block_count;
@@ -769,7 +885,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
/* Here, we only have one bio having CP pack */
sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
- if (!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
clear_prefree_segments(sbi);
F2FS_RESET_SB_DIRT(sbi);
}
@@ -790,9 +906,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops");
- f2fs_submit_bio(sbi, DATA, true);
- f2fs_submit_bio(sbi, NODE, true);
- f2fs_submit_bio(sbi, META, true);
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
+ f2fs_submit_merged_bio(sbi, META, WRITE);
/*
* update checkpoint pack index
@@ -812,25 +928,34 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
unblock_operations(sbi);
mutex_unlock(&sbi->cp_mutex);
+ stat_inc_cp_count(sbi->stat_info);
trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint");
}
void init_orphan_info(struct f2fs_sb_info *sbi)
{
- mutex_init(&sbi->orphan_inode_mutex);
+ spin_lock_init(&sbi->orphan_inode_lock);
INIT_LIST_HEAD(&sbi->orphan_inode_list);
sbi->n_orphans = 0;
+ /*
+ * considering 512 blocks in a segment 8 blocks are needed for cp
+ * and log segment summaries. Remaining blocks are used to keep
+ * orphan entries with the limitation one reserved segment
+ * for cp pack we can have max 1020*504 orphan entries
+ */
+ sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE)
+ * F2FS_ORPHANS_PER_BLOCK;
}
int __init create_checkpoint_caches(void)
{
orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
- sizeof(struct orphan_inode_entry), NULL);
- if (unlikely(!orphan_entry_slab))
+ sizeof(struct orphan_inode_entry));
+ if (!orphan_entry_slab)
return -ENOMEM;
inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
- sizeof(struct dir_inode_entry), NULL);
- if (unlikely(!inode_entry_slab)) {
+ sizeof(struct dir_inode_entry));
+ if (!inode_entry_slab) {
kmem_cache_destroy(orphan_entry_slab);
return -ENOMEM;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 941f9b9ca3a..f8cf619edb5 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -24,6 +24,190 @@
#include "segment.h"
#include <trace/events/f2fs.h>
+static void f2fs_read_end_io(struct bio *bio, int err)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+
+ if (!err) {
+ SetPageUptodate(page);
+ } else {
+ ClearPageUptodate(page);
+ SetPageError(page);
+ }
+ unlock_page(page);
+ }
+ bio_put(bio);
+}
+
+static void f2fs_write_end_io(struct bio *bio, int err)
+{
+ struct f2fs_sb_info *sbi = bio->bi_private;
+ struct bio_vec *bvec;
+ int i;
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ struct page *page = bvec->bv_page;
+
+ if (unlikely(err)) {
+ SetPageError(page);
+ set_bit(AS_EIO, &page->mapping->flags);
+ f2fs_stop_checkpoint(sbi);
+ }
+ end_page_writeback(page);
+ dec_page_count(sbi, F2FS_WRITEBACK);
+ }
+
+ if (sbi->wait_io) {
+ complete(sbi->wait_io);
+ sbi->wait_io = NULL;
+ }
+
+ if (!get_pages(sbi, F2FS_WRITEBACK) &&
+ !list_empty(&sbi->cp_wait.task_list))
+ wake_up(&sbi->cp_wait);
+
+ bio_put(bio);
+}
+
+/*
+ * Low-level block read/write IO operations.
+ */
+static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
+ int npages, bool is_read)
+{
+ struct bio *bio;
+
+ /* No failure on bio allocation */
+ bio = bio_alloc(GFP_NOIO, npages);
+
+ bio->bi_bdev = sbi->sb->s_bdev;
+ bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
+ bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
+ bio->bi_private = sbi;
+
+ return bio;
+}
+
+static void __submit_merged_bio(struct f2fs_bio_info *io)
+{
+ struct f2fs_io_info *fio = &io->fio;
+ int rw;
+
+ if (!io->bio)
+ return;
+
+ rw = fio->rw;
+
+ if (is_read_io(rw)) {
+ trace_f2fs_submit_read_bio(io->sbi->sb, rw,
+ fio->type, io->bio);
+ submit_bio(rw, io->bio);
+ } else {
+ trace_f2fs_submit_write_bio(io->sbi->sb, rw,
+ fio->type, io->bio);
+ /*
+ * META_FLUSH is only from the checkpoint procedure, and we
+ * should wait this metadata bio for FS consistency.
+ */
+ if (fio->type == META_FLUSH) {
+ DECLARE_COMPLETION_ONSTACK(wait);
+ io->sbi->wait_io = &wait;
+ submit_bio(rw, io->bio);
+ wait_for_completion(&wait);
+ } else {
+ submit_bio(rw, io->bio);
+ }
+ }
+
+ io->bio = NULL;
+}
+
+void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
+ enum page_type type, int rw)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io;
+
+ io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype];
+
+ down_write(&io->io_rwsem);
+
+ /* change META to META_FLUSH in the checkpoint procedure */
+ if (type >= META_FLUSH) {
+ io->fio.type = META_FLUSH;
+ io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
+ }
+ __submit_merged_bio(io);
+ up_write(&io->io_rwsem);
+}
+
+/*
+ * Fill the locked page with data located in the block address.
+ * Return unlocked page.
+ */
+int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, int rw)
+{
+ struct bio *bio;
+
+ trace_f2fs_submit_page_bio(page, blk_addr, rw);
+
+ /* Allocate a new bio */
+ bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+
+ if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
+ bio_put(bio);
+ f2fs_put_page(page, 1);
+ return -EFAULT;
+ }
+
+ submit_bio(rw, bio);
+ return 0;
+}
+
+void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
+ block_t blk_addr, struct f2fs_io_info *fio)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
+ struct f2fs_bio_info *io;
+ bool is_read = is_read_io(fio->rw);
+
+ io = is_read ? &sbi->read_io : &sbi->write_io[btype];
+
+ verify_block_addr(sbi, blk_addr);
+
+ down_write(&io->io_rwsem);
+
+ if (!is_read)
+ inc_page_count(sbi, F2FS_WRITEBACK);
+
+ if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+ io->fio.rw != fio->rw))
+ __submit_merged_bio(io);
+alloc_new:
+ if (io->bio == NULL) {
+ int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+
+ io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+ io->fio = *fio;
+ }
+
+ if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) <
+ PAGE_CACHE_SIZE) {
+ __submit_merged_bio(io);
+ goto alloc_new;
+ }
+
+ io->last_block_in_bio = blk_addr;
+
+ up_write(&io->io_rwsem);
+ trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+}
+
/*
* Lock ordering for the change of data block address:
* ->data_page
@@ -37,7 +221,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
struct page *node_page = dn->node_page;
unsigned int ofs_in_node = dn->ofs_in_node;
- f2fs_wait_on_page_writeback(node_page, NODE, false);
+ f2fs_wait_on_page_writeback(node_page, NODE);
rn = F2FS_NODE(node_page);
@@ -51,38 +235,57 @@ int reserve_new_block(struct dnode_of_data *dn)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
- if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
- if (!inc_valid_block_count(sbi, dn->inode, 1))
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
return -ENOSPC;
trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
__set_data_blkaddr(dn, NEW_ADDR);
dn->data_blkaddr = NEW_ADDR;
+ mark_inode_dirty(dn->inode);
sync_inode_page(dn);
return 0;
}
+int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
+{
+ bool need_put = dn->inode_page ? false : true;
+ int err;
+
+ /* if inode_page exists, index should be zero */
+ f2fs_bug_on(!need_put && index);
+
+ err = get_dnode_of_data(dn, index, ALLOC_NODE);
+ if (err)
+ return err;
+
+ if (dn->data_blkaddr == NULL_ADDR)
+ err = reserve_new_block(dn);
+ if (err || need_put)
+ f2fs_put_dnode(dn);
+ return err;
+}
+
static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
struct buffer_head *bh_result)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
-#ifdef CONFIG_F2FS_STAT_FS
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-#endif
pgoff_t start_fofs, end_fofs;
block_t start_blkaddr;
+ if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ return 0;
+
read_lock(&fi->ext.ext_lock);
if (fi->ext.len == 0) {
read_unlock(&fi->ext.ext_lock);
return 0;
}
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->total_hit_ext++;
-#endif
+ stat_inc_total_hit(inode->i_sb);
+
start_fofs = fi->ext.fofs;
end_fofs = fi->ext.fofs + fi->ext.len - 1;
start_blkaddr = fi->ext.blk_addr;
@@ -100,9 +303,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
else
bh_result->b_size = UINT_MAX;
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->read_hit_ext++;
-#endif
+ stat_inc_read_hit(inode->i_sb);
read_unlock(&fi->ext.ext_lock);
return 1;
}
@@ -115,14 +316,18 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
struct f2fs_inode_info *fi = F2FS_I(dn->inode);
pgoff_t fofs, start_fofs, end_fofs;
block_t start_blkaddr, end_blkaddr;
+ int need_update = true;
- BUG_ON(blk_addr == NEW_ADDR);
+ f2fs_bug_on(blk_addr == NEW_ADDR);
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
/* Update the page address in the parent node */
__set_data_blkaddr(dn, blk_addr);
+ if (is_inode_flag_set(fi, FI_NO_EXTENT))
+ return;
+
write_lock(&fi->ext.ext_lock);
start_fofs = fi->ext.fofs;
@@ -169,14 +374,21 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
fofs - start_fofs + 1;
fi->ext.len -= fofs - start_fofs + 1;
}
- goto end_update;
+ } else {
+ need_update = false;
}
- write_unlock(&fi->ext.ext_lock);
- return;
+ /* Finally, if the extent is very fragmented, let's drop the cache. */
+ if (fi->ext.len < F2FS_MIN_EXTENT_LEN) {
+ fi->ext.len = 0;
+ set_inode_flag(fi, FI_NO_EXTENT);
+ need_update = true;
+ }
end_update:
write_unlock(&fi->ext.ext_lock);
- sync_inode_page(dn);
+ if (need_update)
+ sync_inode_page(dn);
+ return;
}
struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
@@ -202,10 +414,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return ERR_PTR(-ENOENT);
/* By fallocate(), there is no cached page, but with NEW_ADDR */
- if (dn.data_blkaddr == NEW_ADDR)
+ if (unlikely(dn.data_blkaddr == NEW_ADDR))
return ERR_PTR(-EINVAL);
- page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ page = grab_cache_page(mapping, index);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -214,11 +426,14 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return page;
}
- err = f2fs_readpage(sbi, page, dn.data_blkaddr,
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
sync ? READ_SYNC : READA);
+ if (err)
+ return ERR_PTR(err);
+
if (sync) {
wait_on_page_locked(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 0);
return ERR_PTR(-EIO);
}
@@ -240,7 +455,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
int err;
repeat:
- page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+ page = grab_cache_page(mapping, index);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -252,7 +467,7 @@ repeat:
}
f2fs_put_dnode(&dn);
- if (dn.data_blkaddr == NULL_ADDR) {
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
f2fs_put_page(page, 1);
return ERR_PTR(-ENOENT);
}
@@ -272,16 +487,16 @@ repeat:
return page;
}
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC);
if (err)
return ERR_PTR(err);
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
@@ -292,12 +507,12 @@ repeat:
* Caller ensures that this data page is never allocated.
* A new zero-filled data page is allocated in the page cache.
*
- * Also, caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
- * Note that, npage is set only by make_empty_dir.
+ * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
+ * Note that, ipage is set only by make_empty_dir.
*/
struct page *get_new_data_page(struct inode *inode,
- struct page *npage, pgoff_t index, bool new_i_size)
+ struct page *ipage, pgoff_t index, bool new_i_size)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct address_space *mapping = inode->i_mapping;
@@ -305,24 +520,16 @@ struct page *get_new_data_page(struct inode *inode,
struct dnode_of_data dn;
int err;
- set_new_dnode(&dn, inode, npage, npage, 0);
- err = get_dnode_of_data(&dn, index, ALLOC_NODE);
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ err = f2fs_reserve_block(&dn, index);
if (err)
return ERR_PTR(err);
-
- if (dn.data_blkaddr == NULL_ADDR) {
- if (reserve_new_block(&dn)) {
- if (!npage)
- f2fs_put_dnode(&dn);
- return ERR_PTR(-ENOSPC);
- }
- }
- if (!npage)
- f2fs_put_dnode(&dn);
repeat:
page = grab_cache_page(mapping, index);
- if (!page)
- return ERR_PTR(-ENOMEM);
+ if (!page) {
+ err = -ENOMEM;
+ goto put_err;
+ }
if (PageUptodate(page))
return page;
@@ -331,15 +538,18 @@ repeat:
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ READ_SYNC);
if (err)
- return ERR_PTR(err);
+ goto put_err;
+
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
- return ERR_PTR(-EIO);
+ err = -EIO;
+ goto put_err;
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
@@ -350,140 +560,206 @@ repeat:
i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
/* Only the directory inode sets new_i_size */
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
- mark_inode_dirty_sync(inode);
}
return page;
-}
-
-static void read_end_io(struct bio *bio, int err)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- do {
- struct page *page = bvec->bv_page;
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- } while (bvec >= bio->bi_io_vec);
- bio_put(bio);
+put_err:
+ f2fs_put_dnode(&dn);
+ return ERR_PTR(err);
}
-/*
- * Fill the locked page with data located in the block address.
- * Return unlocked page.
- */
-int f2fs_readpage(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, int type)
+static int __allocate_data_block(struct dnode_of_data *dn)
{
- struct block_device *bdev = sbi->sb->s_bdev;
- struct bio *bio;
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
+ struct f2fs_summary sum;
+ block_t new_blkaddr;
+ struct node_info ni;
+ int type;
- trace_f2fs_readpage(page, blk_addr, type);
+ if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
+ return -EPERM;
+ if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
+ return -ENOSPC;
- down_read(&sbi->bio_sem);
+ __set_data_blkaddr(dn, NEW_ADDR);
+ dn->data_blkaddr = NEW_ADDR;
- /* Allocate a new bio */
- bio = f2fs_bio_alloc(bdev, 1);
+ get_node_info(sbi, dn->nid, &ni);
+ set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- /* Initialize the bio */
- bio->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
- bio->bi_end_io = read_end_io;
+ type = CURSEG_WARM_DATA;
- if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
- bio_put(bio);
- up_read(&sbi->bio_sem);
- f2fs_put_page(page, 1);
- return -EFAULT;
- }
+ allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
+
+ /* direct IO doesn't use extent cache to maximize the performance */
+ set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ update_extent_cache(new_blkaddr, dn);
+ clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
- submit_bio(type, bio);
- up_read(&sbi->bio_sem);
+ dn->data_blkaddr = new_blkaddr;
return 0;
}
/*
- * This function should be used by the data read flow only where it
- * does not check the "create" flag that indicates block allocation.
- * The reason for this special functionality is to exploit VFS readahead
- * mechanism.
+ * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
+ * If original data blocks are allocated, then give them to blockdev.
+ * Otherwise,
+ * a. preallocate requested block addresses
+ * b. do not use extent cache for better performance
+ * c. give the block addresses to blockdev
*/
-static int get_data_block_ro(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
+static int __get_data_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create, bool fiemap)
{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
unsigned maxblocks = bh_result->b_size >> blkbits;
struct dnode_of_data dn;
- pgoff_t pgofs;
- int err;
+ int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
+ pgoff_t pgofs, end_offset;
+ int err = 0, ofs = 1;
+ bool allocated = false;
/* Get the page offset from the block offset(iblock) */
pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits));
- if (check_extent_cache(inode, pgofs, bh_result)) {
- trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
- return 0;
- }
+ if (check_extent_cache(inode, pgofs, bh_result))
+ goto out;
+
+ if (create)
+ f2fs_lock_op(sbi);
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ err = get_dnode_of_data(&dn, pgofs, mode);
if (err) {
- trace_f2fs_get_data_block(inode, iblock, bh_result, err);
- return (err == -ENOENT) ? 0 : err;
+ if (err == -ENOENT)
+ err = 0;
+ goto unlock_out;
}
+ if (dn.data_blkaddr == NEW_ADDR && !fiemap)
+ goto put_out;
- /* It does not support data allocation */
- BUG_ON(create);
+ if (dn.data_blkaddr != NULL_ADDR) {
+ map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ } else if (create) {
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto put_out;
+ allocated = true;
+ map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
+ } else {
+ goto put_out;
+ }
- if (dn.data_blkaddr != NEW_ADDR && dn.data_blkaddr != NULL_ADDR) {
- int i;
- unsigned int end_offset;
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ bh_result->b_size = (((size_t)1) << blkbits);
+ dn.ofs_in_node++;
+ pgofs++;
- end_offset = IS_INODE(dn.node_page) ?
- ADDRS_PER_INODE(F2FS_I(inode)) :
- ADDRS_PER_BLOCK;
+get_next:
+ if (dn.ofs_in_node >= end_offset) {
+ if (allocated)
+ sync_inode_page(&dn);
+ allocated = false;
+ f2fs_put_dnode(&dn);
- clear_buffer_new(bh_result);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, pgofs, mode);
+ if (err) {
+ if (err == -ENOENT)
+ err = 0;
+ goto unlock_out;
+ }
+ if (dn.data_blkaddr == NEW_ADDR && !fiemap)
+ goto put_out;
+
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ }
+ if (maxblocks > (bh_result->b_size >> blkbits)) {
+ block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+ if (blkaddr == NULL_ADDR && create) {
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto sync_out;
+ allocated = true;
+ blkaddr = dn.data_blkaddr;
+ }
/* Give more consecutive addresses for the read ahead */
- for (i = 0; i < end_offset - dn.ofs_in_node; i++)
- if (((datablock_addr(dn.node_page,
- dn.ofs_in_node + i))
- != (dn.data_blkaddr + i)) || maxblocks == i)
- break;
- map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
- bh_result->b_size = (i << blkbits);
+ if (blkaddr == (bh_result->b_blocknr + ofs)) {
+ ofs++;
+ dn.ofs_in_node++;
+ pgofs++;
+ bh_result->b_size += (((size_t)1) << blkbits);
+ goto get_next;
+ }
}
+sync_out:
+ if (allocated)
+ sync_inode_page(&dn);
+put_out:
f2fs_put_dnode(&dn);
- trace_f2fs_get_data_block(inode, iblock, bh_result, 0);
- return 0;
+unlock_out:
+ if (create)
+ f2fs_unlock_op(sbi);
+out:
+ trace_f2fs_get_data_block(inode, iblock, bh_result, err);
+ return err;
+}
+
+static int get_data_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ return __get_data_block(inode, iblock, bh_result, create, false);
+}
+
+static int get_data_block_fiemap(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ return __get_data_block(inode, iblock, bh_result, create, true);
+}
+
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ return generic_block_fiemap(inode, fieinfo,
+ start, len, get_data_block_fiemap);
}
static int f2fs_read_data_page(struct file *file, struct page *page)
{
- return mpage_readpage(page, get_data_block_ro);
+ struct inode *inode = page->mapping->host;
+ int ret;
+
+ trace_f2fs_readpage(page, DATA);
+
+ /* If the file has inline data, try to read it directlly */
+ if (f2fs_has_inline_data(inode))
+ ret = f2fs_read_inline_data(inode, page);
+ else
+ ret = mpage_readpage(page, get_data_block);
+
+ return ret;
}
static int f2fs_read_data_pages(struct file *file,
struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
- return mpage_readpages(mapping, pages, nr_pages, get_data_block_ro);
+ struct inode *inode = file->f_mapping->host;
+
+ /* If the file has inline data, skip readpages */
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ return mpage_readpages(mapping, pages, nr_pages, get_data_block);
}
-int do_write_data_page(struct page *page)
+int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
{
struct inode *inode = page->mapping->host;
- block_t old_blk_addr, new_blk_addr;
+ block_t old_blkaddr, new_blkaddr;
struct dnode_of_data dn;
int err = 0;
@@ -492,10 +768,10 @@ int do_write_data_page(struct page *page)
if (err)
return err;
- old_blk_addr = dn.data_blkaddr;
+ old_blkaddr = dn.data_blkaddr;
/* This page is already truncated */
- if (old_blk_addr == NULL_ADDR)
+ if (old_blkaddr == NULL_ADDR)
goto out_writepage;
set_page_writeback(page);
@@ -504,15 +780,13 @@ int do_write_data_page(struct page *page)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(old_blk_addr != NEW_ADDR &&
+ if (unlikely(old_blkaddr != NEW_ADDR &&
!is_cold_data(page) &&
need_inplace_update(inode))) {
- rewrite_data_page(F2FS_SB(inode->i_sb), page,
- old_blk_addr);
+ rewrite_data_page(page, old_blkaddr, fio);
} else {
- write_data_page(inode, page, &dn,
- old_blk_addr, &new_blk_addr);
- update_extent_cache(new_blk_addr, &dn);
+ write_data_page(page, &dn, &new_blkaddr, fio);
+ update_extent_cache(new_blkaddr, &dn);
}
out_writepage:
f2fs_put_dnode(&dn);
@@ -527,9 +801,15 @@ static int f2fs_write_data_page(struct page *page,
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_CACHE_SHIFT;
- unsigned offset;
+ unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ };
+
+ trace_f2fs_writepage(page, DATA);
if (page->index < end_index)
goto write;
@@ -539,55 +819,50 @@ static int f2fs_write_data_page(struct page *page,
* this page does not have to be written to disk.
*/
offset = i_size & (PAGE_CACHE_SIZE - 1);
- if ((page->index >= end_index + 1) || !offset) {
- if (S_ISDIR(inode->i_mode)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
- inode_dec_dirty_dents(inode);
- }
+ if ((page->index >= end_index + 1) || !offset)
goto out;
- }
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
write:
- if (sbi->por_doing) {
- err = AOP_WRITEPAGE_ACTIVATE;
+ if (unlikely(sbi->por_doing))
goto redirty_out;
- }
/* Dentry blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
- inode_dec_dirty_dents(inode);
- err = do_write_data_page(page);
- } else {
- int ilock = mutex_lock_op(sbi);
- err = do_write_data_page(page);
- mutex_unlock_op(sbi, ilock);
- need_balance_fs = true;
+ err = do_write_data_page(page, &fio);
+ goto done;
}
- if (err == -ENOENT)
- goto out;
- else if (err)
+
+ if (!wbc->for_reclaim)
+ need_balance_fs = true;
+ else if (has_not_enough_free_secs(sbi, 0))
goto redirty_out;
- if (wbc->for_reclaim)
- f2fs_submit_bio(sbi, DATA, true);
+ f2fs_lock_op(sbi);
+ if (f2fs_has_inline_data(inode) || f2fs_may_inline(inode))
+ err = f2fs_write_inline_data(inode, page, offset);
+ else
+ err = do_write_data_page(page, &fio);
+ f2fs_unlock_op(sbi);
+done:
+ if (err && err != -ENOENT)
+ goto redirty_out;
clear_cold_data(page);
out:
+ inode_dec_dirty_dents(inode);
unlock_page(page);
if (need_balance_fs)
f2fs_balance_fs(sbi);
+ if (wbc->for_reclaim)
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
return 0;
redirty_out:
- wbc->pages_skipped++;
- set_page_dirty(page);
- return err;
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
}
-#define MAX_DESIRED_PAGES_WP 4096
-
static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
@@ -604,17 +879,20 @@ static int f2fs_write_data_pages(struct address_space *mapping,
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
bool locked = false;
int ret;
- long excess_nrtw = 0, desired_nrtw;
+ long diff;
+
+ trace_f2fs_writepages(mapping->host, wbc, DATA);
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
- if (wbc->nr_to_write < MAX_DESIRED_PAGES_WP) {
- desired_nrtw = MAX_DESIRED_PAGES_WP;
- excess_nrtw = desired_nrtw - wbc->nr_to_write;
- wbc->nr_to_write = desired_nrtw;
- }
+ if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
+ get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) &&
+ available_free_memory(sbi, DIRTY_DENTS))
+ goto skip_write;
+
+ diff = nr_pages_to_write(sbi, DATA, wbc);
if (!S_ISDIR(inode->i_mode)) {
mutex_lock(&sbi->writepages);
@@ -623,12 +901,17 @@ static int f2fs_write_data_pages(struct address_space *mapping,
ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
if (locked)
mutex_unlock(&sbi->writepages);
- f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
+
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
remove_dirty_dir_inode(inode);
- wbc->nr_to_write -= excess_nrtw;
+ wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
return ret;
+
+skip_write:
+ wbc->pages_skipped += get_dirty_dents(inode);
+ return 0;
}
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
@@ -641,30 +924,44 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
int err = 0;
- int ilock;
+
+ trace_f2fs_write_begin(inode, pos, len, flags);
f2fs_balance_fs(sbi);
repeat:
+ err = f2fs_convert_inline_data(inode, pos + len);
+ if (err)
+ return err;
+
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
+
+ /* to avoid latency during memory pressure */
+ unlock_page(page);
+
*pagep = page;
- ilock = mutex_lock_op(sbi);
+ if (f2fs_has_inline_data(inode) && (pos + len) <= MAX_INLINE_DATA)
+ goto inline_data;
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, ALLOC_NODE);
- if (err)
- goto err;
+ err = f2fs_reserve_block(&dn, index);
+ f2fs_unlock_op(sbi);
- if (dn.data_blkaddr == NULL_ADDR)
- err = reserve_new_block(&dn);
-
- f2fs_put_dnode(&dn);
- if (err)
- goto err;
+ if (err) {
+ f2fs_put_page(page, 0);
+ return err;
+ }
+inline_data:
+ lock_page(page);
+ if (unlikely(page->mapping != mapping)) {
+ f2fs_put_page(page, 1);
+ goto repeat;
+ }
- mutex_unlock_op(sbi, ilock);
+ f2fs_wait_on_page_writeback(page, DATA);
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
return 0;
@@ -681,15 +978,25 @@ repeat:
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
- err = f2fs_readpage(sbi, page, dn.data_blkaddr, READ_SYNC);
- if (err)
- return err;
+ if (f2fs_has_inline_data(inode)) {
+ err = f2fs_read_inline_data(inode, page);
+ if (err) {
+ page_cache_release(page);
+ return err;
+ }
+ } else {
+ err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+ READ_SYNC);
+ if (err)
+ return err;
+ }
+
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
return -EIO;
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
@@ -698,11 +1005,6 @@ out:
SetPageUptodate(page);
clear_cold_data(page);
return 0;
-
-err:
- mutex_unlock_op(sbi, ilock);
- f2fs_put_page(page, 1);
- return err;
}
static int f2fs_write_end(struct file *file,
@@ -712,6 +1014,8 @@ static int f2fs_write_end(struct file *file,
{
struct inode *inode = page->mapping->host;
+ trace_f2fs_write_end(inode, pos, len, copied);
+
SetPageUptodate(page);
set_page_dirty(page);
@@ -721,34 +1025,53 @@ static int f2fs_write_end(struct file *file,
update_inode_page(inode);
}
- unlock_page(page);
- page_cache_release(page);
+ f2fs_put_page(page, 1);
return copied;
}
+static int check_direct_IO(struct inode *inode, int rw,
+ struct iov_iter *iter, loff_t offset)
+{
+ unsigned blocksize_mask = inode->i_sb->s_blocksize - 1;
+
+ if (rw == READ)
+ return 0;
+
+ if (offset & blocksize_mask)
+ return -EINVAL;
+
+ if (iov_iter_alignment(iter) & blocksize_mask)
+ return -EINVAL;
+
+ return 0;
+}
+
static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- if (rw == WRITE)
+ /* Let buffer I/O handle the inline data case. */
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ if (check_direct_IO(inode, rw, iter, offset))
return 0;
- /* Needs synchronization with the cleaner */
- return blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- get_data_block_ro);
+ /* clear fsync mark to recover these blocks */
+ fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino);
+
+ return blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ get_data_block);
}
static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
unsigned int length)
{
struct inode *inode = page->mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- if (S_ISDIR(inode->i_mode) && PageDirty(page)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ if (PageDirty(page))
inode_dec_dirty_dents(inode);
- }
ClearPagePrivate(page);
}
@@ -763,7 +1086,11 @@ static int f2fs_set_data_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
+ trace_f2fs_set_page_dirty(page, DATA);
+
SetPageUptodate(page);
+ mark_inode_dirty(inode);
+
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
set_dirty_dir_page(inode, page);
@@ -774,7 +1101,12 @@ static int f2fs_set_data_page_dirty(struct page *page)
static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
- return generic_block_bmap(mapping, block, get_data_block_ro);
+ struct inode *inode = mapping->host;
+
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ return generic_block_bmap(mapping, block, get_data_block);
}
const struct address_space_operations f2fs_dblock_aops = {
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index a84b0a8e685..b52c12cf587 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -24,7 +24,7 @@
#include "gc.h"
static LIST_HEAD(f2fs_stat_list);
-static struct dentry *debugfs_root;
+static struct dentry *f2fs_debugfs_root;
static DEFINE_MUTEX(f2fs_stat_mutex);
static void update_general_status(struct f2fs_sb_info *sbi)
@@ -45,14 +45,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->valid_count = valid_user_blocks(sbi);
si->valid_node_count = valid_node_count(sbi);
si->valid_inode_count = valid_inode_count(sbi);
+ si->inline_inode = sbi->inline_inode;
si->utilization = utilization(sbi);
si->free_segs = free_segments(sbi);
si->free_secs = free_sections(sbi);
si->prefree_count = prefree_segments(sbi);
si->dirty_count = dirty_segments(sbi);
- si->node_pages = sbi->node_inode->i_mapping->nrpages;
- si->meta_pages = sbi->meta_inode->i_mapping->nrpages;
+ si->node_pages = NODE_MAPPING(sbi)->nrpages;
+ si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
si->sits = SIT_I(sbi)->dirty_sentries;
si->fnids = NM_I(sbi)->fcnt;
@@ -85,7 +86,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
{
struct f2fs_stat_info *si = F2FS_STAT(sbi);
unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist;
- struct sit_info *sit_i = SIT_I(sbi);
unsigned int segno, vblocks;
int ndirty = 0;
@@ -93,7 +93,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
total_vblocks = 0;
blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
hblks_per_sec = blks_per_sec / 2;
- mutex_lock(&sit_i->sentry_lock);
for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) {
vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
dist = abs(vblocks - hblks_per_sec);
@@ -104,7 +103,6 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
ndirty++;
}
}
- mutex_unlock(&sit_i->sentry_lock);
dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
si->bimodal = bimodal / dist;
if (si->dirty_count)
@@ -165,9 +163,9 @@ get_cache:
/* free nids */
si->cache_mem = NM_I(sbi)->fcnt;
si->cache_mem += NM_I(sbi)->nat_cnt;
- npages = sbi->node_inode->i_mapping->nrpages;
+ npages = NODE_MAPPING(sbi)->nrpages;
si->cache_mem += npages << PAGE_CACHE_SHIFT;
- npages = sbi->meta_inode->i_mapping->nrpages;
+ npages = META_MAPPING(sbi)->nrpages;
si->cache_mem += npages << PAGE_CACHE_SHIFT;
si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry);
si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
@@ -200,6 +198,8 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "Other: %u)\n - Data: %u\n",
si->valid_node_count - si->valid_inode_count,
si->valid_count - si->valid_node_count);
+ seq_printf(s, " - Inline_data Inode: %u\n",
+ si->inline_inode);
seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n",
si->main_area_segs, si->main_area_sections,
si->main_area_zones);
@@ -233,6 +233,7 @@ static int stat_show(struct seq_file *s, void *v)
si->dirty_count);
seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n",
si->prefree_count, si->free_segs, si->free_secs);
+ seq_printf(s, "CP calls: %d\n", si->cp_count);
seq_printf(s, "GC calls: %d (BG: %d)\n",
si->call_count, si->bg_gc);
seq_printf(s, " - data segments : %d\n", si->data_segs);
@@ -242,17 +243,17 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - node blocks : %d\n", si->node_blks);
seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
si->hit_ext, si->total_ext);
- seq_printf(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - nodes %4d in %4d\n",
+ seq_puts(s, "\nBalancing F2FS Async:\n");
+ seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
- seq_printf(s, " - dents %4d in dirs:%4d\n",
+ seq_printf(s, " - dents: %4d in dirs:%4d\n",
si->ndirty_dent, si->ndirty_dirs);
- seq_printf(s, " - meta %4d in %4d\n",
+ seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
- seq_printf(s, " - NATs %5d > %lu\n",
- si->nats, NM_WOUT_THRESHOLD);
- seq_printf(s, " - SITs: %5d\n - free_nids: %5d\n",
- si->sits, si->fnids);
+ seq_printf(s, " - NATs: %9d\n - SITs: %9d\n",
+ si->nats, si->sits);
+ seq_printf(s, " - free_nids: %9d\n",
+ si->fnids);
seq_puts(s, "\nDistribution of User Blocks:");
seq_puts(s, " [ valid | invalid | free ]\n");
seq_puts(s, " [");
@@ -340,14 +341,32 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
void __init f2fs_create_root_stats(void)
{
- debugfs_root = debugfs_create_dir("f2fs", NULL);
- if (debugfs_root)
- debugfs_create_file("status", S_IRUGO, debugfs_root,
- NULL, &stat_fops);
+ struct dentry *file;
+
+ f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
+ if (!f2fs_debugfs_root)
+ goto bail;
+
+ file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
+ NULL, &stat_fops);
+ if (!file)
+ goto free_debugfs_dir;
+
+ return;
+
+free_debugfs_dir:
+ debugfs_remove(f2fs_debugfs_root);
+
+bail:
+ f2fs_debugfs_root = NULL;
+ return;
}
void f2fs_destroy_root_stats(void)
{
- debugfs_remove_recursive(debugfs_root);
- debugfs_root = NULL;
+ if (!f2fs_debugfs_root)
+ return;
+
+ debugfs_remove_recursive(f2fs_debugfs_root);
+ f2fs_debugfs_root = NULL;
}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 384c6daf9a8..a4addd72ebb 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -21,12 +21,12 @@ static unsigned long dir_blocks(struct inode *inode)
>> PAGE_CACHE_SHIFT;
}
-static unsigned int dir_buckets(unsigned int level)
+static unsigned int dir_buckets(unsigned int level, int dir_level)
{
- if (level < MAX_DIR_HASH_DEPTH / 2)
- return 1 << level;
+ if (level + dir_level < MAX_DIR_HASH_DEPTH / 2)
+ return 1 << (level + dir_level);
else
- return 1 << ((MAX_DIR_HASH_DEPTH / 2) - 1);
+ return MAX_DIR_BUCKETS;
}
static unsigned int bucket_blocks(unsigned int level)
@@ -65,13 +65,14 @@ static void set_de_type(struct f2fs_dir_entry *de, struct inode *inode)
de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
}
-static unsigned long dir_block_index(unsigned int level, unsigned int idx)
+static unsigned long dir_block_index(unsigned int level,
+ int dir_level, unsigned int idx)
{
unsigned long i;
unsigned long bidx = 0;
for (i = 0; i < level; i++)
- bidx += dir_buckets(i) * bucket_blocks(i);
+ bidx += dir_buckets(i, dir_level) * bucket_blocks(i);
bidx += idx * bucket_blocks(level);
return bidx;
}
@@ -93,16 +94,21 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
f2fs_hash_t namehash, struct page **res_page)
{
struct f2fs_dir_entry *de;
- unsigned long bit_pos, end_pos, next_pos;
+ unsigned long bit_pos = 0;
struct f2fs_dentry_block *dentry_blk = kmap(dentry_page);
- int slots;
+ const void *dentry_bits = &dentry_blk->dentry_bitmap;
+ int max_len = 0;
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK, 0);
while (bit_pos < NR_DENTRY_IN_BLOCK) {
+ if (!test_bit_le(bit_pos, dentry_bits)) {
+ if (bit_pos == 0)
+ max_len = 1;
+ else if (!test_bit_le(bit_pos - 1, dentry_bits))
+ max_len++;
+ bit_pos++;
+ continue;
+ }
de = &dentry_blk->dentry[bit_pos];
- slots = GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
-
if (early_match_name(name, namelen, namehash, de)) {
if (!memcmp(dentry_blk->filename[bit_pos],
name, namelen)) {
@@ -110,20 +116,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
goto found;
}
}
- next_pos = bit_pos + slots;
- bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
- NR_DENTRY_IN_BLOCK, next_pos);
- if (bit_pos >= NR_DENTRY_IN_BLOCK)
- end_pos = NR_DENTRY_IN_BLOCK;
- else
- end_pos = bit_pos;
- if (*max_slots < end_pos - next_pos)
- *max_slots = end_pos - next_pos;
+ if (max_len > *max_slots) {
+ *max_slots = max_len;
+ max_len = 0;
+ }
+ bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
}
de = NULL;
kunmap(dentry_page);
found:
+ if (max_len > *max_slots)
+ *max_slots = max_len;
return de;
}
@@ -139,12 +143,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
bool room = false;
int max_slots = 0;
- BUG_ON(level > MAX_DIR_HASH_DEPTH);
+ f2fs_bug_on(level > MAX_DIR_HASH_DEPTH);
- nbucket = dir_buckets(level);
+ nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
- bidx = dir_block_index(level, le32_to_cpu(namehash) % nbucket);
+ bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+ le32_to_cpu(namehash) % nbucket);
end_block = bidx + nblock;
for (; bidx < end_block; bidx++) {
@@ -190,9 +195,6 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
unsigned int max_depth;
unsigned int level;
- if (namelen > F2FS_NAME_LEN)
- return NULL;
-
if (npages == 0)
return NULL;
@@ -251,7 +253,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
struct page *page, struct inode *inode)
{
lock_page(page);
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode);
kunmap(page);
@@ -259,20 +261,19 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
mark_inode_dirty(dir);
- /* update parent inode number before releasing dentry page */
- F2FS_I(inode)->i_pino = dir->i_ino;
-
f2fs_put_page(page, 1);
}
static void init_dent_inode(const struct qstr *name, struct page *ipage)
{
- struct f2fs_node *rn;
+ struct f2fs_inode *ri;
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
/* copy name info. to this inode page */
- rn = F2FS_NODE(ipage);
- rn->i.i_namelen = cpu_to_le32(name->len);
- memcpy(rn->i.i_name, name->name, name->len);
+ ri = F2FS_INODE(ipage);
+ ri->i_namelen = cpu_to_le32(name->len);
+ memcpy(ri->i_name, name->name, name->len);
set_page_dirty(ipage);
}
@@ -346,21 +347,18 @@ static struct page *init_inode_metadata(struct inode *inode,
goto error;
}
- err = f2fs_init_acl(inode, dir);
+ err = f2fs_init_acl(inode, dir, page);
if (err)
- goto error;
+ goto put_error;
err = f2fs_init_security(inode, dir, name, page);
if (err)
- goto error;
-
- wait_on_page_writeback(page);
+ goto put_error;
} else {
page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
if (IS_ERR(page))
return page;
- wait_on_page_writeback(page);
set_cold_node(inode, page);
}
@@ -376,8 +374,13 @@ static struct page *init_inode_metadata(struct inode *inode,
}
return page;
-error:
+put_error:
f2fs_put_page(page, 1);
+error:
+ /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
+ truncate_inode_pages(&inode->i_data, 0);
+ truncate_blocks(inode, 0);
+ remove_dirty_dir_inode(inode);
remove_inode_page(inode);
return ERR_PTR(err);
}
@@ -393,16 +396,13 @@ static void update_parent_metadata(struct inode *dir, struct inode *inode,
clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
}
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(dir);
+
if (F2FS_I(dir)->i_current_depth != current_depth) {
F2FS_I(dir)->i_current_depth = current_depth;
set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
}
- if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR))
- update_inode_page(dir);
- else
- mark_inode_dirty(dir);
-
if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK))
clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
}
@@ -432,10 +432,11 @@ next:
}
/*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
*/
-int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode)
+int __f2fs_add_link(struct inode *dir, const struct qstr *name,
+ struct inode *inode)
{
unsigned int bit_pos;
unsigned int level;
@@ -461,17 +462,18 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *in
}
start:
- if (current_depth == MAX_DIR_HASH_DEPTH)
+ if (unlikely(current_depth == MAX_DIR_HASH_DEPTH))
return -ENOSPC;
/* Increase the depth, if required */
if (level == current_depth)
++current_depth;
- nbucket = dir_buckets(level);
+ nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
nblock = bucket_blocks(level);
- bidx = dir_block_index(level, (le32_to_cpu(dentry_hash) % nbucket));
+ bidx = dir_block_index(level, F2FS_I(dir)->i_dir_level,
+ (le32_to_cpu(dentry_hash) % nbucket));
for (block = bidx; block <= (bidx + nblock - 1); block++) {
dentry_page = get_new_data_page(dir, NULL, block, true);
@@ -491,8 +493,9 @@ start:
++level;
goto start;
add_dentry:
- wait_on_page_writeback(dentry_page);
+ f2fs_wait_on_page_writeback(dentry_page, DATA);
+ down_write(&F2FS_I(inode)->i_sem);
page = init_inode_metadata(inode, dir, name);
if (IS_ERR(page)) {
err = PTR_ERR(page);
@@ -515,7 +518,12 @@ add_dentry:
update_parent_metadata(dir, inode, current_depth);
fail:
- clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+ up_write(&F2FS_I(inode)->i_sem);
+
+ if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) {
+ update_inode_page(dir);
+ clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR);
+ }
kunmap(dentry_page);
f2fs_put_page(dentry_page, 1);
return err;
@@ -532,13 +540,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
unsigned int bit_pos;
struct address_space *mapping = page->mapping;
struct inode *dir = mapping->host;
- struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
void *kaddr = page_address(page);
int i;
lock_page(page);
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
dentry_blk = (struct f2fs_dentry_block *)kaddr;
bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry;
@@ -554,20 +561,22 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
- if (inode && S_ISDIR(inode->i_mode)) {
- drop_nlink(dir);
- update_inode_page(dir);
- } else {
- mark_inode_dirty(dir);
- }
-
if (inode) {
+ struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
+
+ down_write(&F2FS_I(inode)->i_sem);
+
+ if (S_ISDIR(inode->i_mode)) {
+ drop_nlink(dir);
+ update_inode_page(dir);
+ }
inode->i_ctime = CURRENT_TIME;
drop_nlink(inode);
if (S_ISDIR(inode->i_mode)) {
drop_nlink(inode);
i_size_write(inode, 0);
}
+ up_write(&F2FS_I(inode)->i_sem);
update_inode_page(inode);
if (inode->i_nlink == 0)
@@ -580,7 +589,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
truncate_hole(dir, page->index, page->index + 1);
clear_page_dirty_for_io(page);
ClearPageUptodate(page);
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
inode_dec_dirty_dents(dir);
}
f2fs_put_page(page, 1);
@@ -631,12 +639,18 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
struct f2fs_dentry_block *dentry_blk = NULL;
struct f2fs_dir_entry *de = NULL;
struct page *dentry_page = NULL;
+ struct file_ra_state *ra = &file->f_ra;
unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
unsigned char d_type = DT_UNKNOWN;
bit_pos = ((unsigned long)ctx->pos % NR_DENTRY_IN_BLOCK);
- for ( ; n < npages; n++) {
+ /* readahead for multi pages of dir */
+ if (npages - n > 1 && !ra_has_index(ra, n))
+ page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+ min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
+
+ for (; n < npages; n++) {
dentry_page = get_lock_data_page(inode, n);
if (IS_ERR(dentry_page))
continue;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 608f0df5b91..58df97e174d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -18,6 +18,15 @@
#include <linux/crc32.h>
#include <linux/magic.h>
#include <linux/kobject.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_F2FS_CHECK_FS
+#define f2fs_bug_on(condition) BUG_ON(condition)
+#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
+#else
+#define f2fs_bug_on(condition)
+#define f2fs_down_write(x, y) down_write(x)
+#endif
/*
* For mount options
@@ -30,6 +39,8 @@
#define F2FS_MOUNT_POSIX_ACL 0x00000020
#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000040
#define F2FS_MOUNT_INLINE_XATTR 0x00000080
+#define F2FS_MOUNT_INLINE_DATA 0x00000100
+#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -78,6 +89,16 @@ enum {
SIT_BITMAP
};
+/*
+ * For CP/NAT/SIT/SSA readahead
+ */
+enum {
+ META_CP,
+ META_NAT,
+ META_SIT,
+ META_SSA
+};
+
/* for the list of orphan inodes */
struct orphan_inode_entry {
struct list_head list; /* list head */
@@ -90,6 +111,13 @@ struct dir_inode_entry {
struct inode *inode; /* vfs inode pointer */
};
+/* for the list of blockaddresses to be discarded */
+struct discard_entry {
+ struct list_head list; /* list head */
+ block_t blkaddr; /* block address to be discarded */
+ int len; /* # of consecutive blocks of the discard */
+};
+
/* for the list of fsync inodes, used only during recovery */
struct fsync_inode_entry {
struct list_head list; /* list head */
@@ -148,13 +176,17 @@ enum {
LOOKUP_NODE, /* look up a node without readahead */
LOOKUP_NODE_RA, /*
* look up a node with readahead called
- * by get_datablock_ro.
+ * by get_data_block.
*/
};
#define F2FS_LINK_MAX 32000 /* maximum link count per file */
+#define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */
+
/* for in-memory extent cache entry */
+#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */
+
struct extent_info {
rwlock_t ext_lock; /* rwlock for consistency */
unsigned int fofs; /* start offset in a file */
@@ -168,22 +200,27 @@ struct extent_info {
#define FADVISE_COLD_BIT 0x01
#define FADVISE_LOST_PINO_BIT 0x02
+#define DEF_DIR_LEVEL 0
+
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
unsigned long i_flags; /* keep an inode flags for ioctl */
unsigned char i_advise; /* use to give file attribute hints */
+ unsigned char i_dir_level; /* use for dentry level for large dir */
unsigned int i_current_depth; /* use only in directory structure */
unsigned int i_pino; /* parent inode number */
umode_t i_acl_mode; /* keep file acl mode temporarily */
/* Use below internally in f2fs*/
unsigned long flags; /* use to pass per-file flags */
+ struct rw_semaphore i_sem; /* protect fi info */
atomic_t dirty_dents; /* # of dirty dentry pages */
f2fs_hash_t chash; /* hash value of given file name */
unsigned int clevel; /* maximum level of given file name */
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
+ struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
};
static inline void get_extent_info(struct extent_info *ext,
@@ -209,7 +246,9 @@ static inline void set_raw_extent(struct extent_info *ext,
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
+ nid_t available_nids; /* maximum available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
+ unsigned int ram_thresh; /* control the memory footprint */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -219,6 +258,7 @@ struct f2fs_nm_info {
struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */
/* free node ids management */
+ struct radix_tree_root free_nid_root;/* root of the free_nid cache */
struct list_head free_nid_list; /* a list for free nids */
spinlock_t free_nid_list_lock; /* protect free nid list */
unsigned int fcnt; /* the number of free node id */
@@ -281,15 +321,27 @@ enum {
NO_CHECK_TYPE
};
+struct flush_cmd {
+ struct flush_cmd *next;
+ struct completion wait;
+ int ret;
+};
+
+struct flush_cmd_control {
+ struct task_struct *f2fs_issue_flush; /* flush thread */
+ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
+ struct flush_cmd *issue_list; /* list for command issue */
+ struct flush_cmd *dispatch_list; /* list for command dispatch */
+ spinlock_t issue_lock; /* for issue list lock */
+ struct flush_cmd *issue_tail; /* list tail of issue list */
+};
+
struct f2fs_sm_info {
struct sit_info *sit_info; /* whole segment information */
struct free_segmap_info *free_info; /* free segment information */
struct dirty_seglist_info *dirty_info; /* dirty segment information */
struct curseg_info *curseg_array; /* active segment information */
- struct list_head wblist_head; /* list of under-writeback pages */
- spinlock_t wblist_lock; /* lock for checkpoint */
-
block_t seg0_blkaddr; /* block address of 0'th segment */
block_t main_blkaddr; /* start block address of main area */
block_t ssa_blkaddr; /* start block address of SSA area */
@@ -298,6 +350,21 @@ struct f2fs_sm_info {
unsigned int main_segments; /* # of segments in main area */
unsigned int reserved_segments; /* # of reserved segments */
unsigned int ovp_segments; /* # of overprovision segments */
+
+ /* a threshold to reclaim prefree segments */
+ unsigned int rec_prefree_segments;
+
+ /* for small discard management */
+ struct list_head discard_list; /* 4KB discard list */
+ int nr_discards; /* # of discards in the list */
+ int max_discards; /* max. discards to be issued */
+
+ unsigned int ipu_policy; /* in-place-update policy */
+ unsigned int min_ipu_util; /* in-place-update threshold */
+
+ /* for flush command control */
+ struct flush_cmd_control *cmd_control_info;
+
};
/*
@@ -318,14 +385,6 @@ enum count_type {
};
/*
- * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS].
- * The checkpoint procedure blocks all the locks in this fs_lock array.
- * Some FS operations grab free locks, and if there is no free lock,
- * then wait to grab a lock in a round-robin manner.
- */
-#define NR_GLOBAL_LOCKS 8
-
-/*
* The below are the page types of bios used in submti_bio().
* The available types are:
* DATA User data pages. It operates as async mode.
@@ -336,6 +395,7 @@ enum count_type {
* with waiting the bio's completion
* ... Only can be used with META.
*/
+#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
enum page_type {
DATA,
NODE,
@@ -344,6 +404,20 @@ enum page_type {
META_FLUSH,
};
+struct f2fs_io_info {
+ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
+ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+};
+
+#define is_read_io(rw) (((rw) & 1) == READ)
+struct f2fs_bio_info {
+ struct f2fs_sb_info *sbi; /* f2fs superblock */
+ struct bio *bio; /* bios to merge */
+ sector_t last_block_in_bio; /* last block number */
+ struct f2fs_io_info fio; /* store buffered io info. */
+ struct rw_semaphore io_rwsem; /* blocking op for bio */
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
@@ -357,25 +431,27 @@ struct f2fs_sb_info {
/* for segment-related operations */
struct f2fs_sm_info *sm_info; /* segment manager */
- struct bio *bio[NR_PAGE_TYPE]; /* bios to merge */
- sector_t last_block_in_bio[NR_PAGE_TYPE]; /* last block number */
- struct rw_semaphore bio_sem; /* IO semaphore */
+
+ /* for bio operations */
+ struct f2fs_bio_info read_io; /* for read bios */
+ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */
+ struct completion *wait_io; /* for completion bios */
/* for checkpoint */
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
- struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */
+ struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct mutex node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
- unsigned char next_lock_num; /* round-robin global locks */
- int por_doing; /* recovery is doing or not */
- int on_build_free_nids; /* build_free_nids is doing */
+ bool por_doing; /* recovery is doing or not */
+ wait_queue_head_t cp_wait;
/* for orphan inode management */
struct list_head orphan_inode_list; /* orphan inode list */
- struct mutex orphan_inode_mutex; /* for orphan inode list */
+ spinlock_t orphan_inode_lock; /* for orphan inode list */
unsigned int n_orphans; /* # of orphan inodes */
+ unsigned int max_orphans; /* max orphan inodes */
/* for directory inode management */
struct list_head dir_inode_list; /* dir inode list */
@@ -397,6 +473,7 @@ struct f2fs_sb_info {
unsigned int total_valid_node_count; /* valid node block count */
unsigned int total_valid_inode_count; /* valid inode count */
int active_logs; /* # of active logs */
+ int dir_level; /* directory level */
block_t user_block_count; /* # of user blocks */
block_t total_valid_block_count; /* # of valid blocks */
@@ -412,6 +489,9 @@ struct f2fs_sb_info {
struct f2fs_gc_kthread *gc_thread; /* GC thread */
unsigned int cur_victim_sec; /* current victim section num */
+ /* maximum # of trials to find a victim segment for SSR and GC */
+ unsigned int max_victim_search;
+
/*
* for stat information.
* one is for the LFS mode, and the other is for the SSR mode.
@@ -421,6 +501,7 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
+ int inline_inode; /* # of inline_data inodes */
int bg_gc; /* background gc calls */
unsigned int n_dirty_dirs; /* # of dir inodes */
#endif
@@ -460,6 +541,11 @@ static inline struct f2fs_node *F2FS_NODE(struct page *page)
return (struct f2fs_node *)page_address(page);
}
+static inline struct f2fs_inode *F2FS_INODE(struct page *page)
+{
+ return &((struct f2fs_node *)page_address(page))->i;
+}
+
static inline struct f2fs_nm_info *NM_I(struct f2fs_sb_info *sbi)
{
return (struct f2fs_nm_info *)(sbi->nm_info);
@@ -485,6 +571,16 @@ static inline struct dirty_seglist_info *DIRTY_I(struct f2fs_sb_info *sbi)
return (struct dirty_seglist_info *)(SM_I(sbi)->dirty_info);
}
+static inline struct address_space *META_MAPPING(struct f2fs_sb_info *sbi)
+{
+ return sbi->meta_inode->i_mapping;
+}
+
+static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
+{
+ return sbi->node_inode->i_mapping;
+}
+
static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
{
sbi->s_dirty = 1;
@@ -520,48 +616,24 @@ static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f)
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
-static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
+static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- int i;
-
- for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
- /*
- * This is the only time we take multiple fs_lock[]
- * instances; the order is immaterial since we
- * always hold cp_mutex, which serializes multiple
- * such operations.
- */
- mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
- }
+ down_read(&sbi->cp_rwsem);
}
-static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
+static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- int i = 0;
- for (; i < NR_GLOBAL_LOCKS; i++)
- mutex_unlock(&sbi->fs_lock[i]);
+ up_read(&sbi->cp_rwsem);
}
-static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
+static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- unsigned char next_lock = sbi->next_lock_num % NR_GLOBAL_LOCKS;
- int i = 0;
-
- for (; i < NR_GLOBAL_LOCKS; i++)
- if (mutex_trylock(&sbi->fs_lock[i]))
- return i;
-
- mutex_lock(&sbi->fs_lock[next_lock]);
- sbi->next_lock_num++;
- return next_lock;
+ f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex);
}
-static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
+static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- if (ilock < 0)
- return;
- BUG_ON(ilock >= NR_GLOBAL_LOCKS);
- mutex_unlock(&sbi->fs_lock[ilock]);
+ up_write(&sbi->cp_rwsem);
}
/*
@@ -569,8 +641,9 @@ static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
*/
static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
{
- WARN_ON((nid >= NM_I(sbi)->max_nid));
- if (nid >= NM_I(sbi)->max_nid)
+ if (unlikely(nid < F2FS_ROOT_INO(sbi)))
+ return -EINVAL;
+ if (unlikely(nid >= NM_I(sbi)->max_nid))
return -EINVAL;
return 0;
}
@@ -583,9 +656,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid)
static inline int F2FS_HAS_BLOCKS(struct inode *inode)
{
if (F2FS_I(inode)->i_xattr_nid)
- return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1);
+ return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1;
else
- return (inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS);
+ return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS;
+}
+
+static inline bool f2fs_has_xattr_block(unsigned int ofs)
+{
+ return ofs == XATTR_NODE_OFFSET;
}
static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
@@ -596,7 +674,7 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
spin_lock(&sbi->stat_lock);
valid_block_count =
sbi->total_valid_block_count + (block_t)count;
- if (valid_block_count > sbi->user_block_count) {
+ if (unlikely(valid_block_count > sbi->user_block_count)) {
spin_unlock(&sbi->stat_lock);
return false;
}
@@ -607,17 +685,16 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi,
return true;
}
-static inline int dec_valid_block_count(struct f2fs_sb_info *sbi,
+static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
struct inode *inode,
blkcnt_t count)
{
spin_lock(&sbi->stat_lock);
- BUG_ON(sbi->total_valid_block_count < (block_t) count);
- BUG_ON(inode->i_blocks < count);
+ f2fs_bug_on(sbi->total_valid_block_count < (block_t) count);
+ f2fs_bug_on(inode->i_blocks < count);
inode->i_blocks -= count;
sbi->total_valid_block_count -= (block_t)count;
spin_unlock(&sbi->stat_lock);
- return 0;
}
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -628,6 +705,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_inc_dirty_dents(struct inode *inode)
{
+ inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
atomic_inc(&F2FS_I(inode)->dirty_dents);
}
@@ -638,6 +716,10 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
static inline void inode_dec_dirty_dents(struct inode *inode)
{
+ if (!S_ISDIR(inode->i_mode))
+ return;
+
+ dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS);
atomic_dec(&F2FS_I(inode)->dirty_dents);
}
@@ -646,6 +728,11 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
return atomic_read(&sbi->nr_pages[count_type]);
}
+static inline int get_dirty_dents(struct inode *inode)
+{
+ return atomic_read(&F2FS_I(inode)->dirty_dents);
+}
+
static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
{
unsigned int pages_per_sec = sbi->segs_per_sec *
@@ -656,11 +743,7 @@ static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
{
- block_t ret;
- spin_lock(&sbi->stat_lock);
- ret = sbi->total_valid_block_count;
- spin_unlock(&sbi->stat_lock);
- return ret;
+ return sbi->total_valid_block_count;
}
static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
@@ -679,9 +762,18 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag)
static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
- int offset = (flag == NAT_BITMAP) ?
+ int offset;
+
+ if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) {
+ if (flag == NAT_BITMAP)
+ return &ckpt->sit_nat_version_bitmap;
+ else
+ return ((unsigned char *)ckpt + F2FS_BLKSIZE);
+ } else {
+ offset = (flag == NAT_BITMAP) ?
le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
- return &ckpt->sit_nat_version_bitmap + offset;
+ return &ckpt->sit_nat_version_bitmap + offset;
+ }
}
static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
@@ -708,96 +800,85 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
}
static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi,
- struct inode *inode,
- unsigned int count)
+ struct inode *inode)
{
block_t valid_block_count;
unsigned int valid_node_count;
spin_lock(&sbi->stat_lock);
- valid_block_count = sbi->total_valid_block_count + (block_t)count;
- sbi->alloc_valid_block_count += (block_t)count;
- valid_node_count = sbi->total_valid_node_count + count;
-
- if (valid_block_count > sbi->user_block_count) {
+ valid_block_count = sbi->total_valid_block_count + 1;
+ if (unlikely(valid_block_count > sbi->user_block_count)) {
spin_unlock(&sbi->stat_lock);
return false;
}
- if (valid_node_count > sbi->total_node_count) {
+ valid_node_count = sbi->total_valid_node_count + 1;
+ if (unlikely(valid_node_count > sbi->total_node_count)) {
spin_unlock(&sbi->stat_lock);
return false;
}
if (inode)
- inode->i_blocks += count;
- sbi->total_valid_node_count = valid_node_count;
- sbi->total_valid_block_count = valid_block_count;
+ inode->i_blocks++;
+
+ sbi->alloc_valid_block_count++;
+ sbi->total_valid_node_count++;
+ sbi->total_valid_block_count++;
spin_unlock(&sbi->stat_lock);
return true;
}
static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
- struct inode *inode,
- unsigned int count)
+ struct inode *inode)
{
spin_lock(&sbi->stat_lock);
- BUG_ON(sbi->total_valid_block_count < count);
- BUG_ON(sbi->total_valid_node_count < count);
- BUG_ON(inode->i_blocks < count);
+ f2fs_bug_on(!sbi->total_valid_block_count);
+ f2fs_bug_on(!sbi->total_valid_node_count);
+ f2fs_bug_on(!inode->i_blocks);
- inode->i_blocks -= count;
- sbi->total_valid_node_count -= count;
- sbi->total_valid_block_count -= (block_t)count;
+ inode->i_blocks--;
+ sbi->total_valid_node_count--;
+ sbi->total_valid_block_count--;
spin_unlock(&sbi->stat_lock);
}
static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
{
- unsigned int ret;
- spin_lock(&sbi->stat_lock);
- ret = sbi->total_valid_node_count;
- spin_unlock(&sbi->stat_lock);
- return ret;
+ return sbi->total_valid_node_count;
}
static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- BUG_ON(sbi->total_valid_inode_count == sbi->total_node_count);
+ f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count);
sbi->total_valid_inode_count++;
spin_unlock(&sbi->stat_lock);
}
-static inline int dec_valid_inode_count(struct f2fs_sb_info *sbi)
+static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
{
spin_lock(&sbi->stat_lock);
- BUG_ON(!sbi->total_valid_inode_count);
+ f2fs_bug_on(!sbi->total_valid_inode_count);
sbi->total_valid_inode_count--;
spin_unlock(&sbi->stat_lock);
- return 0;
}
static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
{
- unsigned int ret;
- spin_lock(&sbi->stat_lock);
- ret = sbi->total_valid_inode_count;
- spin_unlock(&sbi->stat_lock);
- return ret;
+ return sbi->total_valid_inode_count;
}
static inline void f2fs_put_page(struct page *page, int unlock)
{
- if (!page || IS_ERR(page))
+ if (!page)
return;
if (unlock) {
- BUG_ON(!PageLocked(page));
+ f2fs_bug_on(!PageLocked(page));
unlock_page(page);
}
page_cache_release(page);
@@ -814,9 +895,23 @@ static inline void f2fs_put_dnode(struct dnode_of_data *dn)
}
static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name,
- size_t size, void (*ctor)(void *))
+ size_t size)
{
- return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, ctor);
+ return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL);
+}
+
+static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
+ gfp_t flags)
+{
+ void *entry;
+retry:
+ entry = kmem_cache_alloc(cachep, flags);
+ if (!entry) {
+ cond_resched();
+ goto retry;
+ }
+
+ return entry;
}
#define RAW_IS_INODE(p) ((p)->footer.nid == (p)->footer.ino)
@@ -879,12 +974,15 @@ static inline int f2fs_clear_bit(unsigned int nr, char *addr)
enum {
FI_NEW_INODE, /* indicate newly allocated inode */
FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_DIRTY_DIR, /* indicate directory has dirty pages */
FI_INC_LINK, /* need to increment i_nlink */
FI_ACL_MODE, /* indicate acl mode */
FI_NO_ALLOC, /* should not allocate any blocks */
FI_UPDATE_DIR, /* should update inode block for consistency */
FI_DELAY_IPUT, /* used for the recovery */
+ FI_NO_EXTENT, /* not to use the extent cache */
FI_INLINE_XATTR, /* used for inline xattr */
+ FI_INLINE_DATA, /* used for inline data*/
};
static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -922,6 +1020,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi,
{
if (ri->i_inline & F2FS_INLINE_XATTR)
set_inode_flag(fi, FI_INLINE_XATTR);
+ if (ri->i_inline & F2FS_INLINE_DATA)
+ set_inode_flag(fi, FI_INLINE_DATA);
}
static inline void set_raw_inline(struct f2fs_inode_info *fi,
@@ -931,41 +1031,75 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi,
if (is_inode_flag_set(fi, FI_INLINE_XATTR))
ri->i_inline |= F2FS_INLINE_XATTR;
+ if (is_inode_flag_set(fi, FI_INLINE_DATA))
+ ri->i_inline |= F2FS_INLINE_DATA;
+}
+
+static inline int f2fs_has_inline_xattr(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR);
}
static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi)
{
- if (is_inode_flag_set(fi, FI_INLINE_XATTR))
+ if (f2fs_has_inline_xattr(&fi->vfs_inode))
return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS;
return DEF_ADDRS_PER_INODE;
}
static inline void *inline_xattr_addr(struct page *page)
{
- struct f2fs_inode *ri;
- ri = (struct f2fs_inode *)page_address(page);
+ struct f2fs_inode *ri = F2FS_INODE(page);
return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
F2FS_INLINE_XATTR_ADDRS]);
}
static inline int inline_xattr_size(struct inode *inode)
{
- if (is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR))
+ if (f2fs_has_inline_xattr(inode))
return F2FS_INLINE_XATTR_ADDRS << 2;
else
return 0;
}
+static inline int f2fs_has_inline_data(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
+}
+
+static inline void *inline_data_addr(struct page *page)
+{
+ struct f2fs_inode *ri = F2FS_INODE(page);
+ return (void *)&(ri->i_addr[1]);
+}
+
static inline int f2fs_readonly(struct super_block *sb)
{
return sb->s_flags & MS_RDONLY;
}
+static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
+{
+ set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
+ sbi->sb->s_flags |= MS_RDONLY;
+}
+
+#define get_inode_mode(i) \
+ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
+ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
+
+/* get offset of first page in next direct node */
+#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \
+ ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \
+ (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \
+ ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi))
+
/*
* file.c
*/
int f2fs_sync_file(struct file *, loff_t, loff_t, int);
void truncate_data_blocks(struct dnode_of_data *);
+int truncate_blocks(struct inode *, u64);
void f2fs_truncate(struct inode *);
int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
int f2fs_setattr(struct dentry *, struct iattr *);
@@ -979,8 +1113,9 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
*/
void f2fs_set_inode_flags(struct inode *);
struct inode *f2fs_iget(struct super_block *, unsigned long);
+int try_to_free_nats(struct f2fs_sb_info *, int);
void update_inode(struct inode *, struct page *);
-int update_inode_page(struct inode *);
+void update_inode_page(struct inode *);
int f2fs_write_inode(struct inode *, struct writeback_control *);
void f2fs_evict_inode(struct inode *);
@@ -1028,12 +1163,16 @@ f2fs_hash_t f2fs_dentry_hash(const char *, size_t);
struct dnode_of_data;
struct node_info;
+bool available_free_memory(struct f2fs_sb_info *, int);
int is_checkpointed_node(struct f2fs_sb_info *, nid_t);
+bool fsync_mark_done(struct f2fs_sb_info *, nid_t);
+void fsync_mark_clear(struct f2fs_sb_info *, nid_t);
void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
int truncate_inode_blocks(struct inode *, pgoff_t);
int truncate_xattr_node(struct inode *, struct page *);
-int remove_inode_page(struct inode *);
+int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
+void remove_inode_page(struct inode *);
struct page *new_inode_page(struct inode *, const struct qstr *);
struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
void ra_node_page(struct f2fs_sb_info *, nid_t);
@@ -1046,6 +1185,7 @@ void alloc_nid_done(struct f2fs_sb_info *, nid_t);
void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
void recover_node_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, struct node_info *, block_t);
+bool recover_xattr_data(struct inode *, struct page *, block_t);
int recover_inode_page(struct f2fs_sb_info *, struct page *);
int restore_node_summary(struct f2fs_sb_info *, unsigned int,
struct f2fs_summary_block *);
@@ -1059,24 +1199,30 @@ void destroy_node_manager_caches(void);
* segment.c
*/
void f2fs_balance_fs(struct f2fs_sb_info *);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *);
+int f2fs_issue_flush(struct f2fs_sb_info *);
+int create_flush_cmd_control(struct f2fs_sb_info *);
+void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
+void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *);
+void discard_next_dnode(struct f2fs_sb_info *);
int npages_for_summary_flush(struct f2fs_sb_info *);
void allocate_new_segments(struct f2fs_sb_info *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
-struct bio *f2fs_bio_alloc(struct block_device *, int);
-void f2fs_submit_bio(struct f2fs_sb_info *, enum page_type, bool);
-void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool);
void write_meta_page(struct f2fs_sb_info *, struct page *);
-void write_node_page(struct f2fs_sb_info *, struct page *, unsigned int,
- block_t, block_t *);
-void write_data_page(struct inode *, struct page *, struct dnode_of_data*,
- block_t, block_t *);
-void rewrite_data_page(struct f2fs_sb_info *, struct page *, block_t);
+void write_node_page(struct f2fs_sb_info *, struct page *,
+ struct f2fs_io_info *, unsigned int, block_t, block_t *);
+void write_data_page(struct page *, struct dnode_of_data *, block_t *,
+ struct f2fs_io_info *);
+void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
void recover_data_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, block_t, block_t);
void rewrite_node_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, block_t, block_t);
+void allocate_data_block(struct f2fs_sb_info *, struct page *,
+ block_t, block_t *, struct f2fs_summary *, int);
+void f2fs_wait_on_page_writeback(struct page *, enum page_type);
void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1084,23 +1230,25 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *,
void flush_sit_entries(struct f2fs_sb_info *);
int build_segment_manager(struct f2fs_sb_info *);
void destroy_segment_manager(struct f2fs_sb_info *);
+int __init create_segment_manager_caches(void);
+void destroy_segment_manager_caches(void);
/*
* checkpoint.c
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
int acquire_orphan_inode(struct f2fs_sb_info *);
void release_orphan_inode(struct f2fs_sb_info *);
void add_orphan_inode(struct f2fs_sb_info *, nid_t);
void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
-int recover_orphan_inodes(struct f2fs_sb_info *);
+void recover_orphan_inodes(struct f2fs_sb_info *);
int get_valid_checkpoint(struct f2fs_sb_info *);
void set_dirty_dir_page(struct inode *, struct page *);
void add_dirty_dir_inode(struct inode *);
void remove_dirty_dir_inode(struct inode *);
-struct inode *check_dirty_dir_inode(struct f2fs_sb_info *, nid_t);
void sync_dirty_dir_inodes(struct f2fs_sb_info *);
void write_checkpoint(struct f2fs_sb_info *, bool);
void init_orphan_info(struct f2fs_sb_info *);
@@ -1110,13 +1258,18 @@ void destroy_checkpoint_caches(void);
/*
* data.c
*/
+void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+ struct f2fs_io_info *);
int reserve_new_block(struct dnode_of_data *);
+int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
void update_extent_cache(block_t, struct dnode_of_data *);
struct page *find_data_page(struct inode *, pgoff_t, bool);
struct page *get_lock_data_page(struct inode *, pgoff_t);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
-int f2fs_readpage(struct f2fs_sb_info *, struct page *, block_t, int);
-int do_write_data_page(struct page *);
+int do_write_data_page(struct page *, struct f2fs_io_info *);
+int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
/*
* gc.c
@@ -1149,13 +1302,13 @@ struct f2fs_stat_info {
int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
int nats, sits, fnids;
int total_count, utilization;
- int bg_gc;
+ int bg_gc, inline_inode;
unsigned int valid_count, valid_node_count, valid_inode_count;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
int rsvd_segs, overp_segs;
int dirty_count, node_pages, meta_pages;
- int prefree_count, call_count;
+ int prefree_count, call_count, cp_count;
int tot_segs, node_segs, data_segs, free_segs, free_secs;
int tot_blks, data_blks, node_blks;
int curseg[NR_CURSEG_TYPE];
@@ -1169,10 +1322,31 @@ struct f2fs_stat_info {
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
{
- return (struct f2fs_stat_info*)sbi->stat_info;
+ return (struct f2fs_stat_info *)sbi->stat_info;
}
-#define stat_inc_call_count(si) ((si)->call_count++)
+#define stat_inc_cp_count(si) ((si)->cp_count++)
+#define stat_inc_call_count(si) ((si)->call_count++)
+#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
+#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
+#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
+#define stat_inc_total_hit(sb) ((F2FS_SB(sb))->total_hit_ext++)
+#define stat_inc_read_hit(sb) ((F2FS_SB(sb))->read_hit_ext++)
+#define stat_inc_inline_inode(inode) \
+ do { \
+ if (f2fs_has_inline_data(inode)) \
+ ((F2FS_SB(inode->i_sb))->inline_inode++); \
+ } while (0)
+#define stat_dec_inline_inode(inode) \
+ do { \
+ if (f2fs_has_inline_data(inode)) \
+ ((F2FS_SB(inode->i_sb))->inline_inode--); \
+ } while (0)
+
+#define stat_inc_seg_type(sbi, curseg) \
+ ((sbi)->segment_count[(curseg)->alloc_type]++)
+#define stat_inc_block_count(sbi, curseg) \
+ ((sbi)->block_count[(curseg)->alloc_type]++)
#define stat_inc_seg_count(sbi, type) \
do { \
@@ -1206,7 +1380,17 @@ void f2fs_destroy_stats(struct f2fs_sb_info *);
void __init f2fs_create_root_stats(void);
void f2fs_destroy_root_stats(void);
#else
+#define stat_inc_cp_count(si)
#define stat_inc_call_count(si)
+#define stat_inc_bggc_count(si)
+#define stat_inc_dirty_dir(sbi)
+#define stat_dec_dirty_dir(sbi)
+#define stat_inc_total_hit(sb)
+#define stat_inc_read_hit(sb)
+#define stat_inc_inline_inode(inode)
+#define stat_dec_inline_inode(inode)
+#define stat_inc_seg_type(sbi, curseg)
+#define stat_inc_block_count(sbi, curseg)
#define stat_inc_seg_count(si, type)
#define stat_inc_tot_blk_count(si, blks)
#define stat_inc_data_blk_count(si, blks)
@@ -1227,4 +1411,14 @@ extern const struct address_space_operations f2fs_meta_aops;
extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;
+
+/*
+ * inline.c
+ */
+bool f2fs_may_inline(struct inode *);
+int f2fs_read_inline_data(struct inode *, struct page *);
+int f2fs_convert_inline_data(struct inode *, pgoff_t);
+int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
+void truncate_inline_data(struct inode *, u64);
+int recover_inline_data(struct inode *, struct page *);
#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 02c906971cc..7d8b9627509 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -19,6 +19,7 @@
#include <linux/compat.h>
#include <linux/uaccess.h>
#include <linux/mount.h>
+#include <linux/pagevec.h>
#include "f2fs.h"
#include "node.h"
@@ -33,41 +34,26 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
struct page *page = vmf->page;
struct inode *inode = file_inode(vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- block_t old_blk_addr;
struct dnode_of_data dn;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
sb_start_pagefault(inode->i_sb);
/* block allocation */
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
- if (err) {
- mutex_unlock_op(sbi, ilock);
+ err = f2fs_reserve_block(&dn, page->index);
+ f2fs_unlock_op(sbi);
+ if (err)
goto out;
- }
-
- old_blk_addr = dn.data_blkaddr;
-
- if (old_blk_addr == NULL_ADDR) {
- err = reserve_new_block(&dn);
- if (err) {
- f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
- goto out;
- }
- }
- f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
file_update_time(vma->vm_file);
lock_page(page);
- if (page->mapping != inode->i_mapping ||
+ if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
- !PageUptodate(page)) {
+ !PageUptodate(page))) {
unlock_page(page);
err = -EFAULT;
goto out;
@@ -88,9 +74,10 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
set_page_dirty(page);
SetPageUptodate(page);
+ trace_f2fs_vm_page_mkwrite(page, DATA);
mapped:
/* fill the page */
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(err);
@@ -98,6 +85,7 @@ out:
static const struct vm_operations_struct f2fs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = f2fs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -125,6 +113,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
{
struct inode *inode = file->f_mapping->host;
+ struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
int ret = 0;
bool need_cp = false;
@@ -134,7 +123,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
.for_reclaim = 0,
};
- if (f2fs_readonly(inode->i_sb))
+ if (unlikely(f2fs_readonly(inode->i_sb)))
return 0;
trace_f2fs_sync_file_enter(inode);
@@ -147,7 +136,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
/* guarantee free sections for fsync */
f2fs_balance_fs(sbi);
- mutex_lock(&inode->i_mutex);
+ down_read(&fi->i_sem);
/*
* Both of fdatasync() and fsync() are able to be recovered from
@@ -164,40 +153,174 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
need_cp = true;
+ up_read(&fi->i_sem);
+
if (need_cp) {
nid_t pino;
- F2FS_I(inode)->xattr_ver = 0;
-
/* all the dirty node pages should be flushed for POR */
ret = f2fs_sync_fs(inode->i_sb, 1);
+
+ down_write(&fi->i_sem);
+ F2FS_I(inode)->xattr_ver = 0;
if (file_wrong_pino(inode) && inode->i_nlink == 1 &&
get_parent_ino(inode, &pino)) {
F2FS_I(inode)->i_pino = pino;
file_got_pino(inode);
+ up_write(&fi->i_sem);
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
+ } else {
+ up_write(&fi->i_sem);
}
} else {
/* if there is no written node page, write its inode page */
while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+ if (fsync_mark_done(sbi, inode->i_ino))
+ goto out;
mark_inode_dirty_sync(inode);
ret = f2fs_write_inode(inode, NULL);
if (ret)
goto out;
}
- filemap_fdatawait_range(sbi->node_inode->i_mapping,
- 0, LONG_MAX);
- ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ ret = wait_on_node_pages_writeback(sbi, inode->i_ino);
+ if (ret)
+ goto out;
+ ret = f2fs_issue_flush(F2FS_SB(inode->i_sb));
}
out:
- mutex_unlock(&inode->i_mutex);
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
return ret;
}
+static pgoff_t __get_first_dirty_index(struct address_space *mapping,
+ pgoff_t pgofs, int whence)
+{
+ struct pagevec pvec;
+ int nr_pages;
+
+ if (whence != SEEK_DATA)
+ return 0;
+
+ /* find first dirty page index */
+ pagevec_init(&pvec, 0);
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1);
+ pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX;
+ pagevec_release(&pvec);
+ return pgofs;
+}
+
+static bool __found_offset(block_t blkaddr, pgoff_t dirty, pgoff_t pgofs,
+ int whence)
+{
+ switch (whence) {
+ case SEEK_DATA:
+ if ((blkaddr == NEW_ADDR && dirty == pgofs) ||
+ (blkaddr != NEW_ADDR && blkaddr != NULL_ADDR))
+ return true;
+ break;
+ case SEEK_HOLE:
+ if (blkaddr == NULL_ADDR)
+ return true;
+ break;
+ }
+ return false;
+}
+
+static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxbytes = inode->i_sb->s_maxbytes;
+ struct dnode_of_data dn;
+ pgoff_t pgofs, end_offset, dirty;
+ loff_t data_ofs = offset;
+ loff_t isize;
+ int err = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize)
+ goto fail;
+
+ /* handle inline data case */
+ if (f2fs_has_inline_data(inode)) {
+ if (whence == SEEK_HOLE)
+ data_ofs = isize;
+ goto found;
+ }
+
+ pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT);
+
+ dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
+
+ for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
+ if (err && err != -ENOENT) {
+ goto fail;
+ } else if (err == -ENOENT) {
+ /* direct node is not exist */
+ if (whence == SEEK_DATA) {
+ pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
+ F2FS_I(inode));
+ continue;
+ } else {
+ goto found;
+ }
+ }
+
+ end_offset = IS_INODE(dn.node_page) ?
+ ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
+
+ /* find data/hole in dnode block */
+ for (; dn.ofs_in_node < end_offset;
+ dn.ofs_in_node++, pgofs++,
+ data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ block_t blkaddr;
+ blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+
+ if (__found_offset(blkaddr, dirty, pgofs, whence)) {
+ f2fs_put_dnode(&dn);
+ goto found;
+ }
+ }
+ f2fs_put_dnode(&dn);
+ }
+
+ if (whence == SEEK_DATA)
+ goto fail;
+found:
+ if (whence == SEEK_HOLE && data_ofs > isize)
+ data_ofs = isize;
+ mutex_unlock(&inode->i_mutex);
+ return vfs_setpos(file, data_ofs, maxbytes);
+fail:
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+}
+
+static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t maxbytes = inode->i_sb->s_maxbytes;
+
+ switch (whence) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ case SEEK_END:
+ return generic_file_llseek_size(file, offset, whence,
+ maxbytes, i_size_read(inode));
+ case SEEK_DATA:
+ case SEEK_HOLE:
+ return f2fs_seek_block(file, offset, whence);
+ }
+
+ return -EINVAL;
+}
+
static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
@@ -215,7 +338,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
raw_node = F2FS_NODE(dn->node_page);
addr = blkaddr_in_node(raw_node) + ofs;
- for ( ; count > 0; count--, addr++, dn->ofs_in_node++) {
+ for (; count > 0; count--, addr++, dn->ofs_in_node++) {
block_t blkaddr = le32_to_cpu(*addr);
if (blkaddr == NULL_ADDR)
continue;
@@ -246,6 +369,9 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
unsigned offset = from & (PAGE_CACHE_SIZE - 1);
struct page *page;
+ if (f2fs_has_inline_data(inode))
+ return truncate_inline_data(inode, from);
+
if (!offset)
return;
@@ -254,48 +380,48 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
return;
lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ if (unlikely(page->mapping != inode->i_mapping)) {
f2fs_put_page(page, 1);
return;
}
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
zero_user(page, offset, PAGE_CACHE_SIZE - offset);
set_page_dirty(page);
f2fs_put_page(page, 1);
}
-static int truncate_blocks(struct inode *inode, u64 from)
+int truncate_blocks(struct inode *inode, u64 from)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
- int count = 0, ilock = -1;
- int err;
+ int count = 0, err = 0;
trace_f2fs_truncate_blocks_enter(inode, from);
+ if (f2fs_has_inline_data(inode))
+ goto done;
+
free_from = (pgoff_t)
((from + blocksize - 1) >> (sbi->log_blocksize));
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
- if (IS_INODE(dn.node_page))
- count = ADDRS_PER_INODE(F2FS_I(inode));
- else
- count = ADDRS_PER_BLOCK;
+ count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
count -= dn.ofs_in_node;
- BUG_ON(count < 0);
+ f2fs_bug_on(count < 0);
if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
truncate_data_blocks_range(&dn, count);
@@ -305,8 +431,8 @@ static int truncate_blocks(struct inode *inode, u64 from)
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
- mutex_unlock_op(sbi, ilock);
-
+ f2fs_unlock_op(sbi);
+done:
/* lastly zero out the first data page */
truncate_partial_data_page(inode, from);
@@ -380,6 +506,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode)) {
+ err = f2fs_convert_inline_data(inode, attr->ia_size);
+ if (err)
+ return err;
+
truncate_setsize(inode, attr->ia_size);
f2fs_truncate(inode);
f2fs_balance_fs(F2FS_SB(inode->i_sb));
@@ -388,7 +518,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
__setattr_copy(inode, attr);
if (attr->ia_valid & ATTR_MODE) {
- err = f2fs_acl_chmod(inode);
+ err = posix_acl_chmod(inode, get_inode_mode(inode));
if (err || is_inode_flag_set(fi, FI_ACL_MODE)) {
inode->i_mode = fi->i_acl_mode;
clear_inode_flag(fi, FI_ACL_MODE);
@@ -403,12 +533,14 @@ const struct inode_operations f2fs_file_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
+ .set_acl = f2fs_set_acl,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = f2fs_listxattr,
.removexattr = generic_removexattr,
#endif
+ .fiemap = f2fs_fiemap,
};
static void fill_zero(struct inode *inode, pgoff_t index,
@@ -416,19 +548,18 @@ static void fill_zero(struct inode *inode, pgoff_t index,
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
- int ilock;
if (!len)
return;
f2fs_balance_fs(sbi);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
page = get_new_data_page(inode, NULL, index, false);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (!IS_ERR(page)) {
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, DATA);
zero_user(page, start, len);
set_page_dirty(page);
f2fs_put_page(page, 1);
@@ -458,12 +589,16 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
return 0;
}
-static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
+static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
pgoff_t pg_start, pg_end;
loff_t off_start, off_end;
int ret = 0;
+ ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1);
+ if (ret)
+ return ret;
+
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -484,7 +619,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ilock;
f2fs_balance_fs(sbi);
@@ -493,18 +627,12 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len, int mode)
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
ret = truncate_hole(inode, pg_start, pg_end);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
}
}
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- i_size_read(inode) <= (offset + len)) {
- i_size_write(inode, offset);
- mark_inode_dirty(inode);
- }
-
return ret;
}
@@ -521,35 +649,29 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
if (ret)
return ret;
+ ret = f2fs_convert_inline_data(inode, offset + len);
+ if (ret)
+ return ret;
+
pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
off_start = offset & (PAGE_CACHE_SIZE - 1);
off_end = (offset + len) & (PAGE_CACHE_SIZE - 1);
+ f2fs_lock_op(sbi);
+
for (index = pg_start; index <= pg_end; index++) {
struct dnode_of_data dn;
- int ilock;
- ilock = mutex_lock_op(sbi);
+ if (index == pg_end && !off_end)
+ goto noalloc;
+
set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
- if (ret) {
- mutex_unlock_op(sbi, ilock);
+ ret = f2fs_reserve_block(&dn, index);
+ if (ret)
break;
- }
-
- if (dn.data_blkaddr == NULL_ADDR) {
- ret = reserve_new_block(&dn);
- if (ret) {
- f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
- break;
- }
- }
- f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
-
+noalloc:
if (pg_start == pg_end)
new_size = offset + len;
else if (index == pg_start && off_start)
@@ -564,7 +686,9 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
i_size_read(inode) < new_size) {
i_size_write(inode, new_size);
mark_inode_dirty(inode);
+ update_inode_page(inode);
}
+ f2fs_unlock_op(sbi);
return ret;
}
@@ -578,8 +702,10 @@ static long f2fs_fallocate(struct file *file, int mode,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
+ mutex_lock(&inode->i_mutex);
+
if (mode & FALLOC_FL_PUNCH_HOLE)
- ret = punch_hole(inode, offset, len, mode);
+ ret = punch_hole(inode, offset, len);
else
ret = expand_inode_data(inode, offset, len, mode);
@@ -587,6 +713,9 @@ static long f2fs_fallocate(struct file *file, int mode,
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(inode);
}
+
+ mutex_unlock(&inode->i_mutex);
+
trace_f2fs_fallocate(inode, mode, offset, len, ret);
return ret;
}
@@ -682,11 +811,11 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
#endif
const struct file_operations f2fs_file_operations = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .llseek = f2fs_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.open = generic_file_open,
.mmap = f2fs_file_mmap,
.fsync = f2fs_sync_file,
@@ -696,5 +825,5 @@ const struct file_operations f2fs_file_operations = {
.compat_ioctl = f2fs_compat_ioctl,
#endif
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
};
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 2f157e88368..b90dbe55403 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -77,13 +77,15 @@ static int gc_thread_func(void *data)
else
wait_ms = increase_sleep_time(gc_th, wait_ms);
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->bg_gc++;
-#endif
+ stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
if (f2fs_gc(sbi))
wait_ms = gc_th->no_gc_sleep_time;
+
+ /* balancing f2fs's metadata periodically */
+ f2fs_balance_fs_bg(sbi);
+
} while (!kthread_should_stop());
return 0;
}
@@ -117,7 +119,6 @@ int start_gc_thread(struct f2fs_sb_info *sbi)
kfree(gc_th);
sbi->gc_thread = NULL;
}
-
out:
return err;
}
@@ -162,8 +163,8 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = sbi->segs_per_sec;
}
- if (p->max_search > MAX_VICTIM_SEARCH)
- p->max_search = MAX_VICTIM_SEARCH;
+ if (p->max_search > sbi->max_victim_search)
+ p->max_search = sbi->max_victim_search;
p->offset = sbi->last_victim[p->gc_mode];
}
@@ -236,8 +237,8 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
}
-static unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno,
- struct victim_sel_policy *p)
+static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
+ unsigned int segno, struct victim_sel_policy *p)
{
if (p->alloc_mode == SSR)
return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
@@ -293,7 +294,11 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
}
break;
}
- p.offset = ((segno / p.ofs_unit) * p.ofs_unit) + p.ofs_unit;
+
+ p.offset = segno + p.ofs_unit;
+ if (p.ofs_unit > 1)
+ p.offset -= segno % p.ofs_unit;
+
secno = GET_SECNO(sbi, segno);
if (sec_usage_check(sbi, secno))
@@ -306,10 +311,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
if (p.min_cost > cost) {
p.min_segno = segno;
p.min_cost = cost;
- }
-
- if (cost == max_cost)
+ } else if (unlikely(cost == max_cost)) {
continue;
+ }
if (nsearched++ >= p.max_search) {
sbi->last_victim[p.gc_mode] = segno;
@@ -358,12 +362,8 @@ static void add_gc_inode(struct inode *inode, struct list_head *ilist)
iput(inode);
return;
}
-repeat:
- new_ie = kmem_cache_alloc(winode_slab, GFP_NOFS);
- if (!new_ie) {
- cond_resched();
- goto repeat;
- }
+
+ new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
new_ie->inode = inode;
list_add_tail(&new_ie->list, ilist);
}
@@ -428,7 +428,7 @@ next_step:
/* set page dirty and write it */
if (gc_type == FG_GC) {
- f2fs_wait_on_page_writeback(node_page, NODE, true);
+ f2fs_wait_on_page_writeback(node_page, NODE);
set_page_dirty(node_page);
} else {
if (!PageWriteback(node_page))
@@ -520,23 +520,23 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
static void move_data_page(struct inode *inode, struct page *page, int gc_type)
{
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = WRITE_SYNC,
+ };
+
if (gc_type == BG_GC) {
if (PageWriteback(page))
goto out;
set_page_dirty(page);
set_cold_data(page);
} else {
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
-
- f2fs_wait_on_page_writeback(page, DATA, true);
+ f2fs_wait_on_page_writeback(page, DATA);
- if (clear_page_dirty_for_io(page) &&
- S_ISDIR(inode->i_mode)) {
- dec_page_count(sbi, F2FS_DIRTY_DENTS);
+ if (clear_page_dirty_for_io(page))
inode_dec_dirty_dents(inode);
- }
set_cold_data(page);
- do_write_data_page(page);
+ do_write_data_page(page, &fio);
clear_cold_data(page);
}
out:
@@ -630,7 +630,7 @@ next_iput:
goto next_step;
if (gc_type == FG_GC) {
- f2fs_submit_bio(sbi, DATA, true);
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
/*
* In the case of FG_GC, it'd be better to reclaim this victim
@@ -663,8 +663,6 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
/* read segment summary of victim */
sum_page = get_sum_page(sbi, segno);
- if (IS_ERR(sum_page))
- return;
blk_start_plug(&plug);
@@ -696,7 +694,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&ilist);
gc_more:
- if (!(sbi->sb->s_flags & MS_ACTIVE))
+ if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
+ goto stop;
+ if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
goto stop;
if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
@@ -708,6 +708,11 @@ gc_more:
goto stop;
ret = 0;
+ /* readahead multi ssa blocks those have contiguous address */
+ if (sbi->segs_per_sec > 1)
+ ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
+ META_SSA);
+
for (i = 0; i < sbi->segs_per_sec; i++)
do_garbage_collect(sbi, segno + i, &ilist, gc_type);
@@ -737,7 +742,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
int __init create_gc_caches(void)
{
winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
- sizeof(struct inode_entry), NULL);
+ sizeof(struct inode_entry));
if (!winode_slab)
return -ENOMEM;
return 0;
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 507056d2220..5d5eb6047bf 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -20,7 +20,7 @@
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
/* Search max. number of dirty segments to select a victim segment */
-#define MAX_VICTIM_SEARCH 4096 /* covers 8GB */
+#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
struct f2fs_gc_kthread {
struct task_struct *f2fs_gc_task;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
new file mode 100644
index 00000000000..1bba5228c19
--- /dev/null
+++ b/fs/f2fs/inline.c
@@ -0,0 +1,250 @@
+/*
+ * fs/f2fs/inline.c
+ * Copyright (c) 2013, Intel Corporation
+ * Authors: Huajun Li <huajun.li@intel.com>
+ * Haicheng Li <haicheng.li@intel.com>
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+
+#include "f2fs.h"
+
+bool f2fs_may_inline(struct inode *inode)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ block_t nr_blocks;
+ loff_t i_size;
+
+ if (!test_opt(sbi, INLINE_DATA))
+ return false;
+
+ nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
+ if (inode->i_blocks > nr_blocks)
+ return false;
+
+ i_size = i_size_read(inode);
+ if (i_size > MAX_INLINE_DATA)
+ return false;
+
+ return true;
+}
+
+int f2fs_read_inline_data(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *ipage;
+ void *src_addr, *dst_addr;
+
+ if (page->index) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ goto out;
+ }
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ unlock_page(page);
+ return PTR_ERR(ipage);
+ }
+
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+
+ /* Copy the whole inline data block */
+ src_addr = inline_data_addr(ipage);
+ dst_addr = kmap(page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ kunmap(page);
+ f2fs_put_page(ipage, 1);
+
+out:
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ return 0;
+}
+
+static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
+{
+ int err;
+ struct page *ipage;
+ struct dnode_of_data dn;
+ void *src_addr, *dst_addr;
+ block_t new_blk_addr;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = WRITE_SYNC | REQ_PRIO,
+ };
+
+ f2fs_lock_op(sbi);
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage)) {
+ err = PTR_ERR(ipage);
+ goto out;
+ }
+
+ /*
+ * i_addr[0] is not used for inline data,
+ * so reserving new block will not destroy inline data
+ */
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ err = f2fs_reserve_block(&dn, 0);
+ if (err)
+ goto out;
+
+ f2fs_wait_on_page_writeback(page, DATA);
+ zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE);
+
+ /* Copy the whole inline data block */
+ src_addr = inline_data_addr(ipage);
+ dst_addr = kmap(page);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ kunmap(page);
+ SetPageUptodate(page);
+
+ /* write data page to try to make data consistent */
+ set_page_writeback(page);
+ write_data_page(page, &dn, &new_blk_addr, &fio);
+ update_extent_cache(new_blk_addr, &dn);
+ f2fs_wait_on_page_writeback(page, DATA);
+
+ /* clear inline data and flag after data writeback */
+ zero_user_segment(ipage, INLINE_DATA_OFFSET,
+ INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ stat_dec_inline_inode(inode);
+
+ sync_inode_page(&dn);
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_unlock_op(sbi);
+ return err;
+}
+
+int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
+{
+ struct page *page;
+ int err;
+
+ if (!f2fs_has_inline_data(inode))
+ return 0;
+ else if (to_size <= MAX_INLINE_DATA)
+ return 0;
+
+ page = grab_cache_page(inode->i_mapping, 0);
+ if (!page)
+ return -ENOMEM;
+
+ err = __f2fs_convert_inline_data(inode, page);
+ f2fs_put_page(page, 1);
+ return err;
+}
+
+int f2fs_write_inline_data(struct inode *inode,
+ struct page *page, unsigned size)
+{
+ void *src_addr, *dst_addr;
+ struct page *ipage;
+ struct dnode_of_data dn;
+ int err;
+
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = get_dnode_of_data(&dn, 0, LOOKUP_NODE);
+ if (err)
+ return err;
+ ipage = dn.inode_page;
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
+ zero_user_segment(ipage, INLINE_DATA_OFFSET,
+ INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+ src_addr = kmap(page);
+ dst_addr = inline_data_addr(ipage);
+ memcpy(dst_addr, src_addr, size);
+ kunmap(page);
+
+ /* Release the first data block if it is allocated */
+ if (!f2fs_has_inline_data(inode)) {
+ truncate_data_blocks_range(&dn, 1);
+ set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ stat_inc_inline_inode(inode);
+ }
+
+ sync_inode_page(&dn);
+ f2fs_put_dnode(&dn);
+
+ return 0;
+}
+
+void truncate_inline_data(struct inode *inode, u64 from)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct page *ipage;
+
+ if (from >= MAX_INLINE_DATA)
+ return;
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ if (IS_ERR(ipage))
+ return;
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
+
+ zero_user_segment(ipage, INLINE_DATA_OFFSET + from,
+ INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+ set_page_dirty(ipage);
+ f2fs_put_page(ipage, 1);
+}
+
+int recover_inline_data(struct inode *inode, struct page *npage)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_inode *ri = NULL;
+ void *src_addr, *dst_addr;
+ struct page *ipage;
+
+ /*
+ * The inline_data recovery policy is as follows.
+ * [prev.] [next] of inline_data flag
+ * o o -> recover inline_data
+ * o x -> remove inline_data, and then recover data blocks
+ * x o -> remove inline_data, and then recover inline_data
+ * x x -> recover data blocks
+ */
+ if (IS_INODE(npage))
+ ri = F2FS_INODE(npage);
+
+ if (f2fs_has_inline_data(inode) &&
+ ri && ri->i_inline & F2FS_INLINE_DATA) {
+process_inline:
+ ipage = get_node_page(sbi, inode->i_ino);
+ f2fs_bug_on(IS_ERR(ipage));
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
+
+ src_addr = inline_data_addr(npage);
+ dst_addr = inline_data_addr(ipage);
+ memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
+ update_inode(inode, ipage);
+ f2fs_put_page(ipage, 1);
+ return -1;
+ }
+
+ if (f2fs_has_inline_data(inode)) {
+ ipage = get_node_page(sbi, inode->i_ino);
+ f2fs_bug_on(IS_ERR(ipage));
+ f2fs_wait_on_page_writeback(ipage, NODE);
+ zero_user_segment(ipage, INLINE_DATA_OFFSET,
+ INLINE_DATA_OFFSET + MAX_INLINE_DATA);
+ clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ update_inode(inode, ipage);
+ f2fs_put_page(ipage, 1);
+ } else if (ri && ri->i_inline & F2FS_INLINE_DATA) {
+ truncate_blocks(inode, 0);
+ set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
+ goto process_inline;
+ }
+ return 0;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9339cd29204..2cf6962f6cc 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -12,6 +12,7 @@
#include <linux/f2fs_fs.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
+#include <linux/bitops.h>
#include "f2fs.h"
#include "node.h"
@@ -21,20 +22,49 @@
void f2fs_set_inode_flags(struct inode *inode)
{
unsigned int flags = F2FS_I(inode)->i_flags;
-
- inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE |
- S_NOATIME | S_DIRSYNC);
+ unsigned int new_fl = 0;
if (flags & FS_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
if (flags & FS_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & FS_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & FS_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
+ set_mask_bits(&inode->i_flags,
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC, new_fl);
+}
+
+static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+{
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+ if (ri->i_addr[0])
+ inode->i_rdev =
+ old_decode_dev(le32_to_cpu(ri->i_addr[0]));
+ else
+ inode->i_rdev =
+ new_decode_dev(le32_to_cpu(ri->i_addr[1]));
+ }
+}
+
+static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+{
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ if (old_valid_dev(inode->i_rdev)) {
+ ri->i_addr[0] =
+ cpu_to_le32(old_encode_dev(inode->i_rdev));
+ ri->i_addr[1] = 0;
+ } else {
+ ri->i_addr[0] = 0;
+ ri->i_addr[1] =
+ cpu_to_le32(new_encode_dev(inode->i_rdev));
+ ri->i_addr[2] = 0;
+ }
+ }
}
static int do_read_inode(struct inode *inode)
@@ -42,13 +72,13 @@ static int do_read_inode(struct inode *inode)
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
- struct f2fs_node *rn;
struct f2fs_inode *ri;
/* Check if ino is within scope */
if (check_nid_range(sbi, inode->i_ino)) {
f2fs_msg(inode->i_sb, KERN_ERR, "bad inode number: %lu",
(unsigned long) inode->i_ino);
+ WARN_ON(1);
return -EINVAL;
}
@@ -56,8 +86,7 @@ static int do_read_inode(struct inode *inode)
if (IS_ERR(node_page))
return PTR_ERR(node_page);
- rn = F2FS_NODE(node_page);
- ri = &(rn->i);
+ ri = F2FS_INODE(node_page);
inode->i_mode = le16_to_cpu(ri->i_mode);
i_uid_write(inode, le32_to_cpu(ri->i_uid));
@@ -73,10 +102,6 @@ static int do_read_inode(struct inode *inode)
inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
inode->i_generation = le32_to_cpu(ri->i_generation);
- if (ri->i_addr[0])
- inode->i_rdev = old_decode_dev(le32_to_cpu(ri->i_addr[0]));
- else
- inode->i_rdev = new_decode_dev(le32_to_cpu(ri->i_addr[1]));
fi->i_current_depth = le32_to_cpu(ri->i_current_depth);
fi->i_xattr_nid = le32_to_cpu(ri->i_xattr_nid);
@@ -84,8 +109,14 @@ static int do_read_inode(struct inode *inode)
fi->flags = 0;
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
+ fi->i_dir_level = ri->i_dir_level;
+
get_extent_info(&fi->ext, ri->i_ext);
get_inline_info(fi, ri);
+
+ /* get rdev by using inline_info */
+ __get_inode_rdev(inode, ri);
+
f2fs_put_page(node_page, 1);
return 0;
}
@@ -149,13 +180,11 @@ bad_inode:
void update_inode(struct inode *inode, struct page *node_page)
{
- struct f2fs_node *rn;
struct f2fs_inode *ri;
- f2fs_wait_on_page_writeback(node_page, NODE, false);
+ f2fs_wait_on_page_writeback(node_page, NODE);
- rn = F2FS_NODE(node_page);
- ri = &(rn->i);
+ ri = F2FS_INODE(node_page);
ri->i_mode = cpu_to_le16(inode->i_mode);
ri->i_advise = F2FS_I(inode)->i_advise;
@@ -178,43 +207,38 @@ void update_inode(struct inode *inode, struct page *node_page)
ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
ri->i_generation = cpu_to_le32(inode->i_generation);
+ ri->i_dir_level = F2FS_I(inode)->i_dir_level;
- if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
- if (old_valid_dev(inode->i_rdev)) {
- ri->i_addr[0] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- ri->i_addr[1] = 0;
- } else {
- ri->i_addr[0] = 0;
- ri->i_addr[1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- ri->i_addr[2] = 0;
- }
- }
-
+ __set_inode_rdev(inode, ri);
set_cold_node(inode, node_page);
set_page_dirty(node_page);
+
clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
}
-int update_inode_page(struct inode *inode)
+void update_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *node_page;
-
+retry:
node_page = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(node_page))
- return PTR_ERR(node_page);
-
+ if (IS_ERR(node_page)) {
+ int err = PTR_ERR(node_page);
+ if (err == -ENOMEM) {
+ cond_resched();
+ goto retry;
+ } else if (err != -ENOENT) {
+ f2fs_stop_checkpoint(sbi);
+ }
+ return;
+ }
update_inode(inode, node_page);
f2fs_put_page(node_page, 1);
- return 0;
}
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ret, ilock;
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
@@ -227,14 +251,14 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
* We need to lock here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- ilock = mutex_lock_op(sbi);
- ret = update_inode_page(inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_lock_op(sbi);
+ update_inode_page(inode);
+ f2fs_unlock_op(sbi);
if (wbc)
f2fs_balance_fs(sbi);
- return ret;
+ return 0;
}
/*
@@ -243,16 +267,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ilock;
trace_f2fs_evict_inode(inode);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
goto no_delete;
- BUG_ON(atomic_read(&F2FS_I(inode)->dirty_dents));
+ f2fs_bug_on(get_dirty_dents(inode));
remove_dirty_dir_inode(inode);
if (inode->i_nlink || is_bad_inode(inode))
@@ -265,11 +288,13 @@ void f2fs_evict_inode(struct inode *inode)
if (F2FS_HAS_BLOCKS(inode))
f2fs_truncate(inode);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
remove_inode_page(inode);
- mutex_unlock_op(sbi, ilock);
+ stat_dec_inline_inode(inode);
+ f2fs_unlock_op(sbi);
sb_end_intwrite(inode->i_sb);
no_delete:
clear_inode(inode);
+ invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2a5359c990f..a6bdddc33ce 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -27,32 +27,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
nid_t ino;
struct inode *inode;
bool nid_free = false;
- int err, ilock;
+ int err;
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
if (!alloc_nid(sbi, &ino)) {
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
err = -ENOSPC;
goto fail;
}
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
- inode->i_uid = current_fsuid();
-
- if (dir->i_mode & S_ISGID) {
- inode->i_gid = dir->i_gid;
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else {
- inode->i_gid = current_fsgid();
- }
+ inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
- inode->i_mode = mode;
inode->i_blocks = 0;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = sbi->s_next_generation++;
@@ -115,7 +106,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
nid_t ino = 0;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
@@ -131,9 +122,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
@@ -157,7 +148,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = old_dentry->d_inode;
struct super_block *sb = dir->i_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
@@ -165,9 +156,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
@@ -207,6 +198,8 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
inode = f2fs_iget(dir->i_sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
+
+ stat_inc_inline_inode(inode);
}
return d_splice_alias(inode, dentry);
@@ -220,7 +213,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
struct f2fs_dir_entry *de;
struct page *page;
int err = -ENOENT;
- int ilock;
trace_f2fs_unlink_enter(dir, dentry);
f2fs_balance_fs(sbi);
@@ -229,16 +221,16 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
if (!de)
goto fail;
+ f2fs_lock_op(sbi);
err = acquire_orphan_inode(sbi);
if (err) {
+ f2fs_unlock_op(sbi);
kunmap(page);
f2fs_put_page(page, 0);
goto fail;
}
-
- ilock = mutex_lock_op(sbi);
f2fs_delete_entry(de, page, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
/* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
@@ -254,7 +246,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
size_t symlen = strlen(symname) + 1;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
@@ -265,9 +257,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
@@ -290,7 +282,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct inode *inode;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
@@ -304,9 +296,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out_fail;
@@ -342,7 +334,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
int err = 0;
- int ilock;
if (!new_valid_dev(rdev))
return -EINVAL;
@@ -356,9 +347,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
@@ -387,7 +378,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
- int err = -ENOENT, ilock = -1;
+ int err = -ENOENT;
f2fs_balance_fs(sbi);
@@ -402,7 +393,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_old;
}
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
if (new_inode) {
@@ -428,9 +419,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
new_inode->i_ctime = CURRENT_TIME;
+ down_write(&F2FS_I(new_inode)->i_sem);
if (old_dir_entry)
drop_nlink(new_inode);
drop_nlink(new_inode);
+ up_write(&F2FS_I(new_inode)->i_sem);
+
+ mark_inode_dirty(new_inode);
if (!new_inode->i_nlink)
add_orphan_inode(sbi, new_inode->i_ino);
@@ -450,6 +445,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
+ down_write(&F2FS_I(old_inode)->i_sem);
+ file_lost_pino(old_inode);
+ up_write(&F2FS_I(old_inode)->i_sem);
+
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
@@ -459,25 +458,28 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dir != new_dir) {
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
+ update_inode_page(old_inode);
} else {
kunmap(old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
drop_nlink(old_dir);
+ mark_inode_dirty(old_dir);
update_inode_page(old_dir);
}
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
return 0;
put_out_dir:
- f2fs_put_page(new_page, 1);
+ kunmap(new_page);
+ f2fs_put_page(new_page, 0);
out_dir:
if (old_dir_entry) {
kunmap(old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
out_old:
kunmap(old_page);
f2fs_put_page(old_page, 0);
@@ -498,6 +500,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
+ .set_acl = f2fs_set_acl,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
@@ -524,6 +527,7 @@ const struct inode_operations f2fs_special_inode_operations = {
.getattr = f2fs_getattr,
.setattr = f2fs_setattr,
.get_acl = f2fs_get_acl,
+ .set_acl = f2fs_set_acl,
#ifdef CONFIG_F2FS_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 51ef2789443..4b697ccc9b0 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -21,9 +21,35 @@
#include "segment.h"
#include <trace/events/f2fs.h>
+#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
+
static struct kmem_cache *nat_entry_slab;
static struct kmem_cache *free_nid_slab;
+bool available_free_memory(struct f2fs_sb_info *sbi, int type)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct sysinfo val;
+ unsigned long mem_size = 0;
+ bool res = false;
+
+ si_meminfo(&val);
+ /* give 25%, 25%, 50% memory for each components respectively */
+ if (type == FREE_NIDS) {
+ mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> 12;
+ res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2);
+ } else if (type == NAT_ENTRIES) {
+ mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> 12;
+ res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 2);
+ } else if (type == DIRTY_DENTS) {
+ if (sbi->sb->s_bdi->dirty_exceeded)
+ return false;
+ mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
+ res = mem_size < ((val.totalram * nm_i->ram_thresh / 100) >> 1);
+ }
+ return res;
+}
+
static void clear_node_page_dirty(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -82,40 +108,6 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
return dst_page;
}
-/*
- * Readahead NAT pages
- */
-static void ra_nat_pages(struct f2fs_sb_info *sbi, int nid)
-{
- struct address_space *mapping = sbi->meta_inode->i_mapping;
- struct f2fs_nm_info *nm_i = NM_I(sbi);
- struct blk_plug plug;
- struct page *page;
- pgoff_t index;
- int i;
-
- blk_start_plug(&plug);
-
- for (i = 0; i < FREE_NID_PAGES; i++, nid += NAT_ENTRY_PER_BLOCK) {
- if (nid >= nm_i->max_nid)
- nid = 0;
- index = current_nat_addr(sbi, nid);
-
- page = grab_cache_page(mapping, index);
- if (!page)
- continue;
- if (PageUptodate(page)) {
- f2fs_put_page(page, 1);
- continue;
- }
- if (f2fs_readpage(sbi, page, index, READ))
- continue;
-
- f2fs_put_page(page, 0);
- }
- blk_finish_plug(&plug);
-}
-
static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
{
return radix_tree_lookup(&nm_i->nat_root, n);
@@ -149,6 +141,32 @@ int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
return is_cp;
}
+bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+ bool fsync_done = false;
+
+ read_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e)
+ fsync_done = e->fsync_done;
+ read_unlock(&nm_i->nat_tree_lock);
+ return fsync_done;
+}
+
+void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid)
+{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+ struct nat_entry *e;
+
+ write_lock(&nm_i->nat_tree_lock);
+ e = __lookup_nat_cache(nm_i, nid);
+ if (e)
+ e->fsync_done = false;
+ write_unlock(&nm_i->nat_tree_lock);
+}
+
static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
{
struct nat_entry *new;
@@ -162,6 +180,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
}
memset(new, 0, sizeof(struct nat_entry));
nat_set_nid(new, nid);
+ new->checkpointed = true;
list_add_tail(&new->list, &nm_i->nat_entries);
nm_i->nat_cnt++;
return new;
@@ -180,16 +199,13 @@ retry:
write_unlock(&nm_i->nat_tree_lock);
goto retry;
}
- nat_set_blkaddr(e, le32_to_cpu(ne->block_addr));
- nat_set_ino(e, le32_to_cpu(ne->ino));
- nat_set_version(e, ne->version);
- e->checkpointed = true;
+ node_info_from_raw_nat(&e->ni, ne);
}
write_unlock(&nm_i->nat_tree_lock);
}
static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
- block_t new_blkaddr)
+ block_t new_blkaddr, bool fsync_done)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct nat_entry *e;
@@ -203,8 +219,7 @@ retry:
goto retry;
}
e->ni = *ni;
- e->checkpointed = true;
- BUG_ON(ni->blk_addr == NEW_ADDR);
+ f2fs_bug_on(ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
* when nid is reallocated,
@@ -212,19 +227,16 @@ retry:
* So, reinitialize it with new information.
*/
e->ni = *ni;
- BUG_ON(ni->blk_addr != NULL_ADDR);
+ f2fs_bug_on(ni->blk_addr != NULL_ADDR);
}
- if (new_blkaddr == NEW_ADDR)
- e->checkpointed = false;
-
/* sanity check */
- BUG_ON(nat_get_blkaddr(e) != ni->blk_addr);
- BUG_ON(nat_get_blkaddr(e) == NULL_ADDR &&
+ f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr);
+ f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR &&
new_blkaddr == NULL_ADDR);
- BUG_ON(nat_get_blkaddr(e) == NEW_ADDR &&
+ f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR &&
new_blkaddr == NEW_ADDR);
- BUG_ON(nat_get_blkaddr(e) != NEW_ADDR &&
+ f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR &&
nat_get_blkaddr(e) != NULL_ADDR &&
new_blkaddr == NEW_ADDR);
@@ -237,14 +249,19 @@ retry:
/* change address */
nat_set_blkaddr(e, new_blkaddr);
__set_nat_cache_dirty(nm_i, e);
+
+ /* update fsync_mark if its inode nat entry is still alive */
+ e = __lookup_nat_cache(nm_i, ni->ino);
+ if (e)
+ e->fsync_done = fsync_done;
write_unlock(&nm_i->nat_tree_lock);
}
-static int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
+int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
- if (nm_i->nat_cnt <= NM_WOUT_THRESHOLD)
+ if (available_free_memory(sbi, NAT_ENTRIES))
return 0;
write_lock(&nm_i->nat_tree_lock);
@@ -391,8 +408,8 @@ got:
/*
* Caller should call f2fs_put_dnode(dn).
- * Also, it should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op() only if ro is not set RDONLY_NODE.
+ * Also, it should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
* In the case of RDONLY_NODE, we don't need to care about mutex.
*/
int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
@@ -495,15 +512,15 @@ static void truncate_node(struct dnode_of_data *dn)
get_node_info(sbi, dn->nid, &ni);
if (dn->inode->i_blocks == 0) {
- BUG_ON(ni.blk_addr != NULL_ADDR);
+ f2fs_bug_on(ni.blk_addr != NULL_ADDR);
goto invalidate;
}
- BUG_ON(ni.blk_addr == NULL_ADDR);
+ f2fs_bug_on(ni.blk_addr == NULL_ADDR);
/* Deallocate node address */
invalidate_blocks(sbi, ni.blk_addr);
- dec_valid_node_count(sbi, dn->inode, 1);
- set_node_addr(sbi, &ni, NULL_ADDR);
+ dec_valid_node_count(sbi, dn->inode);
+ set_node_addr(sbi, &ni, NULL_ADDR, false);
if (dn->nid == dn->inode->i_ino) {
remove_orphan_inode(sbi, dn->nid);
@@ -516,6 +533,10 @@ invalidate:
F2FS_SET_SB_DIRT(sbi);
f2fs_put_page(dn->node_page, 1);
+
+ invalidate_mapping_pages(NODE_MAPPING(sbi),
+ dn->node_page->index, dn->node_page->index);
+
dn->node_page = NULL;
trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr);
}
@@ -631,19 +652,19 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
return 0;
/* get indirect nodes in the path */
- for (i = 0; i < depth - 1; i++) {
+ for (i = 0; i < idx + 1; i++) {
/* refernece count'll be increased */
pages[i] = get_node_page(sbi, nid[i]);
if (IS_ERR(pages[i])) {
- depth = i + 1;
err = PTR_ERR(pages[i]);
+ idx = i - 1;
goto fail;
}
nid[i + 1] = get_nid(pages[i], offset[i + 1], false);
}
/* free direct nodes linked to a partial indirect node */
- for (i = offset[depth - 1]; i < NIDS_PER_BLOCK; i++) {
+ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) {
child_nid = get_nid(pages[idx], i, false);
if (!child_nid)
continue;
@@ -654,7 +675,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
set_nid(pages[idx], i, 0, false);
}
- if (offset[depth - 1] == 0) {
+ if (offset[idx + 1] == 0) {
dn->node_page = pages[idx];
dn->nid = nid[idx];
truncate_node(dn);
@@ -662,9 +683,10 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
f2fs_put_page(pages[idx], 1);
}
offset[idx]++;
- offset[depth - 1] = 0;
+ offset[idx + 1] = 0;
+ idx--;
fail:
- for (i = depth - 3; i >= 0; i--)
+ for (i = idx; i >= 0; i--)
f2fs_put_page(pages[i], 1);
trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err);
@@ -678,11 +700,10 @@ fail:
int truncate_inode_blocks(struct inode *inode, pgoff_t from)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- struct address_space *node_mapping = sbi->node_inode->i_mapping;
int err = 0, cont = 1;
int level, offset[4], noffset[4];
unsigned int nofs = 0;
- struct f2fs_node *rn;
+ struct f2fs_inode *ri;
struct dnode_of_data dn;
struct page *page;
@@ -699,7 +720,7 @@ restart:
set_new_dnode(&dn, inode, page, NULL, 0);
unlock_page(page);
- rn = F2FS_NODE(page);
+ ri = F2FS_INODE(page);
switch (level) {
case 0:
case 1:
@@ -709,7 +730,7 @@ restart:
nofs = noffset[1];
if (!offset[level - 1])
goto skip_partial;
- err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+ err = truncate_partial_nodes(&dn, ri, offset, level);
if (err < 0 && err != -ENOENT)
goto fail;
nofs += 1 + NIDS_PER_BLOCK;
@@ -718,7 +739,7 @@ restart:
nofs = 5 + 2 * NIDS_PER_BLOCK;
if (!offset[level - 1])
goto skip_partial;
- err = truncate_partial_nodes(&dn, &rn->i, offset, level);
+ err = truncate_partial_nodes(&dn, ri, offset, level);
if (err < 0 && err != -ENOENT)
goto fail;
break;
@@ -728,7 +749,7 @@ restart:
skip_partial:
while (cont) {
- dn.nid = le32_to_cpu(rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]);
+ dn.nid = le32_to_cpu(ri->i_nid[offset[0] - NODE_DIR1_BLOCK]);
switch (offset[0]) {
case NODE_DIR1_BLOCK:
case NODE_DIR2_BLOCK:
@@ -751,14 +772,14 @@ skip_partial:
if (err < 0 && err != -ENOENT)
goto fail;
if (offset[1] == 0 &&
- rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK]) {
+ ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) {
lock_page(page);
- if (page->mapping != node_mapping) {
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto restart;
}
- wait_on_page_writeback(page);
- rn->i.i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
+ f2fs_wait_on_page_writeback(page, NODE);
+ ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0;
set_page_dirty(page);
unlock_page(page);
}
@@ -794,38 +815,34 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
set_new_dnode(&dn, inode, page, npage, nid);
if (page)
- dn.inode_page_locked = 1;
+ dn.inode_page_locked = true;
truncate_node(&dn);
return 0;
}
/*
- * Caller should grab and release a mutex by calling mutex_lock_op() and
- * mutex_unlock_op().
+ * Caller should grab and release a rwsem by calling f2fs_lock_op() and
+ * f2fs_unlock_op().
*/
-int remove_inode_page(struct inode *inode)
+void remove_inode_page(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
nid_t ino = inode->i_ino;
struct dnode_of_data dn;
- int err;
page = get_node_page(sbi, ino);
if (IS_ERR(page))
- return PTR_ERR(page);
+ return;
- err = truncate_xattr_node(inode, page);
- if (err) {
+ if (truncate_xattr_node(inode, page)) {
f2fs_put_page(page, 1);
- return err;
+ return;
}
-
/* 0 is possible, after f2fs_new_inode() is failed */
- BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
+ f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1);
set_new_dnode(&dn, inode, page, page, ino);
truncate_node(&dn);
- return 0;
}
struct page *new_inode_page(struct inode *inode, const struct qstr *name)
@@ -843,19 +860,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
unsigned int ofs, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
- struct address_space *mapping = sbi->node_inode->i_mapping;
struct node_info old_ni, new_ni;
struct page *page;
int err;
- if (is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))
+ if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return ERR_PTR(-EPERM);
- page = grab_cache_page(mapping, dn->nid);
+ page = grab_cache_page(NODE_MAPPING(sbi), dn->nid);
if (!page)
return ERR_PTR(-ENOMEM);
- if (!inc_valid_node_count(sbi, dn->inode, 1)) {
+ if (unlikely(!inc_valid_node_count(sbi, dn->inode))) {
err = -ENOSPC;
goto fail;
}
@@ -863,17 +879,18 @@ struct page *new_node_page(struct dnode_of_data *dn,
get_node_info(sbi, dn->nid, &old_ni);
/* Reinitialize old_ni with new node page */
- BUG_ON(old_ni.blk_addr != NULL_ADDR);
+ f2fs_bug_on(old_ni.blk_addr != NULL_ADDR);
new_ni = old_ni;
new_ni.ino = dn->inode->i_ino;
- set_node_addr(sbi, &new_ni, NEW_ADDR);
+ set_node_addr(sbi, &new_ni, NEW_ADDR, false);
+ f2fs_wait_on_page_writeback(page, NODE);
fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
set_cold_node(dn->inode, page);
SetPageUptodate(page);
set_page_dirty(page);
- if (ofs == XATTR_NODE_OFFSET)
+ if (f2fs_has_xattr_block(ofs))
F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
dn->node_page = page;
@@ -898,14 +915,14 @@ fail:
* LOCKED_PAGE: f2fs_put_page(page, 1)
* error: nothing
*/
-static int read_node_page(struct page *page, int type)
+static int read_node_page(struct page *page, int rw)
{
struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
struct node_info ni;
get_node_info(sbi, page->index, &ni);
- if (ni.blk_addr == NULL_ADDR) {
+ if (unlikely(ni.blk_addr == NULL_ADDR)) {
f2fs_put_page(page, 1);
return -ENOENT;
}
@@ -913,7 +930,7 @@ static int read_node_page(struct page *page, int type)
if (PageUptodate(page))
return LOCKED_PAGE;
- return f2fs_readpage(sbi, page, ni.blk_addr, type);
+ return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
}
/*
@@ -921,18 +938,17 @@ static int read_node_page(struct page *page, int type)
*/
void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
{
- struct address_space *mapping = sbi->node_inode->i_mapping;
struct page *apage;
int err;
- apage = find_get_page(mapping, nid);
+ apage = find_get_page(NODE_MAPPING(sbi), nid);
if (apage && PageUptodate(apage)) {
f2fs_put_page(apage, 0);
return;
}
f2fs_put_page(apage, 0);
- apage = grab_cache_page(mapping, nid);
+ apage = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!apage)
return;
@@ -945,11 +961,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
{
- struct address_space *mapping = sbi->node_inode->i_mapping;
struct page *page;
int err;
repeat:
- page = grab_cache_page(mapping, nid);
+ page = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -960,17 +975,15 @@ repeat:
goto got_it;
lock_page(page);
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
got_it:
- BUG_ON(nid != nid_of_node(page));
- mark_page_accessed(page);
return page;
}
@@ -981,7 +994,6 @@ got_it:
struct page *get_node_page_ra(struct page *parent, int start)
{
struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb);
- struct address_space *mapping = sbi->node_inode->i_mapping;
struct blk_plug plug;
struct page *page;
int err, i, end;
@@ -992,7 +1004,7 @@ struct page *get_node_page_ra(struct page *parent, int start)
if (!nid)
return ERR_PTR(-ENOENT);
repeat:
- page = grab_cache_page(mapping, nid);
+ page = grab_cache_page(NODE_MAPPING(sbi), nid);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -1017,16 +1029,15 @@ repeat:
blk_finish_plug(&plug);
lock_page(page);
- if (page->mapping != mapping) {
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
f2fs_put_page(page, 1);
goto repeat;
}
page_hit:
- if (!PageUptodate(page)) {
+ if (unlikely(!PageUptodate(page))) {
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
- mark_page_accessed(page);
return page;
}
@@ -1048,7 +1059,6 @@ void sync_inode_page(struct dnode_of_data *dn)
int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
struct writeback_control *wbc)
{
- struct address_space *mapping = sbi->node_inode->i_mapping;
pgoff_t index, end;
struct pagevec pvec;
int step = ino ? 2 : 0;
@@ -1062,7 +1072,7 @@ next_step:
while (index <= end) {
int i, nr_pages;
- nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
PAGECACHE_TAG_DIRTY,
min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
@@ -1095,7 +1105,7 @@ next_step:
else if (!trylock_page(page))
continue;
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
continue_unlock:
unlock_page(page);
continue;
@@ -1122,7 +1132,7 @@ continue_unlock:
set_fsync_mark(page, 0);
set_dentry_mark(page, 0);
}
- mapping->a_ops->writepage(page, wbc);
+ NODE_MAPPING(sbi)->a_ops->writepage(page, wbc);
wrote++;
if (--wbc->nr_to_write == 0)
@@ -1143,11 +1153,52 @@ continue_unlock:
}
if (wrote)
- f2fs_submit_bio(sbi, NODE, wbc->sync_mode == WB_SYNC_ALL);
-
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
return nwritten;
}
+int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
+{
+ pgoff_t index = 0, end = LONG_MAX;
+ struct pagevec pvec;
+ int ret2 = 0, ret = 0;
+
+ pagevec_init(&pvec, 0);
+
+ while (index <= end) {
+ int i, nr_pages;
+ nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index,
+ PAGECACHE_TAG_WRITEBACK,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+
+ /* until radix tree lookup accepts end_index */
+ if (unlikely(page->index > end))
+ continue;
+
+ if (ino && ino_of_node(page) == ino) {
+ f2fs_wait_on_page_writeback(page, NODE);
+ if (TestClearPageError(page))
+ ret = -EIO;
+ }
+ }
+ pagevec_release(&pvec);
+ cond_resched();
+ }
+
+ if (unlikely(test_and_clear_bit(AS_ENOSPC, &NODE_MAPPING(sbi)->flags)))
+ ret2 = -ENOSPC;
+ if (unlikely(test_and_clear_bit(AS_EIO, &NODE_MAPPING(sbi)->flags)))
+ ret2 = -EIO;
+ if (!ret)
+ ret = ret2;
+ return ret;
+}
+
static int f2fs_write_node_page(struct page *page,
struct writeback_control *wbc)
{
@@ -1155,66 +1206,71 @@ static int f2fs_write_node_page(struct page *page,
nid_t nid;
block_t new_addr;
struct node_info ni;
+ struct f2fs_io_info fio = {
+ .type = NODE,
+ .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ };
+
+ trace_f2fs_writepage(page, NODE);
+
+ if (unlikely(sbi->por_doing))
+ goto redirty_out;
- wait_on_page_writeback(page);
+ f2fs_wait_on_page_writeback(page, NODE);
/* get old block addr of this node page */
nid = nid_of_node(page);
- BUG_ON(page->index != nid);
+ f2fs_bug_on(page->index != nid);
get_node_info(sbi, nid, &ni);
/* This page is already truncated */
- if (ni.blk_addr == NULL_ADDR) {
+ if (unlikely(ni.blk_addr == NULL_ADDR)) {
dec_page_count(sbi, F2FS_DIRTY_NODES);
unlock_page(page);
return 0;
}
- if (wbc->for_reclaim) {
- dec_page_count(sbi, F2FS_DIRTY_NODES);
- wbc->pages_skipped++;
- set_page_dirty(page);
- return AOP_WRITEPAGE_ACTIVATE;
- }
+ if (wbc->for_reclaim)
+ goto redirty_out;
mutex_lock(&sbi->node_write);
set_page_writeback(page);
- write_node_page(sbi, page, nid, ni.blk_addr, &new_addr);
- set_node_addr(sbi, &ni, new_addr);
+ write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
+ set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
mutex_unlock(&sbi->node_write);
unlock_page(page);
return 0;
+
+redirty_out:
+ redirty_page_for_writepage(wbc, page);
+ return AOP_WRITEPAGE_ACTIVATE;
}
-/*
- * It is very important to gather dirty pages and write at once, so that we can
- * submit a big bio without interfering other data writes.
- * Be default, 512 pages (2MB) * 3 node types, is more reasonable.
- */
-#define COLLECT_DIRTY_NODES 1536
static int f2fs_write_node_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
- long nr_to_write = wbc->nr_to_write;
+ long diff;
- /* First check balancing cached NAT entries */
- if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
- f2fs_sync_fs(sbi->sb, true);
- return 0;
- }
+ trace_f2fs_writepages(mapping->host, wbc, NODE);
+
+ /* balancing f2fs's metadata in background */
+ f2fs_balance_fs_bg(sbi);
/* collect a number of dirty node pages and write together */
- if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
- return 0;
+ if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE))
+ goto skip_write;
- /* if mounting is failed, skip writing node pages */
- wbc->nr_to_write = 3 * max_hw_blocks(sbi);
+ diff = nr_pages_to_write(sbi, NODE, wbc);
+ wbc->sync_mode = WB_SYNC_NONE;
sync_node_pages(sbi, 0, wbc);
- wbc->nr_to_write = nr_to_write - (3 * max_hw_blocks(sbi) -
- wbc->nr_to_write);
+ wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
+ return 0;
+
+skip_write:
+ wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES);
return 0;
}
@@ -1223,6 +1279,8 @@ static int f2fs_set_node_page_dirty(struct page *page)
struct address_space *mapping = page->mapping;
struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
+ trace_f2fs_set_page_dirty(page, NODE);
+
SetPageUptodate(page);
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
@@ -1260,59 +1318,51 @@ const struct address_space_operations f2fs_node_aops = {
.releasepage = f2fs_release_node_page,
};
-static struct free_nid *__lookup_free_nid_list(nid_t n, struct list_head *head)
+static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
+ nid_t n)
{
- struct list_head *this;
- struct free_nid *i;
- list_for_each(this, head) {
- i = list_entry(this, struct free_nid, list);
- if (i->nid == n)
- return i;
- }
- return NULL;
+ return radix_tree_lookup(&nm_i->free_nid_root, n);
}
-static void __del_from_free_nid_list(struct free_nid *i)
+static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
+ struct free_nid *i)
{
list_del(&i->list);
- kmem_cache_free(free_nid_slab, i);
+ radix_tree_delete(&nm_i->free_nid_root, i->nid);
}
-static int add_free_nid(struct f2fs_nm_info *nm_i, nid_t nid, bool build)
+static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
struct nat_entry *ne;
bool allocated = false;
- if (nm_i->fcnt > 2 * MAX_FREE_NIDS)
+ if (!available_free_memory(sbi, FREE_NIDS))
return -1;
/* 0 nid should not be used */
- if (nid == 0)
+ if (unlikely(nid == 0))
return 0;
- if (!build)
- goto retry;
-
- /* do not add allocated nids */
- read_lock(&nm_i->nat_tree_lock);
- ne = __lookup_nat_cache(nm_i, nid);
- if (ne && nat_get_blkaddr(ne) != NULL_ADDR)
- allocated = true;
- read_unlock(&nm_i->nat_tree_lock);
- if (allocated)
- return 0;
-retry:
- i = kmem_cache_alloc(free_nid_slab, GFP_NOFS);
- if (!i) {
- cond_resched();
- goto retry;
+ if (build) {
+ /* do not add allocated nids */
+ read_lock(&nm_i->nat_tree_lock);
+ ne = __lookup_nat_cache(nm_i, nid);
+ if (ne &&
+ (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR))
+ allocated = true;
+ read_unlock(&nm_i->nat_tree_lock);
+ if (allocated)
+ return 0;
}
+
+ i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
i->nid = nid;
i->state = NID_NEW;
spin_lock(&nm_i->free_nid_list_lock);
- if (__lookup_free_nid_list(nid, &nm_i->free_nid_list)) {
+ if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
spin_unlock(&nm_i->free_nid_list_lock);
kmem_cache_free(free_nid_slab, i);
return 0;
@@ -1326,18 +1376,25 @@ retry:
static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
{
struct free_nid *i;
+ bool need_free = false;
+
spin_lock(&nm_i->free_nid_list_lock);
- i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
+ i = __lookup_free_nid_list(nm_i, nid);
if (i && i->state == NID_NEW) {
- __del_from_free_nid_list(i);
+ __del_from_free_nid_list(nm_i, i);
nm_i->fcnt--;
+ need_free = true;
}
spin_unlock(&nm_i->free_nid_list_lock);
+
+ if (need_free)
+ kmem_cache_free(free_nid_slab, i);
}
-static void scan_nat_page(struct f2fs_nm_info *nm_i,
+static void scan_nat_page(struct f2fs_sb_info *sbi,
struct page *nat_page, nid_t start_nid)
{
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
struct f2fs_nat_block *nat_blk = page_address(nat_page);
block_t blk_addr;
int i;
@@ -1346,13 +1403,13 @@ static void scan_nat_page(struct f2fs_nm_info *nm_i,
for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) {
- if (start_nid >= nm_i->max_nid)
+ if (unlikely(start_nid >= nm_i->max_nid))
break;
blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
- BUG_ON(blk_addr == NEW_ADDR);
+ f2fs_bug_on(blk_addr == NEW_ADDR);
if (blk_addr == NULL_ADDR) {
- if (add_free_nid(nm_i, start_nid, true) < 0)
+ if (add_free_nid(sbi, start_nid, true) < 0)
break;
}
}
@@ -1371,16 +1428,16 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
return;
/* readahead nat pages to be scanned */
- ra_nat_pages(sbi, nid);
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
- scan_nat_page(nm_i, page, nid);
+ scan_nat_page(sbi, page, nid);
f2fs_put_page(page, 1);
nid += (NAT_ENTRY_PER_BLOCK - (nid % NAT_ENTRY_PER_BLOCK));
- if (nid >= nm_i->max_nid)
+ if (unlikely(nid >= nm_i->max_nid))
nid = 0;
if (i++ == FREE_NID_PAGES)
@@ -1396,7 +1453,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr);
nid = le32_to_cpu(nid_in_journal(sum, i));
if (addr == NULL_ADDR)
- add_free_nid(nm_i, nid, true);
+ add_free_nid(sbi, nid, true);
else
remove_free_nid(nm_i, nid);
}
@@ -1412,23 +1469,20 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i = NULL;
- struct list_head *this;
retry:
- if (sbi->total_valid_node_count + 1 >= nm_i->max_nid)
+ if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
return false;
spin_lock(&nm_i->free_nid_list_lock);
/* We should not use stale free nids created by build_free_nids */
- if (nm_i->fcnt && !sbi->on_build_free_nids) {
- BUG_ON(list_empty(&nm_i->free_nid_list));
- list_for_each(this, &nm_i->free_nid_list) {
- i = list_entry(this, struct free_nid, list);
+ if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
+ f2fs_bug_on(list_empty(&nm_i->free_nid_list));
+ list_for_each_entry(i, &nm_i->free_nid_list, list)
if (i->state == NID_NEW)
break;
- }
- BUG_ON(i->state != NID_NEW);
+ f2fs_bug_on(i->state != NID_NEW);
*nid = i->nid;
i->state = NID_ALLOC;
nm_i->fcnt--;
@@ -1439,9 +1493,7 @@ retry:
/* Let's scan nat pages and its caches to get free nids */
mutex_lock(&nm_i->build_lock);
- sbi->on_build_free_nids = 1;
build_free_nids(sbi);
- sbi->on_build_free_nids = 0;
mutex_unlock(&nm_i->build_lock);
goto retry;
}
@@ -1455,10 +1507,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
struct free_nid *i;
spin_lock(&nm_i->free_nid_list_lock);
- i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
- BUG_ON(!i || i->state != NID_ALLOC);
- __del_from_free_nid_list(i);
+ i = __lookup_free_nid_list(nm_i, nid);
+ f2fs_bug_on(!i || i->state != NID_ALLOC);
+ __del_from_free_nid_list(nm_i, i);
spin_unlock(&nm_i->free_nid_list_lock);
+
+ kmem_cache_free(free_nid_slab, i);
}
/*
@@ -1468,20 +1522,25 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
{
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i;
+ bool need_free = false;
if (!nid)
return;
spin_lock(&nm_i->free_nid_list_lock);
- i = __lookup_free_nid_list(nid, &nm_i->free_nid_list);
- BUG_ON(!i || i->state != NID_ALLOC);
- if (nm_i->fcnt > 2 * MAX_FREE_NIDS) {
- __del_from_free_nid_list(i);
+ i = __lookup_free_nid_list(nm_i, nid);
+ f2fs_bug_on(!i || i->state != NID_ALLOC);
+ if (!available_free_memory(sbi, FREE_NIDS)) {
+ __del_from_free_nid_list(nm_i, i);
+ need_free = true;
} else {
i->state = NID_NEW;
nm_i->fcnt++;
}
spin_unlock(&nm_i->free_nid_list_lock);
+
+ if (need_free)
+ kmem_cache_free(free_nid_slab, i);
}
void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1489,90 +1548,200 @@ void recover_node_page(struct f2fs_sb_info *sbi, struct page *page,
block_t new_blkaddr)
{
rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
- set_node_addr(sbi, ni, new_blkaddr);
+ set_node_addr(sbi, ni, new_blkaddr, false);
clear_node_page_dirty(page);
}
+static void recover_inline_xattr(struct inode *inode, struct page *page)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ void *src_addr, *dst_addr;
+ size_t inline_size;
+ struct page *ipage;
+ struct f2fs_inode *ri;
+
+ if (!f2fs_has_inline_xattr(inode))
+ return;
+
+ if (!IS_INODE(page))
+ return;
+
+ ri = F2FS_INODE(page);
+ if (!(ri->i_inline & F2FS_INLINE_XATTR))
+ return;
+
+ ipage = get_node_page(sbi, inode->i_ino);
+ f2fs_bug_on(IS_ERR(ipage));
+
+ dst_addr = inline_xattr_addr(ipage);
+ src_addr = inline_xattr_addr(page);
+ inline_size = inline_xattr_size(inode);
+
+ f2fs_wait_on_page_writeback(ipage, NODE);
+ memcpy(dst_addr, src_addr, inline_size);
+
+ update_inode(inode, ipage);
+ f2fs_put_page(ipage, 1);
+}
+
+bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
+ nid_t new_xnid = nid_of_node(page);
+ struct node_info ni;
+
+ recover_inline_xattr(inode, page);
+
+ if (!f2fs_has_xattr_block(ofs_of_node(page)))
+ return false;
+
+ /* 1: invalidate the previous xattr nid */
+ if (!prev_xnid)
+ goto recover_xnid;
+
+ /* Deallocate node address */
+ get_node_info(sbi, prev_xnid, &ni);
+ f2fs_bug_on(ni.blk_addr == NULL_ADDR);
+ invalidate_blocks(sbi, ni.blk_addr);
+ dec_valid_node_count(sbi, inode);
+ set_node_addr(sbi, &ni, NULL_ADDR, false);
+
+recover_xnid:
+ /* 2: allocate new xattr nid */
+ if (unlikely(!inc_valid_node_count(sbi, inode)))
+ f2fs_bug_on(1);
+
+ remove_free_nid(NM_I(sbi), new_xnid);
+ get_node_info(sbi, new_xnid, &ni);
+ ni.ino = inode->i_ino;
+ set_node_addr(sbi, &ni, NEW_ADDR, false);
+ F2FS_I(inode)->i_xattr_nid = new_xnid;
+
+ /* 3: update xattr blkaddr */
+ refresh_sit_entry(sbi, NEW_ADDR, blkaddr);
+ set_node_addr(sbi, &ni, blkaddr, false);
+
+ update_inode_page(inode);
+ return true;
+}
+
int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
{
- struct address_space *mapping = sbi->node_inode->i_mapping;
- struct f2fs_node *src, *dst;
+ struct f2fs_inode *src, *dst;
nid_t ino = ino_of_node(page);
struct node_info old_ni, new_ni;
struct page *ipage;
- ipage = grab_cache_page(mapping, ino);
+ get_node_info(sbi, ino, &old_ni);
+
+ if (unlikely(old_ni.blk_addr != NULL_ADDR))
+ return -EINVAL;
+
+ ipage = grab_cache_page(NODE_MAPPING(sbi), ino);
if (!ipage)
return -ENOMEM;
/* Should not use this inode from free nid list */
remove_free_nid(NM_I(sbi), ino);
- get_node_info(sbi, ino, &old_ni);
SetPageUptodate(ipage);
fill_node_footer(ipage, ino, ino, 0, true);
- src = F2FS_NODE(page);
- dst = F2FS_NODE(ipage);
+ src = F2FS_INODE(page);
+ dst = F2FS_INODE(ipage);
- memcpy(dst, src, (unsigned long)&src->i.i_ext - (unsigned long)&src->i);
- dst->i.i_size = 0;
- dst->i.i_blocks = cpu_to_le64(1);
- dst->i.i_links = cpu_to_le32(1);
- dst->i.i_xattr_nid = 0;
+ memcpy(dst, src, (unsigned long)&src->i_ext - (unsigned long)src);
+ dst->i_size = 0;
+ dst->i_blocks = cpu_to_le64(1);
+ dst->i_links = cpu_to_le32(1);
+ dst->i_xattr_nid = 0;
new_ni = old_ni;
new_ni.ino = ino;
- if (!inc_valid_node_count(sbi, NULL, 1))
+ if (unlikely(!inc_valid_node_count(sbi, NULL)))
WARN_ON(1);
- set_node_addr(sbi, &new_ni, NEW_ADDR);
+ set_node_addr(sbi, &new_ni, NEW_ADDR, false);
inc_valid_inode_count(sbi);
f2fs_put_page(ipage, 1);
return 0;
}
+/*
+ * ra_sum_pages() merge contiguous pages into one bio and submit.
+ * these pre-readed pages are alloced in bd_inode's mapping tree.
+ */
+static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
+ int start, int nrpages)
+{
+ struct inode *inode = sbi->sb->s_bdev->bd_inode;
+ struct address_space *mapping = inode->i_mapping;
+ int i, page_idx = start;
+ struct f2fs_io_info fio = {
+ .type = META,
+ .rw = READ_SYNC | REQ_META | REQ_PRIO
+ };
+
+ for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
+ /* alloc page in bd_inode for reading node summary info */
+ pages[i] = grab_cache_page(mapping, page_idx);
+ if (!pages[i])
+ break;
+ f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
+ }
+
+ f2fs_submit_merged_bio(sbi, META, READ);
+ return i;
+}
+
int restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum)
{
struct f2fs_node *rn;
struct f2fs_summary *sum_entry;
- struct page *page;
+ struct inode *inode = sbi->sb->s_bdev->bd_inode;
block_t addr;
- int i, last_offset;
-
- /* alloc temporal page for read node */
- page = alloc_page(GFP_NOFS | __GFP_ZERO);
- if (!page)
- return -ENOMEM;
- lock_page(page);
+ int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ struct page *pages[bio_blocks];
+ int i, idx, last_offset, nrpages, err = 0;
/* scan the node segment */
last_offset = sbi->blocks_per_seg;
addr = START_BLOCK(sbi, segno);
sum_entry = &sum->entries[0];
- for (i = 0; i < last_offset; i++, sum_entry++) {
- /*
- * In order to read next node page,
- * we must clear PageUptodate flag.
- */
- ClearPageUptodate(page);
+ for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
+ nrpages = min(last_offset - i, bio_blocks);
- if (f2fs_readpage(sbi, page, addr, READ_SYNC))
- goto out;
+ /* read ahead node pages */
+ nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
+ if (!nrpages)
+ return -ENOMEM;
+
+ for (idx = 0; idx < nrpages; idx++) {
+ if (err)
+ goto skip;
+
+ lock_page(pages[idx]);
+ if (unlikely(!PageUptodate(pages[idx]))) {
+ err = -EIO;
+ } else {
+ rn = F2FS_NODE(pages[idx]);
+ sum_entry->nid = rn->footer.nid;
+ sum_entry->version = 0;
+ sum_entry->ofs_in_node = 0;
+ sum_entry++;
+ }
+ unlock_page(pages[idx]);
+skip:
+ page_cache_release(pages[idx]);
+ }
- lock_page(page);
- rn = F2FS_NODE(page);
- sum_entry->nid = rn->footer.nid;
- sum_entry->version = 0;
- sum_entry->ofs_in_node = 0;
- addr++;
+ invalidate_mapping_pages(inode->i_mapping, addr,
+ addr + nrpages);
}
- unlock_page(page);
-out:
- __free_pages(page, 0);
- return 0;
+ return err;
}
static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -1608,9 +1777,7 @@ retry:
write_unlock(&nm_i->nat_tree_lock);
goto retry;
}
- nat_set_blkaddr(ne, le32_to_cpu(raw_ne.block_addr));
- nat_set_ino(ne, le32_to_cpu(raw_ne.ino));
- nat_set_version(ne, raw_ne.version);
+ node_info_from_raw_nat(&ne->ni, &raw_ne);
__set_nat_cache_dirty(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
}
@@ -1627,7 +1794,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- struct list_head *cur, *n;
+ struct nat_entry *ne, *cur;
struct page *page = NULL;
struct f2fs_nat_block *nat_blk = NULL;
nid_t start_nid = 0, end_nid = 0;
@@ -1639,18 +1806,16 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
mutex_lock(&curseg->curseg_mutex);
/* 1) flush dirty nat caches */
- list_for_each_safe(cur, n, &nm_i->dirty_nat_entries) {
- struct nat_entry *ne;
+ list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) {
nid_t nid;
struct f2fs_nat_entry raw_ne;
int offset = -1;
- block_t new_blkaddr;
-
- ne = list_entry(cur, struct nat_entry, list);
- nid = nat_get_nid(ne);
if (nat_get_blkaddr(ne) == NEW_ADDR)
continue;
+
+ nid = nat_get_nid(ne);
+
if (flushed)
goto to_nat_page;
@@ -1677,14 +1842,10 @@ to_nat_page:
nat_blk = page_address(page);
}
- BUG_ON(!nat_blk);
+ f2fs_bug_on(!nat_blk);
raw_ne = nat_blk->entries[nid - start_nid];
flush_now:
- new_blkaddr = nat_get_blkaddr(ne);
-
- raw_ne.ino = cpu_to_le32(nat_get_ino(ne));
- raw_ne.block_addr = cpu_to_le32(new_blkaddr);
- raw_ne.version = nat_get_version(ne);
+ raw_nat_from_node_info(&raw_ne, &ne->ni);
if (offset < 0) {
nat_blk->entries[nid - start_nid] = raw_ne;
@@ -1694,23 +1855,19 @@ flush_now:
}
if (nat_get_blkaddr(ne) == NULL_ADDR &&
- add_free_nid(NM_I(sbi), nid, false) <= 0) {
+ add_free_nid(sbi, nid, false) <= 0) {
write_lock(&nm_i->nat_tree_lock);
__del_from_nat_cache(nm_i, ne);
write_unlock(&nm_i->nat_tree_lock);
} else {
write_lock(&nm_i->nat_tree_lock);
__clear_nat_cache_dirty(nm_i, ne);
- ne->checkpointed = true;
write_unlock(&nm_i->nat_tree_lock);
}
}
if (!flushed)
mutex_unlock(&curseg->curseg_mutex);
f2fs_put_page(page, 1);
-
- /* 2) shrink nat caches if necessary */
- try_to_free_nats(sbi, nm_i->nat_cnt - NM_WOUT_THRESHOLD);
}
static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1725,10 +1882,16 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
/* segment_count_nat includes pair segment so divide to 2. */
nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1;
nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg);
+
nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
+
+ /* not used nids: 0, node, meta, (and root counted as valid node) */
+ nm_i->available_nids = nm_i->max_nid - 3;
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
+ nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+ INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -1781,11 +1944,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
/* destroy free nid list */
spin_lock(&nm_i->free_nid_list_lock);
list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
- BUG_ON(i->state == NID_ALLOC);
- __del_from_free_nid_list(i);
+ f2fs_bug_on(i->state == NID_ALLOC);
+ __del_from_free_nid_list(nm_i, i);
nm_i->fcnt--;
+ spin_unlock(&nm_i->free_nid_list_lock);
+ kmem_cache_free(free_nid_slab, i);
+ spin_lock(&nm_i->free_nid_list_lock);
}
- BUG_ON(nm_i->fcnt);
+ f2fs_bug_on(nm_i->fcnt);
spin_unlock(&nm_i->free_nid_list_lock);
/* destroy nat cache */
@@ -1793,13 +1959,11 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
- for (idx = 0; idx < found; idx++) {
- struct nat_entry *e = natvec[idx];
- nid = nat_get_nid(e) + 1;
- __del_from_nat_cache(nm_i, e);
- }
+ nid = nat_get_nid(natvec[found - 1]) + 1;
+ for (idx = 0; idx < found; idx++)
+ __del_from_nat_cache(nm_i, natvec[idx]);
}
- BUG_ON(nm_i->nat_cnt);
+ f2fs_bug_on(nm_i->nat_cnt);
write_unlock(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
@@ -1810,12 +1974,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
int __init create_node_manager_caches(void)
{
nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
- sizeof(struct nat_entry), NULL);
+ sizeof(struct nat_entry));
if (!nat_entry_slab)
return -ENOMEM;
free_nid_slab = f2fs_kmem_cache_create("free_nid",
- sizeof(struct free_nid), NULL);
+ sizeof(struct free_nid));
if (!free_nid_slab) {
kmem_cache_destroy(nat_entry_slab);
return -ENOMEM;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 3496bb3e15d..7281112cd1c 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -17,14 +17,11 @@
/* # of pages to perform readahead before building free nids */
#define FREE_NID_PAGES 4
-/* maximum # of free node ids to produce during build_free_nids */
-#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES)
-
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
-/* maximum cached nat entries to manage memory footprint */
-#define NM_WOUT_THRESHOLD (64 * NAT_ENTRY_PER_BLOCK)
+/* control the memory footprint threshold (10MB per 1GB ram) */
+#define DEF_RAM_THRESHOLD 10
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
@@ -45,6 +42,7 @@ struct node_info {
struct nat_entry {
struct list_head list; /* for clean or dirty nat list */
bool checkpointed; /* whether it is checkpointed or not */
+ bool fsync_done; /* whether the latest node has fsync mark */
struct node_info ni; /* in-memory node information */
};
@@ -58,9 +56,15 @@ struct nat_entry {
#define nat_set_version(nat, v) (nat->ni.version = v)
#define __set_nat_cache_dirty(nm_i, ne) \
- list_move_tail(&ne->list, &nm_i->dirty_nat_entries);
+ do { \
+ ne->checkpointed = false; \
+ list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
+ } while (0)
#define __clear_nat_cache_dirty(nm_i, ne) \
- list_move_tail(&ne->list, &nm_i->nat_entries);
+ do { \
+ ne->checkpointed = true; \
+ list_move_tail(&ne->list, &nm_i->nat_entries); \
+ } while (0)
#define inc_node_version(version) (++version)
static inline void node_info_from_raw_nat(struct node_info *ni,
@@ -71,6 +75,20 @@ static inline void node_info_from_raw_nat(struct node_info *ni,
ni->version = raw_ne->version;
}
+static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne,
+ struct node_info *ni)
+{
+ raw_ne->ino = cpu_to_le32(ni->ino);
+ raw_ne->block_addr = cpu_to_le32(ni->blk_addr);
+ raw_ne->version = ni->version;
+}
+
+enum mem_type {
+ FREE_NIDS, /* indicates the free nid list */
+ NAT_ENTRIES, /* indicates the cached nat entry */
+ DIRTY_DENTS /* indicates dirty dentry pages */
+};
+
/*
* For free nid mangement
*/
@@ -224,13 +242,19 @@ static inline block_t next_blkaddr_of_node(struct page *node_page)
* | `- direct node (5 + N => 5 + 2N - 1)
* `- double indirect node (5 + 2N)
* `- indirect node (6 + 2N)
- * `- direct node (x(N + 1))
+ * `- direct node
+ * ......
+ * `- indirect node ((6 + 2N) + x(N + 1))
+ * `- direct node
+ * ......
+ * `- indirect node ((6 + 2N) + (N - 1)(N + 1))
+ * `- direct node
*/
static inline bool IS_DNODE(struct page *node_page)
{
unsigned int ofs = ofs_of_node(node_page);
- if (ofs == XATTR_NODE_OFFSET)
+ if (f2fs_has_xattr_block(ofs))
return false;
if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK ||
@@ -248,7 +272,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
{
struct f2fs_node *rn = F2FS_NODE(p);
- wait_on_page_writeback(p);
+ f2fs_wait_on_page_writeback(p, NODE);
if (i)
rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 51ef5eec33d..a112368a4a8 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -27,21 +27,18 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi)
static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
nid_t ino)
{
- struct list_head *this;
struct fsync_inode_entry *entry;
- list_for_each(this, head) {
- entry = list_entry(this, struct fsync_inode_entry, list);
+ list_for_each_entry(entry, head, list)
if (entry->inode->i_ino == ino)
return entry;
- }
+
return NULL;
}
static int recover_dentry(struct page *ipage, struct inode *inode)
{
- struct f2fs_node *raw_node = F2FS_NODE(ipage);
- struct f2fs_inode *raw_inode = &(raw_node->i);
+ struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
nid_t pino = le32_to_cpu(raw_inode->i_pino);
struct f2fs_dir_entry *de;
struct qstr name;
@@ -49,51 +46,71 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
struct inode *dir, *einode;
int err = 0;
- dir = check_dirty_dir_inode(F2FS_SB(inode->i_sb), pino);
- if (!dir) {
- dir = f2fs_iget(inode->i_sb, pino);
- if (IS_ERR(dir)) {
- err = PTR_ERR(dir);
- goto out;
- }
- set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
- add_dirty_dir_inode(dir);
+ dir = f2fs_iget(inode->i_sb, pino);
+ if (IS_ERR(dir)) {
+ err = PTR_ERR(dir);
+ goto out;
}
name.len = le32_to_cpu(raw_inode->i_namelen);
name.name = raw_inode->i_name;
+
+ if (unlikely(name.len > F2FS_NAME_LEN)) {
+ WARN_ON(1);
+ err = -ENAMETOOLONG;
+ goto out_err;
+ }
retry:
de = f2fs_find_entry(dir, &name, &page);
- if (de && inode->i_ino == le32_to_cpu(de->ino)) {
- kunmap(page);
- f2fs_put_page(page, 0);
- goto out;
- }
+ if (de && inode->i_ino == le32_to_cpu(de->ino))
+ goto out_unmap_put;
if (de) {
einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
if (IS_ERR(einode)) {
WARN_ON(1);
- if (PTR_ERR(einode) == -ENOENT)
+ err = PTR_ERR(einode);
+ if (err == -ENOENT)
err = -EEXIST;
- goto out;
+ goto out_unmap_put;
+ }
+ err = acquire_orphan_inode(F2FS_SB(inode->i_sb));
+ if (err) {
+ iput(einode);
+ goto out_unmap_put;
}
f2fs_delete_entry(de, page, einode);
iput(einode);
goto retry;
}
err = __f2fs_add_link(dir, &name, inode);
+ if (err)
+ goto out_err;
+
+ if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) {
+ iput(dir);
+ } else {
+ add_dirty_dir_inode(dir);
+ set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT);
+ }
+
+ goto out;
+
+out_unmap_put:
+ kunmap(page);
+ f2fs_put_page(page, 0);
+out_err:
+ iput(dir);
out:
- f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode and its dentry: "
- "ino = %x, name = %s, dir = %lx, err = %d",
- ino_of_node(ipage), raw_inode->i_name,
+ f2fs_msg(inode->i_sb, KERN_NOTICE,
+ "%s: ino = %x, name = %s, dir = %lx, err = %d",
+ __func__, ino_of_node(ipage), raw_inode->i_name,
IS_ERR(dir) ? 0 : dir->i_ino, err);
return err;
}
static int recover_inode(struct inode *inode, struct page *node_page)
{
- struct f2fs_node *raw_node = F2FS_NODE(node_page);
- struct f2fs_inode *raw_inode = &(raw_node->i);
+ struct f2fs_inode *raw_inode = F2FS_INODE(node_page);
if (!IS_INODE(node_page))
return 0;
@@ -125,7 +142,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
/* get node pages in the current segment */
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
- blkaddr = START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff;
+ blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/* read node page */
page = alloc_page(GFP_F2FS_ZERO);
@@ -136,9 +153,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
while (1) {
struct fsync_inode_entry *entry;
- err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
if (err)
- goto out;
+ return err;
lock_page(page);
@@ -184,9 +201,10 @@ next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
+
unlock_page(page);
-out:
__free_pages(page, 0);
+
return err;
}
@@ -206,13 +224,12 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
{
struct seg_entry *sentry;
unsigned int segno = GET_SEGNO(sbi, blkaddr);
- unsigned short blkoff = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) &
- (sbi->blocks_per_seg - 1);
+ unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+ struct f2fs_summary_block *sum_node;
struct f2fs_summary sum;
+ struct page *sum_page, *node_page;
nid_t ino, nid;
- void *kaddr;
struct inode *inode;
- struct page *node_page;
unsigned int offset;
block_t bidx;
int i;
@@ -226,18 +243,15 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct curseg_info *curseg = CURSEG_I(sbi, i);
if (curseg->segno == segno) {
sum = curseg->sum_blk->entries[blkoff];
- break;
+ goto got_it;
}
}
- if (i > CURSEG_COLD_DATA) {
- struct page *sum_page = get_sum_page(sbi, segno);
- struct f2fs_summary_block *sum_node;
- kaddr = page_address(sum_page);
- sum_node = (struct f2fs_summary_block *)kaddr;
- sum = sum_node->entries[blkoff];
- f2fs_put_page(sum_page, 1);
- }
+ sum_page = get_sum_page(sbi, segno);
+ sum_node = (struct f2fs_summary_block *)page_address(sum_page);
+ sum = sum_node->entries[blkoff];
+ f2fs_put_page(sum_page, 1);
+got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
if (dn->inode->i_ino == nid) {
@@ -285,28 +299,31 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
struct f2fs_summary sum;
struct node_info ni;
int err = 0, recovered = 0;
- int ilock;
+
+ if (recover_inline_data(inode, page))
+ goto out;
+
+ if (recover_xattr_data(inode, page, blkaddr))
+ goto out;
start = start_bidx_of_node(ofs_of_node(page), fi);
- if (IS_INODE(page))
- end = start + ADDRS_PER_INODE(fi);
- else
- end = start + ADDRS_PER_BLOCK;
+ end = start + ADDRS_PER_PAGE(page, fi);
+
+ f2fs_lock_op(sbi);
- ilock = mutex_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
- mutex_unlock_op(sbi, ilock);
- return err;
+ f2fs_unlock_op(sbi);
+ goto out;
}
- wait_on_page_writeback(dn.node_page);
+ f2fs_wait_on_page_writeback(dn.node_page, NODE);
get_node_info(sbi, dn.nid, &ni);
- BUG_ON(ni.ino != ino_of_node(page));
- BUG_ON(ofs_of_node(dn.node_page) != ofs_of_node(page));
+ f2fs_bug_on(ni.ino != ino_of_node(page));
+ f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page));
for (; start < end; start++) {
block_t src, dest;
@@ -316,9 +333,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) {
if (src == NULL_ADDR) {
- int err = reserve_new_block(&dn);
+ err = reserve_new_block(&dn);
/* We should not get -ENOSPC */
- BUG_ON(err);
+ f2fs_bug_on(err);
}
/* Check the previous node page having this index */
@@ -349,11 +366,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
err:
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
-
- f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
- "recovered_data = %d blocks, err = %d",
- inode->i_ino, recovered, err);
+ f2fs_unlock_op(sbi);
+out:
+ f2fs_msg(sbi->sb, KERN_NOTICE,
+ "recover_data: ino = %lx, recovered = %d blocks, err = %d",
+ inode->i_ino, recovered, err);
return err;
}
@@ -371,7 +388,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/* read node page */
- page = alloc_page(GFP_NOFS | __GFP_ZERO);
+ page = alloc_page(GFP_F2FS_ZERO);
if (!page)
return -ENOMEM;
@@ -380,9 +397,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
while (1) {
struct fsync_inode_entry *entry;
- err = f2fs_readpage(sbi, page, blkaddr, READ_SYNC);
+ err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
if (err)
- goto out;
+ return err;
lock_page(page);
@@ -406,8 +423,8 @@ next:
/* check next segment */
blkaddr = next_blkaddr_of_node(page);
}
+
unlock_page(page);
-out:
__free_pages(page, 0);
if (!err)
@@ -419,16 +436,17 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
{
struct list_head inode_list;
int err;
+ bool need_writecp = false;
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
- sizeof(struct fsync_inode_entry), NULL);
- if (unlikely(!fsync_entry_slab))
+ sizeof(struct fsync_inode_entry));
+ if (!fsync_entry_slab)
return -ENOMEM;
INIT_LIST_HEAD(&inode_list);
/* step #1: find fsynced inode numbers */
- sbi->por_doing = 1;
+ sbi->por_doing = true;
err = find_fsync_dnodes(sbi, &inode_list);
if (err)
goto out;
@@ -436,14 +454,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
if (list_empty(&inode_list))
goto out;
+ need_writecp = true;
+
/* step #2: recover data */
err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
- BUG_ON(!list_empty(&inode_list));
+ f2fs_bug_on(!list_empty(&inode_list));
out:
destroy_fsync_dnodes(&inode_list);
kmem_cache_destroy(fsync_entry_slab);
- sbi->por_doing = 0;
- if (!err)
+ sbi->por_doing = false;
+ if (!err && need_writecp)
write_checkpoint(sbi, false);
return err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 09af9c7b0f5..d04613df710 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -13,13 +13,165 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/prefetch.h>
+#include <linux/kthread.h>
#include <linux/vmalloc.h>
+#include <linux/swap.h>
#include "f2fs.h"
#include "segment.h"
#include "node.h"
#include <trace/events/f2fs.h>
+#define __reverse_ffz(x) __reverse_ffs(~(x))
+
+static struct kmem_cache *discard_entry_slab;
+
+/*
+ * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
+ * MSB and LSB are reversed in a byte by f2fs_set_bit.
+ */
+static inline unsigned long __reverse_ffs(unsigned long word)
+{
+ int num = 0;
+
+#if BITS_PER_LONG == 64
+ if ((word & 0xffffffff) == 0) {
+ num += 32;
+ word >>= 32;
+ }
+#endif
+ if ((word & 0xffff) == 0) {
+ num += 16;
+ word >>= 16;
+ }
+ if ((word & 0xff) == 0) {
+ num += 8;
+ word >>= 8;
+ }
+ if ((word & 0xf0) == 0)
+ num += 4;
+ else
+ word >>= 4;
+ if ((word & 0xc) == 0)
+ num += 2;
+ else
+ word >>= 2;
+ if ((word & 0x2) == 0)
+ num += 1;
+ return num;
+}
+
+/*
+ * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue
+ * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * Example:
+ * LSB <--> MSB
+ * f2fs_set_bit(0, bitmap) => 0000 0001
+ * f2fs_set_bit(7, bitmap) => 1000 0000
+ */
+static unsigned long __find_rev_next_bit(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ const unsigned long *p = addr + BIT_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long tmp;
+ unsigned long mask, submask;
+ unsigned long quot, rest;
+
+ if (offset >= size)
+ return size;
+
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (!offset)
+ goto aligned;
+
+ tmp = *(p++);
+ quot = (offset >> 3) << 3;
+ rest = offset & 0x7;
+ mask = ~0UL << quot;
+ submask = (unsigned char)(0xff << rest) >> rest;
+ submask <<= quot;
+ mask &= submask;
+ tmp &= mask;
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+aligned:
+ while (size & ~(BITS_PER_LONG-1)) {
+ tmp = *(p++);
+ if (tmp)
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __reverse_ffs(tmp);
+}
+
+static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
+ unsigned long size, unsigned long offset)
+{
+ const unsigned long *p = addr + BIT_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG - 1);
+ unsigned long tmp;
+ unsigned long mask, submask;
+ unsigned long quot, rest;
+
+ if (offset >= size)
+ return size;
+
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (!offset)
+ goto aligned;
+
+ tmp = *(p++);
+ quot = (offset >> 3) << 3;
+ rest = offset & 0x7;
+ mask = ~(~0UL << quot);
+ submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
+ submask <<= quot;
+ mask += submask;
+ tmp |= mask;
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (~tmp)
+ goto found_middle;
+
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+aligned:
+ while (size & ~(BITS_PER_LONG - 1)) {
+ tmp = *(p++);
+ if (~tmp)
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp |= ~0UL << size;
+ if (tmp == ~0UL) /* Are any bits zero? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __reverse_ffz(tmp);
+}
+
/*
* This function balances dirty node and dentry pages.
* In addition, it controls garbage collection.
@@ -36,6 +188,114 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
}
}
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+{
+ /* check the # of cached NAT entries and prefree segments */
+ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
+ excess_prefree_segs(sbi))
+ f2fs_sync_fs(sbi->sb, true);
+}
+
+static int issue_flush_thread(void *data)
+{
+ struct f2fs_sb_info *sbi = data;
+ struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ wait_queue_head_t *q = &fcc->flush_wait_queue;
+repeat:
+ if (kthread_should_stop())
+ return 0;
+
+ spin_lock(&fcc->issue_lock);
+ if (fcc->issue_list) {
+ fcc->dispatch_list = fcc->issue_list;
+ fcc->issue_list = fcc->issue_tail = NULL;
+ }
+ spin_unlock(&fcc->issue_lock);
+
+ if (fcc->dispatch_list) {
+ struct bio *bio = bio_alloc(GFP_NOIO, 0);
+ struct flush_cmd *cmd, *next;
+ int ret;
+
+ bio->bi_bdev = sbi->sb->s_bdev;
+ ret = submit_bio_wait(WRITE_FLUSH, bio);
+
+ for (cmd = fcc->dispatch_list; cmd; cmd = next) {
+ cmd->ret = ret;
+ next = cmd->next;
+ complete(&cmd->wait);
+ }
+ bio_put(bio);
+ fcc->dispatch_list = NULL;
+ }
+
+ wait_event_interruptible(*q,
+ kthread_should_stop() || fcc->issue_list);
+ goto repeat;
+}
+
+int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+{
+ struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
+ struct flush_cmd cmd;
+
+ if (!test_opt(sbi, FLUSH_MERGE))
+ return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
+
+ init_completion(&cmd.wait);
+ cmd.next = NULL;
+
+ spin_lock(&fcc->issue_lock);
+ if (fcc->issue_list)
+ fcc->issue_tail->next = &cmd;
+ else
+ fcc->issue_list = &cmd;
+ fcc->issue_tail = &cmd;
+ spin_unlock(&fcc->issue_lock);
+
+ if (!fcc->dispatch_list)
+ wake_up(&fcc->flush_wait_queue);
+
+ wait_for_completion(&cmd.wait);
+
+ return cmd.ret;
+}
+
+int create_flush_cmd_control(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ struct flush_cmd_control *fcc;
+ int err = 0;
+
+ fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
+ if (!fcc)
+ return -ENOMEM;
+ spin_lock_init(&fcc->issue_lock);
+ init_waitqueue_head(&fcc->flush_wait_queue);
+ sbi->sm_info->cmd_control_info = fcc;
+ fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
+ "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
+ if (IS_ERR(fcc->f2fs_issue_flush)) {
+ err = PTR_ERR(fcc->f2fs_issue_flush);
+ kfree(fcc);
+ sbi->sm_info->cmd_control_info = NULL;
+ return err;
+ }
+
+ return err;
+}
+
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+{
+ struct flush_cmd_control *fcc =
+ sbi->sm_info->cmd_control_info;
+
+ if (fcc && fcc->f2fs_issue_flush)
+ kthread_stop(fcc->f2fs_issue_flush);
+ kfree(fcc);
+ sbi->sm_info->cmd_control_info = NULL;
+}
+
static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
enum dirty_type dirty_type)
{
@@ -50,20 +310,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
if (dirty_type == DIRTY) {
struct seg_entry *sentry = get_seg_entry(sbi, segno);
- enum dirty_type t = DIRTY_HOT_DATA;
+ enum dirty_type t = sentry->type;
- dirty_type = sentry->type;
-
- if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type]))
- dirty_i->nr_dirty[dirty_type]++;
-
- /* Only one bitmap should be set */
- for (; t <= DIRTY_COLD_NODE; t++) {
- if (t == dirty_type)
- continue;
- if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
- dirty_i->nr_dirty[t]--;
- }
+ if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
+ dirty_i->nr_dirty[t]++;
}
}
@@ -76,12 +326,11 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
dirty_i->nr_dirty[dirty_type]--;
if (dirty_type == DIRTY) {
- enum dirty_type t = DIRTY_HOT_DATA;
+ struct seg_entry *sentry = get_seg_entry(sbi, segno);
+ enum dirty_type t = sentry->type;
- /* clear all the bitmaps */
- for (; t <= DIRTY_COLD_NODE; t++)
- if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
- dirty_i->nr_dirty[t]--;
+ if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t]))
+ dirty_i->nr_dirty[t]--;
if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0)
clear_bit(GET_SECNO(sbi, segno),
@@ -119,6 +368,69 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
mutex_unlock(&dirty_i->seglist_lock);
}
+static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
+ block_t blkstart, block_t blklen)
+{
+ sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart);
+ sector_t len = SECTOR_FROM_BLOCK(sbi, blklen);
+ trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+ return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
+}
+
+void discard_next_dnode(struct f2fs_sb_info *sbi)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
+ block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
+
+ if (f2fs_issue_discard(sbi, blkaddr, 1)) {
+ struct page *page = grab_meta_page(sbi, blkaddr);
+ /* zero-filled page */
+ set_page_dirty(page);
+ f2fs_put_page(page, 1);
+ }
+}
+
+static void add_discard_addrs(struct f2fs_sb_info *sbi,
+ unsigned int segno, struct seg_entry *se)
+{
+ struct list_head *head = &SM_I(sbi)->discard_list;
+ struct discard_entry *new;
+ int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+ int max_blocks = sbi->blocks_per_seg;
+ unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+ unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+ unsigned long dmap[entries];
+ unsigned int start = 0, end = -1;
+ int i;
+
+ if (!test_opt(sbi, DISCARD))
+ return;
+
+ /* zero block will be discarded through the prefree list */
+ if (!se->valid_blocks || se->valid_blocks == max_blocks)
+ return;
+
+ /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
+ for (i = 0; i < entries; i++)
+ dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
+
+ while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
+ start = __find_rev_next_bit(dmap, max_blocks, end + 1);
+ if (start >= max_blocks)
+ break;
+
+ end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
+
+ new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
+ INIT_LIST_HEAD(&new->list);
+ new->blkaddr = START_BLOCK(sbi, segno) + start;
+ new->len = end - start;
+
+ list_add_tail(&new->list, head);
+ SM_I(sbi)->nr_discards += end - start;
+ }
+}
+
/*
* Should call clear_prefree_segments after checkpoint is done.
*/
@@ -141,30 +453,42 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
void clear_prefree_segments(struct f2fs_sb_info *sbi)
{
+ struct list_head *head = &(SM_I(sbi)->discard_list);
+ struct discard_entry *entry, *this;
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- unsigned int segno = -1;
+ unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
unsigned int total_segs = TOTAL_SEGS(sbi);
+ unsigned int start = 0, end = -1;
mutex_lock(&dirty_i->seglist_lock);
+
while (1) {
- segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
- segno + 1);
- if (segno >= total_segs)
+ int i;
+ start = find_next_bit(prefree_map, total_segs, end + 1);
+ if (start >= total_segs)
break;
+ end = find_next_zero_bit(prefree_map, total_segs, start + 1);
+
+ for (i = start; i < end; i++)
+ clear_bit(i, prefree_map);
+
+ dirty_i->nr_dirty[PRE] -= end - start;
- if (test_and_clear_bit(segno, dirty_i->dirty_segmap[PRE]))
- dirty_i->nr_dirty[PRE]--;
-
- /* Let's use trim */
- if (test_opt(sbi, DISCARD))
- blkdev_issue_discard(sbi->sb->s_bdev,
- START_BLOCK(sbi, segno) <<
- sbi->log_sectors_per_block,
- 1 << (sbi->log_sectors_per_block +
- sbi->log_blocks_per_seg),
- GFP_NOFS, 0);
+ if (!test_opt(sbi, DISCARD))
+ continue;
+
+ f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
+ (end - start) << sbi->log_blocks_per_seg);
}
mutex_unlock(&dirty_i->seglist_lock);
+
+ /* send small discards */
+ list_for_each_entry_safe(entry, this, head, list) {
+ f2fs_issue_discard(sbi, entry->blkaddr, entry->len);
+ list_del(&entry->list);
+ SM_I(sbi)->nr_discards -= entry->len;
+ kmem_cache_free(discard_entry_slab, entry);
+ }
}
static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
@@ -193,9 +517,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
se = get_seg_entry(sbi, segno);
new_vblocks = se->valid_blocks + del;
- offset = GET_SEGOFF_FROM_SEG0(sbi, blkaddr) & (sbi->blocks_per_seg - 1);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
- BUG_ON((new_vblocks >> (sizeof(unsigned short) << 3) ||
+ f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) ||
(new_vblocks > sbi->blocks_per_seg)));
se->valid_blocks = new_vblocks;
@@ -222,12 +546,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
get_sec_entry(sbi, segno)->valid_blocks += del;
}
-static void refresh_sit_entry(struct f2fs_sb_info *sbi,
- block_t old_blkaddr, block_t new_blkaddr)
+void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
{
- update_sit_entry(sbi, new_blkaddr, 1);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
- update_sit_entry(sbi, old_blkaddr, -1);
+ update_sit_entry(sbi, new, 1);
+ if (GET_SEGNO(sbi, old) != NULL_SEGNO)
+ update_sit_entry(sbi, old, -1);
+
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
}
void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
@@ -235,7 +561,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
unsigned int segno = GET_SEGNO(sbi, addr);
struct sit_info *sit_i = SIT_I(sbi);
- BUG_ON(addr == NULL_ADDR);
+ f2fs_bug_on(addr == NULL_ADDR);
if (addr == NEW_ADDR)
return;
@@ -267,9 +593,8 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
*/
int npages_for_summary_flush(struct f2fs_sb_info *sbi)
{
- int total_size_bytes = 0;
int valid_sum_count = 0;
- int i, sum_space;
+ int i, sum_in_page;
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
if (sbi->ckpt->alloc_type[i] == SSR)
@@ -278,13 +603,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
valid_sum_count += curseg_blkoff(sbi, i);
}
- total_size_bytes = valid_sum_count * (SUMMARY_SIZE + 1)
- + sizeof(struct nat_journal) + 2
- + sizeof(struct sit_journal) + 2;
- sum_space = PAGE_CACHE_SIZE - SUM_FOOTER_SIZE;
- if (total_size_bytes < sum_space)
+ sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
+ SUM_FOOTER_SIZE) / SUMMARY_SIZE;
+ if (valid_sum_count <= sum_in_page)
return 1;
- else if (total_size_bytes < 2 * sum_space)
+ else if ((valid_sum_count - sum_in_page) <=
+ (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE)
return 2;
return 3;
}
@@ -350,7 +674,7 @@ find_other_zone:
if (dir == ALLOC_RIGHT) {
secno = find_next_zero_bit(free_i->free_secmap,
TOTAL_SECS(sbi), 0);
- BUG_ON(secno >= TOTAL_SECS(sbi));
+ f2fs_bug_on(secno >= TOTAL_SECS(sbi));
} else {
go_left = 1;
left_start = hint - 1;
@@ -366,7 +690,7 @@ find_other_zone:
}
left_start = find_next_zero_bit(free_i->free_secmap,
TOTAL_SECS(sbi), 0);
- BUG_ON(left_start >= TOTAL_SECS(sbi));
+ f2fs_bug_on(left_start >= TOTAL_SECS(sbi));
break;
}
secno = left_start;
@@ -405,7 +729,7 @@ skip_left:
}
got_it:
/* set it as dirty segment in free segmap */
- BUG_ON(test_bit(segno, free_i->free_segmap));
+ f2fs_bug_on(test_bit(segno, free_i->free_segmap));
__set_inuse(sbi, segno);
*newseg = segno;
write_unlock(&free_i->segmap_lock);
@@ -458,13 +782,18 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
struct curseg_info *seg, block_t start)
{
struct seg_entry *se = get_seg_entry(sbi, seg->segno);
- block_t ofs;
- for (ofs = start; ofs < sbi->blocks_per_seg; ofs++) {
- if (!f2fs_test_bit(ofs, se->ckpt_valid_map)
- && !f2fs_test_bit(ofs, se->cur_valid_map))
- break;
- }
- seg->next_blkoff = ofs;
+ int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
+ unsigned long target_map[entries];
+ unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
+ unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
+ int i, pos;
+
+ for (i = 0; i < entries; i++)
+ target_map[i] = ckpt_map[i] | cur_map[i];
+
+ pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+
+ seg->next_blkoff = pos;
}
/*
@@ -550,9 +879,8 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
change_curseg(sbi, type, true);
else
new_curseg(sbi, type, false);
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->segment_count[curseg->alloc_type]++;
-#endif
+
+ stat_inc_seg_type(sbi, curseg);
}
void allocate_new_segments(struct f2fs_sb_info *sbi)
@@ -573,141 +901,6 @@ static const struct segment_allocation default_salloc_ops = {
.allocate_segment = allocate_segment_by_default,
};
-static void f2fs_end_io_write(struct bio *bio, int err)
-{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- struct bio_private *p = bio->bi_private;
-
- do {
- struct page *page = bvec->bv_page;
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
- if (!uptodate) {
- SetPageError(page);
- if (page->mapping)
- set_bit(AS_EIO, &page->mapping->flags);
- set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
- p->sbi->sb->s_flags |= MS_RDONLY;
- }
- end_page_writeback(page);
- dec_page_count(p->sbi, F2FS_WRITEBACK);
- } while (bvec >= bio->bi_io_vec);
-
- if (p->is_sync)
- complete(p->wait);
- kfree(p);
- bio_put(bio);
-}
-
-struct bio *f2fs_bio_alloc(struct block_device *bdev, int npages)
-{
- struct bio *bio;
-
- /* No failure on bio allocation */
- bio = bio_alloc(GFP_NOIO, npages);
- bio->bi_bdev = bdev;
- bio->bi_private = NULL;
-
- return bio;
-}
-
-static void do_submit_bio(struct f2fs_sb_info *sbi,
- enum page_type type, bool sync)
-{
- int rw = sync ? WRITE_SYNC : WRITE;
- enum page_type btype = type > META ? META : type;
-
- if (type >= META_FLUSH)
- rw = WRITE_FLUSH_FUA;
-
- if (btype == META)
- rw |= REQ_META;
-
- if (sbi->bio[btype]) {
- struct bio_private *p = sbi->bio[btype]->bi_private;
- p->sbi = sbi;
- sbi->bio[btype]->bi_end_io = f2fs_end_io_write;
-
- trace_f2fs_do_submit_bio(sbi->sb, btype, sync, sbi->bio[btype]);
-
- if (type == META_FLUSH) {
- DECLARE_COMPLETION_ONSTACK(wait);
- p->is_sync = true;
- p->wait = &wait;
- submit_bio(rw, sbi->bio[btype]);
- wait_for_completion(&wait);
- } else {
- p->is_sync = false;
- submit_bio(rw, sbi->bio[btype]);
- }
- sbi->bio[btype] = NULL;
- }
-}
-
-void f2fs_submit_bio(struct f2fs_sb_info *sbi, enum page_type type, bool sync)
-{
- down_write(&sbi->bio_sem);
- do_submit_bio(sbi, type, sync);
- up_write(&sbi->bio_sem);
-}
-
-static void submit_write_page(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, enum page_type type)
-{
- struct block_device *bdev = sbi->sb->s_bdev;
-
- verify_block_addr(sbi, blk_addr);
-
- down_write(&sbi->bio_sem);
-
- inc_page_count(sbi, F2FS_WRITEBACK);
-
- if (sbi->bio[type] && sbi->last_block_in_bio[type] != blk_addr - 1)
- do_submit_bio(sbi, type, false);
-alloc_new:
- if (sbi->bio[type] == NULL) {
- struct bio_private *priv;
-retry:
- priv = kmalloc(sizeof(struct bio_private), GFP_NOFS);
- if (!priv) {
- cond_resched();
- goto retry;
- }
-
- sbi->bio[type] = f2fs_bio_alloc(bdev, max_hw_blocks(sbi));
- sbi->bio[type]->bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr);
- sbi->bio[type]->bi_private = priv;
- /*
- * The end_io will be assigned at the sumbission phase.
- * Until then, let bio_add_page() merge consecutive IOs as much
- * as possible.
- */
- }
-
- if (bio_add_page(sbi->bio[type], page, PAGE_CACHE_SIZE, 0) <
- PAGE_CACHE_SIZE) {
- do_submit_bio(sbi, type, false);
- goto alloc_new;
- }
-
- sbi->last_block_in_bio[type] = blk_addr;
-
- up_write(&sbi->bio_sem);
- trace_f2fs_submit_write_page(page, blk_addr, type);
-}
-
-void f2fs_wait_on_page_writeback(struct page *page,
- enum page_type type, bool sync)
-{
- struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
- if (PageWriteback(page)) {
- f2fs_submit_bio(sbi, type, sync);
- wait_on_page_writeback(page);
- }
-}
-
static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
{
struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -771,20 +964,18 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
return __get_segment_type_4(page, p_type);
}
/* NR_CURSEG_TYPE(6) logs by default */
- BUG_ON(sbi->active_logs != NR_CURSEG_TYPE);
+ f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE);
return __get_segment_type_6(page, p_type);
}
-static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
- block_t old_blkaddr, block_t *new_blkaddr,
- struct f2fs_summary *sum, enum page_type p_type)
+void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blkaddr, block_t *new_blkaddr,
+ struct f2fs_summary *sum, int type)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
unsigned int old_cursegno;
- int type;
- type = __get_segment_type(page, p_type);
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
@@ -801,66 +992,78 @@ static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
mutex_lock(&sit_i->sentry_lock);
__refresh_next_blkoff(sbi, curseg);
-#ifdef CONFIG_F2FS_STAT_FS
- sbi->block_count[curseg->alloc_type]++;
-#endif
+ stat_inc_block_count(sbi, curseg);
+
+ if (!__has_curseg_space(sbi, type))
+ sit_i->s_ops->allocate_segment(sbi, type, false);
/*
* SIT information should be updated before segment allocation,
* since SSR needs latest valid block information.
*/
refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
-
- if (!__has_curseg_space(sbi, type))
- sit_i->s_ops->allocate_segment(sbi, type, false);
-
locate_dirty_segment(sbi, old_cursegno);
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+
mutex_unlock(&sit_i->sentry_lock);
- if (p_type == NODE)
+ if (page && IS_NODESEG(type))
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
- /* writeout dirty page into bdev */
- submit_write_page(sbi, page, *new_blkaddr, p_type);
-
mutex_unlock(&curseg->curseg_mutex);
}
+static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
+ block_t old_blkaddr, block_t *new_blkaddr,
+ struct f2fs_summary *sum, struct f2fs_io_info *fio)
+{
+ int type = __get_segment_type(page, fio->type);
+
+ allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+
+ /* writeout dirty page into bdev */
+ f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+}
+
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
{
+ struct f2fs_io_info fio = {
+ .type = META,
+ .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+ };
+
set_page_writeback(page);
- submit_write_page(sbi, page, page->index, META);
+ f2fs_submit_page_mbio(sbi, page, page->index, &fio);
}
void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
+ struct f2fs_io_info *fio,
unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
{
struct f2fs_summary sum;
set_summary(&sum, nid, 0, 0);
- do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, NODE);
+ do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
}
-void write_data_page(struct inode *inode, struct page *page,
- struct dnode_of_data *dn, block_t old_blkaddr,
- block_t *new_blkaddr)
+void write_data_page(struct page *page, struct dnode_of_data *dn,
+ block_t *new_blkaddr, struct f2fs_io_info *fio)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
struct f2fs_summary sum;
struct node_info ni;
- BUG_ON(old_blkaddr == NULL_ADDR);
+ f2fs_bug_on(dn->data_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- do_write_page(sbi, page, old_blkaddr,
- new_blkaddr, &sum, DATA);
+ do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
}
-void rewrite_data_page(struct f2fs_sb_info *sbi, struct page *page,
- block_t old_blk_addr)
+void rewrite_data_page(struct page *page, block_t old_blkaddr,
+ struct f2fs_io_info *fio)
{
- submit_write_page(sbi, page, old_blk_addr, DATA);
+ struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
}
void recover_data_page(struct f2fs_sb_info *sbi,
@@ -896,14 +1099,11 @@ void recover_data_page(struct f2fs_sb_info *sbi,
change_curseg(sbi, type, true);
}
- curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
- (sbi->blocks_per_seg - 1);
+ curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-
locate_dirty_segment(sbi, old_cursegno);
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
@@ -919,6 +1119,10 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
unsigned int segno, old_cursegno;
block_t next_blkaddr = next_blkaddr_of_node(page);
unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
+ struct f2fs_io_info fio = {
+ .type = NODE,
+ .rw = WRITE_SYNC,
+ };
curseg = CURSEG_I(sbi, type);
@@ -933,8 +1137,7 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
curseg->next_segno = segno;
change_curseg(sbi, type, true);
}
- curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, new_blkaddr) &
- (sbi->blocks_per_seg - 1);
+ curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
/* change the current log to the next block addr in advance */
@@ -942,22 +1145,54 @@ void rewrite_node_page(struct f2fs_sb_info *sbi,
curseg->next_segno = next_segno;
change_curseg(sbi, type, true);
}
- curseg->next_blkoff = GET_SEGOFF_FROM_SEG0(sbi, next_blkaddr) &
- (sbi->blocks_per_seg - 1);
+ curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
/* rewrite node page */
set_page_writeback(page);
- submit_write_page(sbi, page, new_blkaddr, NODE);
- f2fs_submit_bio(sbi, NODE, true);
+ f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
+ f2fs_submit_merged_bio(sbi, NODE, WRITE);
refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
-
locate_dirty_segment(sbi, old_cursegno);
- locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
mutex_unlock(&sit_i->sentry_lock);
mutex_unlock(&curseg->curseg_mutex);
}
+static inline bool is_merged_page(struct f2fs_sb_info *sbi,
+ struct page *page, enum page_type type)
+{
+ enum page_type btype = PAGE_TYPE_OF_BIO(type);
+ struct f2fs_bio_info *io = &sbi->write_io[btype];
+ struct bio_vec *bvec;
+ int i;
+
+ down_read(&io->io_rwsem);
+ if (!io->bio)
+ goto out;
+
+ bio_for_each_segment_all(bvec, io->bio, i) {
+ if (page == bvec->bv_page) {
+ up_read(&io->io_rwsem);
+ return true;
+ }
+ }
+
+out:
+ up_read(&io->io_rwsem);
+ return false;
+}
+
+void f2fs_wait_on_page_writeback(struct page *page,
+ enum page_type type)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
+ if (PageWriteback(page)) {
+ if (is_merged_page(sbi, page, type))
+ f2fs_submit_merged_bio(sbi, type, WRITE);
+ wait_on_page_writeback(page);
+ }
+}
+
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1062,9 +1297,12 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
ns->ofs_in_node = 0;
}
} else {
- if (restore_node_summary(sbi, segno, sum)) {
+ int err;
+
+ err = restore_node_summary(sbi, segno, sum);
+ if (err) {
f2fs_put_page(new, 1);
- return -EINVAL;
+ return err;
}
}
}
@@ -1085,6 +1323,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
{
int type = CURSEG_HOT_DATA;
+ int err;
if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
/* restore for compacted data summary */
@@ -1093,9 +1332,12 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
type = CURSEG_HOT_NODE;
}
- for (; type <= CURSEG_COLD_NODE; type++)
- if (read_normal_summaries(sbi, type))
- return -EINVAL;
+ for (; type <= CURSEG_COLD_NODE; type++) {
+ err = read_normal_summaries(sbi, type);
+ if (err)
+ return err;
+ }
+
return 0;
}
@@ -1122,8 +1364,6 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
SUM_JOURNAL_SIZE);
written_size += SUM_JOURNAL_SIZE;
- set_page_dirty(page);
-
/* Step 3: write summary entries */
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
unsigned short blkoff;
@@ -1142,18 +1382,20 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr)
summary = (struct f2fs_summary *)(kaddr + written_size);
*summary = seg_i->sum_blk->entries[j];
written_size += SUMMARY_SIZE;
- set_page_dirty(page);
if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE -
SUM_FOOTER_SIZE)
continue;
+ set_page_dirty(page);
f2fs_put_page(page, 1);
page = NULL;
}
}
- if (page)
+ if (page) {
+ set_page_dirty(page);
f2fs_put_page(page, 1);
+ }
}
static void write_normal_summaries(struct f2fs_sb_info *sbi,
@@ -1239,7 +1481,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
/* get current sit block page without lock */
src_page = get_meta_page(sbi, src_off);
dst_page = grab_meta_page(sbi, dst_off);
- BUG_ON(PageDirty(src_page));
+ f2fs_bug_on(PageDirty(src_page));
src_addr = page_address(src_page);
dst_addr = page_address(dst_page);
@@ -1271,9 +1513,9 @@ static bool flush_sits_in_journal(struct f2fs_sb_info *sbi)
__mark_sit_entry_dirty(sbi, segno);
}
update_sits_in_cursum(sum, -sits_in_cursum(sum));
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/*
@@ -1308,6 +1550,10 @@ void flush_sit_entries(struct f2fs_sb_info *sbi)
sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
+ /* add discard candidates */
+ if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards)
+ add_discard_addrs(sbi, segno, se);
+
if (flushed)
goto to_sit_page;
@@ -1479,36 +1725,48 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- unsigned int start;
+ int sit_blk_cnt = SIT_BLK_CNT(sbi);
+ unsigned int i, start, end;
+ unsigned int readed, start_blk = 0;
+ int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
- for (start = 0; start < TOTAL_SEGS(sbi); start++) {
- struct seg_entry *se = &sit_i->sentries[start];
- struct f2fs_sit_block *sit_blk;
- struct f2fs_sit_entry sit;
- struct page *page;
- int i;
-
- mutex_lock(&curseg->curseg_mutex);
- for (i = 0; i < sits_in_cursum(sum); i++) {
- if (le32_to_cpu(segno_in_journal(sum, i)) == start) {
- sit = sit_in_journal(sum, i);
- mutex_unlock(&curseg->curseg_mutex);
- goto got_it;
+ do {
+ readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+
+ start = start_blk * sit_i->sents_per_block;
+ end = (start_blk + readed) * sit_i->sents_per_block;
+
+ for (; start < end && start < TOTAL_SEGS(sbi); start++) {
+ struct seg_entry *se = &sit_i->sentries[start];
+ struct f2fs_sit_block *sit_blk;
+ struct f2fs_sit_entry sit;
+ struct page *page;
+
+ mutex_lock(&curseg->curseg_mutex);
+ for (i = 0; i < sits_in_cursum(sum); i++) {
+ if (le32_to_cpu(segno_in_journal(sum, i))
+ == start) {
+ sit = sit_in_journal(sum, i);
+ mutex_unlock(&curseg->curseg_mutex);
+ goto got_it;
+ }
}
- }
- mutex_unlock(&curseg->curseg_mutex);
- page = get_current_sit_page(sbi, start);
- sit_blk = (struct f2fs_sit_block *)page_address(page);
- sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
- f2fs_put_page(page, 1);
+ mutex_unlock(&curseg->curseg_mutex);
+
+ page = get_current_sit_page(sbi, start);
+ sit_blk = (struct f2fs_sit_block *)page_address(page);
+ sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)];
+ f2fs_put_page(page, 1);
got_it:
- check_block_count(sbi, start, &sit);
- seg_info_from_raw_sit(se, &sit);
- if (sbi->segs_per_sec > 1) {
- struct sec_entry *e = get_sec_entry(sbi, start);
- e->valid_blocks += se->valid_blocks;
+ check_block_count(sbi, start, &sit);
+ seg_info_from_raw_sit(se, &sit);
+ if (sbi->segs_per_sec > 1) {
+ struct sec_entry *e = get_sec_entry(sbi, start);
+ e->valid_blocks += se->valid_blocks;
+ }
}
- }
+ start_blk += readed;
+ } while (start_blk < sit_blk_cnt);
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -1628,8 +1886,6 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
/* init sm info */
sbi->sm_info = sm_info;
- INIT_LIST_HEAD(&sm_info->wblist_head);
- spin_lock_init(&sm_info->wblist_lock);
sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
sm_info->segment_count = le32_to_cpu(raw_super->segment_count);
@@ -1637,6 +1893,20 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main);
sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+ sm_info->rec_prefree_segments = sm_info->main_segments *
+ DEF_RECLAIM_PREFREE_SEGMENTS / 100;
+ sm_info->ipu_policy = F2FS_IPU_DISABLE;
+ sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
+
+ INIT_LIST_HEAD(&sm_info->discard_list);
+ sm_info->nr_discards = 0;
+ sm_info->max_discards = 0;
+
+ if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
+ err = create_flush_cmd_control(sbi);
+ if (err)
+ return err;
+ }
err = build_sit_info(sbi);
if (err)
@@ -1744,6 +2014,10 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
void destroy_segment_manager(struct f2fs_sb_info *sbi)
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
+
+ if (!sm_info)
+ return;
+ destroy_flush_cmd_control(sbi);
destroy_dirty_segmap(sbi);
destroy_curseg(sbi);
destroy_free_segmap(sbi);
@@ -1751,3 +2025,17 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
sbi->sm_info = NULL;
kfree(sm_info);
}
+
+int __init create_segment_manager_caches(void)
+{
+ discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+ sizeof(struct discard_entry));
+ if (!discard_entry_slab)
+ return -ENOMEM;
+ return 0;
+}
+
+void destroy_segment_manager_caches(void)
+{
+ kmem_cache_destroy(discard_entry_slab);
+}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index bdd10eab8c4..7091204680f 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -14,17 +14,14 @@
#define NULL_SEGNO ((unsigned int)(~0))
#define NULL_SECNO ((unsigned int)(~0))
+#define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */
+
/* L: Logical segment # in volume, R: Relative segment # in main area */
#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno)
#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno)
-#define IS_DATASEG(t) \
- ((t == CURSEG_HOT_DATA) || (t == CURSEG_COLD_DATA) || \
- (t == CURSEG_WARM_DATA))
-
-#define IS_NODESEG(t) \
- ((t == CURSEG_HOT_NODE) || (t == CURSEG_COLD_NODE) || \
- (t == CURSEG_WARM_NODE))
+#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA)
+#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE)
#define IS_CURSEG(sbi, seg) \
((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \
@@ -60,6 +57,9 @@
((blk_addr) - SM_I(sbi)->seg0_blkaddr)
#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
+#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
+ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1))
+
#define GET_SEGNO(sbi, blk_addr) \
(((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \
NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \
@@ -81,22 +81,19 @@
(segno / SIT_ENTRY_PER_BLOCK)
#define START_SEGNO(sit_i, segno) \
(SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK)
+#define SIT_BLK_CNT(sbi) \
+ ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
#define f2fs_bitmap_size(nr) \
(BITS_TO_LONGS(nr) * sizeof(unsigned long))
#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
#define TOTAL_SECS(sbi) (sbi->total_sections)
#define SECTOR_FROM_BLOCK(sbi, blk_addr) \
- (blk_addr << ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
+ (((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
#define SECTOR_TO_BLOCK(sbi, sectors) \
- (sectors >> ((sbi)->log_blocksize - F2FS_LOG_SECTOR_SIZE))
-
-/* during checkpoint, bio_private is used to synchronize the last bio */
-struct bio_private {
- struct f2fs_sb_info *sbi;
- bool is_sync;
- void *wait;
-};
+ (sectors >> (sbi)->log_sectors_per_block)
+#define MAX_BIO_BLOCKS(max_hw_blocks) \
+ (min((int)max_hw_blocks, BIO_MAX_PAGES))
/*
* indicate a block allocation direction: RIGHT and LEFT.
@@ -383,26 +380,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
static inline block_t written_block_count(struct f2fs_sb_info *sbi)
{
- struct sit_info *sit_i = SIT_I(sbi);
- block_t vblocks;
-
- mutex_lock(&sit_i->sentry_lock);
- vblocks = sit_i->written_valid_blocks;
- mutex_unlock(&sit_i->sentry_lock);
-
- return vblocks;
+ return SIT_I(sbi)->written_valid_blocks;
}
static inline unsigned int free_segments(struct f2fs_sb_info *sbi)
{
- struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int free_segs;
-
- read_lock(&free_i->segmap_lock);
- free_segs = free_i->free_segments;
- read_unlock(&free_i->segmap_lock);
-
- return free_segs;
+ return FREE_I(sbi)->free_segments;
}
static inline int reserved_segments(struct f2fs_sb_info *sbi)
@@ -412,14 +395,7 @@ static inline int reserved_segments(struct f2fs_sb_info *sbi)
static inline unsigned int free_sections(struct f2fs_sb_info *sbi)
{
- struct free_segmap_info *free_i = FREE_I(sbi);
- unsigned int free_secs;
-
- read_lock(&free_i->segmap_lock);
- free_secs = free_i->free_sections;
- read_unlock(&free_i->segmap_lock);
-
- return free_secs;
+ return FREE_I(sbi)->free_sections;
}
static inline unsigned int prefree_segments(struct f2fs_sb_info *sbi)
@@ -454,8 +430,8 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
static inline bool need_SSR(struct f2fs_sb_info *sbi)
{
- return ((prefree_segments(sbi) / sbi->segs_per_sec)
- + free_sections(sbi) < overprovision_sections(sbi));
+ return (prefree_segments(sbi) / sbi->segs_per_sec)
+ + free_sections(sbi) < overprovision_sections(sbi);
}
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -463,33 +439,71 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- if (sbi->por_doing)
+ if (unlikely(sbi->por_doing))
return false;
- return ((free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
- reserved_sections(sbi)));
+ return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
+ reserved_sections(sbi));
+}
+
+static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
+{
+ return prefree_segments(sbi) > SM_I(sbi)->rec_prefree_segments;
}
static inline int utilization(struct f2fs_sb_info *sbi)
{
- return div_u64((u64)valid_user_blocks(sbi) * 100, sbi->user_block_count);
+ return div_u64((u64)valid_user_blocks(sbi) * 100,
+ sbi->user_block_count);
}
/*
* Sometimes f2fs may be better to drop out-of-place update policy.
- * So, if fs utilization is over MIN_IPU_UTIL, then f2fs tries to write
- * data in the original place likewise other traditional file systems.
- * But, currently set 100 in percentage, which means it is disabled.
- * See below need_inplace_update().
+ * And, users can control the policy through sysfs entries.
+ * There are five policies with triggering conditions as follows.
+ * F2FS_IPU_FORCE - all the time,
+ * F2FS_IPU_SSR - if SSR mode is activated,
+ * F2FS_IPU_UTIL - if FS utilization is over threashold,
+ * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
+ * threashold,
+ * F2FS_IPUT_DISABLE - disable IPU. (=default option)
*/
-#define MIN_IPU_UTIL 100
+#define DEF_MIN_IPU_UTIL 70
+
+enum {
+ F2FS_IPU_FORCE,
+ F2FS_IPU_SSR,
+ F2FS_IPU_UTIL,
+ F2FS_IPU_SSR_UTIL,
+ F2FS_IPU_DISABLE,
+};
+
static inline bool need_inplace_update(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+
+ /* IPU can be done only for the user data */
if (S_ISDIR(inode->i_mode))
return false;
- if (need_SSR(sbi) && utilization(sbi) > MIN_IPU_UTIL)
+
+ switch (SM_I(sbi)->ipu_policy) {
+ case F2FS_IPU_FORCE:
return true;
+ case F2FS_IPU_SSR:
+ if (need_SSR(sbi))
+ return true;
+ break;
+ case F2FS_IPU_UTIL:
+ if (utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ return true;
+ break;
+ case F2FS_IPU_SSR_UTIL:
+ if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util)
+ return true;
+ break;
+ case F2FS_IPU_DISABLE:
+ break;
+ }
return false;
}
@@ -513,16 +527,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
return curseg->next_blkoff;
}
+#ifdef CONFIG_F2FS_CHECK_FS
static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
{
unsigned int end_segno = SM_I(sbi)->segment_count - 1;
BUG_ON(segno > end_segno);
}
-/*
- * This function is used for only debugging.
- * NOTE: In future, we have to remove this function.
- */
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
@@ -541,8 +552,9 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
{
struct f2fs_sm_info *sm_info = SM_I(sbi);
unsigned int end_segno = sm_info->segment_count - 1;
+ bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
int valid_blocks = 0;
- int i;
+ int cur_pos = 0, next_pos;
/* check segment usage */
BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
@@ -551,11 +563,26 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
BUG_ON(segno > end_segno);
/* check bitmap with valid block count */
- for (i = 0; i < sbi->blocks_per_seg; i++)
- if (f2fs_test_bit(i, raw_sit->valid_map))
- valid_blocks++;
+ do {
+ if (is_valid) {
+ next_pos = find_next_zero_bit_le(&raw_sit->valid_map,
+ sbi->blocks_per_seg,
+ cur_pos);
+ valid_blocks += next_pos - cur_pos;
+ } else
+ next_pos = find_next_bit_le(&raw_sit->valid_map,
+ sbi->blocks_per_seg,
+ cur_pos);
+ cur_pos = next_pos;
+ is_valid = !is_valid;
+ } while (cur_pos < sbi->blocks_per_seg);
BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
}
+#else
+#define check_seg_range(sbi, segno)
+#define verify_block_addr(sbi, blk_addr)
+#define check_block_count(sbi, segno, raw_sit)
+#endif
static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
unsigned int start)
@@ -637,3 +664,46 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
struct request_queue *q = bdev_get_queue(bdev);
return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q));
}
+
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * By default, 512 pages for directory data,
+ * 512 pages (2MB) * 3 for three types of nodes, and
+ * max_bio_blocks for meta are set.
+ */
+static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
+{
+ if (type == DATA)
+ return sbi->blocks_per_seg;
+ else if (type == NODE)
+ return 3 * sbi->blocks_per_seg;
+ else if (type == META)
+ return MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+ else
+ return 0;
+}
+
+/*
+ * When writing pages, it'd better align nr_to_write for segment size.
+ */
+static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
+ struct writeback_control *wbc)
+{
+ long nr_to_write, desired;
+
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ return 0;
+
+ nr_to_write = wbc->nr_to_write;
+
+ if (type == DATA)
+ desired = 4096;
+ else if (type == NODE)
+ desired = 3 * max_hw_blocks(sbi);
+ else
+ desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi));
+
+ wbc->nr_to_write = desired;
+ return desired - nr_to_write;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 13d0a0fe49d..8f96d9372ad 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -43,11 +43,15 @@ enum {
Opt_disable_roll_forward,
Opt_discard,
Opt_noheap,
+ Opt_user_xattr,
Opt_nouser_xattr,
+ Opt_acl,
Opt_noacl,
Opt_active_logs,
Opt_disable_ext_identify,
Opt_inline_xattr,
+ Opt_inline_data,
+ Opt_flush_merge,
Opt_err,
};
@@ -56,33 +60,59 @@ static match_table_t f2fs_tokens = {
{Opt_disable_roll_forward, "disable_roll_forward"},
{Opt_discard, "discard"},
{Opt_noheap, "no_heap"},
+ {Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
+ {Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_active_logs, "active_logs=%u"},
{Opt_disable_ext_identify, "disable_ext_identify"},
{Opt_inline_xattr, "inline_xattr"},
+ {Opt_inline_data, "inline_data"},
+ {Opt_flush_merge, "flush_merge"},
{Opt_err, NULL},
};
/* Sysfs support for f2fs */
+enum {
+ GC_THREAD, /* struct f2fs_gc_thread */
+ SM_INFO, /* struct f2fs_sm_info */
+ NM_INFO, /* struct f2fs_nm_info */
+ F2FS_SBI, /* struct f2fs_sb_info */
+};
+
struct f2fs_attr {
struct attribute attr;
ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *);
ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *,
const char *, size_t);
+ int struct_type;
int offset;
};
+static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
+{
+ if (struct_type == GC_THREAD)
+ return (unsigned char *)sbi->gc_thread;
+ else if (struct_type == SM_INFO)
+ return (unsigned char *)SM_I(sbi);
+ else if (struct_type == NM_INFO)
+ return (unsigned char *)NM_I(sbi);
+ else if (struct_type == F2FS_SBI)
+ return (unsigned char *)sbi;
+ return NULL;
+}
+
static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
- struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+ unsigned char *ptr = NULL;
unsigned int *ui;
- if (!gc_kth)
+ ptr = __struct_ptr(sbi, a->struct_type);
+ if (!ptr)
return -EINVAL;
- ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+ ui = (unsigned int *)(ptr + a->offset);
return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
}
@@ -91,15 +121,16 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
struct f2fs_sb_info *sbi,
const char *buf, size_t count)
{
- struct f2fs_gc_kthread *gc_kth = sbi->gc_thread;
+ unsigned char *ptr;
unsigned long t;
unsigned int *ui;
ssize_t ret;
- if (!gc_kth)
+ ptr = __struct_ptr(sbi, a->struct_type);
+ if (!ptr)
return -EINVAL;
- ui = (unsigned int *)(((char *)gc_kth) + a->offset);
+ ui = (unsigned int *)(ptr + a->offset);
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret < 0)
@@ -135,21 +166,31 @@ static void f2fs_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
-#define F2FS_ATTR_OFFSET(_name, _mode, _show, _store, _elname) \
+#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \
static struct f2fs_attr f2fs_attr_##_name = { \
.attr = {.name = __stringify(_name), .mode = _mode }, \
.show = _show, \
.store = _store, \
- .offset = offsetof(struct f2fs_gc_kthread, _elname), \
+ .struct_type = _struct_type, \
+ .offset = _offset \
}
-#define F2FS_RW_ATTR(name, elname) \
- F2FS_ATTR_OFFSET(name, 0644, f2fs_sbi_show, f2fs_sbi_store, elname)
-
-F2FS_RW_ATTR(gc_min_sleep_time, min_sleep_time);
-F2FS_RW_ATTR(gc_max_sleep_time, max_sleep_time);
-F2FS_RW_ATTR(gc_no_gc_sleep_time, no_gc_sleep_time);
-F2FS_RW_ATTR(gc_idle, gc_idle);
+#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \
+ F2FS_ATTR_OFFSET(struct_type, name, 0644, \
+ f2fs_sbi_show, f2fs_sbi_store, \
+ offsetof(struct struct_name, elname))
+
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
+F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -157,6 +198,13 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_max_sleep_time),
ATTR_LIST(gc_no_gc_sleep_time),
ATTR_LIST(gc_idle),
+ ATTR_LIST(reclaim_segments),
+ ATTR_LIST(max_small_discards),
+ ATTR_LIST(ipu_policy),
+ ATTR_LIST(min_ipu_util),
+ ATTR_LIST(max_victim_search),
+ ATTR_LIST(dir_level),
+ ATTR_LIST(ram_thresh),
NULL,
};
@@ -217,9 +265,9 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
- if (!strncmp(name, "on", 2))
+ if (strlen(name) == 2 && !strncmp(name, "on", 2))
set_opt(sbi, BG_GC);
- else if (!strncmp(name, "off", 3))
+ else if (strlen(name) == 3 && !strncmp(name, "off", 3))
clear_opt(sbi, BG_GC);
else {
kfree(name);
@@ -237,6 +285,9 @@ static int parse_options(struct super_block *sb, char *options)
set_opt(sbi, NOHEAP);
break;
#ifdef CONFIG_F2FS_FS_XATTR
+ case Opt_user_xattr:
+ set_opt(sbi, XATTR_USER);
+ break;
case Opt_nouser_xattr:
clear_opt(sbi, XATTR_USER);
break;
@@ -244,6 +295,10 @@ static int parse_options(struct super_block *sb, char *options)
set_opt(sbi, INLINE_XATTR);
break;
#else
+ case Opt_user_xattr:
+ f2fs_msg(sb, KERN_INFO,
+ "user_xattr options not supported");
+ break;
case Opt_nouser_xattr:
f2fs_msg(sb, KERN_INFO,
"nouser_xattr options not supported");
@@ -254,10 +309,16 @@ static int parse_options(struct super_block *sb, char *options)
break;
#endif
#ifdef CONFIG_F2FS_FS_POSIX_ACL
+ case Opt_acl:
+ set_opt(sbi, POSIX_ACL);
+ break;
case Opt_noacl:
clear_opt(sbi, POSIX_ACL);
break;
#else
+ case Opt_acl:
+ f2fs_msg(sb, KERN_INFO, "acl options not supported");
+ break;
case Opt_noacl:
f2fs_msg(sb, KERN_INFO, "noacl options not supported");
break;
@@ -272,6 +333,12 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_disable_ext_identify:
set_opt(sbi, DISABLE_EXT_IDENTIFY);
break;
+ case Opt_inline_data:
+ set_opt(sbi, INLINE_DATA);
+ break;
+ case Opt_flush_merge:
+ set_opt(sbi, FLUSH_MERGE);
+ break;
default:
f2fs_msg(sb, KERN_ERR,
"Unrecognized mount option \"%s\" or missing value",
@@ -286,7 +353,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
{
struct f2fs_inode_info *fi;
- fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_NOFS | __GFP_ZERO);
+ fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO);
if (!fi)
return NULL;
@@ -298,12 +365,16 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
fi->i_current_depth = 1;
fi->i_advise = 0;
rwlock_init(&fi->ext.ext_lock);
+ init_rwsem(&fi->i_sem);
set_inode_flag(fi, FI_NEW_INODE);
if (test_opt(F2FS_SB(sb), INLINE_XATTR))
set_inode_flag(fi, FI_INLINE_XATTR);
+ /* Will be used by directory only */
+ fi->i_dir_level = F2FS_SB(sb)->dir_level;
+
return &fi->vfs_inode;
}
@@ -355,7 +426,9 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
- write_checkpoint(sbi, true);
+ /* We don't need to do checkpoint when it's clean */
+ if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES))
+ write_checkpoint(sbi, true);
iput(sbi->node_inode);
iput(sbi->meta_inode);
@@ -441,7 +514,7 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!(root->d_sb->s_flags & MS_RDONLY) && test_opt(sbi, BG_GC))
+ if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
seq_printf(seq, ",background_gc=%s", "on");
else
seq_printf(seq, ",background_gc=%s", "off");
@@ -467,7 +540,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
#endif
if (test_opt(sbi, DISABLE_EXT_IDENTIFY))
seq_puts(seq, ",disable_ext_identify");
-
+ if (test_opt(sbi, INLINE_DATA))
+ seq_puts(seq, ",inline_data");
+ if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
+ seq_puts(seq, ",flush_merge");
seq_printf(seq, ",active_logs=%u", sbi->active_logs);
return 0;
@@ -477,16 +553,26 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- unsigned int total_segs = le32_to_cpu(sbi->raw_super->segment_count_main);
+ unsigned int total_segs =
+ le32_to_cpu(sbi->raw_super->segment_count_main);
int i;
+ seq_puts(seq, "format: segment_type|valid_blocks\n"
+ "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n");
+
for (i = 0; i < total_segs; i++) {
- seq_printf(seq, "%u", get_valid_blocks(sbi, i, 1));
- if (i != 0 && (i % 10) == 0)
- seq_puts(seq, "\n");
+ struct seg_entry *se = get_seg_entry(sbi, i);
+
+ if ((i % 10) == 0)
+ seq_printf(seq, "%-5d", i);
+ seq_printf(seq, "%d|%-3u", se->type,
+ get_valid_blocks(sbi, i, 1));
+ if ((i % 10) == 9 || i == (total_segs - 1))
+ seq_putc(seq, '\n');
else
- seq_puts(seq, " ");
+ seq_putc(seq, ' ');
}
+
return 0;
}
@@ -508,6 +594,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct f2fs_mount_info org_mount_opt;
int err, active_logs;
+ bool need_restart_gc = false;
+ bool need_stop_gc = false;
+
+ sync_filesystem(sb);
/*
* Save the old mount options in case we
@@ -523,7 +613,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
/*
* Previous and new state of filesystem is RO,
- * so no point in checking GC conditions.
+ * so skip checking GC and FLUSH_MERGE conditions.
*/
if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
goto skip;
@@ -537,18 +627,40 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (sbi->gc_thread) {
stop_gc_thread(sbi);
f2fs_sync_fs(sb, 1);
+ need_restart_gc = true;
}
} else if (test_opt(sbi, BG_GC) && !sbi->gc_thread) {
err = start_gc_thread(sbi);
if (err)
goto restore_opts;
+ need_stop_gc = true;
+ }
+
+ /*
+ * We stop issue flush thread if FS is mounted as RO
+ * or if flush_merge is not passed in mount option.
+ */
+ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
+ destroy_flush_cmd_control(sbi);
+ } else if (test_opt(sbi, FLUSH_MERGE) &&
+ !sbi->sm_info->cmd_control_info) {
+ err = create_flush_cmd_control(sbi);
+ if (err)
+ goto restore_gc;
}
skip:
/* Update the POSIXACL Flag */
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
(test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0);
return 0;
-
+restore_gc:
+ if (need_restart_gc) {
+ if (start_gc_thread(sbi))
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "background gc thread is stop");
+ } else if (need_stop_gc) {
+ stop_gc_thread(sbi);
+ }
restore_opts:
sbi->mount_opt = org_mount_opt;
sbi->active_logs = active_logs;
@@ -577,7 +689,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
- if (ino < F2FS_ROOT_INO(sbi))
+ if (check_nid_range(sbi, ino))
return ERR_PTR(-ESTALE);
/*
@@ -588,7 +700,7 @@ static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
inode = f2fs_iget(sb, ino);
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (generation && inode->i_generation != generation) {
+ if (unlikely(generation && inode->i_generation != generation)) {
/* we didn't find the right inode.. */
iput(inode);
return ERR_PTR(-ESTALE);
@@ -691,10 +803,10 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
fsmeta += le32_to_cpu(ckpt->rsvd_segment_count);
fsmeta += le32_to_cpu(raw_super->segment_count_ssa);
- if (fsmeta >= total)
+ if (unlikely(fsmeta >= total))
return 1;
- if (is_set_ckpt_flags(ckpt, CP_ERROR_FLAG)) {
+ if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) {
f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
return 1;
}
@@ -722,35 +834,56 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
sbi->node_ino_num = le32_to_cpu(raw_super->node_ino);
sbi->meta_ino_num = le32_to_cpu(raw_super->meta_ino);
sbi->cur_victim_sec = NULL_SECNO;
+ sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
for (i = 0; i < NR_COUNT_TYPE; i++)
atomic_set(&sbi->nr_pages[i], 0);
+
+ sbi->dir_level = DEF_DIR_LEVEL;
}
-static int validate_superblock(struct super_block *sb,
- struct f2fs_super_block **raw_super,
- struct buffer_head **raw_super_buf, sector_t block)
+/*
+ * Read f2fs raw super block.
+ * Because we have two copies of super block, so read the first one at first,
+ * if the first one is invalid, move to read the second one.
+ */
+static int read_raw_super_block(struct super_block *sb,
+ struct f2fs_super_block **raw_super,
+ struct buffer_head **raw_super_buf)
{
- const char *super = (block == 0 ? "first" : "second");
+ int block = 0;
- /* read f2fs raw super block */
+retry:
*raw_super_buf = sb_bread(sb, block);
if (!*raw_super_buf) {
- f2fs_msg(sb, KERN_ERR, "unable to read %s superblock",
- super);
- return -EIO;
+ f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
+ block + 1);
+ if (block == 0) {
+ block++;
+ goto retry;
+ } else {
+ return -EIO;
+ }
}
*raw_super = (struct f2fs_super_block *)
((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET);
/* sanity checking of raw super */
- if (!sanity_check_raw_super(sb, *raw_super))
- return 0;
+ if (sanity_check_raw_super(sb, *raw_super)) {
+ brelse(*raw_super_buf);
+ f2fs_msg(sb, KERN_ERR,
+ "Can't find valid F2FS filesystem in %dth superblock",
+ block + 1);
+ if (block == 0) {
+ block++;
+ goto retry;
+ } else {
+ return -EINVAL;
+ }
+ }
- f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem "
- "in %s superblock", super);
- return -EINVAL;
+ return 0;
}
static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
@@ -768,19 +901,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
return -ENOMEM;
/* set a block size */
- if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+ if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) {
f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
goto free_sbi;
}
- err = validate_superblock(sb, &raw_super, &raw_super_buf, 0);
- if (err) {
- brelse(raw_super_buf);
- /* check secondary superblock when primary failed */
- err = validate_superblock(sb, &raw_super, &raw_super_buf, 1);
- if (err)
- goto free_sb_buf;
- }
+ err = read_raw_super_block(sb, &raw_super, &raw_super_buf);
+ if (err)
+ goto free_sbi;
+
sb->s_fs_info = sbi;
/* init some FS parameters */
sbi->active_logs = NR_CURSEG_TYPE;
@@ -818,12 +947,21 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
- for (i = 0; i < NR_GLOBAL_LOCKS; i++)
- mutex_init(&sbi->fs_lock[i]);
mutex_init(&sbi->node_write);
- sbi->por_doing = 0;
+ sbi->por_doing = false;
spin_lock_init(&sbi->stat_lock);
- init_rwsem(&sbi->bio_sem);
+
+ init_rwsem(&sbi->read_io.io_rwsem);
+ sbi->read_io.sbi = sbi;
+ sbi->read_io.bio = NULL;
+ for (i = 0; i < NR_PAGE_TYPE; i++) {
+ init_rwsem(&sbi->write_io[i].io_rwsem);
+ sbi->write_io[i].sbi = sbi;
+ sbi->write_io[i].bio = NULL;
+ }
+
+ init_rwsem(&sbi->cp_rwsem);
+ init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
/* get an inode for meta space */
@@ -886,9 +1024,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
}
/* if there are nt orphan nodes free them */
- err = -EINVAL;
- if (recover_orphan_inodes(sbi))
- goto free_node_inode;
+ recover_orphan_inodes(sbi);
/* read root inode and dentry */
root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
@@ -897,8 +1033,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
err = PTR_ERR(root);
goto free_node_inode;
}
- if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size)
+ if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
+ err = -EINVAL;
goto free_root_inode;
+ }
sb->s_root = d_make_root(root); /* allocate root dentry */
if (!sb->s_root) {
@@ -906,28 +1044,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
goto free_root_inode;
}
- /* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
- err = recover_fsync_data(sbi);
- if (err)
- f2fs_msg(sb, KERN_ERR,
- "Cannot recover all fsync data errno=%ld", err);
- }
-
- /*
- * If filesystem is not mounted as read-only then
- * do start the gc_thread.
- */
- if (!(sb->s_flags & MS_RDONLY)) {
- /* After POR, we can run background GC thread.*/
- err = start_gc_thread(sbi);
- if (err)
- goto fail;
- }
-
err = f2fs_build_stats(sbi);
if (err)
- goto fail;
+ goto free_root_inode;
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -949,11 +1068,36 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL,
"%s", sb->s_id);
if (err)
- goto fail;
+ goto free_proc;
+ /* recover fsynced data */
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ err = recover_fsync_data(sbi);
+ if (err)
+ f2fs_msg(sb, KERN_ERR,
+ "Cannot recover all fsync data errno=%ld", err);
+ }
+
+ /*
+ * If filesystem is not mounted as read-only then
+ * do start the gc_thread.
+ */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ /* After POR, we can run background GC thread.*/
+ err = start_gc_thread(sbi);
+ if (err)
+ goto free_kobj;
+ }
return 0;
-fail:
- stop_gc_thread(sbi);
+
+free_kobj:
+ kobject_del(&sbi->s_kobj);
+free_proc:
+ if (sbi->s_proc) {
+ remove_proc_entry("segment_info", sbi->s_proc);
+ remove_proc_entry(sb->s_id, f2fs_proc_root);
+ }
+ f2fs_destroy_stats(sbi);
free_root_inode:
dput(sb->s_root);
sb->s_root = NULL;
@@ -993,8 +1137,8 @@ MODULE_ALIAS_FS("f2fs");
static int __init init_inodecache(void)
{
f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
- sizeof(struct f2fs_inode_info), NULL);
- if (f2fs_inode_cachep == NULL)
+ sizeof(struct f2fs_inode_info));
+ if (!f2fs_inode_cachep)
return -ENOMEM;
return 0;
}
@@ -1019,9 +1163,12 @@ static int __init init_f2fs_fs(void)
err = create_node_manager_caches();
if (err)
goto free_inodecache;
- err = create_gc_caches();
+ err = create_segment_manager_caches();
if (err)
goto free_node_manager_caches;
+ err = create_gc_caches();
+ if (err)
+ goto free_segment_manager_caches;
err = create_checkpoint_caches();
if (err)
goto free_gc_caches;
@@ -1043,6 +1190,8 @@ free_checkpoint_caches:
destroy_checkpoint_caches();
free_gc_caches:
destroy_gc_caches();
+free_segment_manager_caches:
+ destroy_segment_manager_caches();
free_node_manager_caches:
destroy_node_manager_caches();
free_inodecache:
@@ -1058,6 +1207,7 @@ static void __exit exit_f2fs_fs(void)
unregister_filesystem(&f2fs_fs_type);
destroy_checkpoint_caches();
destroy_gc_caches();
+ destroy_segment_manager_caches();
destroy_node_manager_caches();
destroy_inodecache();
kset_unregister(f2fs_kset);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 1ac8a5f6e38..8bea941ee30 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -21,11 +21,12 @@
#include <linux/rwsem.h>
#include <linux/f2fs_fs.h>
#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
#include "f2fs.h"
#include "xattr.h"
static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+ size_t list_size, const char *name, size_t len, int type)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
int total_len, prefix_len = 0;
@@ -52,11 +53,11 @@ static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
return -EINVAL;
}
- total_len = prefix_len + name_len + 1;
+ total_len = prefix_len + len + 1;
if (list && total_len <= list_size) {
memcpy(list, prefix, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
+ memcpy(list + prefix_len, name, len);
+ list[prefix_len + len] = '\0';
}
return total_len;
}
@@ -107,11 +108,12 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(dentry->d_inode, type, name, value, size, NULL);
+ return f2fs_setxattr(dentry->d_inode, type, name,
+ value, size, NULL, flags);
}
static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+ size_t list_size, const char *name, size_t len, int type)
{
const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
size_t size;
@@ -163,7 +165,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY,
xattr->name, xattr->value,
- xattr->value_len, (struct page *)page);
+ xattr->value_len, (struct page *)page, 0);
if (err < 0)
break;
}
@@ -213,8 +215,8 @@ const struct xattr_handler f2fs_xattr_security_handler = {
static const struct xattr_handler *f2fs_xattr_handler_map[] = {
[F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
- [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &f2fs_xattr_acl_access_handler,
- [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &f2fs_xattr_acl_default_handler,
+ [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+ [F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler,
#endif
[F2FS_XATTR_INDEX_TRUSTED] = &f2fs_xattr_trusted_handler,
#ifdef CONFIG_F2FS_FS_SECURITY
@@ -226,8 +228,8 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = {
const struct xattr_handler *f2fs_xattr_handlers[] = {
&f2fs_xattr_user_handler,
#ifdef CONFIG_F2FS_FS_POSIX_ACL
- &f2fs_xattr_acl_access_handler,
- &f2fs_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&f2fs_xattr_trusted_handler,
#ifdef CONFIG_F2FS_FS_SECURITY
@@ -237,26 +239,26 @@ const struct xattr_handler *f2fs_xattr_handlers[] = {
NULL,
};
-static inline const struct xattr_handler *f2fs_xattr_handler(int name_index)
+static inline const struct xattr_handler *f2fs_xattr_handler(int index)
{
const struct xattr_handler *handler = NULL;
- if (name_index > 0 && name_index < ARRAY_SIZE(f2fs_xattr_handler_map))
- handler = f2fs_xattr_handler_map[name_index];
+ if (index > 0 && index < ARRAY_SIZE(f2fs_xattr_handler_map))
+ handler = f2fs_xattr_handler_map[index];
return handler;
}
-static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int name_index,
- size_t name_len, const char *name)
+static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
+ size_t len, const char *name)
{
struct f2fs_xattr_entry *entry;
list_for_each_xattr(entry, base_addr) {
- if (entry->e_name_index != name_index)
+ if (entry->e_name_index != index)
continue;
- if (entry->e_name_len != name_len)
+ if (entry->e_name_len != len)
continue;
- if (!memcmp(entry->e_name, name, name_len))
+ if (!memcmp(entry->e_name, name, len))
break;
}
return entry;
@@ -271,7 +273,7 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage)
inline_size = inline_xattr_size(inode);
- txattr_addr = kzalloc(inline_size + size, GFP_KERNEL);
+ txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO);
if (!txattr_addr)
return NULL;
@@ -343,6 +345,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
if (ipage) {
inline_addr = inline_xattr_addr(ipage);
+ f2fs_wait_on_page_writeback(ipage, NODE);
} else {
page = get_node_page(sbi, inode->i_ino);
if (IS_ERR(page)) {
@@ -350,6 +353,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return PTR_ERR(page);
}
inline_addr = inline_xattr_addr(page);
+ f2fs_wait_on_page_writeback(page, NODE);
}
memcpy(inline_addr, txattr_addr, inline_size);
f2fs_put_page(page, 1);
@@ -369,7 +373,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
alloc_nid_failed(sbi, new_nid);
return PTR_ERR(xpage);
}
- BUG_ON(new_nid);
+ f2fs_bug_on(new_nid);
+ f2fs_wait_on_page_writeback(xpage, NODE);
} else {
struct dnode_of_data dn;
set_new_dnode(&dn, inode, NULL, NULL, new_nid);
@@ -392,40 +397,43 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
return 0;
}
-int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
+int f2fs_getxattr(struct inode *inode, int index, const char *name,
void *buffer, size_t buffer_size)
{
struct f2fs_xattr_entry *entry;
void *base_addr;
int error = 0;
- size_t value_len, name_len;
+ size_t size, len;
if (name == NULL)
return -EINVAL;
- name_len = strlen(name);
+
+ len = strlen(name);
+ if (len > F2FS_NAME_LEN)
+ return -ERANGE;
base_addr = read_all_xattrs(inode, NULL);
if (!base_addr)
return -ENOMEM;
- entry = __find_xattr(base_addr, name_index, name_len, name);
+ entry = __find_xattr(base_addr, index, len, name);
if (IS_XATTR_LAST_ENTRY(entry)) {
error = -ENODATA;
goto cleanup;
}
- value_len = le16_to_cpu(entry->e_value_size);
+ size = le16_to_cpu(entry->e_value_size);
- if (buffer && value_len > buffer_size) {
+ if (buffer && size > buffer_size) {
error = -ERANGE;
goto cleanup;
}
if (buffer) {
char *pval = entry->e_name + entry->e_name_len;
- memcpy(buffer, pval, value_len);
+ memcpy(buffer, pval, size);
}
- error = value_len;
+ error = size;
cleanup:
kzfree(base_addr);
@@ -469,16 +477,15 @@ cleanup:
return error;
}
-int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
- const void *value, size_t value_len, struct page *ipage)
+static int __f2fs_setxattr(struct inode *inode, int index,
+ const char *name, const void *value, size_t size,
+ struct page *ipage, int flags)
{
- struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct f2fs_inode_info *fi = F2FS_I(inode);
struct f2fs_xattr_entry *here, *last;
void *base_addr;
int found, newsize;
- size_t name_len;
- int ilock;
+ size_t len;
__u32 new_hsize;
int error = -ENOMEM;
@@ -486,32 +493,35 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
return -EINVAL;
if (value == NULL)
- value_len = 0;
+ size = 0;
- name_len = strlen(name);
+ len = strlen(name);
- if (name_len > F2FS_NAME_LEN || value_len > MAX_VALUE_LEN(inode))
+ if (len > F2FS_NAME_LEN || size > MAX_VALUE_LEN(inode))
return -ERANGE;
- f2fs_balance_fs(sbi);
-
- ilock = mutex_lock_op(sbi);
-
base_addr = read_all_xattrs(inode, ipage);
if (!base_addr)
goto exit;
/* find entry with wanted name. */
- here = __find_xattr(base_addr, name_index, name_len, name);
+ here = __find_xattr(base_addr, index, len, name);
found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1;
- last = here;
+ if ((flags & XATTR_REPLACE) && !found) {
+ error = -ENODATA;
+ goto exit;
+ } else if ((flags & XATTR_CREATE) && found) {
+ error = -EEXIST;
+ goto exit;
+ }
+
+ last = here;
while (!IS_XATTR_LAST_ENTRY(last))
last = XATTR_NEXT_ENTRY(last);
- newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) +
- name_len + value_len);
+ newsize = XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + len + size);
/* 1. Check space */
if (value) {
@@ -522,9 +532,9 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
*/
free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
if (found)
- free = free - ENTRY_SIZE(here);
+ free = free + ENTRY_SIZE(here);
- if (free < newsize) {
+ if (unlikely(free < newsize)) {
error = -ENOSPC;
goto exit;
}
@@ -554,12 +564,12 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
* We just write new entry.
*/
memset(last, 0, newsize);
- last->e_name_index = name_index;
- last->e_name_len = name_len;
- memcpy(last->e_name, name, name_len);
- pval = last->e_name + name_len;
- memcpy(pval, value, value_len);
- last->e_value_size = cpu_to_le16(value_len);
+ last->e_name_index = index;
+ last->e_name_len = len;
+ memcpy(last->e_name, name, len);
+ pval = last->e_name + len;
+ memcpy(pval, value, size);
+ last->e_value_size = cpu_to_le16(size);
new_hsize += newsize;
}
@@ -578,7 +588,29 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
else
update_inode_page(inode);
exit:
- mutex_unlock_op(sbi, ilock);
kzfree(base_addr);
return error;
}
+
+int f2fs_setxattr(struct inode *inode, int index, const char *name,
+ const void *value, size_t size,
+ struct page *ipage, int flags)
+{
+ struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+ int err;
+
+ /* this case is only from init_inode_metadata */
+ if (ipage)
+ return __f2fs_setxattr(inode, index, name, value,
+ size, ipage, flags);
+ f2fs_balance_fs(sbi);
+
+ f2fs_lock_op(sbi);
+ /* protect xattr_ver */
+ down_write(&F2FS_I(inode)->i_sem);
+ err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
+ up_write(&F2FS_I(inode)->i_sem);
+ f2fs_unlock_op(sbi);
+
+ return err;
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 02a08fb88a1..34ab7dbcf5e 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -108,26 +108,24 @@ struct f2fs_xattr_entry {
#ifdef CONFIG_F2FS_FS_XATTR
extern const struct xattr_handler f2fs_xattr_user_handler;
extern const struct xattr_handler f2fs_xattr_trusted_handler;
-extern const struct xattr_handler f2fs_xattr_acl_access_handler;
-extern const struct xattr_handler f2fs_xattr_acl_default_handler;
extern const struct xattr_handler f2fs_xattr_advise_handler;
extern const struct xattr_handler f2fs_xattr_security_handler;
extern const struct xattr_handler *f2fs_xattr_handlers[];
extern int f2fs_setxattr(struct inode *, int, const char *,
- const void *, size_t, struct page *);
+ const void *, size_t, struct page *, int);
extern int f2fs_getxattr(struct inode *, int, const char *, void *, size_t);
extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
#else
#define f2fs_xattr_handlers NULL
-static inline int f2fs_setxattr(struct inode *inode, int name_index,
- const char *name, const void *value, size_t value_len)
+static inline int f2fs_setxattr(struct inode *inode, int index,
+ const char *name, const void *value, size_t size, int flags)
{
return -EOPNOTSUPP;
}
-static inline int f2fs_getxattr(struct inode *inode, int name_index,
+static inline int f2fs_getxattr(struct inode *inode, int index,
const char *name, void *buffer, size_t buffer_size)
{
return -EOPNOTSUPP;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 4241e6f39e8..e0c4ba39a37 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -52,7 +52,8 @@ struct fat_mount_options {
usefree:1, /* Use free_clusters for FAT32 */
tz_set:1, /* Filesystem timestamps' offset set */
rodir:1, /* allow ATTR_RO for directory */
- discard:1; /* Issue discard requests on deletions */
+ discard:1, /* Issue discard requests on deletions */
+ dos1xfloppy:1; /* Assume default BPB for DOS 1.x floppies */
};
#define FAT_HASH_BITS 8
@@ -102,6 +103,7 @@ struct msdos_sb_info {
struct hlist_head dir_hashtable[FAT_HASH_SIZE];
unsigned int dirty; /* fs state before mount */
+ struct rcu_head rcu;
};
#define FAT_CACHE_VALID 0 /* special case for valid cache */
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 9b104f54305..85f79a89e74 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -170,10 +170,10 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
const struct file_operations fat_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.release = fat_file_release,
.unlocked_ioctl = fat_generic_ioctl,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0062da21dd8..756aead10d9 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -35,9 +35,71 @@
#define CONFIG_FAT_DEFAULT_IOCHARSET ""
#endif
+#define KB_IN_SECTORS 2
+
+/*
+ * A deserialized copy of the on-disk structure laid out in struct
+ * fat_boot_sector.
+ */
+struct fat_bios_param_block {
+ u16 fat_sector_size;
+ u8 fat_sec_per_clus;
+ u16 fat_reserved;
+ u8 fat_fats;
+ u16 fat_dir_entries;
+ u16 fat_sectors;
+ u16 fat_fat_length;
+ u32 fat_total_sect;
+
+ u8 fat16_state;
+ u32 fat16_vol_id;
+
+ u32 fat32_length;
+ u32 fat32_root_cluster;
+ u16 fat32_info_sector;
+ u8 fat32_state;
+ u32 fat32_vol_id;
+};
+
static int fat_default_codepage = CONFIG_FAT_DEFAULT_CODEPAGE;
static char fat_default_iocharset[] = CONFIG_FAT_DEFAULT_IOCHARSET;
+static struct fat_floppy_defaults {
+ unsigned nr_sectors;
+ unsigned sec_per_clus;
+ unsigned dir_entries;
+ unsigned media;
+ unsigned fat_length;
+} floppy_defaults[] = {
+{
+ .nr_sectors = 160 * KB_IN_SECTORS,
+ .sec_per_clus = 1,
+ .dir_entries = 64,
+ .media = 0xFE,
+ .fat_length = 1,
+},
+{
+ .nr_sectors = 180 * KB_IN_SECTORS,
+ .sec_per_clus = 1,
+ .dir_entries = 64,
+ .media = 0xFC,
+ .fat_length = 2,
+},
+{
+ .nr_sectors = 320 * KB_IN_SECTORS,
+ .sec_per_clus = 2,
+ .dir_entries = 112,
+ .media = 0xFF,
+ .fat_length = 1,
+},
+{
+ .nr_sectors = 360 * KB_IN_SECTORS,
+ .sec_per_clus = 2,
+ .dir_entries = 112,
+ .media = 0xFD,
+ .fat_length = 2,
+},
+};
static int fat_add_cluster(struct inode *inode)
{
@@ -185,12 +247,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
}
static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
if (rw == WRITE) {
@@ -203,7 +266,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
*
* Return 0, and fallback to normal buffered write.
*/
- loff_t size = offset + iov_length(iov, nr_segs);
+ loff_t size = offset + count;
if (MSDOS_I(inode)->mmu_private < size)
return 0;
}
@@ -212,10 +275,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
* FAT need to use the DIO_LOCKING for avoiding the race
* condition of fat_get_block() and ->truncate().
*/
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- fat_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block);
if (ret < 0 && (rw & WRITE))
- fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ fat_write_failed(mapping, offset + count);
return ret;
}
@@ -359,7 +421,7 @@ struct inode *fat_iget(struct super_block *sb, loff_t i_pos)
static int is_exec(unsigned char *extension)
{
- unsigned char *exe_extensions = "EXECOMBAT", *walk;
+ unsigned char exe_extensions[] = "EXECOMBAT", *walk;
for (walk = exe_extensions; *walk; walk += 3)
if (!strncmp(extension, walk, 3))
@@ -490,7 +552,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode);
static void fat_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
@@ -548,6 +610,16 @@ static void fat_set_state(struct super_block *sb,
brelse(bh);
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct msdos_sb_info *sbi = container_of(p, struct msdos_sb_info, rcu);
+ unload_nls(sbi->nls_disk);
+ unload_nls(sbi->nls_io);
+ if (sbi->options.iocharset != fat_default_iocharset)
+ kfree(sbi->options.iocharset);
+ kfree(sbi);
+}
+
static void fat_put_super(struct super_block *sb)
{
struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -557,14 +629,7 @@ static void fat_put_super(struct super_block *sb)
iput(sbi->fsinfo_inode);
iput(sbi->fat_inode);
- unload_nls(sbi->nls_disk);
- unload_nls(sbi->nls_io);
-
- if (sbi->options.iocharset != fat_default_iocharset)
- kfree(sbi->options.iocharset);
-
- sb->s_fs_info = NULL;
- kfree(sbi);
+ call_rcu(&sbi->rcu, delayed_free);
}
static struct kmem_cache *fat_inode_cachep;
@@ -632,6 +697,8 @@ static int fat_remount(struct super_block *sb, int *flags, char *data)
struct msdos_sb_info *sbi = MSDOS_SB(sb);
*flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME);
+ sync_filesystem(sb);
+
/* make sure we update state on remount. */
new_rdonly = *flags & MS_RDONLY;
if (new_rdonly != (sb->s_flags & MS_RDONLY)) {
@@ -848,6 +915,8 @@ static int fat_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",nfs=stale_rw");
if (opts->discard)
seq_puts(m, ",discard");
+ if (opts->dos1xfloppy)
+ seq_puts(m, ",dos1xfloppy");
return 0;
}
@@ -862,7 +931,7 @@ enum {
Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes,
Opt_obsolete, Opt_flush, Opt_tz_utc, Opt_rodir, Opt_err_cont,
Opt_err_panic, Opt_err_ro, Opt_discard, Opt_nfs, Opt_time_offset,
- Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err,
+ Opt_nfs_stale_rw, Opt_nfs_nostale_ro, Opt_err, Opt_dos1xfloppy,
};
static const match_table_t fat_tokens = {
@@ -895,6 +964,7 @@ static const match_table_t fat_tokens = {
{Opt_nfs_stale_rw, "nfs"},
{Opt_nfs_stale_rw, "nfs=stale_rw"},
{Opt_nfs_nostale_ro, "nfs=nostale_ro"},
+ {Opt_dos1xfloppy, "dos1xfloppy"},
{Opt_obsolete, "conv=binary"},
{Opt_obsolete, "conv=text"},
{Opt_obsolete, "conv=auto"},
@@ -1097,6 +1167,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
case Opt_nfs_nostale_ro:
opts->nfs = FAT_NFS_NOSTALE_RO;
break;
+ case Opt_dos1xfloppy:
+ opts->dos1xfloppy = 1;
+ break;
/* msdos specific */
case Opt_dots:
@@ -1242,6 +1315,169 @@ static unsigned long calc_fat_clusters(struct super_block *sb)
return sbi->fat_length * sb->s_blocksize * 8 / sbi->fat_bits;
}
+static bool fat_bpb_is_zero(struct fat_boot_sector *b)
+{
+ if (get_unaligned_le16(&b->sector_size))
+ return false;
+ if (b->sec_per_clus)
+ return false;
+ if (b->reserved)
+ return false;
+ if (b->fats)
+ return false;
+ if (get_unaligned_le16(&b->dir_entries))
+ return false;
+ if (get_unaligned_le16(&b->sectors))
+ return false;
+ if (b->media)
+ return false;
+ if (b->fat_length)
+ return false;
+ if (b->secs_track)
+ return false;
+ if (b->heads)
+ return false;
+ return true;
+}
+
+static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
+ int silent, struct fat_bios_param_block *bpb)
+{
+ int error = -EINVAL;
+
+ /* Read in BPB ... */
+ memset(bpb, 0, sizeof(*bpb));
+ bpb->fat_sector_size = get_unaligned_le16(&b->sector_size);
+ bpb->fat_sec_per_clus = b->sec_per_clus;
+ bpb->fat_reserved = le16_to_cpu(b->reserved);
+ bpb->fat_fats = b->fats;
+ bpb->fat_dir_entries = get_unaligned_le16(&b->dir_entries);
+ bpb->fat_sectors = get_unaligned_le16(&b->sectors);
+ bpb->fat_fat_length = le16_to_cpu(b->fat_length);
+ bpb->fat_total_sect = le32_to_cpu(b->total_sect);
+
+ bpb->fat16_state = b->fat16.state;
+ bpb->fat16_vol_id = get_unaligned_le32(b->fat16.vol_id);
+
+ bpb->fat32_length = le32_to_cpu(b->fat32.length);
+ bpb->fat32_root_cluster = le32_to_cpu(b->fat32.root_cluster);
+ bpb->fat32_info_sector = le16_to_cpu(b->fat32.info_sector);
+ bpb->fat32_state = b->fat32.state;
+ bpb->fat32_vol_id = get_unaligned_le32(b->fat32.vol_id);
+
+ /* Validate this looks like a FAT filesystem BPB */
+ if (!bpb->fat_reserved) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR,
+ "bogus number of reserved sectors");
+ goto out;
+ }
+ if (!bpb->fat_fats) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
+ goto out;
+ }
+
+ /*
+ * Earlier we checked here that b->secs_track and b->head are nonzero,
+ * but it turns out valid FAT filesystems can have zero there.
+ */
+
+ if (!fat_valid_media(b->media)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
+ (unsigned)b->media);
+ goto out;
+ }
+
+ if (!is_power_of_2(bpb->fat_sector_size)
+ || (bpb->fat_sector_size < 512)
+ || (bpb->fat_sector_size > 4096)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
+ (unsigned)bpb->fat_sector_size);
+ goto out;
+ }
+
+ if (!is_power_of_2(bpb->fat_sec_per_clus)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
+ (unsigned)bpb->fat_sec_per_clus);
+ goto out;
+ }
+
+ error = 0;
+
+out:
+ return error;
+}
+
+static int fat_read_static_bpb(struct super_block *sb,
+ struct fat_boot_sector *b, int silent,
+ struct fat_bios_param_block *bpb)
+{
+ static const char *notdos1x = "This doesn't look like a DOS 1.x volume";
+
+ struct fat_floppy_defaults *fdefaults = NULL;
+ int error = -EINVAL;
+ sector_t bd_sects;
+ unsigned i;
+
+ bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE;
+
+ /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */
+ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR,
+ "%s; no bootstrapping code", notdos1x);
+ goto out;
+ }
+
+ /*
+ * If any value in this region is non-zero, it isn't archaic
+ * DOS.
+ */
+ if (!fat_bpb_is_zero(b)) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR,
+ "%s; DOS 2.x BPB is non-zero", notdos1x);
+ goto out;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(floppy_defaults); i++) {
+ if (floppy_defaults[i].nr_sectors == bd_sects) {
+ fdefaults = &floppy_defaults[i];
+ break;
+ }
+ }
+
+ if (fdefaults == NULL) {
+ if (!silent)
+ fat_msg(sb, KERN_WARNING,
+ "This looks like a DOS 1.x volume, but isn't a recognized floppy size (%llu sectors)",
+ (u64)bd_sects);
+ goto out;
+ }
+
+ if (!silent)
+ fat_msg(sb, KERN_INFO,
+ "This looks like a DOS 1.x volume; assuming default BPB values");
+
+ memset(bpb, 0, sizeof(*bpb));
+ bpb->fat_sector_size = SECTOR_SIZE;
+ bpb->fat_sec_per_clus = fdefaults->sec_per_clus;
+ bpb->fat_reserved = 1;
+ bpb->fat_fats = 2;
+ bpb->fat_dir_entries = fdefaults->dir_entries;
+ bpb->fat_sectors = fdefaults->nr_sectors;
+ bpb->fat_fat_length = fdefaults->fat_length;
+
+ error = 0;
+
+out:
+ return error;
+}
+
/*
* Read the super block of an MS-DOS FS.
*/
@@ -1251,12 +1487,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
struct inode *root_inode = NULL, *fat_inode = NULL;
struct inode *fsinfo_inode = NULL;
struct buffer_head *bh;
- struct fat_boot_sector *b;
+ struct fat_bios_param_block bpb;
struct msdos_sb_info *sbi;
u16 logical_sector_size;
u32 total_sectors, total_clusters, fat_clusters, rootdir_sectors;
int debug;
- unsigned int media;
long error;
char buf[50];
@@ -1293,100 +1528,72 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
goto out_fail;
}
- b = (struct fat_boot_sector *) bh->b_data;
- if (!b->reserved) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus number of reserved sectors");
- brelse(bh);
- goto out_invalid;
- }
- if (!b->fats) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus number of FAT structure");
- brelse(bh);
- goto out_invalid;
- }
-
- /*
- * Earlier we checked here that b->secs_track and b->head are nonzero,
- * but it turns out valid FAT filesystems can have zero there.
- */
+ error = fat_read_bpb(sb, (struct fat_boot_sector *)bh->b_data, silent,
+ &bpb);
+ if (error == -EINVAL && sbi->options.dos1xfloppy)
+ error = fat_read_static_bpb(sb,
+ (struct fat_boot_sector *)bh->b_data, silent, &bpb);
+ brelse(bh);
- media = b->media;
- if (!fat_valid_media(media)) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)",
- media);
- brelse(bh);
- goto out_invalid;
- }
- logical_sector_size = get_unaligned_le16(&b->sector_size);
- if (!is_power_of_2(logical_sector_size)
- || (logical_sector_size < 512)
- || (logical_sector_size > 4096)) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus logical sector size %u",
- logical_sector_size);
- brelse(bh);
+ if (error == -EINVAL)
goto out_invalid;
- }
- sbi->sec_per_clus = b->sec_per_clus;
- if (!is_power_of_2(sbi->sec_per_clus)) {
- if (!silent)
- fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u",
- sbi->sec_per_clus);
- brelse(bh);
- goto out_invalid;
- }
+ else if (error)
+ goto out_fail;
+ logical_sector_size = bpb.fat_sector_size;
+ sbi->sec_per_clus = bpb.fat_sec_per_clus;
+
+ error = -EIO;
if (logical_sector_size < sb->s_blocksize) {
fat_msg(sb, KERN_ERR, "logical sector size too small for device"
" (logical sector size = %u)", logical_sector_size);
- brelse(bh);
goto out_fail;
}
+
if (logical_sector_size > sb->s_blocksize) {
- brelse(bh);
+ struct buffer_head *bh_resize;
if (!sb_set_blocksize(sb, logical_sector_size)) {
fat_msg(sb, KERN_ERR, "unable to set blocksize %u",
logical_sector_size);
goto out_fail;
}
- bh = sb_bread(sb, 0);
- if (bh == NULL) {
+
+ /* Verify that the larger boot sector is fully readable */
+ bh_resize = sb_bread(sb, 0);
+ if (bh_resize == NULL) {
fat_msg(sb, KERN_ERR, "unable to read boot sector"
" (logical sector size = %lu)",
sb->s_blocksize);
goto out_fail;
}
- b = (struct fat_boot_sector *) bh->b_data;
+ brelse(bh_resize);
}
mutex_init(&sbi->s_lock);
sbi->cluster_size = sb->s_blocksize * sbi->sec_per_clus;
sbi->cluster_bits = ffs(sbi->cluster_size) - 1;
- sbi->fats = b->fats;
+ sbi->fats = bpb.fat_fats;
sbi->fat_bits = 0; /* Don't know yet */
- sbi->fat_start = le16_to_cpu(b->reserved);
- sbi->fat_length = le16_to_cpu(b->fat_length);
+ sbi->fat_start = bpb.fat_reserved;
+ sbi->fat_length = bpb.fat_fat_length;
sbi->root_cluster = 0;
sbi->free_clusters = -1; /* Don't know yet */
sbi->free_clus_valid = 0;
sbi->prev_free = FAT_START_ENT;
sb->s_maxbytes = 0xffffffff;
- if (!sbi->fat_length && b->fat32.length) {
+ if (!sbi->fat_length && bpb.fat32_length) {
struct fat_boot_fsinfo *fsinfo;
struct buffer_head *fsinfo_bh;
/* Must be FAT32 */
sbi->fat_bits = 32;
- sbi->fat_length = le32_to_cpu(b->fat32.length);
- sbi->root_cluster = le32_to_cpu(b->fat32.root_cluster);
+ sbi->fat_length = bpb.fat32_length;
+ sbi->root_cluster = bpb.fat32_root_cluster;
/* MC - if info_sector is 0, don't multiply by 0 */
- sbi->fsinfo_sector = le16_to_cpu(b->fat32.info_sector);
+ sbi->fsinfo_sector = bpb.fat32_info_sector;
if (sbi->fsinfo_sector == 0)
sbi->fsinfo_sector = 1;
@@ -1394,7 +1601,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (fsinfo_bh == NULL) {
fat_msg(sb, KERN_ERR, "bread failed, FSINFO block"
" (sector = %lu)", sbi->fsinfo_sector);
- brelse(bh);
goto out_fail;
}
@@ -1417,35 +1623,28 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/* interpret volume ID as a little endian 32 bit integer */
if (sbi->fat_bits == 32)
- sbi->vol_id = (((u32)b->fat32.vol_id[0]) |
- ((u32)b->fat32.vol_id[1] << 8) |
- ((u32)b->fat32.vol_id[2] << 16) |
- ((u32)b->fat32.vol_id[3] << 24));
+ sbi->vol_id = bpb.fat32_vol_id;
else /* fat 16 or 12 */
- sbi->vol_id = (((u32)b->fat16.vol_id[0]) |
- ((u32)b->fat16.vol_id[1] << 8) |
- ((u32)b->fat16.vol_id[2] << 16) |
- ((u32)b->fat16.vol_id[3] << 24));
+ sbi->vol_id = bpb.fat16_vol_id;
sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length;
- sbi->dir_entries = get_unaligned_le16(&b->dir_entries);
+ sbi->dir_entries = bpb.fat_dir_entries;
if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
if (!silent)
fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
" (%u)", sbi->dir_entries);
- brelse(bh);
goto out_invalid;
}
rootdir_sectors = sbi->dir_entries
* sizeof(struct msdos_dir_entry) / sb->s_blocksize;
sbi->data_start = sbi->dir_start + rootdir_sectors;
- total_sectors = get_unaligned_le16(&b->sectors);
+ total_sectors = bpb.fat_sectors;
if (total_sectors == 0)
- total_sectors = le32_to_cpu(b->total_sect);
+ total_sectors = bpb.fat_total_sect;
total_clusters = (total_sectors - sbi->data_start) / sbi->sec_per_clus;
@@ -1454,9 +1653,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/* some OSes set FAT_STATE_DIRTY and clean it on unmount. */
if (sbi->fat_bits == 32)
- sbi->dirty = b->fat32.state & FAT_STATE_DIRTY;
+ sbi->dirty = bpb.fat32_state & FAT_STATE_DIRTY;
else /* fat 16 or 12 */
- sbi->dirty = b->fat16.state & FAT_STATE_DIRTY;
+ sbi->dirty = bpb.fat16_state & FAT_STATE_DIRTY;
/* check that FAT table does not overflow */
fat_clusters = calc_fat_clusters(sb);
@@ -1465,7 +1664,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (!silent)
fat_msg(sb, KERN_ERR, "count of clusters too big (%u)",
total_clusters);
- brelse(bh);
goto out_invalid;
}
@@ -1478,8 +1676,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
if (sbi->prev_free < FAT_START_ENT)
sbi->prev_free = FAT_START_ENT;
- brelse(bh);
-
/* set up enough so that it can read an inode */
fat_hash_init(sb);
dir_hash_init(sb);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 65343c3741f..72c82f69b01 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -56,7 +56,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
return -EINVAL;
}
- if (filp->f_op && filp->f_op->check_flags)
+ if (filp->f_op->check_flags)
error = filp->f_op->check_flags(arg);
if (error)
return error;
@@ -64,8 +64,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
/*
* ->fasync() is responsible for setting the FASYNC bit.
*/
- if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
- filp->f_op->fasync) {
+ if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
if (error < 0)
goto out;
@@ -273,9 +272,19 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
case F_SETFL:
err = setfl(fd, filp, arg);
break;
+#if BITS_PER_LONG != 32
+ /* 32-bit arches must use fcntl64() */
+ case F_OFD_GETLK:
+#endif
case F_GETLK:
- err = fcntl_getlk(filp, (struct flock __user *) arg);
+ err = fcntl_getlk(filp, cmd, (struct flock __user *) arg);
break;
+#if BITS_PER_LONG != 32
+ /* 32-bit arches must use fcntl64() */
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+#endif
+ /* Fallthrough */
case F_SETLK:
case F_SETLKW:
err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
@@ -389,17 +398,20 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
goto out1;
switch (cmd) {
- case F_GETLK64:
- err = fcntl_getlk64(f.file, (struct flock64 __user *) arg);
- break;
- case F_SETLK64:
- case F_SETLKW64:
- err = fcntl_setlk64(fd, f.file, cmd,
- (struct flock64 __user *) arg);
- break;
- default:
- err = do_fcntl(fd, cmd, arg, f.file);
- break;
+ case F_GETLK64:
+ case F_OFD_GETLK:
+ err = fcntl_getlk64(f.file, cmd, (struct flock64 __user *) arg);
+ break;
+ case F_SETLK64:
+ case F_SETLKW64:
+ case F_OFD_SETLK:
+ case F_OFD_SETLKW:
+ err = fcntl_setlk64(fd, f.file, cmd,
+ (struct flock64 __user *) arg);
+ break;
+ default:
+ err = do_fcntl(fd, cmd, arg, f.file);
+ break;
}
out1:
fdput(f);
diff --git a/fs/file.c b/fs/file.c
index 4a78f981557..66923fe3176 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,7 +25,10 @@
int sysctl_nr_open __read_mostly = 1024*1024;
int sysctl_nr_open_min = BITS_PER_LONG;
-int sysctl_nr_open_max = 1024 * 1024; /* raised later */
+/* our max() is unusable in constant expressions ;-/ */
+#define __const_max(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+ -BITS_PER_LONG;
static void *alloc_fdmem(size_t size)
{
@@ -34,22 +37,17 @@ static void *alloc_fdmem(size_t size)
* vmalloc() if the allocation size will be considered "large" by the VM.
*/
if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
if (data != NULL)
return data;
}
return vmalloc(size);
}
-static void free_fdmem(void *ptr)
-{
- is_vmalloc_addr(ptr) ? vfree(ptr) : kfree(ptr);
-}
-
static void __free_fdtable(struct fdtable *fdt)
{
- free_fdmem(fdt->fd);
- free_fdmem(fdt->open_fds);
+ kvfree(fdt->fd);
+ kvfree(fdt->open_fds);
kfree(fdt);
}
@@ -127,7 +125,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
return fdt;
out_arr:
- free_fdmem(fdt->fd);
+ kvfree(fdt->fd);
out_fdt:
kfree(fdt);
out:
@@ -348,21 +346,16 @@ out:
return NULL;
}
-static void close_files(struct files_struct * files)
+static struct fdtable *close_files(struct files_struct * files)
{
- int i, j;
- struct fdtable *fdt;
-
- j = 0;
-
/*
* It is safe to dereference the fd table without RCU or
* ->file_lock because this is the last reference to the
- * files structure. But use RCU to shut RCU-lockdep up.
+ * files structure.
*/
- rcu_read_lock();
- fdt = files_fdtable(files);
- rcu_read_unlock();
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+ int i, j = 0;
+
for (;;) {
unsigned long set;
i = j * BITS_PER_LONG;
@@ -381,6 +374,8 @@ static void close_files(struct files_struct * files)
set >>= 1;
}
}
+
+ return fdt;
}
struct files_struct *get_files_struct(struct task_struct *task)
@@ -398,14 +393,9 @@ struct files_struct *get_files_struct(struct task_struct *task)
void put_files_struct(struct files_struct *files)
{
- struct fdtable *fdt;
-
if (atomic_dec_and_test(&files->count)) {
- close_files(files);
- /* not really needed, since nobody can see us */
- rcu_read_lock();
- fdt = files_fdtable(files);
- rcu_read_unlock();
+ struct fdtable *fdt = close_files(files);
+
/* free the arrays if they are not embedded */
if (fdt != &files->fdtab)
__free_fdtable(fdt);
@@ -437,12 +427,6 @@ void exit_files(struct task_struct *tsk)
}
}
-void __init files_defer_init(void)
-{
- sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
- -BITS_PER_LONG;
-}
-
struct files_struct init_files = {
.count = ATOMIC_INIT(1),
.fdt = &init_files.fdtab,
@@ -505,7 +489,7 @@ repeat:
error = fd;
#if 1
/* Sanity check */
- if (rcu_dereference_raw(fdt->fd[fd]) != NULL) {
+ if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
rcu_assign_pointer(fdt->fd[fd], NULL);
}
@@ -645,16 +629,16 @@ void do_close_on_exec(struct files_struct *files)
spin_unlock(&files->file_lock);
}
-struct file *fget(unsigned int fd)
+static struct file *__fget(unsigned int fd, fmode_t mask)
{
- struct file *file;
struct files_struct *files = current->files;
+ struct file *file;
rcu_read_lock();
file = fcheck_files(files, fd);
if (file) {
/* File object ref couldn't be taken */
- if (file->f_mode & FMODE_PATH ||
+ if ((file->f_mode & mask) ||
!atomic_long_inc_not_zero(&file->f_count))
file = NULL;
}
@@ -663,25 +647,16 @@ struct file *fget(unsigned int fd)
return file;
}
+struct file *fget(unsigned int fd)
+{
+ return __fget(fd, FMODE_PATH);
+}
EXPORT_SYMBOL(fget);
struct file *fget_raw(unsigned int fd)
{
- struct file *file;
- struct files_struct *files = current->files;
-
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file) {
- /* File object ref couldn't be taken */
- if (!atomic_long_inc_not_zero(&file->f_count))
- file = NULL;
- }
- rcu_read_unlock();
-
- return file;
+ return __fget(fd, 0);
}
-
EXPORT_SYMBOL(fget_raw);
/*
@@ -700,58 +675,54 @@ EXPORT_SYMBOL(fget_raw);
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
*/
-struct file *fget_light(unsigned int fd, int *fput_needed)
+static unsigned long __fget_light(unsigned int fd, fmode_t mask)
{
- struct file *file;
struct files_struct *files = current->files;
+ struct file *file;
- *fput_needed = 0;
if (atomic_read(&files->count) == 1) {
- file = fcheck_files(files, fd);
- if (file && (file->f_mode & FMODE_PATH))
- file = NULL;
+ file = __fcheck_files(files, fd);
+ if (!file || unlikely(file->f_mode & mask))
+ return 0;
+ return (unsigned long)file;
} else {
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file) {
- if (!(file->f_mode & FMODE_PATH) &&
- atomic_long_inc_not_zero(&file->f_count))
- *fput_needed = 1;
- else
- /* Didn't get the reference, someone's freed */
- file = NULL;
- }
- rcu_read_unlock();
+ file = __fget(fd, mask);
+ if (!file)
+ return 0;
+ return FDPUT_FPUT | (unsigned long)file;
}
+}
+unsigned long __fdget(unsigned int fd)
+{
+ return __fget_light(fd, FMODE_PATH);
+}
+EXPORT_SYMBOL(__fdget);
- return file;
+unsigned long __fdget_raw(unsigned int fd)
+{
+ return __fget_light(fd, 0);
}
-EXPORT_SYMBOL(fget_light);
-struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+unsigned long __fdget_pos(unsigned int fd)
{
- struct file *file;
- struct files_struct *files = current->files;
+ unsigned long v = __fdget(fd);
+ struct file *file = (struct file *)(v & ~3);
- *fput_needed = 0;
- if (atomic_read(&files->count) == 1) {
- file = fcheck_files(files, fd);
- } else {
- rcu_read_lock();
- file = fcheck_files(files, fd);
- if (file) {
- if (atomic_long_inc_not_zero(&file->f_count))
- *fput_needed = 1;
- else
- /* Didn't get the reference, someone's freed */
- file = NULL;
+ if (file && (file->f_mode & FMODE_ATOMIC_POS)) {
+ if (file_count(file) > 1) {
+ v |= FDPUT_POS_UNLOCK;
+ mutex_lock(&file->f_pos_lock);
}
- rcu_read_unlock();
}
-
- return file;
+ return v;
}
+/*
+ * We only lock f_pos if we have threads or if the file might be
+ * shared with another process. In both cases we'll have an elevated
+ * file count (done either by fdget() or by fork()).
+ */
+
void set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
diff --git a/fs/file_table.c b/fs/file_table.c
index abdd15ad13c..385bfd31512 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,8 +36,6 @@ struct files_stat_struct files_stat = {
.max_files = NR_FILE
};
-DEFINE_STATIC_LGLOCK(files_lglock);
-
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __read_mostly;
@@ -54,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head)
static inline void file_free(struct file *f)
{
percpu_counter_dec(&nr_files);
- file_check_state(f);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
}
@@ -79,14 +76,14 @@ EXPORT_SYMBOL_GPL(get_max_files);
* Handle nr_files sysctl
*/
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-int proc_nr_files(ctl_table *table, int write,
+int proc_nr_files(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
files_stat.nr_files = get_nr_files();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#else
-int proc_nr_files(ctl_table *table, int write,
+int proc_nr_files(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
return -ENOSYS;
@@ -134,10 +131,10 @@ struct file *get_empty_filp(void)
return ERR_PTR(error);
}
- INIT_LIST_HEAD(&f->f_u.fu_list);
atomic_long_set(&f->f_count, 1);
rwlock_init(&f->f_owner.lock);
spin_lock_init(&f->f_lock);
+ mutex_init(&f->f_pos_lock);
eventpoll_init_file(f);
/* f->f_version: 0 */
return f;
@@ -178,49 +175,20 @@ struct file *alloc_file(struct path *path, fmode_t mode,
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
+ if ((mode & FMODE_READ) &&
+ likely(fop->read || fop->aio_read || fop->read_iter))
+ mode |= FMODE_CAN_READ;
+ if ((mode & FMODE_WRITE) &&
+ likely(fop->write || fop->aio_write || fop->write_iter))
+ mode |= FMODE_CAN_WRITE;
file->f_mode = mode;
file->f_op = fop;
-
- /*
- * These mounts don't really matter in practice
- * for r/o bind mounts. They aren't userspace-
- * visible. We do this for consistency, and so
- * that we can do debugging checks at __fput()
- */
- if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) {
- file_take_write(file);
- WARN_ON(mnt_clone_write(path->mnt));
- }
if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(path->dentry->d_inode);
return file;
}
EXPORT_SYMBOL(alloc_file);
-/**
- * drop_file_write_access - give up ability to write to a file
- * @file: the file to which we will stop writing
- *
- * This is a central place which will give up the ability
- * to write to @file, along with access to write through
- * its vfsmount.
- */
-static void drop_file_write_access(struct file *file)
-{
- struct vfsmount *mnt = file->f_path.mnt;
- struct dentry *dentry = file->f_path.dentry;
- struct inode *inode = dentry->d_inode;
-
- put_write_access(inode);
-
- if (special_file(inode->i_mode))
- return;
- if (file_check_writeable(file) != 0)
- return;
- __mnt_drop_write(mnt);
- file_release_write(file);
-}
-
/* the real guts of fput() - releasing the last reference to file
*/
static void __fput(struct file *file)
@@ -237,14 +205,14 @@ static void __fput(struct file *file)
* in the file cleanup chain.
*/
eventpoll_release(file);
- locks_remove_flock(file);
+ locks_remove_file(file);
if (unlikely(file->f_flags & FASYNC)) {
- if (file->f_op && file->f_op->fasync)
+ if (file->f_op->fasync)
file->f_op->fasync(-1, file, 0);
}
ima_file_free(file);
- if (file->f_op && file->f_op->release)
+ if (file->f_op->release)
file->f_op->release(inode, file);
security_file_free(file);
if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
@@ -255,8 +223,10 @@ static void __fput(struct file *file)
put_pid(file->f_owner.pid);
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_dec(inode);
- if (file->f_mode & FMODE_WRITE)
- drop_file_write_access(file);
+ if (file->f_mode & FMODE_WRITER) {
+ put_write_access(inode);
+ __mnt_drop_write(mnt);
+ }
file->f_path.dentry = NULL;
file->f_path.mnt = NULL;
file->f_inode = NULL;
@@ -297,14 +267,13 @@ void flush_delayed_fput(void)
delayed_fput(NULL);
}
-static DECLARE_WORK(delayed_fput_work, delayed_fput);
+static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
- file_sb_list_del(file);
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
init_task_work(&file->f_u.fu_rcuhead, ____fput);
if (!task_work_add(task, &file->f_u.fu_rcuhead, true))
@@ -317,7 +286,7 @@ void fput(struct file *file)
}
if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
- schedule_work(&delayed_fput_work);
+ schedule_delayed_work(&delayed_fput_work, 1);
}
}
@@ -333,7 +302,6 @@ void __fput_sync(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
- file_sb_list_del(file);
BUG_ON(!(task->flags & PF_KTHREAD));
__fput(file);
}
@@ -345,129 +313,10 @@ void put_filp(struct file *file)
{
if (atomic_long_dec_and_test(&file->f_count)) {
security_file_free(file);
- file_sb_list_del(file);
file_free(file);
}
}
-static inline int file_list_cpu(struct file *file)
-{
-#ifdef CONFIG_SMP
- return file->f_sb_list_cpu;
-#else
- return smp_processor_id();
-#endif
-}
-
-/* helper for file_sb_list_add to reduce ifdefs */
-static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
-{
- struct list_head *list;
-#ifdef CONFIG_SMP
- int cpu;
- cpu = smp_processor_id();
- file->f_sb_list_cpu = cpu;
- list = per_cpu_ptr(sb->s_files, cpu);
-#else
- list = &sb->s_files;
-#endif
- list_add(&file->f_u.fu_list, list);
-}
-
-/**
- * file_sb_list_add - add a file to the sb's file list
- * @file: file to add
- * @sb: sb to add it to
- *
- * Use this function to associate a file with the superblock of the inode it
- * refers to.
- */
-void file_sb_list_add(struct file *file, struct super_block *sb)
-{
- if (likely(!(file->f_mode & FMODE_WRITE)))
- return;
- if (!S_ISREG(file_inode(file)->i_mode))
- return;
- lg_local_lock(&files_lglock);
- __file_sb_list_add(file, sb);
- lg_local_unlock(&files_lglock);
-}
-
-/**
- * file_sb_list_del - remove a file from the sb's file list
- * @file: file to remove
- * @sb: sb to remove it from
- *
- * Use this function to remove a file from its superblock.
- */
-void file_sb_list_del(struct file *file)
-{
- if (!list_empty(&file->f_u.fu_list)) {
- lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
- list_del_init(&file->f_u.fu_list);
- lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
- }
-}
-
-#ifdef CONFIG_SMP
-
-/*
- * These macros iterate all files on all CPUs for a given superblock.
- * files_lglock must be held globally.
- */
-#define do_file_list_for_each_entry(__sb, __file) \
-{ \
- int i; \
- for_each_possible_cpu(i) { \
- struct list_head *list; \
- list = per_cpu_ptr((__sb)->s_files, i); \
- list_for_each_entry((__file), list, f_u.fu_list)
-
-#define while_file_list_for_each_entry \
- } \
-}
-
-#else
-
-#define do_file_list_for_each_entry(__sb, __file) \
-{ \
- struct list_head *list; \
- list = &(sb)->s_files; \
- list_for_each_entry((__file), list, f_u.fu_list)
-
-#define while_file_list_for_each_entry \
-}
-
-#endif
-
-/**
- * mark_files_ro - mark all files read-only
- * @sb: superblock in question
- *
- * All files are marked read-only. We don't care about pending
- * delete files so this should be used in 'force' mode only.
- */
-void mark_files_ro(struct super_block *sb)
-{
- struct file *f;
-
- lg_global_lock(&files_lglock);
- do_file_list_for_each_entry(sb, f) {
- if (!file_count(f))
- continue;
- if (!(f->f_mode & FMODE_WRITE))
- continue;
- spin_lock(&f->f_lock);
- f->f_mode &= ~FMODE_WRITE;
- spin_unlock(&f->f_lock);
- if (file_check_writeable(f) != 0)
- continue;
- __mnt_drop_write(f->f_path.mnt);
- file_release_write(f);
- } while_file_list_for_each_entry;
- lg_global_unlock(&files_lglock);
-}
-
void __init files_init(unsigned long mempages)
{
unsigned long n;
@@ -482,7 +331,5 @@ void __init files_init(unsigned long mempages)
n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = max_t(unsigned long, n, NR_FILE);
- files_defer_init();
- lg_lock_init(&files_lglock, "files_lglock");
percpu_counter_init(&nr_files, 0);
}
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 92567d95ba6..5797d45a78c 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -121,6 +121,7 @@ int unregister_filesystem(struct file_system_type * fs)
EXPORT_SYMBOL(unregister_filesystem);
+#ifdef CONFIG_SYSFS_SYSCALL
static int fs_index(const char __user * __name)
{
struct file_system_type * tmp;
@@ -199,6 +200,7 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2)
}
return retval;
}
+#endif
int __init get_filesystem_list(char *buf)
{
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f47df72cef1..363e3ae25f6 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head)
void
vxfs_evict_inode(struct inode *ip)
{
- truncate_inode_pages(&ip->i_data, 0);
+ truncate_inode_pages_final(&ip->i_data);
clear_inode(ip);
call_rcu(&ip->i_rcu, vxfs_i_callback);
}
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 25d4099a4ae..99c7f0a37af 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -192,7 +192,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
* vxfs_lookup - lookup pathname component
* @dip: dir in which we lookup
* @dp: dentry we lookup
- * @nd: lookup nameidata
+ * @flags: lookup flags
*
* Description:
* vxfs_lookup tries to lookup the pathname component described
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e37eb274e49..7ca8c75d50d 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -124,6 +124,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
static int vxfs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_RDONLY;
return 0;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9f4935b8f20..be568b7311d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -26,6 +26,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
+#include <linux/device.h>
#include "internal.h"
/*
@@ -88,16 +89,31 @@ static inline struct inode *wb_inode(struct list_head *head)
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
+
+static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+{
+ spin_lock_bh(&bdi->wb_lock);
+ if (test_bit(BDI_registered, &bdi->state))
+ mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ spin_unlock_bh(&bdi->wb_lock);
+}
+
static void bdi_queue_work(struct backing_dev_info *bdi,
struct wb_writeback_work *work)
{
trace_writeback_queue(bdi, work);
spin_lock_bh(&bdi->wb_lock);
+ if (!test_bit(BDI_registered, &bdi->state)) {
+ if (work->done)
+ complete(work->done);
+ goto out_unlock;
+ }
list_add_tail(&work->list, &bdi->work_list);
- spin_unlock_bh(&bdi->wb_lock);
-
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+out_unlock:
+ spin_unlock_bh(&bdi->wb_lock);
}
static void
@@ -113,7 +129,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
trace_writeback_nowork(bdi);
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ bdi_wakeup_thread(bdi);
return;
}
@@ -160,7 +176,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
* writeback as soon as there is no other work to do.
*/
trace_writeback_wake_background(bdi);
- mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+ bdi_wakeup_thread(bdi);
}
/*
@@ -510,13 +526,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
}
WARN_ON(inode->i_state & I_SYNC);
/*
- * Skip inode if it is clean. We don't want to mess with writeback
- * lists in this function since flusher thread may be doing for example
- * sync in parallel and if we move the inode, it could get skipped. So
- * here we make sure inode is on some writeback list and leave it there
- * unless we have completely cleaned the inode.
+ * Skip inode if it is clean and we have no outstanding writeback in
+ * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
+ * function since flusher thread may be doing for example sync in
+ * parallel and if we move the inode, it could get skipped. So here we
+ * make sure inode is on some writeback list and leave it there unless
+ * we have completely cleaned the inode.
*/
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY) &&
+ (wbc->sync_mode != WB_SYNC_ALL ||
+ !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
inode->i_state |= I_SYNC;
spin_unlock(&inode->i_lock);
@@ -1013,7 +1032,7 @@ void bdi_writeback_workfn(struct work_struct *work)
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
- list_empty(&bdi->bdi_list))) {
+ !test_bit(BDI_registered, &bdi->state))) {
/*
* The normal path. Keep writing back @bdi until its
* work_list is empty. Note that this path is also taken
@@ -1035,10 +1054,10 @@ void bdi_writeback_workfn(struct work_struct *work)
trace_writeback_pages_written(pages_written);
}
- if (!list_empty(&bdi->work_list) ||
- (wb_has_dirty_io(wb) && dirty_writeback_interval))
- queue_delayed_work(bdi_wq, &wb->dwork,
- msecs_to_jiffies(dirty_writeback_interval * 10));
+ if (!list_empty(&bdi->work_list))
+ mod_delayed_work(bdi_wq, &wb->dwork, 0);
+ else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
+ bdi_wakeup_thread_delayed(bdi);
current->flags &= ~PF_SWAPWRITE;
}
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index d8ac61d0c93..7dca743b2ce 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -161,6 +161,6 @@ EXPORT_SYMBOL(current_umask);
struct fs_struct init_fs = {
.users = 1,
.lock = __SPIN_LOCK_UNLOCKED(init_fs.lock),
- .seq = SEQCNT_ZERO,
+ .seq = SEQCNT_ZERO(init_fs.seq),
.umask = 0022,
};
diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
index f7cff367db7..56cce7fdd39 100644
--- a/fs/fscache/cache.c
+++ b/fs/fscache/cache.c
@@ -280,15 +280,15 @@ int fscache_add_cache(struct fscache_cache *cache,
spin_unlock(&fscache_fsdef_index.lock);
up_write(&fscache_addremove_sem);
- printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n",
- cache->tag->name, cache->ops->name);
+ pr_notice("Cache \"%s\" added (type %s)\n",
+ cache->tag->name, cache->ops->name);
kobject_uevent(cache->kobj, KOBJ_ADD);
_leave(" = 0 [%s]", cache->identifier);
return 0;
tag_in_use:
- printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname);
+ pr_err("Cache tag '%s' already in use\n", tagname);
__fscache_release_cache_tag(tag);
_leave(" = -EXIST");
return -EEXIST;
@@ -317,8 +317,7 @@ EXPORT_SYMBOL(fscache_add_cache);
void fscache_io_error(struct fscache_cache *cache)
{
if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags))
- printk(KERN_ERR "FS-Cache:"
- " Cache '%s' stopped due to I/O error\n",
+ pr_err("Cache '%s' stopped due to I/O error\n",
cache->ops->name);
}
EXPORT_SYMBOL(fscache_io_error);
@@ -369,8 +368,8 @@ void fscache_withdraw_cache(struct fscache_cache *cache)
_enter("");
- printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n",
- cache->tag->name);
+ pr_notice("Withdrawing cache \"%s\"\n",
+ cache->tag->name);
/* make the cache unavailable for cookie acquisition */
if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags))
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index b2a86e324aa..aec01be91b0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -58,15 +58,16 @@ void fscache_cookie_init_once(void *_cookie)
struct fscache_cookie *__fscache_acquire_cookie(
struct fscache_cookie *parent,
const struct fscache_cookie_def *def,
- void *netfs_data)
+ void *netfs_data,
+ bool enable)
{
struct fscache_cookie *cookie;
BUG_ON(!def);
- _enter("{%s},{%s},%p",
+ _enter("{%s},{%s},%p,%u",
parent ? (char *) parent->def->name : "<no-parent>",
- def->name, netfs_data);
+ def->name, netfs_data, enable);
fscache_stat(&fscache_n_acquires);
@@ -106,7 +107,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
cookie->def = def;
cookie->parent = parent;
cookie->netfs_data = netfs_data;
- cookie->flags = 0;
+ cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
@@ -124,16 +125,22 @@ struct fscache_cookie *__fscache_acquire_cookie(
break;
}
- /* if the object is an index then we need do nothing more here - we
- * create indices on disk when we need them as an index may exist in
- * multiple caches */
- if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
- if (fscache_acquire_non_index_cookie(cookie) < 0) {
- atomic_dec(&parent->n_children);
- __fscache_cookie_put(cookie);
- fscache_stat(&fscache_n_acquires_nobufs);
- _leave(" = NULL");
- return NULL;
+ if (enable) {
+ /* if the object is an index then we need do nothing more here
+ * - we create indices on disk when we need them as an index
+ * may exist in multiple caches */
+ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ if (fscache_acquire_non_index_cookie(cookie) == 0) {
+ set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+ } else {
+ atomic_dec(&parent->n_children);
+ __fscache_cookie_put(cookie);
+ fscache_stat(&fscache_n_acquires_nobufs);
+ _leave(" = NULL");
+ return NULL;
+ }
+ } else {
+ set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
}
}
@@ -144,6 +151,39 @@ struct fscache_cookie *__fscache_acquire_cookie(
EXPORT_SYMBOL(__fscache_acquire_cookie);
/*
+ * Enable a cookie to permit it to accept new operations.
+ */
+void __fscache_enable_cookie(struct fscache_cookie *cookie,
+ bool (*can_enable)(void *data),
+ void *data)
+{
+ _enter("%p", cookie);
+
+ wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+
+ if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+ goto out_unlock;
+
+ if (can_enable && !can_enable(data)) {
+ /* The netfs decided it didn't want to enable after all */
+ } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) {
+ /* Wait for outstanding disablement to complete */
+ __fscache_wait_on_invalidate(cookie);
+
+ if (fscache_acquire_non_index_cookie(cookie) == 0)
+ set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+ } else {
+ set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags);
+ }
+
+out_unlock:
+ clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+}
+EXPORT_SYMBOL(__fscache_enable_cookie);
+
+/*
* acquire a non-index cookie
* - this must make sure the index chain is instantiated and instantiate the
* object representation too
@@ -157,7 +197,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
_enter("");
- cookie->flags = 1 << FSCACHE_COOKIE_UNAVAILABLE;
+ set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
/* now we need to see whether the backing objects for this cookie yet
* exist, if not there'll be nothing to search */
@@ -180,9 +220,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
_debug("cache %s", cache->tag->name);
- cookie->flags =
- (1 << FSCACHE_COOKIE_LOOKING_UP) |
- (1 << FSCACHE_COOKIE_NO_DATA_YET);
+ set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
/* ask the cache to allocate objects for this cookie and its parent
* chain */
@@ -398,7 +436,8 @@ void __fscache_invalidate(struct fscache_cookie *cookie)
if (!hlist_empty(&cookie->backing_objects)) {
spin_lock(&cookie->lock);
- if (!hlist_empty(&cookie->backing_objects) &&
+ if (fscache_cookie_enabled(cookie) &&
+ !hlist_empty(&cookie->backing_objects) &&
!test_and_set_bit(FSCACHE_COOKIE_INVALIDATING,
&cookie->flags)) {
object = hlist_entry(cookie->backing_objects.first,
@@ -452,10 +491,14 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
spin_lock(&cookie->lock);
- /* update the index entry on disk in each cache backing this cookie */
- hlist_for_each_entry(object,
- &cookie->backing_objects, cookie_link) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+ if (fscache_cookie_enabled(cookie)) {
+ /* update the index entry on disk in each cache backing this
+ * cookie.
+ */
+ hlist_for_each_entry(object,
+ &cookie->backing_objects, cookie_link) {
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE);
+ }
}
spin_unlock(&cookie->lock);
@@ -464,15 +507,80 @@ void __fscache_update_cookie(struct fscache_cookie *cookie)
EXPORT_SYMBOL(__fscache_update_cookie);
/*
+ * Disable a cookie to stop it from accepting new requests from the netfs.
+ */
+void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
+{
+ struct fscache_object *object;
+ bool awaken = false;
+
+ _enter("%p,%u", cookie, invalidate);
+
+ ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
+
+ if (atomic_read(&cookie->n_children) != 0) {
+ pr_err("Cookie '%s' still has children\n",
+ cookie->def->name);
+ BUG();
+ }
+
+ wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
+ fscache_wait_bit, TASK_UNINTERRUPTIBLE);
+ if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
+ goto out_unlock_enable;
+
+ /* If the cookie is being invalidated, wait for that to complete first
+ * so that we can reuse the flag.
+ */
+ __fscache_wait_on_invalidate(cookie);
+
+ /* Dispose of the backing objects */
+ set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags);
+
+ spin_lock(&cookie->lock);
+ if (!hlist_empty(&cookie->backing_objects)) {
+ hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
+ if (invalidate)
+ set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+ fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
+ }
+ } else {
+ if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ awaken = true;
+ }
+ spin_unlock(&cookie->lock);
+ if (awaken)
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING);
+
+ /* Wait for cessation of activity requiring access to the netfs (when
+ * n_active reaches 0). This makes sure outstanding reads and writes
+ * have completed.
+ */
+ if (!atomic_dec_and_test(&cookie->n_active))
+ wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
+ TASK_UNINTERRUPTIBLE);
+
+ /* Reset the cookie state if it wasn't relinquished */
+ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
+ atomic_inc(&cookie->n_active);
+ set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ }
+
+out_unlock_enable:
+ clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags);
+ wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK);
+ _leave("");
+}
+EXPORT_SYMBOL(__fscache_disable_cookie);
+
+/*
* release a cookie back to the cache
* - the object will be marked as recyclable on disk if retire is true
* - all dependents of this cookie must have already been unregistered
* (indices/files/pages)
*/
-void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
+void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire)
{
- struct fscache_object *object;
-
fscache_stat(&fscache_n_relinquishes);
if (retire)
fscache_stat(&fscache_n_relinquishes_retire);
@@ -487,31 +595,10 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, int retire)
cookie, cookie->def->name, cookie->netfs_data,
atomic_read(&cookie->n_active), retire);
- ASSERTCMP(atomic_read(&cookie->n_active), >, 0);
-
- if (atomic_read(&cookie->n_children) != 0) {
- printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n",
- cookie->def->name);
- BUG();
- }
-
/* No further netfs-accessing operations on this cookie permitted */
set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags);
- if (retire)
- set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
- spin_lock(&cookie->lock);
- hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
- fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
- }
- spin_unlock(&cookie->lock);
-
- /* Wait for cessation of activity requiring access to the netfs (when
- * n_active reaches 0).
- */
- if (!atomic_dec_and_test(&cookie->n_active))
- wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
- TASK_UNINTERRUPTIBLE);
+ __fscache_disable_cookie(cookie, retire);
/* Clear pointers back to the netfs */
cookie->netfs_data = NULL;
@@ -568,6 +655,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
{
struct fscache_operation *op;
struct fscache_object *object;
+ bool wake_cookie = false;
int ret;
_enter("%p,", cookie);
@@ -591,7 +679,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects))
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
goto inconsistent;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
@@ -600,7 +689,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
op->debug_id = atomic_inc_return(&fscache_op_debug_id);
- atomic_inc(&cookie->n_active);
+ __fscache_use_cookie(cookie);
if (fscache_submit_op(object, op) < 0)
goto submit_failed;
@@ -622,9 +711,11 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
return ret;
submit_failed:
- atomic_dec(&cookie->n_active);
+ wake_cookie = __fscache_unuse_cookie(cookie);
inconsistent:
spin_unlock(&cookie->lock);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
kfree(op);
_leave(" = -ESTALE");
return -ESTALE;
diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c
index 10a2ade0bdf..5a117df2a9e 100644
--- a/fs/fscache/fsdef.c
+++ b/fs/fscache/fsdef.c
@@ -59,6 +59,7 @@ struct fscache_cookie fscache_fsdef_index = {
.lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock),
.backing_objects = HLIST_HEAD_INIT,
.def = &fscache_fsdef_index_def,
+ .flags = 1 << FSCACHE_COOKIE_ENABLED,
};
EXPORT_SYMBOL(fscache_fsdef_index);
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index bad496748a5..7d637e2335f 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -31,12 +31,10 @@ static int fscache_histogram_show(struct seq_file *m, void *v)
switch ((unsigned long) v) {
case 1:
- seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS "
- " RETRV DLY RETRIEVLS\n");
+ seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n");
return 0;
case 2:
- seq_puts(m, "===== ===== ========= ========= ========="
- " ========= =========\n");
+ seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n");
return 0;
default:
index = (unsigned long) v - 3;
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index 4226f6680b0..bc6c08fcfdd 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -22,6 +22,12 @@
*
*/
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
+
#include <linux/fscache-cache.h>
#include <linux/sched.h>
@@ -413,8 +419,8 @@ do { \
#define ASSERT(X) \
do { \
if (unlikely(!(X))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
@@ -422,9 +428,9 @@ do { \
#define ASSERTCMP(X, OP, Y) \
do { \
if (unlikely(!((X) OP (Y)))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
- printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
(unsigned long)(X), (unsigned long)(Y)); \
BUG(); \
} \
@@ -433,8 +439,8 @@ do { \
#define ASSERTIF(C, X) \
do { \
if (unlikely((C) && !(X))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
BUG(); \
} \
} while (0)
@@ -442,9 +448,9 @@ do { \
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
if (unlikely((C) && !((X) OP (Y)))) { \
- printk(KERN_ERR "\n"); \
- printk(KERN_ERR "FS-Cache: Assertion failed\n"); \
- printk(KERN_ERR "%lx " #OP " %lx is false\n", \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
(unsigned long)(X), (unsigned long)(Y)); \
BUG(); \
} \
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 7c27907e650..63f868e869b 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write,
return ret;
}
-ctl_table fscache_sysctls[] = {
+struct ctl_table fscache_sysctls[] = {
{
.procname = "object_max_active",
.data = &fscache_object_max_active,
@@ -87,7 +87,7 @@ ctl_table fscache_sysctls[] = {
{}
};
-ctl_table fscache_sysctls_root[] = {
+struct ctl_table fscache_sysctls_root[] = {
{
.procname = "fscache",
.mode = 0555,
@@ -146,8 +146,7 @@ static int __init fscache_init(void)
0,
fscache_cookie_init_once);
if (!fscache_cookie_jar) {
- printk(KERN_NOTICE
- "FS-Cache: Failed to allocate a cookie jar\n");
+ pr_notice("Failed to allocate a cookie jar\n");
ret = -ENOMEM;
goto error_cookie_jar;
}
@@ -156,7 +155,7 @@ static int __init fscache_init(void)
if (!fscache_root)
goto error_kobj;
- printk(KERN_NOTICE "FS-Cache: Loaded\n");
+ pr_notice("Loaded\n");
return 0;
error_kobj:
@@ -192,7 +191,7 @@ static void __exit fscache_exit(void)
fscache_proc_cleanup();
destroy_workqueue(fscache_op_wq);
destroy_workqueue(fscache_object_wq);
- printk(KERN_NOTICE "FS-Cache: Unloaded\n");
+ pr_notice("Unloaded\n");
}
module_exit(fscache_exit);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index b1bb6117473..6d941f56faf 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -45,6 +45,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
netfs->primary_index->def = &fscache_fsdef_netfs_def;
netfs->primary_index->parent = &fscache_fsdef_index;
netfs->primary_index->netfs_data = netfs;
+ netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED;
atomic_inc(&netfs->primary_index->parent->usage);
atomic_inc(&netfs->primary_index->parent->n_children);
@@ -64,8 +65,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
list_add(&netfs->link, &fscache_netfs_list);
ret = 0;
- printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n",
- netfs->name);
+ pr_notice("Netfs '%s' registered for caching\n", netfs->name);
already_registered:
up_write(&fscache_addremove_sem);
@@ -96,8 +96,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs)
up_write(&fscache_addremove_sem);
- printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n",
- netfs->name);
+ pr_notice("Netfs '%s' unregistered from caching\n",
+ netfs->name);
_leave("");
}
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index e1959efad64..b8179ca6bf9 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -50,6 +50,8 @@ void fscache_objlist_add(struct fscache_object *obj)
struct fscache_object *xobj;
struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL;
+ ASSERT(RB_EMPTY_NODE(&obj->objlist_link));
+
write_lock(&fscache_object_list_lock);
while (*p) {
@@ -75,6 +77,9 @@ void fscache_objlist_add(struct fscache_object *obj)
*/
void fscache_objlist_remove(struct fscache_object *obj)
{
+ if (RB_EMPTY_NODE(&obj->objlist_link))
+ return;
+
write_lock(&fscache_object_list_lock);
BUG_ON(RB_EMPTY_ROOT(&fscache_object_list));
@@ -280,20 +285,20 @@ static int fscache_objlist_show(struct seq_file *m, void *v)
fscache_unuse_cookie(obj);
if (keylen > 0 || auxlen > 0) {
- seq_printf(m, " ");
+ seq_puts(m, " ");
for (p = buf; keylen > 0; keylen--)
seq_printf(m, "%02x", *p++);
if (auxlen > 0) {
if (config & FSCACHE_OBJLIST_CONFIG_KEY)
- seq_printf(m, ", ");
+ seq_puts(m, ", ");
for (; auxlen > 0; auxlen--)
seq_printf(m, "%02x", *p++);
}
}
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
} else {
- seq_printf(m, "<no_netfs>\n");
+ seq_puts(m, "<no_netfs>\n");
}
return 0;
}
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 86d75a60b20..d3b4539f165 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -314,6 +314,9 @@ void fscache_object_init(struct fscache_object *object,
object->cache = cache;
object->cookie = cookie;
object->parent = NULL;
+#ifdef CONFIG_FSCACHE_OBJECT_LIST
+ RB_CLEAR_NODE(&object->objlist_link);
+#endif
object->oob_event_mask = 0;
for (t = object->oob_table; t->events; t++)
@@ -495,6 +498,7 @@ void fscache_object_lookup_negative(struct fscache_object *object)
* returning ENODATA.
*/
set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
_debug("wake up lookup %p", &cookie->flags);
clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags);
@@ -527,6 +531,7 @@ void fscache_obtained_object(struct fscache_object *object)
/* We do (presumably) have data */
clear_bit_unlock(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags);
+ clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags);
/* Allow write requests to begin stacking up and read requests
* to begin shovelling data.
@@ -679,7 +684,8 @@ static const struct fscache_state *fscache_drop_object(struct fscache_object *ob
*/
spin_lock(&cookie->lock);
hlist_del_init(&object->cookie_link);
- if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
+ if (hlist_empty(&cookie->backing_objects) &&
+ test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags))
awaken = true;
spin_unlock(&cookie->lock);
@@ -796,7 +802,7 @@ void fscache_enqueue_object(struct fscache_object *object)
*/
bool fscache_object_sleep_till_congested(signed long *timeoutp)
{
- wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait);
+ wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait);
DEFINE_WAIT(wait);
if (fscache_object_congested())
@@ -927,7 +933,7 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj
*/
if (!fscache_use_cookie(object)) {
ASSERT(object->cookie->stores.rnode == NULL);
- set_bit(FSCACHE_COOKIE_RETIRED, &cookie->flags);
+ set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
_leave(" [no cookie]");
return transit_to(KILL_OBJECT);
}
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index 318071aca21..e7b87a0e518 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -51,8 +51,7 @@ void fscache_enqueue_operation(struct fscache_operation *op)
_debug("queue for caller's attention");
break;
default:
- printk(KERN_ERR "FS-Cache: Unexpected op type %lx",
- op->flags);
+ pr_err("Unexpected op type %lx", op->flags);
BUG();
break;
}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 73899c1c344..ed70714503f 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -163,12 +163,10 @@ static void fscache_attr_changed_op(struct fscache_operation *op)
fscache_stat(&fscache_n_attr_changed_calls);
- if (fscache_object_is_active(object) &&
- fscache_use_cookie(object)) {
+ if (fscache_object_is_active(object)) {
fscache_stat(&fscache_n_cop_attr_changed);
ret = object->cache->ops->attr_changed(object);
fscache_stat_d(&fscache_n_cop_attr_changed);
- fscache_unuse_cookie(object);
if (ret < 0)
fscache_abort_object(object);
}
@@ -184,6 +182,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
{
struct fscache_operation *op;
struct fscache_object *object;
+ bool wake_cookie;
_enter("%p", cookie);
@@ -199,15 +198,19 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
}
fscache_operation_init(op, fscache_attr_changed_op, NULL);
- op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE);
+ op->flags = FSCACHE_OP_ASYNC |
+ (1 << FSCACHE_OP_EXCLUSIVE) |
+ (1 << FSCACHE_OP_UNUSE_COOKIE);
spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects))
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
goto nobufs;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
+ __fscache_use_cookie(cookie);
if (fscache_submit_exclusive_op(object, op) < 0)
goto nobufs;
spin_unlock(&cookie->lock);
@@ -217,8 +220,11 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
return 0;
nobufs:
+ wake_cookie = __fscache_unuse_cookie(cookie);
spin_unlock(&cookie->lock);
kfree(op);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
fscache_stat(&fscache_n_attr_changed_nobufs);
_leave(" = %d", -ENOBUFS);
return -ENOBUFS;
@@ -263,7 +269,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval(
}
fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op);
- atomic_inc(&cookie->n_active);
op->op.flags = FSCACHE_OP_MYTHREAD |
(1UL << FSCACHE_OP_WAITING) |
(1UL << FSCACHE_OP_UNUSE_COOKIE);
@@ -384,6 +389,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
{
struct fscache_retrieval *op;
struct fscache_object *object;
+ bool wake_cookie = false;
int ret;
_enter("%p,%p,,,", cookie, page);
@@ -405,7 +411,7 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
return -ERESTARTSYS;
op = fscache_alloc_retrieval(cookie, page->mapping,
- end_io_func,context);
+ end_io_func, context);
if (!op) {
_leave(" = -ENOMEM");
return -ENOMEM;
@@ -414,13 +420,15 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie,
spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects))
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
goto nobufs_unlock;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
ASSERT(test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags));
+ __fscache_use_cookie(cookie);
atomic_inc(&object->n_reads);
__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -475,9 +483,11 @@ error:
nobufs_unlock_dec:
atomic_dec(&object->n_reads);
+ wake_cookie = __fscache_unuse_cookie(cookie);
nobufs_unlock:
spin_unlock(&cookie->lock);
- atomic_dec(&cookie->n_active);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
kfree(op);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
@@ -514,6 +524,7 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
{
struct fscache_retrieval *op;
struct fscache_object *object;
+ bool wake_cookie = false;
int ret;
_enter("%p,,%d,,,", cookie, *nr_pages);
@@ -542,11 +553,13 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie,
spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects))
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
goto nobufs_unlock;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
+ __fscache_use_cookie(cookie);
atomic_inc(&object->n_reads);
__set_bit(FSCACHE_OP_DEC_READ_CNT, &op->op.flags);
@@ -601,10 +614,12 @@ error:
nobufs_unlock_dec:
atomic_dec(&object->n_reads);
+ wake_cookie = __fscache_unuse_cookie(cookie);
nobufs_unlock:
spin_unlock(&cookie->lock);
- atomic_dec(&cookie->n_active);
kfree(op);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
nobufs:
fscache_stat(&fscache_n_retrievals_nobufs);
_leave(" = -ENOBUFS");
@@ -626,6 +641,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
{
struct fscache_retrieval *op;
struct fscache_object *object;
+ bool wake_cookie = false;
int ret;
_enter("%p,%p,,,", cookie, page);
@@ -653,13 +669,15 @@ int __fscache_alloc_page(struct fscache_cookie *cookie,
spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects))
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
goto nobufs_unlock;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
+ __fscache_use_cookie(cookie);
if (fscache_submit_op(object, &op->op) < 0)
- goto nobufs_unlock;
+ goto nobufs_unlock_dec;
spin_unlock(&cookie->lock);
fscache_stat(&fscache_n_alloc_ops);
@@ -689,10 +707,13 @@ error:
_leave(" = %d", ret);
return ret;
+nobufs_unlock_dec:
+ wake_cookie = __fscache_unuse_cookie(cookie);
nobufs_unlock:
spin_unlock(&cookie->lock);
- atomic_dec(&cookie->n_active);
kfree(op);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
nobufs:
fscache_stat(&fscache_n_allocs_nobufs);
_leave(" = -ENOBUFS");
@@ -889,6 +910,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
{
struct fscache_storage *op;
struct fscache_object *object;
+ bool wake_cookie = false;
int ret;
_enter("%p,%x,", cookie, (u32) page->flags);
@@ -920,7 +942,8 @@ int __fscache_write_page(struct fscache_cookie *cookie,
ret = -ENOBUFS;
spin_lock(&cookie->lock);
- if (hlist_empty(&cookie->backing_objects))
+ if (!fscache_cookie_enabled(cookie) ||
+ hlist_empty(&cookie->backing_objects))
goto nobufs;
object = hlist_entry(cookie->backing_objects.first,
struct fscache_object, cookie_link);
@@ -957,7 +980,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
op->op.debug_id = atomic_inc_return(&fscache_op_debug_id);
op->store_limit = object->store_limit;
- atomic_inc(&cookie->n_active);
+ __fscache_use_cookie(cookie);
if (fscache_submit_op(object, &op->op) < 0)
goto submit_failed;
@@ -984,10 +1007,10 @@ already_pending:
return 0;
submit_failed:
- atomic_dec(&cookie->n_active);
spin_lock(&cookie->stores_lock);
radix_tree_delete(&cookie->stores, page->index);
spin_unlock(&cookie->stores_lock);
+ wake_cookie = __fscache_unuse_cookie(cookie);
page_cache_release(page);
ret = -ENOBUFS;
goto nobufs;
@@ -999,6 +1022,8 @@ nobufs:
spin_unlock(&cookie->lock);
radix_tree_preload_end();
kfree(op);
+ if (wake_cookie)
+ __fscache_wake_unused_cookie(cookie);
fscache_stat(&fscache_n_stores_nobufs);
_leave(" = -ENOBUFS");
return -ENOBUFS;
@@ -1083,10 +1108,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page)
static bool once_only;
if (!once_only) {
once_only = true;
- printk(KERN_WARNING "FS-Cache:"
- " Cookie type %s marked page %lx"
- " multiple times\n",
- cookie->def->name, page->index);
+ pr_warn("Cookie type %s marked page %lx multiple times\n",
+ cookie->def->name, page->index);
}
}
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855d00a..205e0d5d530 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -348,7 +348,7 @@ int __init fuse_ctl_init(void)
return register_filesystem(&fuse_ctl_fs_type);
}
-void fuse_ctl_cleanup(void)
+void __exit fuse_ctl_cleanup(void)
{
unregister_filesystem(&fuse_ctl_fs_type);
}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index adbfd66b380..966ace8b243 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -94,8 +94,10 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
loff_t pos = 0;
struct iovec iov = { .iov_base = buf, .iov_len = count };
struct fuse_io_priv io = { .async = 0, .file = file };
+ struct iov_iter ii;
+ iov_iter_init(&ii, READ, &iov, 1, count);
- return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+ return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE);
}
static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -104,12 +106,15 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
loff_t pos = 0;
struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
struct fuse_io_priv io = { .async = 0, .file = file };
+ struct iov_iter ii;
+ iov_iter_init(&ii, WRITE, &iov, 1, count);
/*
* No locking or generic_write_checks(), the server is
* responsible for locking and sanity checks.
*/
- return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+ return fuse_direct_io(&io, &ii, &pos,
+ FUSE_DIO_WRITE | FUSE_DIO_CUSE);
}
static int cuse_open(struct inode *inode, struct file *file)
@@ -473,7 +478,7 @@ err:
static void cuse_fc_release(struct fuse_conn *fc)
{
struct cuse_conn *cc = fc_to_cc(fc);
- kfree(cc);
+ kfree_rcu(cc, fc.rcu);
}
/**
@@ -568,7 +573,7 @@ static ssize_t cuse_class_waiting_show(struct device *dev,
return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
}
-static DEVICE_ATTR(waiting, S_IFREG | 0400, cuse_class_waiting_show, NULL);
+static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
static ssize_t cuse_class_abort_store(struct device *dev,
struct device_attribute *attr,
@@ -579,7 +584,7 @@ static ssize_t cuse_class_abort_store(struct device *dev,
fuse_abort_conn(&cc->fc);
return count;
}
-static DEVICE_ATTR(abort, S_IFREG | 0200, NULL, cuse_class_abort_store);
+static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
static struct attribute *cuse_class_dev_attrs[] = {
&dev_attr_waiting.attr,
@@ -589,11 +594,14 @@ static struct attribute *cuse_class_dev_attrs[] = {
ATTRIBUTE_GROUPS(cuse_class_dev);
static struct miscdevice cuse_miscdev = {
- .minor = MISC_DYNAMIC_MINOR,
+ .minor = CUSE_MINOR,
.name = "cuse",
.fops = &cuse_channel_fops,
};
+MODULE_ALIAS_MISCDEV(CUSE_MINOR);
+MODULE_ALIAS("devname:cuse");
+
static int __init cuse_init(void)
{
int i, rc;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ef74ad5fd36..ca887314aba 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -643,9 +643,8 @@ struct fuse_copy_state {
unsigned long seglen;
unsigned long addr;
struct page *pg;
- void *mapaddr;
- void *buf;
unsigned len;
+ unsigned offset;
unsigned move_pages:1;
};
@@ -666,23 +665,17 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
if (cs->currbuf) {
struct pipe_buffer *buf = cs->currbuf;
- if (!cs->write) {
- buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
- } else {
- kunmap(buf->page);
+ if (cs->write)
buf->len = PAGE_SIZE - cs->len;
- }
cs->currbuf = NULL;
- cs->mapaddr = NULL;
- } else if (cs->mapaddr) {
- kunmap(cs->pg);
+ } else if (cs->pg) {
if (cs->write) {
flush_dcache_page(cs->pg);
set_page_dirty_lock(cs->pg);
}
put_page(cs->pg);
- cs->mapaddr = NULL;
}
+ cs->pg = NULL;
}
/*
@@ -691,7 +684,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs)
*/
static int fuse_copy_fill(struct fuse_copy_state *cs)
{
- unsigned long offset;
+ struct page *page;
int err;
unlock_request(cs->fc, cs->req);
@@ -706,14 +699,12 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
BUG_ON(!cs->nr_segs);
cs->currbuf = buf;
- cs->mapaddr = buf->ops->map(cs->pipe, buf, 0);
+ cs->pg = buf->page;
+ cs->offset = buf->offset;
cs->len = buf->len;
- cs->buf = cs->mapaddr + buf->offset;
cs->pipebufs++;
cs->nr_segs--;
} else {
- struct page *page;
-
if (cs->nr_segs == cs->pipe->buffers)
return -EIO;
@@ -726,8 +717,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
buf->len = 0;
cs->currbuf = buf;
- cs->mapaddr = kmap(page);
- cs->buf = cs->mapaddr;
+ cs->pg = page;
+ cs->offset = 0;
cs->len = PAGE_SIZE;
cs->pipebufs++;
cs->nr_segs++;
@@ -740,14 +731,13 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
cs->iov++;
cs->nr_segs--;
}
- err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+ err = get_user_pages_fast(cs->addr, 1, cs->write, &page);
if (err < 0)
return err;
BUG_ON(err != 1);
- offset = cs->addr % PAGE_SIZE;
- cs->mapaddr = kmap(cs->pg);
- cs->buf = cs->mapaddr + offset;
- cs->len = min(PAGE_SIZE - offset, cs->seglen);
+ cs->pg = page;
+ cs->offset = cs->addr % PAGE_SIZE;
+ cs->len = min(PAGE_SIZE - cs->offset, cs->seglen);
cs->seglen -= cs->len;
cs->addr += cs->len;
}
@@ -760,15 +750,20 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
{
unsigned ncpy = min(*size, cs->len);
if (val) {
+ void *pgaddr = kmap_atomic(cs->pg);
+ void *buf = pgaddr + cs->offset;
+
if (cs->write)
- memcpy(cs->buf, *val, ncpy);
+ memcpy(buf, *val, ncpy);
else
- memcpy(*val, cs->buf, ncpy);
+ memcpy(*val, buf, ncpy);
+
+ kunmap_atomic(pgaddr);
*val += ncpy;
}
*size -= ncpy;
cs->len -= ncpy;
- cs->buf += ncpy;
+ cs->offset += ncpy;
return ncpy;
}
@@ -874,8 +869,8 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
out_fallback_unlock:
unlock_page(newpage);
out_fallback:
- cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
- cs->buf = cs->mapaddr + buf->offset;
+ cs->pg = buf->page;
+ cs->offset = buf->offset;
err = lock_request(cs->fc, cs->req);
if (err)
@@ -1296,22 +1291,6 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
}
-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf)
-{
- return 1;
-}
-
-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
- .can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
- .confirm = generic_pipe_buf_confirm,
- .release = generic_pipe_buf_release,
- .steal = fuse_dev_pipe_buf_steal,
- .get = generic_pipe_buf_get,
-};
-
static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
@@ -1358,7 +1337,11 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
buf->page = bufs[page_nr].page;
buf->offset = bufs[page_nr].offset;
buf->len = bufs[page_nr].len;
- buf->ops = &fuse_dev_pipe_buf_ops;
+ /*
+ * Need to be careful about this. Having buf->ops in module
+ * code can Oops if the buffer persists after module unload.
+ */
+ buf->ops = &nosteal_pipe_buf_ops;
pipe->nrbufs++;
page_nr++;
@@ -1599,7 +1582,8 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
err = fuse_copy_page(cs, &page, offset, this_num, 0);
- if (!err && offset == 0 && (num != 0 || file_size == end))
+ if (!err && offset == 0 &&
+ (this_num == PAGE_CACHE_SIZE || file_size == end))
SetPageUptodate(page);
unlock_page(page);
page_cache_release(page);
@@ -1625,7 +1609,7 @@ out_finish:
static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
{
- release_pages(req->pages, req->num_pages, 0);
+ release_pages(req->pages, req->num_pages, false);
}
static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7989f2ab4c..0c6048247a3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -112,6 +112,16 @@ void fuse_invalidate_attr(struct inode *inode)
get_fuse_inode(inode)->i_time = 0;
}
+/**
+ * Mark the attributes as stale due to an atime change. Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+ if (!IS_RDONLY(inode))
+ fuse_invalidate_attr(inode);
+}
+
/*
* Just mark the entry as stale, so that a next attempt to look it up
* will result in a new lookup call to userspace
@@ -188,7 +198,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
inode = ACCESS_ONCE(entry->d_inode);
if (inode && is_bad_inode(inode))
goto invalid;
- else if (fuse_dentry_time(entry) < get_jiffies_64()) {
+ else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
+ (flags & LOOKUP_REVAL)) {
int err;
struct fuse_entry_out outarg;
struct fuse_req *req;
@@ -342,24 +353,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
return err;
}
-static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
- struct inode *inode)
-{
- struct dentry *newent;
-
- if (inode && S_ISDIR(inode->i_mode)) {
- struct fuse_conn *fc = get_fuse_conn(inode);
-
- mutex_lock(&fc->inst_mutex);
- newent = d_materialise_unique(dentry, inode);
- mutex_unlock(&fc->inst_mutex);
- } else {
- newent = d_materialise_unique(dentry, inode);
- }
-
- return newent;
-}
-
static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
unsigned int flags)
{
@@ -382,7 +375,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
if (inode && get_node_id(inode) == FUSE_ROOT_ID)
goto out_iput;
- newent = fuse_materialise_dentry(entry, inode);
+ newent = d_materialise_unique(entry, inode);
err = PTR_ERR(newent);
if (IS_ERR(newent))
goto out_err;
@@ -601,21 +594,9 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
}
kfree(forget);
- if (S_ISDIR(inode->i_mode)) {
- struct dentry *alias;
- mutex_lock(&fc->inst_mutex);
- alias = d_find_alias(inode);
- if (alias) {
- /* New directory must have moved since mkdir */
- mutex_unlock(&fc->inst_mutex);
- dput(alias);
- iput(inode);
- return -EBUSY;
- }
- d_instantiate(entry, inode);
- mutex_unlock(&fc->inst_mutex);
- } else
- d_instantiate(entry, inode);
+ err = d_instantiate_no_diralias(entry, inode);
+ if (err)
+ return err;
fuse_change_entry_timeout(entry, &outarg);
fuse_invalidate_attr(dir);
@@ -699,6 +680,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
return create_new_entry(fc, req, dir, entry, S_IFLNK);
}
+static inline void fuse_update_ctime(struct inode *inode)
+{
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = current_fs_time(inode->i_sb);
+ mark_inode_dirty_sync(inode);
+ }
+}
+
static int fuse_unlink(struct inode *dir, struct dentry *entry)
{
int err;
@@ -733,6 +722,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
fuse_invalidate_attr(inode);
fuse_invalidate_attr(dir);
fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
} else if (err == -EINTR)
fuse_invalidate_entry(entry);
return err;
@@ -763,23 +753,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
return err;
}
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
- struct inode *newdir, struct dentry *newent)
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags, int opcode, size_t argsize)
{
int err;
- struct fuse_rename_in inarg;
+ struct fuse_rename2_in inarg;
struct fuse_conn *fc = get_fuse_conn(olddir);
- struct fuse_req *req = fuse_get_req_nopages(fc);
+ struct fuse_req *req;
+ req = fuse_get_req_nopages(fc);
if (IS_ERR(req))
return PTR_ERR(req);
- memset(&inarg, 0, sizeof(inarg));
+ memset(&inarg, 0, argsize);
inarg.newdir = get_node_id(newdir);
- req->in.h.opcode = FUSE_RENAME;
+ inarg.flags = flags;
+ req->in.h.opcode = opcode;
req->in.h.nodeid = get_node_id(olddir);
req->in.numargs = 3;
- req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].size = argsize;
req->in.args[0].value = &inarg;
req->in.args[1].size = oldent->d_name.len + 1;
req->in.args[1].value = oldent->d_name.name;
@@ -791,15 +784,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
if (!err) {
/* ctime changes */
fuse_invalidate_attr(oldent->d_inode);
+ fuse_update_ctime(oldent->d_inode);
+
+ if (flags & RENAME_EXCHANGE) {
+ fuse_invalidate_attr(newent->d_inode);
+ fuse_update_ctime(newent->d_inode);
+ }
fuse_invalidate_attr(olddir);
if (olddir != newdir)
fuse_invalidate_attr(newdir);
/* newent will end up negative */
- if (newent->d_inode) {
+ if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
fuse_invalidate_attr(newent->d_inode);
fuse_invalidate_entry_cache(newent);
+ fuse_update_ctime(newent->d_inode);
}
} else if (err == -EINTR) {
/* If request was interrupted, DEITY only knows if the
@@ -815,6 +815,42 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
return err;
}
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(olddir);
+ int err;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags) {
+ if (fc->no_rename2 || fc->minor < 23)
+ return -EINVAL;
+
+ err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ FUSE_RENAME2,
+ sizeof(struct fuse_rename2_in));
+ if (err == -ENOSYS) {
+ fc->no_rename2 = 1;
+ err = -EINVAL;
+ }
+ } else {
+ err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ FUSE_RENAME,
+ sizeof(struct fuse_rename_in));
+ }
+
+ return err;
+}
+
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent)
+{
+ return fuse_rename2(olddir, oldent, newdir, newent, 0);
+}
+
static int fuse_link(struct dentry *entry, struct inode *newdir,
struct dentry *newent)
{
@@ -849,6 +885,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
inc_nlink(inode);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
} else if (err == -EINTR) {
fuse_invalidate_attr(inode);
}
@@ -859,6 +896,16 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
struct kstat *stat)
{
unsigned int blkbits;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /* see the comment in fuse_change_attributes() */
+ if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+ attr->size = i_size_read(inode);
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
stat->dev = inode->i_sb->s_dev;
stat->ino = attr->ino;
@@ -945,7 +992,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
int err;
bool r;
- if (fi->i_time < get_jiffies_64()) {
+ if (time_before64(fi->i_time, get_jiffies_64())) {
r = true;
err = fuse_do_getattr(inode, stat, file);
} else {
@@ -1131,7 +1178,7 @@ static int fuse_permission(struct inode *inode, int mask)
((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
struct fuse_inode *fi = get_fuse_inode(inode);
- if (fi->i_time < get_jiffies_64()) {
+ if (time_before64(fi->i_time, get_jiffies_64())) {
refreshed = true;
err = fuse_perm_getattr(inode, mask);
@@ -1284,7 +1331,7 @@ static int fuse_direntplus_link(struct file *file,
if (!inode)
goto out;
- alias = fuse_materialise_dentry(dentry, inode);
+ alias = d_materialise_unique(dentry, inode);
err = PTR_ERR(alias);
if (IS_ERR(alias))
goto out;
@@ -1401,7 +1448,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
}
__free_page(page);
- fuse_invalidate_attr(inode); /* atime changed */
+ fuse_invalidate_atime(inode);
return err;
}
@@ -1434,7 +1481,7 @@ static char *read_link(struct dentry *dentry)
link[req->out.args[0].size] = '\0';
out:
fuse_put_request(fc, req);
- fuse_invalidate_attr(inode); /* atime changed */
+ fuse_invalidate_atime(inode);
return link;
}
@@ -1497,12 +1544,16 @@ static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
}
-static bool update_mtime(unsigned ivalid)
+static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
{
/* Always update if mtime is explicitly set */
if (ivalid & ATTR_MTIME_SET)
return true;
+ /* Or if kernel i_mtime is the official one */
+ if (trust_local_mtime)
+ return true;
+
/* If it's an open(O_TRUNC) or an ftruncate(), don't update */
if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
return false;
@@ -1511,7 +1562,8 @@ static bool update_mtime(unsigned ivalid)
return true;
}
-static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
+static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
+ bool trust_local_cmtime)
{
unsigned ivalid = iattr->ia_valid;
@@ -1530,13 +1582,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg)
if (!(ivalid & ATTR_ATIME_SET))
arg->valid |= FATTR_ATIME_NOW;
}
- if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) {
+ if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
arg->valid |= FATTR_MTIME;
arg->mtime = iattr->ia_mtime.tv_sec;
arg->mtimensec = iattr->ia_mtime.tv_nsec;
- if (!(ivalid & ATTR_MTIME_SET))
+ if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
arg->valid |= FATTR_MTIME_NOW;
}
+ if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+ arg->valid |= FATTR_CTIME;
+ arg->ctime = iattr->ia_ctime.tv_sec;
+ arg->ctimensec = iattr->ia_ctime.tv_nsec;
+ }
}
/*
@@ -1583,6 +1640,62 @@ void fuse_release_nowrite(struct inode *inode)
spin_unlock(&fc->lock);
}
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
+ struct inode *inode,
+ struct fuse_setattr_in *inarg_p,
+ struct fuse_attr_out *outarg_p)
+{
+ req->in.h.opcode = FUSE_SETATTR;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(*inarg_p);
+ req->in.args[0].value = inarg_p;
+ req->out.numargs = 1;
+ if (fc->minor < 9)
+ req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ else
+ req->out.args[0].size = sizeof(*outarg_p);
+ req->out.args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ struct fuse_setattr_in inarg;
+ struct fuse_attr_out outarg;
+ int err;
+
+ req = fuse_get_req_nopages(fc);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+
+ inarg.valid = FATTR_MTIME;
+ inarg.mtime = inode->i_mtime.tv_sec;
+ inarg.mtimensec = inode->i_mtime.tv_nsec;
+ if (fc->minor >= 23) {
+ inarg.valid |= FATTR_CTIME;
+ inarg.ctime = inode->i_ctime.tv_sec;
+ inarg.ctimensec = inode->i_ctime.tv_nsec;
+ }
+ if (ff) {
+ inarg.valid |= FATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+
+ return err;
+}
+
/*
* Set attributes, and at the same time refresh them.
*
@@ -1600,8 +1713,10 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct fuse_setattr_in inarg;
struct fuse_attr_out outarg;
bool is_truncate = false;
+ bool is_wb = fc->writeback_cache;
loff_t oldsize;
int err;
+ bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
attr->ia_valid |= ATTR_FORCE;
@@ -1626,11 +1741,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (trust_local_cmtime && attr->ia_size != inode->i_size)
+ attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
}
memset(&inarg, 0, sizeof(inarg));
memset(&outarg, 0, sizeof(outarg));
- iattr_to_fattr(attr, &inarg);
+ iattr_to_fattr(attr, &inarg, trust_local_cmtime);
if (file) {
struct fuse_file *ff = file->private_data;
inarg.valid |= FATTR_FH;
@@ -1641,17 +1758,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
inarg.valid |= FATTR_LOCKOWNER;
inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
}
- req->in.h.opcode = FUSE_SETATTR;
- req->in.h.nodeid = get_node_id(inode);
- req->in.numargs = 1;
- req->in.args[0].size = sizeof(inarg);
- req->in.args[0].value = &inarg;
- req->out.numargs = 1;
- if (fc->minor < 9)
- req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
- else
- req->out.args[0].size = sizeof(outarg);
- req->out.args[0].value = &outarg;
+ fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
fuse_request_send(fc, req);
err = req->out.h.error;
fuse_put_request(fc, req);
@@ -1668,10 +1775,21 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
}
spin_lock(&fc->lock);
+ /* the kernel maintains i_mtime locally */
+ if (trust_local_cmtime) {
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
+ /* FIXME: clear I_DIRTY_SYNC? */
+ }
+
fuse_change_attributes_common(inode, &outarg.attr,
attr_timeout(&outarg));
oldsize = inode->i_size;
- i_size_write(inode, outarg.attr.size);
+ /* see the comment in fuse_change_attributes() */
+ if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ i_size_write(inode, outarg.attr.size);
if (is_truncate) {
/* NOTE: this may release/reacquire fc->lock */
@@ -1683,7 +1801,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
* Only call invalidate_inode_pages2() after removing
* FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
*/
- if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ if ((is_truncate || !is_wb) &&
+ S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
truncate_pagecache(inode, outarg.attr.size);
invalidate_inode_pages2(inode->i_mapping);
}
@@ -1759,8 +1878,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
fc->no_setxattr = 1;
err = -EOPNOTSUPP;
}
- if (!err)
+ if (!err) {
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
return err;
}
@@ -1890,8 +2011,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
fc->no_removexattr = 1;
err = -EOPNOTSUPP;
}
- if (!err)
+ if (!err) {
fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
return err;
}
@@ -1902,6 +2025,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
.unlink = fuse_unlink,
.rmdir = fuse_rmdir,
.rename = fuse_rename,
+ .rename2 = fuse_rename2,
.link = fuse_link,
.setattr = fuse_setattr,
.create = fuse_create,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4598345ab87..40ac2628ddc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -127,7 +127,15 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
if (atomic_dec_and_test(&ff->count)) {
struct fuse_req *req = ff->reserved_req;
- if (sync) {
+ if (ff->fc->no_open) {
+ /*
+ * Drop the release request when client does not
+ * implement 'open'
+ */
+ req->background = 0;
+ path_put(&req->misc.release.path);
+ fuse_put_request(ff->fc, req);
+ } else if (sync) {
req->background = 0;
fuse_request_send(ff->fc, req);
path_put(&req->misc.release.path);
@@ -144,33 +152,58 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir)
{
- struct fuse_open_out outarg;
struct fuse_file *ff;
- int err;
int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
ff = fuse_file_alloc(fc);
if (!ff)
return -ENOMEM;
- err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
- if (err) {
- fuse_file_free(ff);
- return err;
+ ff->fh = 0;
+ ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
+ if (!fc->no_open || isdir) {
+ struct fuse_open_out outarg;
+ int err;
+
+ err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ if (!err) {
+ ff->fh = outarg.fh;
+ ff->open_flags = outarg.open_flags;
+
+ } else if (err != -ENOSYS || isdir) {
+ fuse_file_free(ff);
+ return err;
+ } else {
+ fc->no_open = 1;
+ }
}
if (isdir)
- outarg.open_flags &= ~FOPEN_DIRECT_IO;
+ ff->open_flags &= ~FOPEN_DIRECT_IO;
- ff->fh = outarg.fh;
ff->nodeid = nodeid;
- ff->open_flags = outarg.open_flags;
file->private_data = fuse_file_get(ff);
return 0;
}
EXPORT_SYMBOL_GPL(fuse_do_open);
+static void fuse_link_write_file(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff = file->private_data;
+ /*
+ * file may be written through mmap, so chain it onto the
+ * inodes's write_file list
+ */
+ spin_lock(&fc->lock);
+ if (list_empty(&ff->write_entry))
+ list_add(&ff->write_entry, &fi->write_files);
+ spin_unlock(&fc->lock);
+}
+
void fuse_finish_open(struct inode *inode, struct file *file)
{
struct fuse_file *ff = file->private_data;
@@ -190,25 +223,37 @@ void fuse_finish_open(struct inode *inode, struct file *file)
i_size_write(inode, 0);
spin_unlock(&fc->lock);
fuse_invalidate_attr(inode);
+ if (fc->writeback_cache)
+ file_update_time(file);
}
+ if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+ fuse_link_write_file(file);
}
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
+ bool lock_inode = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc &&
+ fc->writeback_cache;
err = generic_file_open(inode, file);
if (err)
return err;
+ if (lock_inode)
+ mutex_lock(&inode->i_mutex);
+
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
- if (err)
- return err;
- fuse_finish_open(inode, file);
+ if (!err)
+ fuse_finish_open(inode, file);
- return 0;
+ if (lock_inode)
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
}
static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
@@ -275,6 +320,12 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /* see fuse_vma_close() for !writeback_cache case */
+ if (fc->writeback_cache)
+ write_inode_now(inode, 1);
+
fuse_release_common(file, FUSE_RELEASE);
/* return value is ignored by VFS */
@@ -316,12 +367,13 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
}
/*
- * Check if page is under writeback
+ * Check if any page in a range is under writeback
*
* This is currently done by walking the list of writepage requests
* for the inode, which can be pretty inefficient.
*/
-static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
@@ -334,7 +386,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
BUG_ON(req->inode != inode);
curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
- if (curr_index == index) {
+ if (idx_from < curr_index + req->num_pages &&
+ curr_index <= idx_to) {
found = true;
break;
}
@@ -344,6 +397,11 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
return found;
}
+static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+ return fuse_range_is_writeback(inode, index, index);
+}
+
/*
* Wait for page writeback to be completed.
*
@@ -358,6 +416,21 @@ static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
return 0;
}
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+}
+
static int fuse_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
@@ -373,6 +446,14 @@ static int fuse_flush(struct file *file, fl_owner_t id)
if (fc->no_flush)
return 0;
+ err = write_inode_now(inode, 1);
+ if (err)
+ return err;
+
+ mutex_lock(&inode->i_mutex);
+ fuse_sync_writes(inode);
+ mutex_unlock(&inode->i_mutex);
+
req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
@@ -393,21 +474,6 @@ static int fuse_flush(struct file *file, fl_owner_t id)
return err;
}
-/*
- * Wait for all pending writepages on the inode to finish.
- *
- * This is currently done by blocking further writes with FUSE_NOWRITE
- * and waiting for all sent writes to complete.
- *
- * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
- * could conflict with truncation.
- */
-static void fuse_sync_writes(struct inode *inode)
-{
- fuse_set_nowrite(inode);
- fuse_release_nowrite(inode);
-}
-
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
int datasync, int isdir)
{
@@ -421,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
if (is_bad_inode(inode))
return -EIO;
- err = filemap_write_and_wait_range(inode->i_mapping, start, end);
- if (err)
- return err;
-
- if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
- return 0;
-
mutex_lock(&inode->i_mutex);
/*
@@ -435,11 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
* wait for all outstanding writes, before sending the FSYNC
* request.
*/
- err = write_inode_now(inode, 0);
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
goto out;
fuse_sync_writes(inode);
+ err = sync_inode_metadata(inode, 1);
+ if (err)
+ goto out;
+
+ if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+ goto out;
req = fuse_get_req_nopages(fc);
if (IS_ERR(req)) {
@@ -637,7 +702,33 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
spin_unlock(&fc->lock);
}
-static int fuse_readpage(struct file *file, struct page *page)
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+ u64 attr_ver)
+{
+ size_t num_read = req->out.args[0].size;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fc->writeback_cache) {
+ /*
+ * A hole in a file. Some data after the hole are in page cache,
+ * but have not reached the client fs yet. So, the hole is not
+ * present there.
+ */
+ int i;
+ int start_idx = num_read >> PAGE_CACHE_SHIFT;
+ size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+ for (i = start_idx; i < req->num_pages; i++) {
+ zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+ off = 0;
+ }
+ } else {
+ loff_t pos = page_offset(req->pages[0]) + num_read;
+ fuse_read_update_size(inode, pos, attr_ver);
+ }
+}
+
+static int fuse_do_readpage(struct file *file, struct page *page)
{
struct fuse_io_priv io = { .async = 0, .file = file };
struct inode *inode = page->mapping->host;
@@ -649,10 +740,6 @@ static int fuse_readpage(struct file *file, struct page *page)
u64 attr_ver;
int err;
- err = -EIO;
- if (is_bad_inode(inode))
- goto out;
-
/*
* Page writeback can extend beyond the lifetime of the
* page-cache page, so make sure we read a properly synced
@@ -661,9 +748,8 @@ static int fuse_readpage(struct file *file, struct page *page)
fuse_wait_on_page_writeback(inode, page->index);
req = fuse_get_req(fc, 1);
- err = PTR_ERR(req);
if (IS_ERR(req))
- goto out;
+ return PTR_ERR(req);
attr_ver = fuse_get_attr_version(fc);
@@ -674,19 +760,33 @@ static int fuse_readpage(struct file *file, struct page *page)
req->page_descs[0].length = count;
num_read = fuse_send_read(req, &io, pos, count, NULL);
err = req->out.h.error;
- fuse_put_request(fc, req);
if (!err) {
/*
* Short read means EOF. If file size is larger, truncate it
*/
if (num_read < count)
- fuse_read_update_size(inode, pos + num_read, attr_ver);
+ fuse_short_read(req, inode, attr_ver);
SetPageUptodate(page);
}
- fuse_invalidate_attr(inode); /* atime changed */
+ fuse_put_request(fc, req);
+
+ return err;
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ err = fuse_do_readpage(file, page);
+ fuse_invalidate_atime(inode);
out:
unlock_page(page);
return err;
@@ -708,14 +808,10 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
/*
* Short read means EOF. If file size is larger, truncate it
*/
- if (!req->out.h.error && num_read < count) {
- loff_t pos;
+ if (!req->out.h.error && num_read < count)
+ fuse_short_read(req, inode, req->misc.read.attr_ver);
- pos = page_offset(req->pages[0]) + num_read;
- fuse_read_update_size(inode, pos,
- req->misc.read.attr_ver);
- }
- fuse_invalidate_attr(inode); /* atime changed */
+ fuse_invalidate_atime(inode);
}
for (i = 0; i < req->num_pages; i++) {
@@ -837,8 +933,7 @@ out:
return err;
}
-static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -849,14 +944,14 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
* i_size is up to date).
*/
if (fc->auto_inval_data ||
- (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
+ (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
int err;
err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
if (err)
return err;
}
- return generic_file_aio_read(iocb, iov, nr_segs, pos);
+ return generic_file_read_iter(iocb, to);
}
static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
@@ -904,16 +999,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
return req->misc.write.out.size;
}
-void fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ bool ret = false;
spin_lock(&fc->lock);
fi->attr_version = ++fc->attr_version;
- if (pos > inode->i_size)
+ if (pos > inode->i_size) {
i_size_write(inode, pos);
+ ret = true;
+ }
spin_unlock(&fc->lock);
+
+ return ret;
}
static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -985,13 +1085,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- pagefault_disable();
tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
- pagefault_enable();
flush_dcache_page(page);
- mark_page_accessed(page);
-
if (!tmp) {
unlock_page(page);
page_cache_release(page);
@@ -1084,28 +1180,27 @@ static ssize_t fuse_perform_write(struct file *file,
return res > 0 ? res : err;
}
-static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- size_t count = 0;
- size_t ocount = 0;
+ size_t count = iov_iter_count(from);
ssize_t written = 0;
ssize_t written_buffered = 0;
struct inode *inode = mapping->host;
ssize_t err;
- struct iov_iter i;
loff_t endbyte = 0;
+ loff_t pos = iocb->ki_pos;
- WARN_ON(iocb->ki_pos != pos);
+ if (get_fuse_conn(inode)->writeback_cache) {
+ /* Update size (EOF optimization) and mode (SUID clearing) */
+ err = fuse_update_attributes(mapping->host, NULL, file, NULL);
+ if (err)
+ return err;
- ocount = 0;
- err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (err)
- return err;
+ return generic_file_write_iter(iocb, from);
+ }
- count = ocount;
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
@@ -1118,6 +1213,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (count == 0)
goto out;
+ iov_iter_truncate(from, count);
err = file_remove_suid(file);
if (err)
goto out;
@@ -1127,17 +1223,13 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
goto out;
if (file->f_flags & O_DIRECT) {
- written = generic_file_direct_write(iocb, iov, &nr_segs,
- pos, &iocb->ki_pos,
- count, ocount);
- if (written < 0 || written == count)
+ written = generic_file_direct_write(iocb, from, pos);
+ if (written < 0 || !iov_iter_count(from))
goto out;
pos += written;
- count -= written;
- iov_iter_init(&i, iov, nr_segs, count, written);
- written_buffered = fuse_perform_write(file, mapping, &i, pos);
+ written_buffered = fuse_perform_write(file, mapping, from, pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1156,8 +1248,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
written += written_buffered;
iocb->ki_pos = pos + written_buffered;
} else {
- iov_iter_init(&i, iov, nr_segs, count, 0);
- written = fuse_perform_write(file, mapping, &i, pos);
+ written = fuse_perform_write(file, mapping, from, pos);
if (written >= 0)
iocb->ki_pos = pos + written;
}
@@ -1195,7 +1286,7 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
size_t nbytes = 0; /* # bytes already packed in req */
/* Special case for kernel I/O: can copy directly into the buffer */
- if (segment_eq(get_fs(), KERNEL_DS)) {
+ if (ii->type & ITER_KVEC) {
unsigned long user_addr = fuse_get_user_addr(ii);
size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
@@ -1211,35 +1302,26 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
unsigned npages;
- unsigned long user_addr = fuse_get_user_addr(ii);
- unsigned offset = user_addr & ~PAGE_MASK;
- size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
- int ret;
-
+ size_t start;
unsigned n = req->max_pages - req->num_pages;
- frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
-
- npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- npages = clamp(npages, 1U, n);
-
- ret = get_user_pages_fast(user_addr, npages, !write,
- &req->pages[req->num_pages]);
+ ssize_t ret = iov_iter_get_pages(ii,
+ &req->pages[req->num_pages],
+ n * PAGE_SIZE, &start);
if (ret < 0)
return ret;
- npages = ret;
- frag_size = min_t(size_t, frag_size,
- (npages << PAGE_SHIFT) - offset);
- iov_iter_advance(ii, frag_size);
+ iov_iter_advance(ii, ret);
+ nbytes += ret;
+
+ ret += start;
+ npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
- req->page_descs[req->num_pages].offset = offset;
+ req->page_descs[req->num_pages].offset = start;
fuse_page_descs_length_init(req, req->num_pages, npages);
req->num_pages += npages;
req->page_descs[req->num_pages - 1].length -=
- (npages << PAGE_SHIFT) - offset - frag_size;
-
- nbytes += frag_size;
+ (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
}
if (write)
@@ -1254,48 +1336,46 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
static inline int fuse_iter_npages(const struct iov_iter *ii_p)
{
- struct iov_iter ii = *ii_p;
- int npages = 0;
-
- while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
- unsigned long user_addr = fuse_get_user_addr(&ii);
- unsigned offset = user_addr & ~PAGE_MASK;
- size_t frag_size = iov_iter_single_seg_count(&ii);
-
- npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
- iov_iter_advance(&ii, frag_size);
- }
-
- return min(npages, FUSE_MAX_PAGES_PER_REQ);
+ return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
}
-ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
- unsigned long nr_segs, size_t count, loff_t *ppos,
- int write)
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags)
{
+ int write = flags & FUSE_DIO_WRITE;
+ int cuse = flags & FUSE_DIO_CUSE;
struct file *file = io->file;
+ struct inode *inode = file->f_mapping->host;
struct fuse_file *ff = file->private_data;
struct fuse_conn *fc = ff->fc;
size_t nmax = write ? fc->max_write : fc->max_read;
loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
+ pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
ssize_t res = 0;
struct fuse_req *req;
- struct iov_iter ii;
-
- iov_iter_init(&ii, iov, nr_segs, count, 0);
if (io->async)
- req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+ req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
else
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ req = fuse_get_req(fc, fuse_iter_npages(iter));
if (IS_ERR(req))
return PTR_ERR(req);
+ if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+ if (!write)
+ mutex_lock(&inode->i_mutex);
+ fuse_sync_writes(inode);
+ if (!write)
+ mutex_unlock(&inode->i_mutex);
+ }
+
while (count) {
size_t nres;
fl_owner_t owner = current->files;
size_t nbytes = min(count, nmax);
- int err = fuse_get_user_pages(req, &ii, &nbytes, write);
+ int err = fuse_get_user_pages(req, iter, &nbytes, write);
if (err) {
res = err;
break;
@@ -1325,9 +1405,9 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
fuse_put_request(fc, req);
if (io->async)
req = fuse_get_req_for_background(fc,
- fuse_iter_npages(&ii));
+ fuse_iter_npages(iter));
else
- req = fuse_get_req(fc, fuse_iter_npages(&ii));
+ req = fuse_get_req(fc, fuse_iter_npages(iter));
if (IS_ERR(req))
break;
}
@@ -1342,9 +1422,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
EXPORT_SYMBOL_GPL(fuse_direct_io);
static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos,
- size_t count)
+ struct iov_iter *iter,
+ loff_t *ppos)
{
ssize_t res;
struct file *file = io->file;
@@ -1353,7 +1432,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
if (is_bad_inode(inode))
return -EIO;
- res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
+ res = fuse_direct_io(io, iter, ppos, 0);
fuse_invalidate_attr(inode);
@@ -1365,21 +1444,26 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,
{
struct fuse_io_priv io = { .async = 0, .file = file };
struct iovec iov = { .iov_base = buf, .iov_len = count };
- return __fuse_direct_read(&io, &iov, 1, ppos, count);
+ struct iov_iter ii;
+ iov_iter_init(&ii, READ, &iov, 1, count);
+ return __fuse_direct_read(&io, &ii, ppos);
}
static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
- const struct iovec *iov,
- unsigned long nr_segs, loff_t *ppos)
+ struct iov_iter *iter,
+ loff_t *ppos)
{
struct file *file = io->file;
struct inode *inode = file_inode(file);
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
ssize_t res;
+
res = generic_write_checks(file, ppos, &count, 0);
- if (!res)
- res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+ if (!res) {
+ iov_iter_truncate(iter, count);
+ res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE);
+ }
fuse_invalidate_attr(inode);
@@ -1393,13 +1477,15 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
struct inode *inode = file_inode(file);
ssize_t res;
struct fuse_io_priv io = { .async = 0, .file = file };
+ struct iov_iter ii;
+ iov_iter_init(&ii, WRITE, &iov, 1, count);
if (is_bad_inode(inode))
return -EIO;
/* Don't allow parallel writes to the same file */
mutex_lock(&inode->i_mutex);
- res = __fuse_direct_write(&io, &iov, 1, ppos);
+ res = __fuse_direct_write(&io, &ii, ppos);
if (res > 0)
fuse_write_update_size(inode, *ppos);
mutex_unlock(&inode->i_mutex);
@@ -1409,8 +1495,13 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
{
- __free_page(req->pages[0]);
- fuse_file_put(req->ff, false);
+ int i;
+
+ for (i = 0; i < req->num_pages; i++)
+ __free_page(req->pages[i]);
+
+ if (req->ff)
+ fuse_file_put(req->ff, false);
}
static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1418,30 +1509,34 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
struct inode *inode = req->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+ int i;
list_del(&req->writepages_entry);
- dec_bdi_stat(bdi, BDI_WRITEBACK);
- dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
- bdi_writeout_inc(bdi);
+ for (i = 0; i < req->num_pages; i++) {
+ dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+ bdi_writeout_inc(bdi);
+ }
wake_up(&fi->page_waitq);
}
/* Called under fc->lock, may release and reacquire it */
-static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req)
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+ loff_t size)
__releases(fc->lock)
__acquires(fc->lock)
{
struct fuse_inode *fi = get_fuse_inode(req->inode);
- loff_t size = i_size_read(req->inode);
struct fuse_write_in *inarg = &req->misc.write.in;
+ __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
if (!fc->connected)
goto out_free;
- if (inarg->offset + PAGE_CACHE_SIZE <= size) {
- inarg->size = PAGE_CACHE_SIZE;
+ if (inarg->offset + data_size <= size) {
+ inarg->size = data_size;
} else if (inarg->offset < size) {
- inarg->size = size & (PAGE_CACHE_SIZE - 1);
+ inarg->size = size - inarg->offset;
} else {
/* Got truncated off completely */
goto out_free;
@@ -1472,12 +1567,13 @@ __acquires(fc->lock)
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ size_t crop = i_size_read(inode);
struct fuse_req *req;
while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
req = list_entry(fi->queued_writes.next, struct fuse_req, list);
list_del_init(&req->list);
- fuse_send_writepage(fc, req);
+ fuse_send_writepage(fc, req, crop);
}
}
@@ -1488,12 +1584,85 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
mapping_set_error(inode->i_mapping, req->out.h.error);
spin_lock(&fc->lock);
+ while (req->misc.write.next) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_write_in *inarg = &req->misc.write.in;
+ struct fuse_req *next = req->misc.write.next;
+ req->misc.write.next = next->misc.write.next;
+ next->misc.write.next = NULL;
+ next->ff = fuse_file_get(req->ff);
+ list_add(&next->writepages_entry, &fi->writepages);
+
+ /*
+ * Skip fuse_flush_writepages() to make it easy to crop requests
+ * based on primary request size.
+ *
+ * 1st case (trivial): there are no concurrent activities using
+ * fuse_set/release_nowrite. Then we're on safe side because
+ * fuse_flush_writepages() would call fuse_send_writepage()
+ * anyway.
+ *
+ * 2nd case: someone called fuse_set_nowrite and it is waiting
+ * now for completion of all in-flight requests. This happens
+ * rarely and no more than once per page, so this should be
+ * okay.
+ *
+ * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
+ * of fuse_set_nowrite..fuse_release_nowrite section. The fact
+ * that fuse_set_nowrite returned implies that all in-flight
+ * requests were completed along with all of their secondary
+ * requests. Further primary requests are blocked by negative
+ * writectr. Hence there cannot be any in-flight requests and
+ * no invocations of fuse_writepage_end() while we're in
+ * fuse_set_nowrite..fuse_release_nowrite section.
+ */
+ fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+ }
fi->writectr--;
fuse_writepage_finish(fc, req);
spin_unlock(&fc->lock);
fuse_writepage_free(fc, req);
}
+static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
+ struct fuse_inode *fi)
+{
+ struct fuse_file *ff = NULL;
+
+ spin_lock(&fc->lock);
+ if (!list_empty(&fi->write_files)) {
+ ff = list_entry(fi->write_files.next, struct fuse_file,
+ write_entry);
+ fuse_file_get(ff);
+ }
+ spin_unlock(&fc->lock);
+
+ return ff;
+}
+
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+ struct fuse_inode *fi)
+{
+ struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+ WARN_ON(!ff);
+ return ff;
+}
+
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff;
+ int err;
+
+ ff = __fuse_write_file_get(fc, fi);
+ err = fuse_flush_times(inode, ff);
+ if (ff)
+ fuse_file_put(ff, 0);
+
+ return err;
+}
+
static int fuse_writepage_locked(struct page *page)
{
struct address_space *mapping = page->mapping;
@@ -1501,8 +1670,8 @@ static int fuse_writepage_locked(struct page *page)
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_req *req;
- struct fuse_file *ff;
struct page *tmp_page;
+ int error = -ENOMEM;
set_page_writeback(page);
@@ -1515,16 +1684,16 @@ static int fuse_writepage_locked(struct page *page)
if (!tmp_page)
goto err_free;
- spin_lock(&fc->lock);
- BUG_ON(list_empty(&fi->write_files));
- ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
- req->ff = fuse_file_get(ff);
- spin_unlock(&fc->lock);
+ error = -EIO;
+ req->ff = fuse_write_file_get(fc, fi);
+ if (!req->ff)
+ goto err_nofile;
- fuse_write_fill(req, ff, page_offset(page), 0);
+ fuse_write_fill(req, req->ff, page_offset(page), 0);
copy_highpage(tmp_page, page);
req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+ req->misc.write.next = NULL;
req->in.argpages = 1;
req->num_pages = 1;
req->pages[0] = tmp_page;
@@ -1546,23 +1715,340 @@ static int fuse_writepage_locked(struct page *page)
return 0;
+err_nofile:
+ __free_page(tmp_page);
err_free:
fuse_request_free(req);
err:
end_page_writeback(page);
- return -ENOMEM;
+ return error;
}
static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
int err;
+ if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+ /*
+ * ->writepages() should be called for sync() and friends. We
+ * should only get here on direct reclaim and then we are
+ * allowed to skip a page which is already in flight
+ */
+ WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
+ redirty_page_for_writepage(wbc, page);
+ return 0;
+ }
+
err = fuse_writepage_locked(page);
unlock_page(page);
return err;
}
+struct fuse_fill_wb_data {
+ struct fuse_req *req;
+ struct fuse_file *ff;
+ struct inode *inode;
+ struct page **orig_pages;
+};
+
+static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+{
+ struct fuse_req *req = data->req;
+ struct inode *inode = data->inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int num_pages = req->num_pages;
+ int i;
+
+ req->ff = fuse_file_get(data->ff);
+ spin_lock(&fc->lock);
+ list_add_tail(&req->list, &fi->queued_writes);
+ fuse_flush_writepages(inode);
+ spin_unlock(&fc->lock);
+
+ for (i = 0; i < num_pages; i++)
+ end_page_writeback(data->orig_pages[i]);
+}
+
+static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+ struct page *page)
+{
+ struct fuse_conn *fc = get_fuse_conn(new_req->inode);
+ struct fuse_inode *fi = get_fuse_inode(new_req->inode);
+ struct fuse_req *tmp;
+ struct fuse_req *old_req;
+ bool found = false;
+ pgoff_t curr_index;
+
+ BUG_ON(new_req->num_pages != 0);
+
+ spin_lock(&fc->lock);
+ list_del(&new_req->writepages_entry);
+ list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
+ BUG_ON(old_req->inode != new_req->inode);
+ curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (curr_index <= page->index &&
+ page->index < curr_index + old_req->num_pages) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ list_add(&new_req->writepages_entry, &fi->writepages);
+ goto out_unlock;
+ }
+
+ new_req->num_pages = 1;
+ for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
+ BUG_ON(tmp->inode != new_req->inode);
+ curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (tmp->num_pages == 1 &&
+ curr_index == page->index) {
+ old_req = tmp;
+ }
+ }
+
+ if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
+ old_req->state == FUSE_REQ_PENDING)) {
+ struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+
+ copy_highpage(old_req->pages[0], page);
+ spin_unlock(&fc->lock);
+
+ dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+ bdi_writeout_inc(bdi);
+ fuse_writepage_free(fc, new_req);
+ fuse_request_free(new_req);
+ goto out;
+ } else {
+ new_req->misc.write.next = old_req->misc.write.next;
+ old_req->misc.write.next = new_req;
+ }
+out_unlock:
+ spin_unlock(&fc->lock);
+out:
+ return found;
+}
+
+static int fuse_writepages_fill(struct page *page,
+ struct writeback_control *wbc, void *_data)
+{
+ struct fuse_fill_wb_data *data = _data;
+ struct fuse_req *req = data->req;
+ struct inode *inode = data->inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct page *tmp_page;
+ bool is_writeback;
+ int err;
+
+ if (!data->ff) {
+ err = -EIO;
+ data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+ if (!data->ff)
+ goto out_unlock;
+ }
+
+ /*
+ * Being under writeback is unlikely but possible. For example direct
+ * read to an mmaped fuse file will set the page dirty twice; once when
+ * the pages are faulted with get_user_pages(), and then after the read
+ * completed.
+ */
+ is_writeback = fuse_page_is_writeback(inode, page->index);
+
+ if (req && req->num_pages &&
+ (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+ data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+ fuse_writepages_send(data);
+ data->req = NULL;
+ }
+ err = -ENOMEM;
+ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!tmp_page)
+ goto out_unlock;
+
+ /*
+ * The page must not be redirtied until the writeout is completed
+ * (i.e. userspace has sent a reply to the write request). Otherwise
+ * there could be more than one temporary page instance for each real
+ * page.
+ *
+ * This is ensured by holding the page lock in page_mkwrite() while
+ * checking fuse_page_is_writeback(). We already hold the page lock
+ * since clear_page_dirty_for_io() and keep it held until we add the
+ * request to the fi->writepages list and increment req->num_pages.
+ * After this fuse_page_is_writeback() will indicate that the page is
+ * under writeback, so we can release the page lock.
+ */
+ if (data->req == NULL) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ err = -ENOMEM;
+ req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+ if (!req) {
+ __free_page(tmp_page);
+ goto out_unlock;
+ }
+
+ fuse_write_fill(req, data->ff, page_offset(page), 0);
+ req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+ req->misc.write.next = NULL;
+ req->in.argpages = 1;
+ req->background = 1;
+ req->num_pages = 0;
+ req->end = fuse_writepage_end;
+ req->inode = inode;
+
+ spin_lock(&fc->lock);
+ list_add(&req->writepages_entry, &fi->writepages);
+ spin_unlock(&fc->lock);
+
+ data->req = req;
+ }
+ set_page_writeback(page);
+
+ copy_highpage(tmp_page, page);
+ req->pages[req->num_pages] = tmp_page;
+ req->page_descs[req->num_pages].offset = 0;
+ req->page_descs[req->num_pages].length = PAGE_SIZE;
+
+ inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+ err = 0;
+ if (is_writeback && fuse_writepage_in_flight(req, page)) {
+ end_page_writeback(page);
+ data->req = NULL;
+ goto out_unlock;
+ }
+ data->orig_pages[req->num_pages] = page;
+
+ /*
+ * Protected by fc->lock against concurrent access by
+ * fuse_page_is_writeback().
+ */
+ spin_lock(&fc->lock);
+ req->num_pages++;
+ spin_unlock(&fc->lock);
+
+out_unlock:
+ unlock_page(page);
+
+ return err;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_fill_wb_data data;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ data.inode = inode;
+ data.req = NULL;
+ data.ff = NULL;
+
+ err = -ENOMEM;
+ data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+ sizeof(struct page *),
+ GFP_NOFS);
+ if (!data.orig_pages)
+ goto out;
+
+ err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+ if (data.req) {
+ /* Ignore errors if we can write at least one page */
+ BUG_ON(!data.req->num_pages);
+ fuse_writepages_send(&data);
+ err = 0;
+ }
+ if (data.ff)
+ fuse_file_put(data.ff, false);
+
+ kfree(data.orig_pages);
+out:
+ return err;
+}
+
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
+ struct page *page;
+ loff_t fsize;
+ int err = -ENOMEM;
+
+ WARN_ON(!fc->writeback_cache);
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ goto error;
+
+ fuse_wait_on_page_writeback(mapping->host, page->index);
+
+ if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+ goto success;
+ /*
+ * Check if the start this page comes after the end of file, in which
+ * case the readpage can be optimized away.
+ */
+ fsize = i_size_read(mapping->host);
+ if (fsize <= (pos & PAGE_CACHE_MASK)) {
+ size_t off = pos & ~PAGE_CACHE_MASK;
+ if (off)
+ zero_user_segment(page, 0, off);
+ goto success;
+ }
+ err = fuse_do_readpage(file, page);
+ if (err)
+ goto cleanup;
+success:
+ *pagep = page;
+ return 0;
+
+cleanup:
+ unlock_page(page);
+ page_cache_release(page);
+error:
+ return err;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ if (!PageUptodate(page)) {
+ /* Zero any unwritten bytes at the end of the page */
+ size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+ if (endoff)
+ zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ fuse_write_update_size(inode, pos + copied);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+
+ return copied;
+}
+
static int fuse_launder_page(struct page *page)
{
int err = 0;
@@ -1602,39 +2088,32 @@ static void fuse_vma_close(struct vm_area_struct *vma)
static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct page *page = vmf->page;
- /*
- * Don't use page->mapping as it may become NULL from a
- * concurrent truncate.
- */
- struct inode *inode = vma->vm_file->f_mapping->host;
+ struct inode *inode = file_inode(vma->vm_file);
+
+ file_update_time(vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return VM_FAULT_NOPAGE;
+ }
fuse_wait_on_page_writeback(inode, page->index);
- return 0;
+ return VM_FAULT_LOCKED;
}
static const struct vm_operations_struct fuse_file_vm_ops = {
.close = fuse_vma_close,
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = fuse_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
- struct inode *inode = file_inode(file);
- struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_file *ff = file->private_data;
- /*
- * file may be written through mmap, so chain it onto the
- * inodes's write_file list
- */
- spin_lock(&fc->lock);
- if (list_empty(&ff->write_entry))
- list_add(&ff->write_entry, &fi->write_files);
- spin_unlock(&fc->lock);
- }
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ fuse_link_write_file(file);
+
file_accessed(file);
vma->vm_ops = &fuse_file_vm_ops;
return 0;
@@ -1792,7 +2271,6 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
struct fuse_file *ff = file->private_data;
/* emulate flock with POSIX locks */
- fl->fl_owner = (fl_owner_t) file;
ff->flock = true;
err = fuse_setlk(file, fl, 1);
}
@@ -1863,7 +2341,7 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
if (!bytes)
return 0;
- iov_iter_init(&ii, iov, nr_segs, bytes, 0);
+ iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
while (iov_iter_count(&ii)) {
struct page *page = pages[page_idx++];
@@ -2281,7 +2759,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
{
spin_lock(&fc->lock);
if (RB_EMPTY_NODE(&ff->polled_node)) {
- struct rb_node **link, *parent;
+ struct rb_node **link, *uninitialized_var(parent);
link = fuse_find_polled_node(fc, ff->kh, &parent);
BUG_ON(*link);
@@ -2385,8 +2863,8 @@ static inline loff_t fuse_round_up(loff_t off)
}
static ssize_t
-fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
ssize_t ret = 0;
struct file *file = iocb->ki_filp;
@@ -2395,18 +2873,22 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t pos = 0;
struct inode *inode;
loff_t i_size;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);
struct fuse_io_priv *io;
pos = offset;
inode = file->f_mapping->host;
i_size = i_size_read(inode);
+ if ((rw == READ) && (offset > i_size))
+ return 0;
+
/* optimization for short read */
if (async_dio && rw != WRITE && offset + count > i_size) {
if (offset >= i_size)
return 0;
count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+ iov_iter_truncate(iter, count);
}
io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
@@ -2436,9 +2918,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
io->async = false;
if (rw == WRITE)
- ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+ ret = __fuse_direct_write(io, iter, &pos);
else
- ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+ ret = __fuse_direct_read(io, iter, &pos);
if (io->async) {
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
@@ -2480,6 +2962,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
(mode & FALLOC_FL_PUNCH_HOLE);
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
if (fc->no_fallocate)
return -EOPNOTSUPP;
@@ -2522,8 +3007,12 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
/* we could have extended the file */
- if (!(mode & FALLOC_FL_KEEP_SIZE))
- fuse_write_update_size(inode, offset + length);
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ bool changed = fuse_write_update_size(inode, offset + length);
+
+ if (changed && fc->writeback_cache)
+ file_update_time(file);
+ }
if (mode & FALLOC_FL_PUNCH_HOLE)
truncate_pagecache_range(inode, offset, offset + length - 1);
@@ -2542,10 +3031,10 @@ out:
static const struct file_operations fuse_file_operations = {
.llseek = fuse_file_llseek,
- .read = do_sync_read,
- .aio_read = fuse_file_aio_read,
- .write = do_sync_write,
- .aio_write = fuse_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = fuse_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = fuse_file_write_iter,
.mmap = fuse_file_mmap,
.open = fuse_open,
.flush = fuse_flush,
@@ -2581,11 +3070,14 @@ static const struct file_operations fuse_direct_io_file_operations = {
static const struct address_space_operations fuse_file_aops = {
.readpage = fuse_readpage,
.writepage = fuse_writepage,
+ .writepages = fuse_writepages,
.launder_page = fuse_launder_page,
.readpages = fuse_readpages,
.set_page_dirty = __set_page_dirty_nobuffers,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
+ .write_begin = fuse_write_begin,
+ .write_end = fuse_write_end,
};
void fuse_init_file_inode(struct inode *inode)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5b9e6f3b6ae..e8e47a6ab51 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -321,6 +321,7 @@ struct fuse_req {
struct {
struct fuse_write_in in;
struct fuse_write_out out;
+ struct fuse_req *next;
} write;
struct fuse_notify_retrieve_in retrieve_in;
struct fuse_lk_in lk_in;
@@ -374,12 +375,11 @@ struct fuse_conn {
/** Lock protecting accessess to members of this structure */
spinlock_t lock;
- /** Mutex protecting against directory alias creation */
- struct mutex inst_mutex;
-
/** Refcount */
atomic_t count;
+ struct rcu_head rcu;
+
/** The user id for this mount */
kuid_t user_id;
@@ -480,11 +480,17 @@ struct fuse_conn {
/** Set if bdi is valid */
unsigned bdi_initialized:1;
+ /** write-back cache policy (default is write-through) */
+ unsigned writeback_cache:1;
+
/*
* The following bitfields are only for optimization purposes
* and hence races in setting them will not cause malfunction
*/
+ /** Is open/release not implemented by fs? */
+ unsigned no_open:1;
+
/** Is fsync not implemented by fs? */
unsigned no_fsync:1;
@@ -536,6 +542,9 @@ struct fuse_conn {
/** Is fallocate not implemented by fs? */
unsigned no_fallocate:1;
+ /** Is rename with flags implemented by fs? */
+ unsigned no_rename2:1;
+
/** Use enhanced/automatic page cache invalidation. */
unsigned auto_inval_data:1;
@@ -717,7 +726,7 @@ int fuse_dev_init(void);
void fuse_dev_cleanup(void);
int fuse_ctl_init(void);
-void fuse_ctl_cleanup(void);
+void __exit fuse_ctl_cleanup(void);
/**
* Allocate a request
@@ -788,6 +797,8 @@ void fuse_invalidate_attr(struct inode *inode);
void fuse_invalidate_entry_cache(struct dentry *entry);
+void fuse_invalidate_atime(struct inode *inode);
+
/**
* Acquire reference to fuse_conn
*/
@@ -858,9 +869,19 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
bool isdir);
-ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
- unsigned long nr_segs, size_t count, loff_t *ppos,
- int write);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE (1 << 1)
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags);
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
unsigned int flags);
long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -868,7 +889,10 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
unsigned fuse_file_poll(struct file *file, poll_table *wait);
int fuse_dev_release(struct inode *inode, struct file *file);
-void fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8ce6dab60a..03246cd9d47 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode)
static void fuse_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & MS_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
@@ -135,6 +135,7 @@ static void fuse_evict_inode(struct inode *inode)
static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
if (*flags & MS_MANDLOCK)
return -EINVAL;
@@ -170,10 +171,13 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_blocks = attr->blocks;
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
- inode->i_mtime.tv_sec = attr->mtime;
- inode->i_mtime.tv_nsec = attr->mtimensec;
- inode->i_ctime.tv_sec = attr->ctime;
- inode->i_ctime.tv_nsec = attr->ctimensec;
+ /* mtime from server may be stale due to local buffered write */
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
+ }
if (attr->blksize != 0)
inode->i_blkbits = ilog2(attr->blksize);
@@ -197,6 +201,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
{
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+ bool is_wb = fc->writeback_cache;
loff_t oldsize;
struct timespec old_mtime;
@@ -211,10 +216,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
fuse_change_attributes_common(inode, attr, attr_valid);
oldsize = inode->i_size;
- i_size_write(inode, attr->size);
+ /*
+ * In case of writeback_cache enabled, the cached writes beyond EOF
+ * extend local i_size without keeping userspace server in sync. So,
+ * attr->size coming from server can be stale. We cannot trust it.
+ */
+ if (!is_wb || !S_ISREG(inode->i_mode))
+ i_size_write(inode, attr->size);
spin_unlock(&fc->lock);
- if (S_ISREG(inode->i_mode)) {
+ if (!is_wb && S_ISREG(inode->i_mode)) {
bool inval = false;
if (oldsize != attr->size) {
@@ -243,6 +254,10 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
{
inode->i_mode = attr->mode & S_IFMT;
inode->i_size = attr->size;
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
if (S_ISREG(inode->i_mode)) {
fuse_init_common(inode);
fuse_init_file_inode(inode);
@@ -289,7 +304,9 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
return NULL;
if ((inode->i_state & I_NEW)) {
- inode->i_flags |= S_NOATIME|S_NOCMTIME;
+ inode->i_flags |= S_NOATIME;
+ if (!fc->writeback_cache || !S_ISREG(attr->mode))
+ inode->i_flags |= S_NOCMTIME;
inode->i_generation = generation;
inode->i_data.backing_dev_info = &fc->bdi;
fuse_init_inode(inode, attr);
@@ -461,6 +478,17 @@ static const match_table_t tokens = {
{OPT_ERR, NULL}
};
+static int fuse_match_uint(substring_t *s, unsigned int *res)
+{
+ int err = -ENOMEM;
+ char *buf = match_strdup(s);
+ if (buf) {
+ err = kstrtouint(buf, 10, res);
+ kfree(buf);
+ }
+ return err;
+}
+
static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
{
char *p;
@@ -471,6 +499,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
while ((p = strsep(&opt, ",")) != NULL) {
int token;
int value;
+ unsigned uv;
substring_t args[MAX_OPT_ARGS];
if (!*p)
continue;
@@ -494,18 +523,18 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
break;
case OPT_USER_ID:
- if (match_int(&args[0], &value))
+ if (fuse_match_uint(&args[0], &uv))
return 0;
- d->user_id = make_kuid(current_user_ns(), value);
+ d->user_id = make_kuid(current_user_ns(), uv);
if (!uid_valid(d->user_id))
return 0;
d->user_id_present = 1;
break;
case OPT_GROUP_ID:
- if (match_int(&args[0], &value))
+ if (fuse_match_uint(&args[0], &uv))
return 0;
- d->group_id = make_kgid(current_user_ns(), value);
+ d->group_id = make_kgid(current_user_ns(), uv);
if (!gid_valid(d->group_id))
return 0;
d->group_id_present = 1;
@@ -565,7 +594,6 @@ void fuse_conn_init(struct fuse_conn *fc)
{
memset(fc, 0, sizeof(*fc));
spin_lock_init(&fc->lock);
- mutex_init(&fc->inst_mutex);
init_rwsem(&fc->killsb);
atomic_set(&fc->count, 1);
init_waitqueue_head(&fc->waitq);
@@ -596,7 +624,6 @@ void fuse_conn_put(struct fuse_conn *fc)
if (atomic_dec_and_test(&fc->count)) {
if (fc->destroy_req)
fuse_request_free(fc->destroy_req);
- mutex_destroy(&fc->inst_mutex);
fc->release(fc);
}
}
@@ -775,6 +802,7 @@ static const struct super_operations fuse_super_operations = {
.alloc_inode = fuse_alloc_inode,
.destroy_inode = fuse_destroy_inode,
.evict_inode = fuse_evict_inode,
+ .write_inode = fuse_write_inode,
.drop_inode = generic_delete_inode,
.remount_fs = fuse_remount_fs,
.put_super = fuse_put_super,
@@ -875,6 +903,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
}
if (arg->flags & FUSE_ASYNC_DIO)
fc->async_dio = 1;
+ if (arg->flags & FUSE_WRITEBACK_CACHE)
+ fc->writeback_cache = 1;
+ if (arg->time_gran && arg->time_gran <= 1000000000)
+ fc->sb->s_time_gran = arg->time_gran;
} else {
ra_pages = fc->max_read / PAGE_CACHE_SIZE;
fc->no_lock = 1;
@@ -902,7 +934,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
- FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO;
+ FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
+ FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT;
req->in.h.opcode = FUSE_INIT;
req->in.numargs = 1;
req->in.args[0].size = sizeof(*arg);
@@ -920,7 +953,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
static void fuse_free_conn(struct fuse_conn *fc)
{
- kfree(fc);
+ kfree_rcu(fc, rcu);
}
static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
@@ -980,9 +1013,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
if (sb->s_flags & MS_MANDLOCK)
goto err;
- sb->s_flags &= ~MS_NOSEC;
+ sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
- if (!parse_fuse_opt((char *) data, &d, is_bdev))
+ if (!parse_fuse_opt(data, &d, is_bdev))
goto err;
if (is_bdev) {
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
deleted file mode 100644
index b3f3676796d..00000000000
--- a/fs/generic_acl.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
- *
- * This file is released under the GPL.
- *
- * Generic ACL support for in-memory filesystems.
- */
-
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include <linux/fs.h>
-#include <linux/generic_acl.h>
-#include <linux/posix_acl.h>
-#include <linux/posix_acl_xattr.h>
-
-
-static size_t
-generic_acl_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
-{
- struct posix_acl *acl;
- const char *xname;
- size_t size;
-
- acl = get_cached_acl(dentry->d_inode, type);
- if (!acl)
- return 0;
- posix_acl_release(acl);
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- xname = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- xname = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- return 0;
- }
- size = strlen(xname) + 1;
- if (list && size <= list_size)
- memcpy(list, xname, size);
- return size;
-}
-
-static int
-generic_acl_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
- acl = get_cached_acl(dentry->d_inode, type);
- if (!acl)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-generic_acl_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl = NULL;
- int error;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto failed;
- switch (type) {
- case ACL_TYPE_ACCESS:
- error = posix_acl_equiv_mode(acl, &inode->i_mode);
- if (error < 0)
- goto failed;
- inode->i_ctime = CURRENT_TIME;
- if (error == 0) {
- posix_acl_release(acl);
- acl = NULL;
- }
- break;
- case ACL_TYPE_DEFAULT:
- if (!S_ISDIR(inode->i_mode)) {
- error = -EINVAL;
- goto failed;
- }
- break;
- }
- }
- set_cached_acl(inode, type, acl);
- error = 0;
-failed:
- posix_acl_release(acl);
- return error;
-}
-
-/**
- * generic_acl_init - Take care of acl inheritance at @inode create time
- *
- * Files created inside a directory with a default ACL inherit the
- * directory's default ACL.
- */
-int
-generic_acl_init(struct inode *inode, struct inode *dir)
-{
- struct posix_acl *acl = NULL;
- int error;
-
- if (!S_ISLNK(inode->i_mode))
- acl = get_cached_acl(dir, ACL_TYPE_DEFAULT);
- if (acl) {
- if (S_ISDIR(inode->i_mode))
- set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
- error = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
- if (error < 0)
- return error;
- if (error > 0)
- set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
- } else {
- inode->i_mode &= ~current_umask();
- }
- error = 0;
-
- posix_acl_release(acl);
- return error;
-}
-
-/**
- * generic_acl_chmod - change the access acl of @inode upon chmod()
- *
- * A chmod also changes the permissions of the owner, group/mask, and
- * other ACL entries.
- */
-int
-generic_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
- int error = 0;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
- if (acl) {
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (error)
- return error;
- set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- }
- return error;
-}
-
-const struct xattr_handler generic_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = generic_acl_list,
- .get = generic_acl_get,
- .set = generic_acl_set,
-};
-
-const struct xattr_handler generic_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = generic_acl_list,
- .get = generic_acl_get,
- .set = generic_acl_set,
-};
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index f69ac0af549..3088e2a38e3 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -49,10 +49,6 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
if (!ip->i_eattr)
return NULL;
- acl = get_cached_acl(&ip->i_inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
name = gfs2_acl_name(type);
if (name == NULL)
return ERR_PTR(-EINVAL);
@@ -68,19 +64,7 @@ struct posix_acl *gfs2_get_acl(struct inode *inode, int type)
return acl;
}
-static int gfs2_set_mode(struct inode *inode, umode_t mode)
-{
- int error = 0;
-
- if (mode != inode->i_mode) {
- inode->i_mode = mode;
- mark_inode_dirty(inode);
- }
-
- return error;
-}
-
-static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
+int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int error;
int len;
@@ -88,219 +72,50 @@ static int gfs2_acl_set(struct inode *inode, int type, struct posix_acl *acl)
const char *name = gfs2_acl_name(type);
BUG_ON(name == NULL);
- len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
- if (len == 0)
- return 0;
- data = kmalloc(len, GFP_NOFS);
- if (data == NULL)
- return -ENOMEM;
- error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
- if (error < 0)
- goto out;
- error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
- if (!error)
- set_cached_acl(inode, type, acl);
-out:
- kfree(data);
- return error;
-}
-
-int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode)
-{
- struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
- struct posix_acl *acl;
- umode_t mode = inode->i_mode;
- int error = 0;
-
- if (!sdp->sd_args.ar_posix_acl)
- return 0;
- if (S_ISLNK(inode->i_mode))
- return 0;
-
- acl = gfs2_get_acl(&dip->i_inode, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl) {
- mode &= ~current_umask();
- return gfs2_set_mode(inode, mode);
- }
-
- if (S_ISDIR(inode->i_mode)) {
- error = gfs2_acl_set(inode, ACL_TYPE_DEFAULT, acl);
- if (error)
- goto out;
- }
-
- error = posix_acl_create(&acl, GFP_NOFS, &mode);
- if (error < 0)
- return error;
-
- if (error == 0)
- goto munge;
-
- error = gfs2_acl_set(inode, ACL_TYPE_ACCESS, acl);
- if (error)
- goto out;
-munge:
- error = gfs2_set_mode(inode, mode);
-out:
- posix_acl_release(acl);
- return error;
-}
-
-int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
-{
- struct inode *inode = &ip->i_inode;
- struct posix_acl *acl;
- char *data;
- unsigned int len;
- int error;
-
- acl = gfs2_get_acl(&ip->i_inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl)
- return gfs2_setattr_simple(inode, attr);
-
- error = posix_acl_chmod(&acl, GFP_NOFS, attr->ia_mode);
- if (error)
- return error;
-
- len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
- data = kmalloc(len, GFP_NOFS);
- error = -ENOMEM;
- if (data == NULL)
- goto out;
- posix_acl_to_xattr(&init_user_ns, acl, data, len);
- error = gfs2_xattr_acl_chmod(ip, attr, data);
- kfree(data);
- set_cached_acl(&ip->i_inode, ACL_TYPE_ACCESS, acl);
-
-out:
- posix_acl_release(acl);
- return error;
-}
-
-static int gfs2_acl_type(const char *name)
-{
- if (strcmp(name, GFS2_POSIX_ACL_ACCESS) == 0)
- return ACL_TYPE_ACCESS;
- if (strcmp(name, GFS2_POSIX_ACL_DEFAULT) == 0)
- return ACL_TYPE_DEFAULT;
- return -EINVAL;
-}
-
-static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int xtype)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct posix_acl *acl;
- int type;
- int error;
-
- if (!sdp->sd_args.ar_posix_acl)
- return -EOPNOTSUPP;
-
- type = gfs2_acl_type(name);
- if (type < 0)
- return type;
-
- acl = gfs2_get_acl(inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
-
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags,
- int xtype)
-{
- struct inode *inode = dentry->d_inode;
- struct gfs2_sbd *sdp = GFS2_SB(inode);
- struct posix_acl *acl = NULL;
- int error = 0, type;
-
- if (!sdp->sd_args.ar_posix_acl)
- return -EOPNOTSUPP;
- type = gfs2_acl_type(name);
- if (type < 0)
- return type;
- if (flags & XATTR_CREATE)
- return -EINVAL;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
- return value ? -EACCES : 0;
- if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_FOWNER))
- return -EPERM;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- if (!value)
- goto set_acl;
-
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (!acl) {
- /*
- * acl_set_file(3) may request that we set default ACLs with
- * zero length -- defend (gracefully) against that here.
- */
- goto out;
- }
- if (IS_ERR(acl)) {
- error = PTR_ERR(acl);
- goto out;
- }
-
- error = posix_acl_valid(acl);
- if (error)
- goto out_release;
-
- error = -EINVAL;
- if (acl->a_count > GFS2_ACL_MAX_ENTRIES)
- goto out_release;
+ if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+ return -E2BIG;
if (type == ACL_TYPE_ACCESS) {
umode_t mode = inode->i_mode;
+
error = posix_acl_equiv_mode(acl, &mode);
+ if (error < 0)
+ return error;
- if (error <= 0) {
- posix_acl_release(acl);
+ if (error == 0)
acl = NULL;
- if (error < 0)
- return error;
+ if (mode != inode->i_mode) {
+ inode->i_mode = mode;
+ mark_inode_dirty(inode);
}
-
- error = gfs2_set_mode(inode, mode);
- if (error)
- goto out_release;
}
-set_acl:
- error = __gfs2_xattr_set(inode, name, value, size, 0, GFS2_EATYPE_SYS);
- if (!error) {
- if (acl)
- set_cached_acl(inode, type, acl);
- else
- forget_cached_acl(inode, type);
+ if (acl) {
+ len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0);
+ if (len == 0)
+ return 0;
+ data = kmalloc(len, GFP_NOFS);
+ if (data == NULL)
+ return -ENOMEM;
+ error = posix_acl_to_xattr(&init_user_ns, acl, data, len);
+ if (error < 0)
+ goto out;
+ } else {
+ data = NULL;
+ len = 0;
}
-out_release:
- posix_acl_release(acl);
+
+ error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS);
+ if (error)
+ goto out;
+
+ if (acl)
+ set_cached_acl(inode, type, acl);
+ else
+ forget_cached_acl(inode, type);
out:
+ kfree(data);
return error;
}
-
-const struct xattr_handler gfs2_xattr_system_handler = {
- .prefix = XATTR_SYSTEM_PREFIX,
- .flags = GFS2_EATYPE_SYS,
- .get = gfs2_xattr_system_get,
- .set = gfs2_xattr_system_set,
-};
-
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 0da38dc7efe..2d65ec4cd4b 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -14,11 +14,9 @@
#define GFS2_POSIX_ACL_ACCESS "posix_acl_access"
#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default"
-#define GFS2_ACL_MAX_ENTRIES 25
+#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
-extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
-extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
-extern const struct xattr_handler gfs2_xattr_system_handler;
+extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1f7d8057ea6..805b37fed63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -21,6 +21,7 @@
#include <linux/gfs2_ondisk.h>
#include <linux/backing-dev.h>
#include <linux/aio.h>
+#include <trace/events/writeback.h>
#include "gfs2.h"
#include "incore.h"
@@ -230,13 +231,11 @@ static int gfs2_writepages(struct address_space *mapping,
static int gfs2_write_jdata_pagevec(struct address_space *mapping,
struct writeback_control *wbc,
struct pagevec *pvec,
- int nr_pages, pgoff_t end)
+ int nr_pages, pgoff_t end,
+ pgoff_t *done_index)
{
struct inode *inode = mapping->host;
struct gfs2_sbd *sdp = GFS2_SB(inode);
- loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
- unsigned offset = i_size & (PAGE_CACHE_SIZE-1);
unsigned nrblocks = nr_pages * (PAGE_CACHE_SIZE/inode->i_sb->s_blocksize);
int i;
int ret;
@@ -248,40 +247,83 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
for(i = 0; i < nr_pages; i++) {
struct page *page = pvec->pages[i];
+ /*
+ * At this point, the page may be truncated or
+ * invalidated (changing page->mapping to NULL), or
+ * even swizzled back from swapper_space to tmpfs file
+ * mapping. However, page->index will not change
+ * because we have a reference on the page.
+ */
+ if (page->index > end) {
+ /*
+ * can't be range_cyclic (1st pass) because
+ * end == -1 in that case.
+ */
+ ret = 1;
+ break;
+ }
+
+ *done_index = page->index;
+
lock_page(page);
if (unlikely(page->mapping != mapping)) {
+continue_unlock:
unlock_page(page);
continue;
}
- if (!wbc->range_cyclic && page->index > end) {
- ret = 1;
- unlock_page(page);
- continue;
+ if (!PageDirty(page)) {
+ /* someone wrote it for us */
+ goto continue_unlock;
}
- if (wbc->sync_mode != WB_SYNC_NONE)
- wait_on_page_writeback(page);
-
- if (PageWriteback(page) ||
- !clear_page_dirty_for_io(page)) {
- unlock_page(page);
- continue;
+ if (PageWriteback(page)) {
+ if (wbc->sync_mode != WB_SYNC_NONE)
+ wait_on_page_writeback(page);
+ else
+ goto continue_unlock;
}
- /* Is the page fully outside i_size? (truncate in progress) */
- if (page->index > end_index || (page->index == end_index && !offset)) {
- page->mapping->a_ops->invalidatepage(page, 0,
- PAGE_CACHE_SIZE);
- unlock_page(page);
- continue;
- }
+ BUG_ON(PageWriteback(page));
+ if (!clear_page_dirty_for_io(page))
+ goto continue_unlock;
+
+ trace_wbc_writepage(wbc, mapping->backing_dev_info);
ret = __gfs2_jdata_writepage(page, wbc);
+ if (unlikely(ret)) {
+ if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ unlock_page(page);
+ ret = 0;
+ } else {
+
+ /*
+ * done_index is set past this page,
+ * so media errors will not choke
+ * background writeout for the entire
+ * file. This has consequences for
+ * range_cyclic semantics (ie. it may
+ * not be suitable for data integrity
+ * writeout).
+ */
+ *done_index = page->index + 1;
+ ret = 1;
+ break;
+ }
+ }
- if (ret || (--(wbc->nr_to_write) <= 0))
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) {
ret = 1;
+ break;
+ }
+
}
gfs2_trans_end(sdp);
return ret;
@@ -306,51 +348,69 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
int done = 0;
struct pagevec pvec;
int nr_pages;
+ pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end;
- int scanned = 0;
+ pgoff_t done_index;
+ int cycled;
int range_whole = 0;
+ int tag;
pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
- index = mapping->writeback_index; /* Start from prev offset */
+ writeback_index = mapping->writeback_index; /* prev offset */
+ index = writeback_index;
+ if (index == 0)
+ cycled = 1;
+ else
+ cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_CACHE_SHIFT;
end = wbc->range_end >> PAGE_CACHE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- scanned = 1;
+ cycled = 1; /* ignore range_cyclic tests */
}
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag = PAGECACHE_TAG_TOWRITE;
+ else
+ tag = PAGECACHE_TAG_DIRTY;
retry:
- while (!done && (index <= end) &&
- (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
- PAGECACHE_TAG_DIRTY,
- min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
- scanned = 1;
- ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end);
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
+ tag_pages_for_writeback(mapping, index, end);
+ done_index = index;
+ while (!done && (index <= end)) {
+ nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+ min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+ if (nr_pages == 0)
+ break;
+
+ ret = gfs2_write_jdata_pagevec(mapping, wbc, &pvec, nr_pages, end, &done_index);
if (ret)
done = 1;
if (ret > 0)
ret = 0;
-
pagevec_release(&pvec);
cond_resched();
}
- if (!scanned && !done) {
+ if (!cycled && !done) {
/*
+ * range_cyclic:
* We hit the last page and there is more work to be done: wrap
* back to the start of the file
*/
- scanned = 1;
+ cycled = 1;
index = 0;
+ end = writeback_index - 1;
goto retry;
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
- mapping->writeback_index = index;
+ mapping->writeback_index = done_index;
+
return ret;
}
@@ -371,7 +431,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
ret = gfs2_write_cache_jdata(mapping, wbc);
if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
- gfs2_log_flush(sdp, ip->i_gl);
+ gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
ret = gfs2_write_cache_jdata(mapping, wbc);
}
return ret;
@@ -517,7 +577,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos,
p = kmap_atomic(page);
memcpy(buf + copied, p + offset, amt);
kunmap_atomic(p);
- mark_page_accessed(page);
page_cache_release(page);
copied += amt;
index++;
@@ -611,12 +670,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks);
if (alloc_required) {
+ struct gfs2_alloc_parms ap = { .aflags = 0, };
error = gfs2_quota_lock_check(ip);
if (error)
goto out_unlock;
requested = data_blocks + ind_blocks;
- error = gfs2_inplace_reserve(ip, requested, 0);
+ ap.target = requested;
+ error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto out_qunlock;
}
@@ -979,11 +1040,11 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
int rv;
@@ -1004,9 +1065,39 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
if (rv != 1)
goto out; /* dio not valid, fall back to buffered i/o */
- rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
- offset, nr_segs, gfs2_get_block_direct,
- NULL, NULL, 0);
+ /*
+ * Now since we are holding a deferred (CW) lock at this point, you
+ * might be wondering why this is ever needed. There is a case however
+ * where we've granted a deferred local lock against a cached exclusive
+ * glock. That is ok provided all granted local locks are deferred, but
+ * it also means that it is possible to encounter pages which are
+ * cached and possibly also mapped. So here we check for that and sort
+ * them out ahead of the dio. The glock state machine will take care of
+ * everything else.
+ *
+ * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+ * the first place, mapping->nr_pages will always be zero.
+ */
+ if (mapping->nrpages) {
+ loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+ loff_t len = iov_iter_count(iter);
+ loff_t end = PAGE_ALIGN(offset + len) - 1;
+
+ rv = 0;
+ if (len == 0)
+ goto out;
+ if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+ unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+ rv = filemap_write_and_wait_range(mapping, lstart, end);
+ if (rv)
+ goto out;
+ if (rw == WRITE)
+ truncate_inode_pages_range(mapping, lstart, end);
+ }
+
+ rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ gfs2_get_block_direct, NULL, NULL, 0);
out:
gfs2_glock_dq(&gh);
gfs2_holder_uninit(&gh);
@@ -1048,30 +1139,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
bh = bh->b_this_page;
} while(bh != head);
spin_unlock(&sdp->sd_ail_lock);
- gfs2_log_unlock(sdp);
head = bh = page_buffers(page);
do {
- gfs2_log_lock(sdp);
bd = bh->b_private;
if (bd) {
gfs2_assert_warn(sdp, bd->bd_bh == bh);
- if (!list_empty(&bd->bd_list)) {
- if (!buffer_pinned(bh))
- list_del_init(&bd->bd_list);
- else
- bd = NULL;
- }
- if (bd)
- bd->bd_bh = NULL;
+ if (!list_empty(&bd->bd_list))
+ list_del_init(&bd->bd_list);
+ bd->bd_bh = NULL;
bh->b_private = NULL;
- }
- gfs2_log_unlock(sdp);
- if (bd)
kmem_cache_free(gfs2_bufdata_cachep, bd);
+ }
bh = bh->b_this_page;
} while (bh != head);
+ gfs2_log_unlock(sdp);
return try_to_free_buffers(page);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 62a65fc448d..e6ee5b6e8d9 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -707,7 +707,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
* @top: The first pointer in the buffer
* @bottom: One more than the last pointer
* @height: the height this buffer is at
- * @data: a pointer to a struct strip_mine
+ * @sm: a pointer to a struct strip_mine
*
* Returns: errno
*/
@@ -992,6 +992,8 @@ unlock:
return err;
}
+#define GFS2_JTRUNC_REVOKES 8192
+
/**
* gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
* @inode: The inode being truncated
@@ -1003,8 +1005,6 @@ unlock:
* if the number of pages being truncated gets too large.
*/
-#define GFS2_JTRUNC_REVOKES 8192
-
static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
{
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1216,6 +1216,7 @@ static int do_grow(struct inode *inode, u64 size)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_alloc_parms ap = { .target = 1, };
struct buffer_head *dibh;
int error;
int unstuff = 0;
@@ -1226,7 +1227,7 @@ static int do_grow(struct inode *inode, u64 size)
if (error)
return error;
- error = gfs2_inplace_reserve(ip, 1, 0);
+ error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto do_grow_qunlock;
unstuff = 1;
@@ -1279,6 +1280,7 @@ do_grow_qunlock:
int gfs2_setattr_size(struct inode *inode, u64 newsize)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
int ret;
u64 oldsize;
@@ -1294,7 +1296,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
inode_dio_wait(inode);
- ret = gfs2_rs_alloc(GFS2_I(inode));
+ ret = gfs2_rs_alloc(ip);
if (ret)
goto out;
@@ -1304,6 +1306,7 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
goto out;
}
+ gfs2_rs_deltree(ip->i_res);
ret = do_shrink(inode, oldsize, newsize);
out:
put_write_access(inode);
@@ -1325,6 +1328,121 @@ int gfs2_file_dealloc(struct gfs2_inode *ip)
}
/**
+ * gfs2_free_journal_extents - Free cached journal bmap info
+ * @jd: The journal
+ *
+ */
+
+void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
+{
+ struct gfs2_journal_extent *jext;
+
+ while(!list_empty(&jd->extent_list)) {
+ jext = list_entry(jd->extent_list.next, struct gfs2_journal_extent, list);
+ list_del(&jext->list);
+ kfree(jext);
+ }
+}
+
+/**
+ * gfs2_add_jextent - Add or merge a new extent to extent cache
+ * @jd: The journal descriptor
+ * @lblock: The logical block at start of new extent
+ * @dblock: The physical block at start of new extent
+ * @blocks: Size of extent in fs blocks
+ *
+ * Returns: 0 on success or -ENOMEM
+ */
+
+static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
+{
+ struct gfs2_journal_extent *jext;
+
+ if (!list_empty(&jd->extent_list)) {
+ jext = list_entry(jd->extent_list.prev, struct gfs2_journal_extent, list);
+ if ((jext->dblock + jext->blocks) == dblock) {
+ jext->blocks += blocks;
+ return 0;
+ }
+ }
+
+ jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
+ if (jext == NULL)
+ return -ENOMEM;
+ jext->dblock = dblock;
+ jext->lblock = lblock;
+ jext->blocks = blocks;
+ list_add_tail(&jext->list, &jd->extent_list);
+ jd->nr_extents++;
+ return 0;
+}
+
+/**
+ * gfs2_map_journal_extents - Cache journal bmap info
+ * @sdp: The super block
+ * @jd: The journal to map
+ *
+ * Create a reusable "extent" mapping from all logical
+ * blocks to all physical blocks for the given journal. This will save
+ * us time when writing journal blocks. Most journals will have only one
+ * extent that maps all their logical blocks. That's because gfs2.mkfs
+ * arranges the journal blocks sequentially to maximize performance.
+ * So the extent would map the first block for the entire file length.
+ * However, gfs2_jadd can happen while file activity is happening, so
+ * those journals may not be sequential. Less likely is the case where
+ * the users created their own journals by mounting the metafs and
+ * laying it out. But it's still possible. These journals might have
+ * several extents.
+ *
+ * Returns: 0 on success, or error on failure
+ */
+
+int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
+{
+ u64 lblock = 0;
+ u64 lblock_stop;
+ struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
+ struct buffer_head bh;
+ unsigned int shift = sdp->sd_sb.sb_bsize_shift;
+ u64 size;
+ int rc;
+
+ lblock_stop = i_size_read(jd->jd_inode) >> shift;
+ size = (lblock_stop - lblock) << shift;
+ jd->nr_extents = 0;
+ WARN_ON(!list_empty(&jd->extent_list));
+
+ do {
+ bh.b_state = 0;
+ bh.b_blocknr = 0;
+ bh.b_size = size;
+ rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
+ if (rc || !buffer_mapped(&bh))
+ goto fail;
+ rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
+ if (rc)
+ goto fail;
+ size -= bh.b_size;
+ lblock += (bh.b_size >> ip->i_inode.i_blkbits);
+ } while(size > 0);
+
+ fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid,
+ jd->nr_extents);
+ return 0;
+
+fail:
+ fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
+ rc, jd->jd_jid,
+ (unsigned long long)(i_size_read(jd->jd_inode) - size),
+ jd->nr_extents);
+ fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
+ rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
+ bh.b_state, (unsigned long long)bh.b_size);
+ gfs2_free_journal_extents(jd);
+ return rc;
+}
+
+/**
* gfs2_write_alloc_required - figure out if a write will require an allocation
* @ip: the file being written to
* @offset: the offset to write to
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index 42fea03e2bd..81ded5e2aaa 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -55,5 +55,7 @@ extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
extern int gfs2_file_dealloc(struct gfs2_inode *ip);
extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
unsigned int len);
+extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd);
+extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd);
#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d32..1a349f9a968 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -53,6 +53,8 @@
* but never before the maximum hash table size has been reached.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/buffer_head.h>
@@ -507,8 +509,8 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
goto error;
return 0;
error:
- printk(KERN_WARNING "gfs2_check_dirent: %s (%s)\n", msg,
- first ? "first in block" : "not first in block");
+ pr_warn("%s: %s (%s)\n",
+ __func__, msg, first ? "first in block" : "not first in block");
return -EIO;
}
@@ -531,8 +533,7 @@ static int gfs2_dirent_offset(const void *buf)
}
return offset;
wrong_type:
- printk(KERN_WARNING "gfs2_scan_dirent: wrong block type %u\n",
- be32_to_cpu(h->mh_type));
+ pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
return -1;
}
@@ -728,7 +729,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
- /* printk(KERN_INFO "block num=%llu\n", leaf_no); */
+ /* pr_info("block num=%llu\n", leaf_no); */
error = -EIO;
}
@@ -834,6 +835,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
struct gfs2_leaf *leaf;
struct gfs2_dirent *dent;
struct qstr name = { .name = "" };
+ struct timespec tv = CURRENT_TIME;
error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
if (error)
@@ -850,7 +852,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
leaf->lf_entries = 0;
leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
leaf->lf_next = 0;
- memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+ leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
+ leaf->lf_dist = cpu_to_be32(1);
+ leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+ leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+ memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
dent = (struct gfs2_dirent *)(leaf+1);
gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
*pbh = bh;
@@ -1001,7 +1007,8 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
half_len = len >> 1;
if (!half_len) {
- printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
+ pr_warn("i_depth %u lf_depth %u index %u\n",
+ dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
gfs2_consist_inode(dip);
error = -EIO;
goto fail_brelse;
@@ -1612,11 +1619,31 @@ out:
return ret;
}
+/**
+ * dir_new_leaf - Add a new leaf onto hash chain
+ * @inode: The directory
+ * @name: The name we are adding
+ *
+ * This adds a new dir leaf onto an existing leaf when there is not
+ * enough space to add a new dir entry. This is a last resort after
+ * we've expanded the hash table to max size and also split existing
+ * leaf blocks, so it will only occur for very large directories.
+ *
+ * The dist parameter is set to 1 for leaf blocks directly attached
+ * to the hash table, 2 for one layer of indirection, 3 for two layers
+ * etc. We are thus able to tell the difference between an old leaf
+ * with dist set to zero (i.e. "don't know") and a new one where we
+ * set this information for debug/fsck purposes.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
static int dir_new_leaf(struct inode *inode, const struct qstr *name)
{
struct buffer_head *bh, *obh;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_leaf *leaf, *oleaf;
+ u32 dist = 1;
int error;
u32 index;
u64 bn;
@@ -1626,6 +1653,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
if (error)
return error;
do {
+ dist++;
oleaf = (struct gfs2_leaf *)obh->b_data;
bn = be64_to_cpu(oleaf->lf_next);
if (!bn)
@@ -1643,6 +1671,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
brelse(obh);
return -ENOSPC;
}
+ leaf->lf_dist = cpu_to_be32(dist);
oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
brelse(bh);
brelse(obh);
@@ -1657,41 +1686,64 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
return 0;
}
+static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip)
+{
+ u64 where = ip->i_no_addr + 1;
+ if (ip->i_eattr == where)
+ return 1;
+ return 0;
+}
+
/**
* gfs2_dir_add - Add new filename into directory
- * @dip: The GFS2 inode
- * @filename: The new name
- * @inode: The inode number of the entry
- * @type: The type of the entry
+ * @inode: The directory inode
+ * @name: The new name
+ * @nip: The GFS2 inode to be linked in to the directory
+ * @da: The directory addition info
+ *
+ * If the call to gfs2_diradd_alloc_required resulted in there being
+ * no need to allocate any new directory blocks, then it will contain
+ * a pointer to the directory entry and the bh in which it resides. We
+ * can use that without having to repeat the search. If there was no
+ * free space, then we must now create more space.
*
* Returns: 0 on success, error code on failure
*/
int gfs2_dir_add(struct inode *inode, const struct qstr *name,
- const struct gfs2_inode *nip)
+ const struct gfs2_inode *nip, struct gfs2_diradd *da)
{
struct gfs2_inode *ip = GFS2_I(inode);
- struct buffer_head *bh;
- struct gfs2_dirent *dent;
+ struct buffer_head *bh = da->bh;
+ struct gfs2_dirent *dent = da->dent;
+ struct timespec tv;
struct gfs2_leaf *leaf;
int error;
while(1) {
- dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
- &bh);
+ if (da->bh == NULL) {
+ dent = gfs2_dirent_search(inode, name,
+ gfs2_dirent_find_space, &bh);
+ }
if (dent) {
if (IS_ERR(dent))
return PTR_ERR(dent);
dent = gfs2_init_dirent(inode, dent, name, bh);
gfs2_inum_out(nip, dent);
dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+ dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip));
+ tv = CURRENT_TIME;
if (ip->i_diskflags & GFS2_DIF_EXHASH) {
leaf = (struct gfs2_leaf *)bh->b_data;
be16_add_cpu(&leaf->lf_entries, 1);
+ leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+ leaf->lf_sec = cpu_to_be64(tv.tv_sec);
}
+ da->dent = NULL;
+ da->bh = NULL;
brelse(bh);
ip->i_entries++;
- ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+ ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
if (S_ISDIR(nip->i_inode.i_mode))
inc_nlink(&ip->i_inode);
mark_inode_dirty(inode);
@@ -1742,6 +1794,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
const struct qstr *name = &dentry->d_name;
struct gfs2_dirent *dent, *prev = NULL;
struct buffer_head *bh;
+ struct timespec tv = CURRENT_TIME;
/* Returns _either_ the entry (if its first in block) or the
previous entry otherwise */
@@ -1767,13 +1820,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
if (!entries)
gfs2_consist_inode(dip);
leaf->lf_entries = cpu_to_be16(--entries);
+ leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+ leaf->lf_sec = cpu_to_be64(tv.tv_sec);
}
brelse(bh);
if (!dip->i_entries)
gfs2_consist_inode(dip);
dip->i_entries--;
- dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+ dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
if (S_ISDIR(dentry->d_inode->i_mode))
drop_nlink(&dip->i_inode);
mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2072,36 @@ out:
* gfs2_diradd_alloc_required - find if adding entry will require an allocation
* @ip: the file being written to
* @filname: the filename that's going to be added
+ * @da: The structure to return dir alloc info
*
- * Returns: 1 if alloc required, 0 if not, -ve on error
+ * Returns: 0 if ok, -ve on error
*/
-int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
+ struct gfs2_diradd *da)
{
+ struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+ const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
struct gfs2_dirent *dent;
struct buffer_head *bh;
+ da->nr_blocks = 0;
+ da->bh = NULL;
+ da->dent = NULL;
+
dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
if (!dent) {
- return 1;
+ da->nr_blocks = sdp->sd_max_dirres;
+ if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
+ (GFS2_DIRENT_SIZE(name->len) < extra))
+ da->nr_blocks = 1;
+ return 0;
}
if (IS_ERR(dent))
return PTR_ERR(dent);
- brelse(bh);
+ da->bh = bh;
+ da->dent = dent;
return 0;
}
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873..126c65dda02 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
struct inode;
struct gfs2_inode;
struct gfs2_inum;
+struct buffer_head;
+struct gfs2_dirent;
+
+struct gfs2_diradd {
+ unsigned nr_blocks;
+ struct gfs2_dirent *dent;
+ struct buffer_head *bh;
+};
extern struct inode *gfs2_dir_search(struct inode *dir,
const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
const struct gfs2_inode *ip);
extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
- const struct gfs2_inode *ip);
+ const struct gfs2_inode *ip, struct gfs2_diradd *da);
+static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
+{
+ if (da->bh)
+ brelse(da->bh);
+ da->bh = NULL;
+}
extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
extern int gfs2_diradd_alloc_required(struct inode *dir,
- const struct qstr *filename);
+ const struct qstr *filename,
+ struct gfs2_diradd *da);
extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
struct buffer_head **bhp);
extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0621b46d474..26b3f952e6b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -203,9 +203,9 @@ void gfs2_set_inode_flags(struct inode *inode)
GFS2_DIF_INHERIT_JDATA)
/**
- * gfs2_set_flags - set flags on an inode
- * @inode: The inode
- * @flags: The flags to set
+ * do_gfs2_set_flags - set flags on an inode
+ * @filp: file pointer
+ * @reqflags: The flags to set
* @mask: Indicates which flags are valid
*
*/
@@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
}
if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
if (flags & GFS2_DIF_JDATA)
- gfs2_log_flush(sdp, ip->i_gl);
+ gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
error = filemap_fdatawrite(inode->i_mapping);
if (error)
goto out;
@@ -318,7 +318,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
/**
* gfs2_size_hint - Give a hint to the size of a write request
- * @file: The struct file
+ * @filep: The struct file
* @offset: The file offset of the write
* @size: The length of the write
*
@@ -371,7 +371,7 @@ static int gfs2_allocate_page_backing(struct page *page)
/**
* gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
* @vma: The virtual memory area
- * @page: The page which is about to become writable
+ * @vmf: The virtual memory fault containing the page to become writable
*
* When the page becomes writable, we need to ensure that we have
* blocks allocated on disk to back that page.
@@ -383,6 +383,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
struct inode *inode = file_inode(vma->vm_file);
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
+ struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned long last_index;
u64 pos = page->index << PAGE_CACHE_SHIFT;
unsigned int data_blocks, ind_blocks, rblocks;
@@ -430,7 +431,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
if (ret)
goto out_unlock;
gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
- ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
+ ap.target = data_blocks + ind_blocks;
+ ret = gfs2_inplace_reserve(ip, &ap);
if (ret)
goto out_quota_unlock;
@@ -492,6 +494,7 @@ out:
static const struct vm_operations_struct gfs2_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = gfs2_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -620,7 +623,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
if (!(file->f_mode & FMODE_WRITE))
return 0;
- gfs2_rs_delete(ip);
+ gfs2_rs_delete(ip, &inode->i_writecount);
return 0;
}
@@ -681,7 +684,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
}
/**
- * gfs2_file_aio_write - Perform a write to a file
+ * gfs2_file_write_iter - Perform a write to a file
* @iocb: The io context
* @iov: The data to write
* @nr_segs: Number of @iov segments
@@ -694,11 +697,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
*
*/
-static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
- size_t writesize = iov_length(iov, nr_segs);
struct gfs2_inode *ip = GFS2_I(file_inode(file));
int ret;
@@ -706,7 +707,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (ret)
return ret;
- gfs2_size_hint(file, pos, writesize);
+ gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
if (file->f_flags & O_APPEND) {
struct gfs2_holder gh;
@@ -717,7 +718,7 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
gfs2_glock_dq_uninit(&gh);
}
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ return generic_file_write_iter(iocb, from);
}
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@ -800,6 +801,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct gfs2_inode *ip = GFS2_I(inode);
+ struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
loff_t bytes, max_bytes;
int error;
@@ -808,6 +810,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
loff_t max_chunk_size = UINT_MAX & bsize_mask;
+ struct gfs2_holder gh;
+
next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
/* We only support the FALLOC_FL_KEEP_SIZE mode */
@@ -828,8 +832,10 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
if (error)
return error;
- gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
- error = gfs2_glock_nq(&ip->i_gh);
+ mutex_lock(&inode->i_mutex);
+
+ gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+ error = gfs2_glock_nq(&gh);
if (unlikely(error))
goto out_uninit;
@@ -850,7 +856,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
retry:
gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
- error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
+ ap.target = data_blocks + ind_blocks;
+ error = gfs2_inplace_reserve(ip, &ap);
if (error) {
if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
bytes >>= 1;
@@ -896,9 +903,10 @@ out_trans_fail:
out_qunlock:
gfs2_quota_unlock(ip);
out_unlock:
- gfs2_glock_dq(&ip->i_gh);
+ gfs2_glock_dq(&gh);
out_uninit:
- gfs2_holder_uninit(&ip->i_gh);
+ gfs2_holder_uninit(&gh);
+ mutex_unlock(&inode->i_mutex);
return error;
}
@@ -973,7 +981,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
int error = 0;
state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
- flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
+ flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT;
mutex_lock(&fp->f_fl_mutex);
@@ -983,7 +991,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
goto out;
flock_lock_file_wait(file,
&(struct file_lock){.fl_type = F_UNLCK});
- gfs2_glock_dq_wait(fl_gh);
+ gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
} else {
error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
@@ -1048,10 +1056,10 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = gfs2_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = gfs2_file_write_iter,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
@@ -1060,7 +1068,7 @@ const struct file_operations gfs2_file_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.setlease = gfs2_setlease,
.fallocate = gfs2_fallocate,
};
@@ -1080,17 +1088,17 @@ const struct file_operations gfs2_dir_fops = {
const struct file_operations gfs2_file_fops_nolock = {
.llseek = gfs2_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = gfs2_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = gfs2_file_write_iter,
.unlocked_ioctl = gfs2_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.setlease = generic_setlease,
.fallocate = gfs2_fallocate,
};
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c2f41b4d00b..ee4e04fe60f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -31,6 +33,7 @@
#include <linux/bit_spinlock.h>
#include <linux/percpu.h>
#include <linux/list_sort.h>
+#include <linux/lockref.h>
#include "gfs2.h"
#include "incore.h"
@@ -129,10 +132,10 @@ void gfs2_glock_free(struct gfs2_glock *gl)
*
*/
-void gfs2_glock_hold(struct gfs2_glock *gl)
+static void gfs2_glock_hold(struct gfs2_glock *gl)
{
- GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
- atomic_inc(&gl->gl_ref);
+ GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
+ lockref_get(&gl->gl_lockref);
}
/**
@@ -187,20 +190,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
}
/**
- * gfs2_glock_put_nolock() - Decrement reference count on glock
- * @gl: The glock to put
- *
- * This function should only be used if the caller has its own reference
- * to the glock, in addition to the one it is dropping.
- */
-
-void gfs2_glock_put_nolock(struct gfs2_glock *gl)
-{
- if (atomic_dec_and_test(&gl->gl_ref))
- GLOCK_BUG_ON(gl, 1);
-}
-
-/**
* gfs2_glock_put() - Decrement reference count on glock
* @gl: The glock to put
*
@@ -211,17 +200,22 @@ void gfs2_glock_put(struct gfs2_glock *gl)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
- if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) {
- __gfs2_glock_remove_from_lru(gl);
- spin_unlock(&lru_lock);
- spin_lock_bucket(gl->gl_hash);
- hlist_bl_del_rcu(&gl->gl_list);
- spin_unlock_bucket(gl->gl_hash);
- GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
- GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
- trace_gfs2_glock_put(gl);
- sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
- }
+ if (lockref_put_or_lock(&gl->gl_lockref))
+ return;
+
+ lockref_mark_dead(&gl->gl_lockref);
+
+ spin_lock(&lru_lock);
+ __gfs2_glock_remove_from_lru(gl);
+ spin_unlock(&lru_lock);
+ spin_unlock(&gl->gl_lockref.lock);
+ spin_lock_bucket(gl->gl_hash);
+ hlist_bl_del_rcu(&gl->gl_list);
+ spin_unlock_bucket(gl->gl_hash);
+ GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
+ GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
+ trace_gfs2_glock_put(gl);
+ sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
}
/**
@@ -244,7 +238,7 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
continue;
if (gl->gl_sbd != sdp)
continue;
- if (atomic_inc_not_zero(&gl->gl_ref))
+ if (lockref_get_not_dead(&gl->gl_lockref))
return gl;
}
@@ -283,7 +277,7 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde
static void gfs2_holder_wake(struct gfs2_holder *gh)
{
clear_bit(HIF_WAIT, &gh->gh_iflags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&gh->gh_iflags, HIF_WAIT);
}
@@ -396,10 +390,11 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
held2 = (new_state != LM_ST_UNLOCKED);
if (held1 != held2) {
+ GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref));
if (held2)
- gfs2_glock_hold(gl);
+ gl->gl_lockref.count++;
else
- gfs2_glock_put_nolock(gl);
+ gl->gl_lockref.count--;
}
if (held1 && held2 && list_empty(&gl->gl_holders))
clear_bit(GLF_QUEUED, &gl->gl_flags);
@@ -416,7 +411,7 @@ static void gfs2_demote_wake(struct gfs2_glock *gl)
{
gl->gl_demote_state = LM_ST_EXCLUSIVE;
clear_bit(GLF_DEMOTE, &gl->gl_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
}
@@ -475,7 +470,7 @@ retry:
do_xmote(gl, gh, LM_ST_UNLOCKED);
break;
default: /* Everything else */
- printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+ pr_err("wanted %u got %u\n", gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
spin_unlock(&gl->gl_spin);
@@ -549,7 +544,7 @@ __acquires(&gl->gl_spin)
/* lock_dlm */
ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
if (ret) {
- printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret);
+ pr_err("lm_lock ret %d\n", ret);
GLOCK_BUG_ON(gl, 1);
}
} else { /* lock_nolock */
@@ -625,15 +620,15 @@ out:
out_sched:
clear_bit(GLF_LOCK, &gl->gl_flags);
- smp_mb__after_clear_bit();
- gfs2_glock_hold(gl);
+ smp_mb__after_atomic();
+ gl->gl_lockref.count++;
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put_nolock(gl);
+ gl->gl_lockref.count--;
return;
out_unlock:
clear_bit(GLF_LOCK, &gl->gl_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
return;
}
@@ -736,14 +731,14 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
cachep = gfs2_glock_aspace_cachep;
else
cachep = gfs2_glock_cachep;
- gl = kmem_cache_alloc(cachep, GFP_KERNEL);
+ gl = kmem_cache_alloc(cachep, GFP_NOFS);
if (!gl)
return -ENOMEM;
memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
if (glops->go_flags & GLOF_LVB) {
- gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL);
+ gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_NOFS);
if (!gl->gl_lksb.sb_lvbptr) {
kmem_cache_free(cachep, gl);
return -ENOMEM;
@@ -754,7 +749,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
gl->gl_sbd = sdp;
gl->gl_flags = 0;
gl->gl_name = name;
- atomic_set(&gl->gl_ref, 1);
+ gl->gl_lockref.count = 1;
gl->gl_state = LM_ST_UNLOCKED;
gl->gl_target = LM_ST_UNLOCKED;
gl->gl_demote_state = LM_ST_EXCLUSIVE;
@@ -942,7 +937,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- printk(KERN_ERR " %pV", &vaf);
+ pr_err("%pV", &vaf);
}
va_end(args);
@@ -1017,13 +1012,13 @@ do_cancel:
return;
trap_recursive:
- printk(KERN_ERR "original: %pSR\n", (void *)gh2->gh_ip);
- printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
- printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ pr_err("original: %pSR\n", (void *)gh2->gh_ip);
+ pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
+ pr_err("lock type: %d req lock state : %d\n",
gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
- printk(KERN_ERR "new: %pSR\n", (void *)gh->gh_ip);
- printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
- printk(KERN_ERR "lock type: %d req lock state : %d\n",
+ pr_err("new: %pSR\n", (void *)gh->gh_ip);
+ pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
+ pr_err("lock type: %d req lock state : %d\n",
gh->gh_gl->gl_name.ln_type, gh->gh_state);
gfs2_dump_glock(NULL, gl);
BUG();
@@ -1052,9 +1047,13 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
spin_lock(&gl->gl_spin);
add_to_queue(gh);
- if ((LM_FLAG_NOEXP & gh->gh_flags) &&
- test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+ if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
+ test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+ gl->gl_lockref.count++;
+ if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+ gl->gl_lockref.count--;
+ }
run_queue(gl, 1);
spin_unlock(&gl->gl_spin);
@@ -1356,10 +1355,10 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
}
}
- spin_unlock(&gl->gl_spin);
+ gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- smp_wmb();
- gfs2_glock_hold(gl);
+ spin_unlock(&gl->gl_spin);
+
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
}
@@ -1404,17 +1403,25 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_entry(list->next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
+ if (!spin_trylock(&gl->gl_spin)) {
+add_back_to_lru:
+ list_add(&gl->gl_lru, &lru_list);
+ atomic_inc(&lru_count);
+ continue;
+ }
+ if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ spin_unlock(&gl->gl_spin);
+ goto add_back_to_lru;
+ }
clear_bit(GLF_LRU, &gl->gl_flags);
- gfs2_glock_hold(gl);
- spin_unlock(&lru_lock);
- spin_lock(&gl->gl_spin);
+ gl->gl_lockref.count++;
if (demote_ok(gl))
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
- gfs2_glock_put_nolock(gl);
+ gl->gl_lockref.count--;
spin_unlock(&gl->gl_spin);
- spin_lock(&lru_lock);
+ cond_resched_lock(&lru_lock);
}
}
@@ -1439,7 +1446,7 @@ static long gfs2_scan_glock_lru(int nr)
gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
/* Test for being demotable */
- if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
+ if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
list_move(&gl->gl_lru, &dispose);
atomic_dec(&lru_count);
freed++;
@@ -1493,7 +1500,7 @@ static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
rcu_read_lock();
hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
- if ((gl->gl_sbd == sdp) && atomic_inc_not_zero(&gl->gl_ref))
+ if ((gl->gl_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref))
examiner(gl);
}
rcu_read_unlock();
@@ -1555,13 +1562,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
glock_hash_walk(thaw_glock, sdp);
}
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
{
- int ret;
spin_lock(&gl->gl_spin);
- ret = gfs2_dump_glock(seq, gl);
+ gfs2_dump_glock(seq, gl);
spin_unlock(&gl->gl_spin);
- return ret;
}
static void dump_glock_func(struct gfs2_glock *gl)
@@ -1650,14 +1655,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
* @seq: the seq_file struct
* @gh: the glock holder
*
- * Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
{
struct task_struct *gh_owner = NULL;
char flags_buf[32];
+ rcu_read_lock();
if (gh->gh_owner_pid)
gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
@@ -1667,7 +1672,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
gh_owner ? gh_owner->comm : "(ended)",
(void *)gh->gh_ip);
- return 0;
+ rcu_read_unlock();
}
static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1722,16 +1727,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
* example. The field's are n = number (id of the object), f = flags,
* t = type, s = state, r = refcount, e = error, p = pid.
*
- * Returns: 0 on success, -ENOBUFS when we run out of space
*/
-int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
unsigned long long dtime;
const struct gfs2_holder *gh;
char gflags_buf[32];
- int error = 0;
dtime = jiffies - gl->gl_demote_time;
dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1746,17 +1749,13 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
state2str(gl->gl_demote_state), dtime,
atomic_read(&gl->gl_ail_count),
atomic_read(&gl->gl_revokes),
- atomic_read(&gl->gl_ref), gl->gl_hold_time);
+ (int)gl->gl_lockref.count, gl->gl_hold_time);
+
+ list_for_each_entry(gh, &gl->gl_holders, gh_list)
+ dump_holder(seq, gh);
- list_for_each_entry(gh, &gl->gl_holders, gh_list) {
- error = dump_holder(seq, gh);
- if (error)
- goto out;
- }
if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
- error = glops->go_dump(seq, gl);
-out:
- return error;
+ glops->go_dump(seq, gl);
}
static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1902,7 +1901,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
gi->nhash = 0;
}
/* Skip entries for other sb and dead entries */
- } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
+ } while (gi->sdp != gi->gl->gl_sbd ||
+ __lockref_is_dead(&gi->gl->gl_lockref));
return 0;
}
@@ -1953,7 +1953,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
{
- return dump_glock(seq, iter_ptr);
+ dump_glock(seq, iter_ptr);
+ return 0;
}
static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 69f66e3d22b..32572f71f02 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -181,8 +181,6 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
const struct gfs2_glock_operations *glops,
int create, struct gfs2_glock **glp);
-extern void gfs2_glock_hold(struct gfs2_glock *gl);
-extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
extern void gfs2_glock_put(struct gfs2_glock *gl);
extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
unsigned flags, struct gfs2_holder *gh);
@@ -201,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
struct gfs2_holder *gh);
extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
extern __printf(2, 3)
void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index e2e0a90396e..2ffc67dce87 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -82,23 +82,30 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
struct gfs2_trans tr;
memset(&tr, 0, sizeof(tr));
+ INIT_LIST_HEAD(&tr.tr_buf);
+ INIT_LIST_HEAD(&tr.tr_databuf);
tr.tr_revokes = atomic_read(&gl->gl_ail_count);
if (!tr.tr_revokes)
return;
- /* A shortened, inline version of gfs2_trans_begin() */
+ /* A shortened, inline version of gfs2_trans_begin()
+ * tr->alloced is not set since the transaction structure is
+ * on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
tr.tr_ip = (unsigned long)__builtin_return_address(0);
sb_start_intwrite(sdp->sd_vfs);
- gfs2_log_reserve(sdp, tr.tr_reserved);
+ if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
+ sb_end_intwrite(sdp->sd_vfs);
+ return;
+ }
WARN_ON_ONCE(current->journal_info);
current->journal_info = &tr;
__gfs2_ail_flush(gl, 0, tr.tr_revokes);
gfs2_trans_end(sdp);
- gfs2_log_flush(sdp, NULL);
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
}
void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
@@ -119,7 +126,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
return;
__gfs2_ail_flush(gl, fsync, max_revokes);
gfs2_trans_end(sdp);
- gfs2_log_flush(sdp, NULL);
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
}
/**
@@ -133,7 +140,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
static void rgrp_go_sync(struct gfs2_glock *gl)
{
- struct address_space *metamapping = gfs2_glock2aspace(gl);
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct address_space *mapping = &sdp->sd_aspace;
struct gfs2_rgrpd *rgd;
int error;
@@ -141,10 +149,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
return;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
- gfs2_log_flush(gl->gl_sbd, gl);
- filemap_fdatawrite(metamapping);
- error = filemap_fdatawait(metamapping);
- mapping_set_error(metamapping, error);
+ gfs2_log_flush(sdp, gl, NORMAL_FLUSH);
+ filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+ error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+ mapping_set_error(mapping, error);
gfs2_ail_empty_gl(gl);
spin_lock(&gl->gl_spin);
@@ -166,11 +174,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
{
- struct address_space *mapping = gfs2_glock2aspace(gl);
+ struct gfs2_sbd *sdp = gl->gl_sbd;
+ struct address_space *mapping = &sdp->sd_aspace;
WARN_ON_ONCE(!(flags & DIO_METADATA));
- gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
- truncate_inode_pages(mapping, 0);
+ gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+ truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
if (gl->gl_object) {
struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -192,14 +201,17 @@ static void inode_go_sync(struct gfs2_glock *gl)
if (ip && !S_ISREG(ip->i_inode.i_mode))
ip = NULL;
- if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
- unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+ if (ip) {
+ if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+ unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+ inode_dio_wait(&ip->i_inode);
+ }
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return;
GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
- gfs2_log_flush(gl->gl_sbd, gl);
+ gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
filemap_fdatawrite(metamapping);
if (ip) {
struct address_space *mapping = ip->i_inode.i_mapping;
@@ -214,7 +226,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
* Writeback of the data mapping may cause the dirty flag to be set
* so we have to clear it again here.
*/
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(GLF_DIRTY, &gl->gl_flags);
}
@@ -222,8 +234,8 @@ static void inode_go_sync(struct gfs2_glock *gl)
* inode_go_inval - prepare a inode glock to be released
* @gl: the glock
* @flags:
- *
- * Normally we invlidate everything, but if we are moving into
+ *
+ * Normally we invalidate everything, but if we are moving into
* LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
* can keep hold of the metadata, since it won't have changed.
*
@@ -246,7 +258,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
}
if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
- gfs2_log_flush(gl->gl_sbd, NULL);
+ gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
gl->gl_sbd->sd_rindex_uptodate = 0;
}
if (ip && S_ISREG(ip->i_inode.i_mode))
@@ -410,6 +422,9 @@ static int inode_go_lock(struct gfs2_holder *gh)
return error;
}
+ if (gh->gh_state != LM_ST_DEFERRED)
+ inode_dio_wait(&ip->i_inode);
+
if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
(gl->gl_state == LM_ST_EXCLUSIVE) &&
(gh->gh_state == LM_ST_EXCLUSIVE)) {
@@ -429,49 +444,55 @@ static int inode_go_lock(struct gfs2_holder *gh)
* @seq: The iterator
* @ip: the inode
*
- * Returns: 0 on success, -ENOBUFS when we run out of space
*/
-static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
{
const struct gfs2_inode *ip = gl->gl_object;
if (ip == NULL)
- return 0;
+ return;
gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
(unsigned long long)ip->i_no_formal_ino,
(unsigned long long)ip->i_no_addr,
IF2DT(ip->i_inode.i_mode), ip->i_flags,
(unsigned int)ip->i_diskflags,
(unsigned long long)i_size_read(&ip->i_inode));
- return 0;
}
/**
- * trans_go_sync - promote/demote the transaction glock
+ * freeze_go_sync - promote/demote the freeze glock
* @gl: the glock
* @state: the requested state
* @flags:
*
*/
-static void trans_go_sync(struct gfs2_glock *gl)
+static void freeze_go_sync(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
+ DEFINE_WAIT(wait);
- if (gl->gl_state != LM_ST_UNLOCKED &&
+ if (gl->gl_state == LM_ST_SHARED &&
test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- gfs2_meta_syncfs(sdp);
- gfs2_log_shutdown(sdp);
+ atomic_set(&sdp->sd_log_freeze, 1);
+ wake_up(&sdp->sd_logd_waitq);
+ do {
+ prepare_to_wait(&sdp->sd_log_frozen_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&sdp->sd_log_freeze))
+ io_schedule();
+ } while(atomic_read(&sdp->sd_log_freeze));
+ finish_wait(&sdp->sd_log_frozen_wait, &wait);
}
}
/**
- * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * freeze_go_xmote_bh - After promoting/demoting the freeze glock
* @gl: the glock
*
*/
-static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
+static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
{
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -504,7 +525,7 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
* Always returns 0
*/
-static int trans_go_demote_ok(const struct gfs2_glock *gl)
+static int freeze_go_demote_ok(const struct gfs2_glock *gl)
{
return 0;
}
@@ -525,9 +546,9 @@ static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
if (gl->gl_demote_state == LM_ST_UNLOCKED &&
gl->gl_state == LM_ST_SHARED && ip) {
- gfs2_glock_hold(gl);
+ gl->gl_lockref.count++;
if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
- gfs2_glock_put_nolock(gl);
+ gl->gl_lockref.count--;
}
}
@@ -552,13 +573,13 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
.go_unlock = gfs2_rgrp_go_unlock,
.go_dump = gfs2_rgrp_dump,
.go_type = LM_TYPE_RGRP,
- .go_flags = GLOF_ASPACE | GLOF_LVB,
+ .go_flags = GLOF_LVB,
};
-const struct gfs2_glock_operations gfs2_trans_glops = {
- .go_sync = trans_go_sync,
- .go_xmote_bh = trans_go_xmote_bh,
- .go_demote_ok = trans_go_demote_ok,
+const struct gfs2_glock_operations gfs2_freeze_glops = {
+ .go_sync = freeze_go_sync,
+ .go_xmote_bh = freeze_go_xmote_bh,
+ .go_demote_ok = freeze_go_demote_ok,
.go_type = LM_TYPE_NONDISK,
};
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index bf95a2dc166..7455d2629bc 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -15,7 +15,7 @@
extern const struct gfs2_glock_operations gfs2_meta_glops;
extern const struct gfs2_glock_operations gfs2_inode_glops;
extern const struct gfs2_glock_operations gfs2_rgrp_glops;
-extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_freeze_glops;
extern const struct gfs2_glock_operations gfs2_iopen_glops;
extern const struct gfs2_glock_operations gfs2_flock_glops;
extern const struct gfs2_glock_operations gfs2_nondisk_glops;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 26aabd7caba..67d310c9ada 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -21,6 +21,7 @@
#include <linux/rbtree.h>
#include <linux/ktime.h>
#include <linux/percpu.h>
+#include <linux/lockref.h>
#define DIO_WAIT 0x00000010
#define DIO_METADATA 0x00000020
@@ -51,7 +52,7 @@ struct gfs2_log_header_host {
*/
struct gfs2_log_operations {
- void (*lo_before_commit) (struct gfs2_sbd *sdp);
+ void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
void (*lo_before_scan) (struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass);
@@ -71,6 +72,7 @@ struct gfs2_bitmap {
u32 bi_offset;
u32 bi_start;
u32 bi_len;
+ u32 bi_blocks;
};
struct gfs2_rgrpd {
@@ -91,6 +93,7 @@ struct gfs2_rgrpd {
struct gfs2_rgrp_lvb *rd_rgl;
u32 rd_last_alloc;
u32 rd_flags;
+ u32 rd_extfail_pt; /* extent failure point */
#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */
#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
@@ -101,19 +104,25 @@ struct gfs2_rgrpd {
struct gfs2_rbm {
struct gfs2_rgrpd *rgd;
- struct gfs2_bitmap *bi; /* Bitmap must belong to the rgd */
u32 offset; /* The offset is bitmap relative */
+ int bii; /* Bitmap index */
};
+static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
+{
+ return rbm->rgd->rd_bits + rbm->bii;
+}
+
static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
{
- return rbm->rgd->rd_data0 + (rbm->bi->bi_start * GFS2_NBBY) + rbm->offset;
+ return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
+ rbm->offset;
}
static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
const struct gfs2_rbm *rbm2)
{
- return (rbm1->rgd == rbm2->rgd) && (rbm1->bi == rbm2->bi) &&
+ return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) &&
(rbm1->offset == rbm2->offset);
}
@@ -209,7 +218,7 @@ struct gfs2_glock_operations {
int (*go_demote_ok) (const struct gfs2_glock *gl);
int (*go_lock) (struct gfs2_holder *gh);
void (*go_unlock) (struct gfs2_holder *gh);
- int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+ void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
void (*go_callback)(struct gfs2_glock *gl, bool remote);
const int go_type;
const unsigned long go_flags;
@@ -278,6 +287,20 @@ struct gfs2_blkreserv {
unsigned int rs_qa_qd_num;
};
+/*
+ * Allocation parameters
+ * @target: The number of blocks we'd ideally like to allocate
+ * @aflags: The flags (e.g. Orlov flag)
+ *
+ * The intent is to gradually expand this structure over time in
+ * order to give more information, e.g. alignment, min extent size
+ * to the allocation code.
+ */
+struct gfs2_alloc_parms {
+ u32 target;
+ u32 aflags;
+};
+
enum {
GLF_LOCK = 1,
GLF_DEMOTE = 3,
@@ -300,9 +323,9 @@ struct gfs2_glock {
struct gfs2_sbd *gl_sbd;
unsigned long gl_flags; /* GLF_... */
struct lm_lockname gl_name;
- atomic_t gl_ref;
- spinlock_t gl_spin;
+ struct lockref gl_lockref;
+#define gl_spin gl_lockref.lock
/* State fields protected by gl_spin */
unsigned int gl_state:2, /* Current state */
@@ -328,7 +351,15 @@ struct gfs2_glock {
atomic_t gl_ail_count;
atomic_t gl_revokes;
struct delayed_work gl_work;
- struct work_struct gl_delete;
+ union {
+ /* For inode and iopen glocks only */
+ struct work_struct gl_delete;
+ /* For rgrp glocks only */
+ struct {
+ loff_t start;
+ loff_t end;
+ } gl_vm;
+ };
struct rcu_head gl_rcu;
};
@@ -340,6 +371,7 @@ enum {
GIF_ALLOC_FAILED = 2,
GIF_SW_PAGED = 3,
GIF_ORDERED = 4,
+ GIF_FREE_VFS_INODE = 5,
};
struct gfs2_inode {
@@ -397,12 +429,14 @@ enum {
};
struct gfs2_quota_data {
+ struct hlist_bl_node qd_hlist;
struct list_head qd_list;
- struct list_head qd_reclaim;
-
- atomic_t qd_count;
-
struct kqid qd_id;
+ struct gfs2_sbd *qd_sbd;
+ struct lockref qd_lockref;
+ struct list_head qd_lru;
+ unsigned qd_hash;
+
unsigned long qd_flags; /* QDF_... */
s64 qd_change;
@@ -420,6 +454,7 @@ struct gfs2_quota_data {
u64 qd_sync_gen;
unsigned long qd_last_warn;
+ struct rcu_head qd_rcu;
};
struct gfs2_trans {
@@ -428,11 +463,9 @@ struct gfs2_trans {
unsigned int tr_blocks;
unsigned int tr_revokes;
unsigned int tr_reserved;
-
- struct gfs2_holder tr_t_gh;
-
- int tr_touched;
- int tr_attached;
+ unsigned int tr_touched:1;
+ unsigned int tr_attached:1;
+ unsigned int tr_alloced:1;
unsigned int tr_num_buf_new;
unsigned int tr_num_databuf_new;
@@ -442,6 +475,8 @@ struct gfs2_trans {
unsigned int tr_num_revoke_rm;
struct list_head tr_list;
+ struct list_head tr_databuf;
+ struct list_head tr_buf;
unsigned int tr_first;
struct list_head tr_ail1_list;
@@ -449,7 +484,7 @@ struct gfs2_trans {
};
struct gfs2_journal_extent {
- struct list_head extent_list;
+ struct list_head list;
unsigned int lblock; /* First logical block */
u64 dblock; /* First disk block */
@@ -459,6 +494,7 @@ struct gfs2_journal_extent {
struct gfs2_jdesc {
struct list_head jd_list;
struct list_head extent_list;
+ unsigned int nr_extents;
struct work_struct jd_work;
struct inode *jd_inode;
unsigned long jd_flags;
@@ -466,6 +502,15 @@ struct gfs2_jdesc {
unsigned int jd_jid;
unsigned int jd_blocks;
int jd_recover_error;
+ /* Replay stuff */
+
+ unsigned int jd_found_blocks;
+ unsigned int jd_found_revokes;
+ unsigned int jd_replayed_blocks;
+
+ struct list_head jd_revoke_list;
+ unsigned int jd_replay_tail;
+
};
struct gfs2_statfs_change_host {
@@ -516,7 +561,6 @@ struct gfs2_tune {
unsigned int gt_logd_secs;
- unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
unsigned int gt_quota_scale_num; /* Numerator */
unsigned int gt_quota_scale_den; /* Denominator */
@@ -636,7 +680,7 @@ struct gfs2_sbd {
struct lm_lockstruct sd_lockstruct;
struct gfs2_holder sd_live_gh;
struct gfs2_glock *sd_rename_gl;
- struct gfs2_glock *sd_trans_gl;
+ struct gfs2_glock *sd_freeze_gl;
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
@@ -684,6 +728,8 @@ struct gfs2_sbd {
struct gfs2_holder sd_sc_gh;
struct gfs2_holder sd_qc_gh;
+ struct completion sd_journal_ready;
+
/* Daemon stuff */
struct task_struct *sd_logd_process;
@@ -694,35 +740,31 @@ struct gfs2_sbd {
struct list_head sd_quota_list;
atomic_t sd_quota_count;
struct mutex sd_quota_mutex;
+ struct mutex sd_quota_sync_mutex;
wait_queue_head_t sd_quota_wait;
struct list_head sd_trunc_list;
spinlock_t sd_trunc_lock;
unsigned int sd_quota_slots;
- unsigned int sd_quota_chunks;
- unsigned char **sd_quota_bitmap;
+ unsigned long *sd_quota_bitmap;
+ spinlock_t sd_bitmap_lock;
u64 sd_quota_sync_gen;
/* Log stuff */
+ struct address_space sd_aspace;
+
spinlock_t sd_log_lock;
struct gfs2_trans *sd_log_tr;
unsigned int sd_log_blks_reserved;
- unsigned int sd_log_commited_buf;
- unsigned int sd_log_commited_databuf;
int sd_log_commited_revoke;
atomic_t sd_log_pinned;
- unsigned int sd_log_num_buf;
unsigned int sd_log_num_revoke;
- unsigned int sd_log_num_rg;
- unsigned int sd_log_num_databuf;
- struct list_head sd_log_le_buf;
struct list_head sd_log_le_revoke;
- struct list_head sd_log_le_databuf;
struct list_head sd_log_le_ordered;
spinlock_t sd_ordered_lock;
@@ -750,17 +792,14 @@ struct gfs2_sbd {
struct list_head sd_ail1_list;
struct list_head sd_ail2_list;
- /* Replay stuff */
-
- struct list_head sd_revoke_list;
- unsigned int sd_replay_tail;
-
- unsigned int sd_found_blocks;
- unsigned int sd_found_revokes;
- unsigned int sd_replayed_blocks;
-
/* For quiescing the filesystem */
struct gfs2_holder sd_freeze_gh;
+ struct gfs2_holder sd_freeze_root_gh;
+ struct gfs2_holder sd_thaw_gh;
+ atomic_t sd_log_freeze;
+ atomic_t sd_frozen_root;
+ wait_queue_head_t sd_frozen_root_wait;
+ wait_queue_head_t sd_log_frozen_wait;
char sd_fsname[GFS2_FSNAME_LEN];
char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d2384f7c53e..e62e5947788 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
ip = GFS2_I(inode);
if (!inode)
- return ERR_PTR(-ENOBUFS);
+ return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -376,25 +376,25 @@ static void munge_mode_uid_gid(const struct gfs2_inode *dip,
inode->i_gid = current_fsgid();
}
-static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
+static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
int error;
- int dblocks = 1;
error = gfs2_quota_lock_check(ip);
if (error)
goto out;
- error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
+ error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto out_quota;
- error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 0);
+ error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0);
if (error)
goto out_ipreserv;
- error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
+ error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation);
ip->i_no_formal_ino = ip->i_generation;
ip->i_inode.i_ino = ip->i_no_addr;
ip->i_goal = ip->i_no_addr;
@@ -427,6 +427,33 @@ static void gfs2_init_dir(struct buffer_head *dibh,
}
/**
+ * gfs2_init_xattr - Initialise an xattr block for a new inode
+ * @ip: The inode in question
+ *
+ * This sets up an empty xattr block for a new inode, ready to
+ * take any ACLs, LSM xattrs, etc.
+ */
+
+static void gfs2_init_xattr(struct gfs2_inode *ip)
+{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct buffer_head *bh;
+ struct gfs2_ea_header *ea;
+
+ bh = gfs2_meta_new(ip->i_gl, ip->i_eattr);
+ gfs2_trans_add_meta(ip->i_gl, bh);
+ gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
+ gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
+
+ ea = GFS2_EA_BH2FIRST(bh);
+ ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize);
+ ea->ea_type = GFS2_EATYPE_UNUSED;
+ ea->ea_flags = GFS2_EAFLAG_LAST;
+
+ brelse(bh);
+}
+
+/**
* init_dinode - Fill in a new dinode structure
* @dip: The directory this inode is being created in
* @ip: The inode
@@ -468,25 +495,45 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
brelse(dibh);
}
+/**
+ * gfs2_trans_da_blocks - Calculate number of blocks to link inode
+ * @dip: The directory we are linking into
+ * @da: The dir add information
+ * @nr_inodes: The number of inodes involved
+ *
+ * This calculate the number of blocks we need to reserve in a
+ * transaction to link @nr_inodes into a directory. In most cases
+ * @nr_inodes will be 2 (the directory plus the inode being linked in)
+ * but in case of rename, 4 may be required.
+ *
+ * Returns: Number of blocks
+ */
+
+static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
+ const struct gfs2_diradd *da,
+ unsigned nr_inodes)
+{
+ return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
+ (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
+}
+
static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
- struct gfs2_inode *ip, int arq)
+ struct gfs2_inode *ip, struct gfs2_diradd *da)
{
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+ struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
int error;
- if (arq) {
+ if (da->nr_blocks) {
error = gfs2_quota_lock_check(dip);
if (error)
goto fail_quota_locks;
- error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
+ error = gfs2_inplace_reserve(dip, &ap);
if (error)
goto fail_quota_locks;
- error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- dip->i_rgd->rd_length +
- 2 * RES_DINODE +
- RES_STATFS + RES_QUOTA, 0);
+ error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
if (error)
goto fail_ipreserv;
} else {
@@ -495,7 +542,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
goto fail_quota_locks;
}
- error = gfs2_dir_add(&dip->i_inode, name, ip);
+ error = gfs2_dir_add(&dip->i_inode, name, ip, da);
if (error)
goto fail_end_trans;
@@ -524,13 +571,6 @@ static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
return err;
}
-static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
- const struct qstr *qstr)
-{
- return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
- &gfs2_initxattrs, NULL);
-}
-
/**
* gfs2_create_inode - Create a new inode
* @dir: The parent directory
@@ -550,15 +590,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
unsigned int size, int excl, int *opened)
{
const struct qstr *name = &dentry->d_name;
+ struct posix_acl *default_acl, *acl;
struct gfs2_holder ghs[2];
struct inode *inode = NULL;
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
struct gfs2_glock *io_gl;
struct dentry *d;
- int error;
+ int error, free_vfs_inode = 0;
u32 aflags = 0;
- int arq;
+ unsigned blocks = 1;
+ struct gfs2_diradd da = { .bh = NULL, };
if (!name->len || name->len > GFS2_FNAMESIZE)
return -ENAMETOOLONG;
@@ -583,24 +625,27 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
error = PTR_ERR(inode);
if (!IS_ERR(inode)) {
d = d_splice_alias(inode, dentry);
+ error = PTR_ERR(d);
+ if (IS_ERR(d))
+ goto fail_gunlock;
error = 0;
- if (file && !IS_ERR(d)) {
- if (d == NULL)
- d = dentry;
- if (S_ISREG(inode->i_mode))
- error = finish_open(file, d, gfs2_open_common, opened);
- else
+ if (file) {
+ if (S_ISREG(inode->i_mode)) {
+ WARN_ON(d != NULL);
+ error = finish_open(file, dentry, gfs2_open_common, opened);
+ } else {
error = finish_no_open(file, d);
+ }
+ } else {
+ dput(d);
}
gfs2_glock_dq_uninit(ghs);
- if (IS_ERR(d))
- return PTR_ERR(d);
return error;
} else if (error != -ENOENT) {
goto fail_gunlock;
}
- arq = error = gfs2_diradd_alloc_required(dir, name);
+ error = gfs2_diradd_alloc_required(dir, name, &da);
if (error < 0)
goto fail_gunlock;
@@ -609,10 +654,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (!inode)
goto fail_gunlock;
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ goto fail_free_vfs_inode;
+
ip = GFS2_I(inode);
error = gfs2_rs_alloc(ip);
if (error)
- goto fail_free_inode;
+ goto fail_free_acls;
inode->i_mode = mode;
set_nlink(inode, S_ISDIR(mode) ? 2 : 1);
@@ -647,10 +696,15 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
(dip->i_diskflags & GFS2_DIF_TOPDIR))
aflags |= GFS2_AF_ORLOV;
- error = alloc_dinode(ip, aflags);
+ if (default_acl || acl)
+ blocks++;
+
+ error = alloc_dinode(ip, aflags, &blocks);
if (error)
goto fail_free_inode;
+ gfs2_set_inode_blocks(inode, blocks);
+
error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
if (error)
goto fail_free_inode;
@@ -660,10 +714,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
if (error)
goto fail_free_inode;
- error = gfs2_trans_begin(sdp, RES_DINODE, 0);
+ error = gfs2_trans_begin(sdp, blocks, 0);
if (error)
goto fail_gunlock2;
+ if (blocks > 1) {
+ ip->i_eattr = ip->i_no_addr + 1;
+ gfs2_init_xattr(ip);
+ }
init_dinode(dip, ip, symname);
gfs2_trans_end(sdp);
@@ -680,15 +738,25 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
gfs2_set_iop(inode);
insert_inode_hash(inode);
- error = gfs2_acl_create(dip, inode);
+ if (default_acl) {
+ error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ if (!error)
+ error = gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
+
if (error)
goto fail_gunlock3;
- error = gfs2_security_init(dip, ip, name);
+ error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name,
+ &gfs2_initxattrs, NULL);
if (error)
goto fail_gunlock3;
- error = link_dinode(dip, name, ip, arq);
+ error = link_dinode(dip, name, ip, &da);
if (error)
goto fail_gunlock3;
@@ -713,15 +781,23 @@ fail_gunlock2:
fail_free_inode:
if (ip->i_gl)
gfs2_glock_put(ip->i_gl);
- gfs2_rs_delete(ip);
- free_inode_nonrcu(inode);
- inode = NULL;
+ gfs2_rs_delete(ip, NULL);
+fail_free_acls:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
+fail_free_vfs_inode:
+ free_vfs_inode = 1;
fail_gunlock:
+ gfs2_dir_no_add(&da);
gfs2_glock_dq_uninit(ghs);
if (inode && !IS_ERR(inode)) {
clear_nlink(inode);
- mark_inode_dirty(inode);
- set_bit(GIF_ALLOC_FAILED, &GFS2_I(inode)->i_flags);
+ if (!free_vfs_inode)
+ mark_inode_dirty(inode);
+ set_bit(free_vfs_inode ? GIF_FREE_VFS_INODE : GIF_ALLOC_FAILED,
+ &GFS2_I(inode)->i_flags);
iput(inode);
}
fail:
@@ -777,12 +853,19 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
}
d = d_splice_alias(inode, dentry);
+ if (IS_ERR(d)) {
+ iput(inode);
+ gfs2_glock_dq_uninit(&gh);
+ return d;
+ }
if (file && S_ISREG(inode->i_mode))
error = finish_open(file, dentry, gfs2_open_common, opened);
gfs2_glock_dq_uninit(&gh);
- if (error)
+ if (error) {
+ dput(d);
return ERR_PTR(error);
+ }
return d;
}
@@ -813,7 +896,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder ghs[2];
struct buffer_head *dibh;
- int alloc_required;
+ struct gfs2_diradd da = { .bh = NULL, };
int error;
if (S_ISDIR(inode->i_mode))
@@ -868,24 +951,21 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (ip->i_inode.i_nlink == (u32)-1)
goto out_gunlock;
- alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+ error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
if (error < 0)
goto out_gunlock;
- error = 0;
- if (alloc_required) {
+ if (da.nr_blocks) {
+ struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
error = gfs2_quota_lock_check(dip);
if (error)
goto out_gunlock;
- error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
+ error = gfs2_inplace_reserve(dip, &ap);
if (error)
goto out_gunlock_q;
- error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
- 2 * RES_DINODE + RES_STATFS +
- RES_QUOTA, 0);
+ error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
if (error)
goto out_ipres;
} else {
@@ -898,7 +978,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
if (error)
goto out_end_trans;
- error = gfs2_dir_add(dir, &dentry->d_name, ip);
+ error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
if (error)
goto out_brelse;
@@ -914,12 +994,13 @@ out_brelse:
out_end_trans:
gfs2_trans_end(sdp);
out_ipres:
- if (alloc_required)
+ if (da.nr_blocks)
gfs2_inplace_release(dip);
out_gunlock_q:
- if (alloc_required)
+ if (da.nr_blocks)
gfs2_quota_unlock(dip);
out_gunlock:
+ gfs2_dir_no_add(&da);
gfs2_glock_dq(ghs + 1);
out_child:
gfs2_glock_dq(ghs);
@@ -1163,14 +1244,19 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
d = __gfs2_lookup(dir, dentry, file, opened);
if (IS_ERR(d))
return PTR_ERR(d);
- if (d == NULL)
- d = dentry;
- if (d->d_inode) {
- if (!(*opened & FILE_OPENED))
- return finish_no_open(file, d);
+ if (d != NULL)
+ dentry = d;
+ if (dentry->d_inode) {
+ if (!(*opened & FILE_OPENED)) {
+ if (d == NULL)
+ dget(dentry);
+ return finish_no_open(file, dentry);
+ }
+ dput(d);
return 0;
}
+ BUG_ON(d != NULL);
if (!(flags & O_CREAT))
return -ENOENT;
@@ -1208,6 +1294,10 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
}
tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
+ if (!tmp) {
+ error = -ENOENT;
+ break;
+ }
if (IS_ERR(tmp)) {
error = PTR_ERR(tmp);
break;
@@ -1244,7 +1334,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
- int alloc_required = 0;
+ struct gfs2_diradd da = { .nr_blocks = 0, };
unsigned int x;
int error;
@@ -1378,25 +1468,24 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
goto out_gunlock;
}
- if (nip == NULL)
- alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
- error = alloc_required;
- if (error < 0)
- goto out_gunlock;
+ if (nip == NULL) {
+ error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
+ if (error)
+ goto out_gunlock;
+ }
- if (alloc_required) {
+ if (da.nr_blocks) {
+ struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
error = gfs2_quota_lock_check(ndip);
if (error)
goto out_gunlock;
- error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
+ error = gfs2_inplace_reserve(ndip, &ap);
if (error)
goto out_gunlock_q;
- error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
- gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
- 4 * RES_DINODE + 4 * RES_LEAF +
- RES_STATFS + RES_QUOTA + 4, 0);
+ error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
+ 4 * RES_LEAF + 4, 0);
if (error)
goto out_ipreserv;
} else {
@@ -1430,19 +1519,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
if (error)
goto out_end_trans;
- error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+ error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
if (error)
goto out_end_trans;
out_end_trans:
gfs2_trans_end(sdp);
out_ipreserv:
- if (alloc_required)
+ if (da.nr_blocks)
gfs2_inplace_release(ndip);
out_gunlock_q:
- if (alloc_required)
+ if (da.nr_blocks)
gfs2_quota_unlock(ndip);
out_gunlock:
+ gfs2_dir_no_add(&da);
while (x--) {
gfs2_glock_dq(ghs + x);
gfs2_holder_uninit(ghs + x);
@@ -1523,18 +1613,26 @@ int gfs2_permission(struct inode *inode, int mask)
{
struct gfs2_inode *ip;
struct gfs2_holder i_gh;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
int unlock = 0;
+ int frozen_root = 0;
ip = GFS2_I(inode);
if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- if (error)
- return error;
- unlock = 1;
+ if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) &&
+ inode == sdp->sd_root_dir->d_inode &&
+ atomic_inc_not_zero(&sdp->sd_frozen_root)))
+ frozen_root = 1;
+ else {
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+ if (error)
+ return error;
+ unlock = 1;
+ }
}
if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
@@ -1543,6 +1641,8 @@ int gfs2_permission(struct inode *inode, int mask)
error = generic_permission(inode, mask);
if (unlock)
gfs2_glock_dq_uninit(&i_gh);
+ else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root))
+ wake_up(&sdp->sd_frozen_root_wait);
return error;
}
@@ -1596,10 +1696,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
ogid = ngid = NO_GID_QUOTA_CHANGE;
- error = gfs2_quota_lock(ip, nuid, ngid);
+ error = get_write_access(inode);
if (error)
return error;
+ error = gfs2_rs_alloc(ip);
+ if (error)
+ goto out;
+
+ error = gfs2_rindex_update(sdp);
+ if (error)
+ goto out;
+
+ error = gfs2_quota_lock(ip, nuid, ngid);
+ if (error)
+ goto out;
+
if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
!gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
error = gfs2_quota_check(ip, nuid, ngid);
@@ -1626,6 +1738,8 @@ out_end_trans:
gfs2_trans_end(sdp);
out_gunlock_q:
gfs2_quota_unlock(ip);
+out:
+ put_write_access(inode);
return error;
}
@@ -1667,10 +1781,11 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
error = gfs2_setattr_size(inode, attr->ia_size);
else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
error = setattr_chown(inode, attr);
- else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
- error = gfs2_acl_chmod(ip, attr);
- else
+ else {
error = gfs2_setattr_simple(inode, attr);
+ if (!error && attr->ia_valid & ATTR_MODE)
+ error = posix_acl_chmod(inode, inode->i_mode);
+ }
out:
if (!error)
@@ -1700,19 +1815,29 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct inode *inode = dentry->d_inode;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder gh;
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
int error;
int unlock = 0;
+ int frozen_root = 0;
if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
- if (error)
- return error;
- unlock = 1;
+ if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) &&
+ inode == sdp->sd_root_dir->d_inode &&
+ atomic_inc_not_zero(&sdp->sd_frozen_root)))
+ frozen_root = 1;
+ else {
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+ if (error)
+ return error;
+ unlock = 1;
+ }
}
generic_fillattr(inode, stat);
if (unlock)
gfs2_glock_dq_uninit(&gh);
+ else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root))
+ wake_up(&sdp->sd_frozen_root_wait);
return 0;
}
@@ -1830,6 +1955,7 @@ const struct inode_operations gfs2_file_iops = {
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
+ .set_acl = gfs2_set_acl,
};
const struct inode_operations gfs2_dir_iops = {
@@ -1851,6 +1977,7 @@ const struct inode_operations gfs2_dir_iops = {
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
.get_acl = gfs2_get_acl,
+ .set_acl = gfs2_set_acl,
.atomic_open = gfs2_atomic_open,
};
@@ -1866,6 +1993,5 @@ const struct inode_operations gfs2_symlink_iops = {
.listxattr = gfs2_listxattr,
.removexattr = gfs2_removexattr,
.fiemap = gfs2_fiemap,
- .get_acl = gfs2_get_acl,
};
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index c8423d6de6c..4fafea1c9ec 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <linux/dlm.h>
#include <linux/slab.h>
@@ -176,7 +178,7 @@ static void gdlm_bast(void *arg, int mode)
gfs2_glock_cb(gl, LM_ST_SHARED);
break;
default:
- printk(KERN_ERR "unknown bast mode %d", mode);
+ pr_err("unknown bast mode %d\n", mode);
BUG();
}
}
@@ -195,7 +197,7 @@ static int make_mode(const unsigned int lmstate)
case LM_ST_SHARED:
return DLM_LOCK_PR;
}
- printk(KERN_ERR "unknown LM state %d", lmstate);
+ pr_err("unknown LM state %d\n", lmstate);
BUG();
return -1;
}
@@ -308,7 +310,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
NULL, gl);
if (error) {
- printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
+ pr_err("gdlm_unlock %x,%llx err=%d\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number, error);
return;
@@ -466,19 +468,19 @@ static void gdlm_cancel(struct gfs2_glock *gl)
static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
char *lvb_bits)
{
- uint32_t gen;
+ __le32 gen;
memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
- memcpy(&gen, lvb_bits, sizeof(uint32_t));
+ memcpy(&gen, lvb_bits, sizeof(__le32));
*lvb_gen = le32_to_cpu(gen);
}
static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
char *lvb_bits)
{
- uint32_t gen;
+ __le32 gen;
memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
gen = cpu_to_le32(lvb_gen);
- memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t));
+ memcpy(ls->ls_control_lvb, &gen, sizeof(__le32));
}
static int all_jid_bits_clear(char *lvb)
@@ -1034,8 +1036,8 @@ static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots,
new_size = old_size + RECOVER_SIZE_INC;
- submit = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
- result = kzalloc(new_size * sizeof(uint32_t), GFP_NOFS);
+ submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
+ result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS);
if (!submit || !result) {
kfree(submit);
kfree(result);
@@ -1102,7 +1104,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
}
if (ls->ls_recover_submit[jid]) {
- fs_info(sdp, "recover_slot jid %d gen %u prev %u",
+ fs_info(sdp, "recover_slot jid %d gen %u prev %u\n",
jid, ls->ls_recover_block, ls->ls_recover_submit[jid]);
}
ls->ls_recover_submit[jid] = ls->ls_recover_block;
@@ -1132,7 +1134,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
spin_unlock(&ls->ls_recover_spin);
}
@@ -1269,7 +1271,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
return 0;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 610613fb65b..3966fadbceb 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/bio.h>
+#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
@@ -145,8 +146,10 @@ void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc)
{
struct list_head *head = &sdp->sd_ail1_list;
struct gfs2_trans *tr;
+ struct blk_plug plug;
trace_gfs2_ail_flush(sdp, wbc, 1);
+ blk_start_plug(&plug);
spin_lock(&sdp->sd_ail_lock);
restart:
list_for_each_entry_reverse(tr, head, tr_list) {
@@ -156,6 +159,7 @@ restart:
goto restart;
}
spin_unlock(&sdp->sd_ail_lock);
+ blk_finish_plug(&plug);
trace_gfs2_ail_flush(sdp, wbc, 0);
}
@@ -297,6 +301,23 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
}
/**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+ atomic_add(blks, &sdp->sd_log_blks_free);
+ trace_gfs2_log_blocks(sdp, blks);
+ gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+ sdp->sd_jdesc->jd_blocks);
+ up_read(&sdp->sd_log_flush_lock);
+}
+
+/**
* gfs2_log_reserve - Make a log reservation
* @sdp: The GFS2 superblock
* @blks: The number of blocks to reserve
@@ -354,7 +375,10 @@ retry:
wake_up(&sdp->sd_log_waitq);
down_read(&sdp->sd_log_flush_lock);
-
+ if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
+ gfs2_log_release(sdp, blks);
+ return -EROFS;
+ }
return 0;
}
@@ -410,24 +434,22 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer
static unsigned int calc_reserved(struct gfs2_sbd *sdp)
{
unsigned int reserved = 0;
- unsigned int mbuf_limit, metabufhdrs_needed;
- unsigned int dbuf_limit, databufhdrs_needed;
- unsigned int revokes = 0;
+ unsigned int mbuf;
+ unsigned int dbuf;
+ struct gfs2_trans *tr = sdp->sd_log_tr;
- mbuf_limit = buf_limit(sdp);
- metabufhdrs_needed = (sdp->sd_log_commited_buf +
- (mbuf_limit - 1)) / mbuf_limit;
- dbuf_limit = databuf_limit(sdp);
- databufhdrs_needed = (sdp->sd_log_commited_databuf +
- (dbuf_limit - 1)) / dbuf_limit;
+ if (tr) {
+ mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
+ dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
+ reserved = mbuf + dbuf;
+ /* Account for header blocks */
+ reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp));
+ reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp));
+ }
if (sdp->sd_log_commited_revoke > 0)
- revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
+ reserved += gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
sizeof(u64));
-
- reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
- sdp->sd_log_commited_databuf + databufhdrs_needed +
- revokes;
/* One for the overall header */
if (reserved)
reserved++;
@@ -551,10 +573,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
struct buffer_head *bh = bd->bd_bh;
struct gfs2_glock *gl = bd->bd_gl;
- gfs2_remove_from_ail(bd);
- bd->bd_bh = NULL;
bh->b_private = NULL;
bd->bd_blkno = bh->b_blocknr;
+ gfs2_remove_from_ail(bd); /* drops ref on bh */
+ bd->bd_bh = NULL;
bd->bd_ops = &gfs2_revoke_lops;
sdp->sd_log_num_revoke++;
atomic_inc(&gl->gl_revokes);
@@ -669,7 +691,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
*
*/
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+ enum gfs2_flush_type type)
{
struct gfs2_trans *tr;
@@ -682,36 +705,25 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
}
trace_gfs2_log_flush(sdp, 1);
+ sdp->sd_log_flush_head = sdp->sd_log_head;
+ sdp->sd_log_flush_wrapped = 0;
tr = sdp->sd_log_tr;
if (tr) {
sdp->sd_log_tr = NULL;
INIT_LIST_HEAD(&tr->tr_ail1_list);
INIT_LIST_HEAD(&tr->tr_ail2_list);
+ tr->tr_first = sdp->sd_log_flush_head;
}
- if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
- printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
- sdp->sd_log_commited_buf);
- gfs2_assert_withdraw(sdp, 0);
- }
- if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
- printk(KERN_INFO "GFS2: log databuf %u %u\n",
- sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
- gfs2_assert_withdraw(sdp, 0);
- }
gfs2_assert_withdraw(sdp,
sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);
- sdp->sd_log_flush_head = sdp->sd_log_head;
- sdp->sd_log_flush_wrapped = 0;
- if (tr)
- tr->tr_first = sdp->sd_log_flush_head;
-
gfs2_ordered_write(sdp);
- lops_before_commit(sdp);
+ lops_before_commit(sdp, tr);
gfs2_log_flush_bio(sdp, WRITE);
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
+ log_flush_wait(sdp);
log_write_header(sdp, 0);
} else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
@@ -723,8 +735,6 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
gfs2_log_lock(sdp);
sdp->sd_log_head = sdp->sd_log_flush_head;
sdp->sd_log_blks_reserved = 0;
- sdp->sd_log_commited_buf = 0;
- sdp->sd_log_commited_databuf = 0;
sdp->sd_log_commited_revoke = 0;
spin_lock(&sdp->sd_ail_lock);
@@ -734,40 +744,96 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
}
spin_unlock(&sdp->sd_ail_lock);
gfs2_log_unlock(sdp);
+
+ if (atomic_read(&sdp->sd_log_freeze))
+ type = FREEZE_FLUSH;
+ if (type != NORMAL_FLUSH) {
+ if (!sdp->sd_log_idle) {
+ for (;;) {
+ gfs2_ail1_start(sdp);
+ gfs2_ail1_wait(sdp);
+ if (gfs2_ail1_empty(sdp))
+ break;
+ }
+ atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+ trace_gfs2_log_blocks(sdp, -1);
+ sdp->sd_log_flush_wrapped = 0;
+ log_write_header(sdp, 0);
+ sdp->sd_log_head = sdp->sd_log_flush_head;
+ }
+ if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH)
+ gfs2_log_shutdown(sdp);
+ if (type == FREEZE_FLUSH) {
+ int error;
+
+ atomic_set(&sdp->sd_log_freeze, 0);
+ wake_up(&sdp->sd_log_frozen_wait);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
+ LM_ST_SHARED, 0,
+ &sdp->sd_thaw_gh);
+ if (error) {
+ printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
+ gfs2_assert_withdraw(sdp, 0);
+ }
+ else
+ gfs2_glock_dq_uninit(&sdp->sd_thaw_gh);
+ }
+ }
+
trace_gfs2_log_flush(sdp, 0);
up_write(&sdp->sd_log_flush_lock);
kfree(tr);
}
+/**
+ * gfs2_merge_trans - Merge a new transaction into a cached transaction
+ * @old: Original transaction to be expanded
+ * @new: New transaction to be merged
+ */
+
+static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
+{
+ WARN_ON_ONCE(old->tr_attached != 1);
+
+ old->tr_num_buf_new += new->tr_num_buf_new;
+ old->tr_num_databuf_new += new->tr_num_databuf_new;
+ old->tr_num_buf_rm += new->tr_num_buf_rm;
+ old->tr_num_databuf_rm += new->tr_num_databuf_rm;
+ old->tr_num_revoke += new->tr_num_revoke;
+ old->tr_num_revoke_rm += new->tr_num_revoke_rm;
+
+ list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
+ list_splice_tail_init(&new->tr_buf, &old->tr_buf);
+}
+
static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
unsigned int reserved;
unsigned int unused;
+ unsigned int maxres;
gfs2_log_lock(sdp);
- sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
- sdp->sd_log_commited_databuf += tr->tr_num_databuf_new -
- tr->tr_num_databuf_rm;
- gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
- (((int)sdp->sd_log_commited_databuf) >= 0));
+ if (sdp->sd_log_tr) {
+ gfs2_merge_trans(sdp->sd_log_tr, tr);
+ } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
+ gfs2_assert_withdraw(sdp, tr->tr_alloced);
+ sdp->sd_log_tr = tr;
+ tr->tr_attached = 1;
+ }
+
sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
reserved = calc_reserved(sdp);
- gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
- unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
+ maxres = sdp->sd_log_blks_reserved + tr->tr_reserved;
+ gfs2_assert_withdraw(sdp, maxres >= reserved);
+ unused = maxres - reserved;
atomic_add(unused, &sdp->sd_log_blks_free);
trace_gfs2_log_blocks(sdp, unused);
gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
sdp->sd_jdesc->jd_blocks);
sdp->sd_log_blks_reserved = reserved;
- if (sdp->sd_log_tr == NULL &&
- (tr->tr_num_buf_new || tr->tr_num_databuf_new)) {
- gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
- sdp->sd_log_tr = tr;
- tr->tr_attached = 1;
- }
gfs2_log_unlock(sdp);
}
@@ -804,13 +870,8 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
void gfs2_log_shutdown(struct gfs2_sbd *sdp)
{
- down_write(&sdp->sd_log_flush_lock);
-
gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
- gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
- gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
- gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
sdp->sd_log_flush_head = sdp->sd_log_head;
@@ -818,38 +879,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT);
- gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
sdp->sd_log_head = sdp->sd_log_flush_head;
sdp->sd_log_tail = sdp->sd_log_head;
-
- up_write(&sdp->sd_log_flush_lock);
-}
-
-
-/**
- * gfs2_meta_syncfs - sync all the buffers in a filesystem
- * @sdp: the filesystem
- *
- */
-
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
-{
- gfs2_log_flush(sdp, NULL);
- for (;;) {
- gfs2_ail1_start(sdp);
- gfs2_ail1_wait(sdp);
- if (gfs2_ail1_empty(sdp))
- break;
- }
- gfs2_log_flush(sdp, NULL);
}
static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
{
- return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+ return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze));
}
static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
@@ -876,14 +915,14 @@ int gfs2_logd(void *data)
if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
gfs2_ail1_empty(sdp);
- gfs2_log_flush(sdp, NULL);
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
}
if (gfs2_ail_flush_reqd(sdp)) {
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
gfs2_ail1_empty(sdp);
- gfs2_log_flush(sdp, NULL);
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
}
if (!gfs2_ail_flush_reqd(sdp))
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 37216634f0a..9499a604921 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -63,14 +63,21 @@ extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
unsigned int ssize);
+extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+enum gfs2_flush_type {
+ NORMAL_FLUSH = 0,
+ SYNC_FLUSH,
+ SHUTDOWN_FLUSH,
+ FREEZE_FLUSH
+};
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+ enum gfs2_flush_type type);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec..2c1ae861dc9 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -75,7 +75,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
struct gfs2_bitmap *bi = rgd->rd_bits + index;
- if (bi->bi_clone == 0)
+ if (bi->bi_clone == NULL)
return;
if (sdp->sd_args.ar_discard)
gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
clear_bit(GBF_FULL, &bi->bi_flags);
rgd->rd_free_clone = rgd->rd_free;
+ rgd->rd_extfail_pt = rgd->rd_free;
}
/**
@@ -145,8 +146,8 @@ static u64 gfs2_log_bmap(struct gfs2_sbd *sdp)
struct gfs2_journal_extent *je;
u64 block;
- list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
- if (lbn >= je->lblock && lbn < je->lblock + je->blocks) {
+ list_for_each_entry(je, &sdp->sd_jdesc->extent_list, list) {
+ if ((lbn >= je->lblock) && (lbn < (je->lblock + je->blocks))) {
block = je->dblock + lbn - je->lblock;
gfs2_log_incr_head(sdp);
return block;
@@ -272,7 +273,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno)
nrvecs = max(nrvecs/2, 1U);
}
- bio->bi_sector = blkno * (sb->s_blocksize >> 9);
+ bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_end_io = gfs2_end_log_write;
bio->bi_private = sdp;
@@ -490,44 +491,40 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit,
gfs2_log_unlock(sdp);
}
-static void buf_lo_before_commit(struct gfs2_sbd *sdp)
+static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */
-
- gfs2_before_commit(sdp, limit, sdp->sd_log_num_buf,
- &sdp->sd_log_le_buf, 0);
+ unsigned int nbuf;
+ if (tr == NULL)
+ return;
+ nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm;
+ gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0);
}
static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
- struct list_head *head = &sdp->sd_log_le_buf;
+ struct list_head *head;
struct gfs2_bufdata *bd;
- if (tr == NULL) {
- gfs2_assert(sdp, list_empty(head));
+ if (tr == NULL)
return;
- }
+ head = &tr->tr_buf;
while (!list_empty(head)) {
bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
- sdp->sd_log_num_buf--;
-
gfs2_unpin(sdp, bd->bd_bh, tr);
}
- gfs2_assert_warn(sdp, !sdp->sd_log_num_buf);
}
static void buf_lo_before_scan(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass)
{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-
if (pass != 0)
return;
- sdp->sd_found_blocks = 0;
- sdp->sd_replayed_blocks = 0;
+ jd->jd_found_blocks = 0;
+ jd->jd_replayed_blocks = 0;
}
static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -550,9 +547,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
blkno = be64_to_cpu(*ptr++);
- sdp->sd_found_blocks++;
+ jd->jd_found_blocks++;
- if (gfs2_revoke_check(sdp, blkno, start))
+ if (gfs2_revoke_check(jd, blkno, start))
continue;
error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -573,7 +570,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
if (error)
break;
- sdp->sd_replayed_blocks++;
+ jd->jd_replayed_blocks++;
}
return error;
@@ -588,8 +585,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
static void gfs2_meta_sync(struct gfs2_glock *gl)
{
struct address_space *mapping = gfs2_glock2aspace(gl);
+ struct gfs2_sbd *sdp = gl->gl_sbd;
int error;
+ if (mapping == NULL)
+ mapping = &sdp->sd_aspace;
+
filemap_fdatawrite(mapping);
error = filemap_fdatawait(mapping);
@@ -612,10 +613,10 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
gfs2_meta_sync(ip->i_gl);
fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n",
- jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+ jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
}
-static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
+static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
struct gfs2_meta_header *mh;
unsigned int offset;
@@ -674,13 +675,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head, int pass)
{
- struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
-
if (pass != 0)
return;
- sdp->sd_found_revokes = 0;
- sdp->sd_replay_tail = head->lh_tail;
+ jd->jd_found_revokes = 0;
+ jd->jd_replay_tail = head->lh_tail;
}
static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -712,13 +711,13 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) {
blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
- error = gfs2_revoke_add(sdp, blkno, start);
+ error = gfs2_revoke_add(jd, blkno, start);
if (error < 0) {
brelse(bh);
return error;
}
else if (error)
- sdp->sd_found_revokes++;
+ jd->jd_found_revokes++;
if (!--revokes)
break;
@@ -738,16 +737,16 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
if (error) {
- gfs2_revoke_clean(sdp);
+ gfs2_revoke_clean(jd);
return;
}
if (pass != 1)
return;
fs_info(sdp, "jid=%u: Found %u revoke tags\n",
- jd->jd_jid, sdp->sd_found_revokes);
+ jd->jd_jid, jd->jd_found_revokes);
- gfs2_revoke_clean(sdp);
+ gfs2_revoke_clean(jd);
}
/**
@@ -755,12 +754,14 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
*
*/
-static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
+static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
- unsigned int limit = buf_limit(sdp) / 2;
-
- gfs2_before_commit(sdp, limit, sdp->sd_log_num_databuf,
- &sdp->sd_log_le_databuf, 1);
+ unsigned int limit = databuf_limit(sdp);
+ unsigned int nbuf;
+ if (tr == NULL)
+ return;
+ nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm;
+ gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1);
}
static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
@@ -784,9 +785,9 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
blkno = be64_to_cpu(*ptr++);
esc = be64_to_cpu(*ptr++);
- sdp->sd_found_blocks++;
+ jd->jd_found_blocks++;
- if (gfs2_revoke_check(sdp, blkno, start))
+ if (gfs2_revoke_check(jd, blkno, start))
continue;
error = gfs2_replay_read_block(jd, start, &bh_log);
@@ -806,7 +807,7 @@ static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
brelse(bh_log);
brelse(bh_ip);
- sdp->sd_replayed_blocks++;
+ jd->jd_replayed_blocks++;
}
return error;
@@ -830,26 +831,23 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
gfs2_meta_sync(ip->i_gl);
fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
- jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+ jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks);
}
static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
- struct list_head *head = &sdp->sd_log_le_databuf;
+ struct list_head *head;
struct gfs2_bufdata *bd;
- if (tr == NULL) {
- gfs2_assert(sdp, list_empty(head));
+ if (tr == NULL)
return;
- }
+ head = &tr->tr_databuf;
while (!list_empty(head)) {
bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
- sdp->sd_log_num_databuf--;
gfs2_unpin(sdp, bd->bd_bh, tr);
}
- gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
}
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 9ca2e643841..a65a7ba32ff 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -46,12 +46,13 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
return limit;
}
-static inline void lops_before_commit(struct gfs2_sbd *sdp)
+static inline void lops_before_commit(struct gfs2_sbd *sdp,
+ struct gfs2_trans *tr)
{
int x;
for (x = 0; gfs2_log_ops[x]; x++)
if (gfs2_log_ops[x]->lo_before_commit)
- gfs2_log_ops[x]->lo_before_commit(sdp);
+ gfs2_log_ops[x]->lo_before_commit(sdp, tr);
}
static inline void lops_after_commit(struct gfs2_sbd *sdp,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 351586e24e3..82b6ac82965 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
@@ -31,12 +33,6 @@
struct workqueue_struct *gfs2_control_wq;
-static struct shrinker qd_shrinker = {
- .count_objects = gfs2_qd_shrink_count,
- .scan_objects = gfs2_qd_shrink_scan,
- .seeks = DEFAULT_SEEKS,
-};
-
static void gfs2_init_inode_once(void *foo)
{
struct gfs2_inode *ip = foo;
@@ -82,11 +78,16 @@ static int __init init_gfs2_fs(void)
gfs2_str2qstr(&gfs2_qdot, ".");
gfs2_str2qstr(&gfs2_qdotdot, "..");
+ gfs2_quota_hash_init();
error = gfs2_sys_init();
if (error)
return error;
+ error = list_lru_init(&gfs2_qd_lru);
+ if (error)
+ goto fail_lru;
+
error = gfs2_glock_init();
if (error)
goto fail;
@@ -139,7 +140,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_rsrv_cachep)
goto fail;
- register_shrinker(&qd_shrinker);
+ register_shrinker(&gfs2_qd_shrinker);
error = register_filesystem(&gfs2_fs_type);
if (error)
@@ -166,7 +167,7 @@ static int __init init_gfs2_fs(void)
gfs2_register_debugfs();
- printk("GFS2 installed\n");
+ pr_info("GFS2 installed\n");
return 0;
@@ -179,7 +180,9 @@ fail_wq:
fail_unregister:
unregister_filesystem(&gfs2_fs_type);
fail:
- unregister_shrinker(&qd_shrinker);
+ list_lru_destroy(&gfs2_qd_lru);
+fail_lru:
+ unregister_shrinker(&gfs2_qd_shrinker);
gfs2_glock_exit();
if (gfs2_rsrv_cachep)
@@ -214,13 +217,14 @@ fail:
static void __exit exit_gfs2_fs(void)
{
- unregister_shrinker(&qd_shrinker);
+ unregister_shrinker(&gfs2_qd_shrinker);
gfs2_glock_exit();
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
destroy_workqueue(gfs2_control_wq);
+ list_lru_destroy(&gfs2_qd_lru);
rcu_barrier();
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 93241505054..b984a6e190b 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -97,6 +97,11 @@ const struct address_space_operations gfs2_meta_aops = {
.releasepage = gfs2_releasepage,
};
+const struct address_space_operations gfs2_rgrp_aops = {
+ .writepage = gfs2_aspace_writepage,
+ .releasepage = gfs2_releasepage,
+};
+
/**
* gfs2_getbuf - Get a buffer with a given address space
* @gl: the glock
@@ -116,6 +121,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
unsigned long index;
unsigned int bufnum;
+ if (mapping == NULL)
+ mapping = &sdp->sd_aspace;
+
shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
index = blkno >> shift; /* convert block to page */
bufnum = blkno - (index << shift); /* block buf index within page */
@@ -128,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
yield();
}
} else {
- page = find_lock_page(mapping, index);
+ page = find_get_page_flags(mapping, index,
+ FGP_LOCK|FGP_ACCESSED);
if (!page)
return NULL;
}
@@ -145,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
map_bh(bh, sdp->sd_vfs, blkno);
unlock_page(page);
- mark_page_accessed(page);
page_cache_release(page);
return bh;
@@ -258,27 +266,27 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
struct address_space *mapping = bh->b_page->mapping;
struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
struct gfs2_bufdata *bd = bh->b_private;
+ int was_pinned = 0;
if (test_clear_buffer_pinned(bh)) {
trace_gfs2_pin(bd, 0);
atomic_dec(&sdp->sd_log_pinned);
list_del_init(&bd->bd_list);
- if (meta) {
- gfs2_assert_warn(sdp, sdp->sd_log_num_buf);
- sdp->sd_log_num_buf--;
+ if (meta)
tr->tr_num_buf_rm++;
- } else {
- gfs2_assert_warn(sdp, sdp->sd_log_num_databuf);
- sdp->sd_log_num_databuf--;
+ else
tr->tr_num_databuf_rm++;
- }
tr->tr_touched = 1;
+ was_pinned = 1;
brelse(bh);
}
if (bd) {
spin_lock(&sdp->sd_ail_lock);
if (bd->bd_tr) {
gfs2_trans_add_revoke(sdp, bd);
+ } else if (was_pinned) {
+ bh->b_private = NULL;
+ kmem_cache_free(gfs2_bufdata_cachep, bd);
}
spin_unlock(&sdp->sd_ail_lock);
}
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 4823b934208..ac5d8027d33 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -38,12 +38,15 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
}
extern const struct address_space_operations gfs2_meta_aops;
+extern const struct address_space_operations gfs2_rgrp_aops;
static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
{
struct inode *inode = mapping->host;
if (mapping->a_ops == &gfs2_meta_aops)
return (((struct gfs2_glock *)mapping) - 1)->gl_sbd;
+ else if (mapping->a_ops == &gfs2_rgrp_aops)
+ return container_of(mapping, struct gfs2_sbd, sd_aspace);
else
return inode->i_sb->s_fs_info;
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 19ff5e8c285..bc564c0d6d1 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -36,6 +38,7 @@
#include "log.h"
#include "quota.h"
#include "dir.h"
+#include "meta_io.h"
#include "trace_gfs2.h"
#define DO 0
@@ -51,7 +54,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
{
spin_lock_init(&gt->gt_spin);
- gt->gt_quota_simul_sync = 64;
gt->gt_quota_warn_period = 10;
gt->gt_quota_scale_num = 1;
gt->gt_quota_scale_den = 1;
@@ -63,6 +65,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
static struct gfs2_sbd *init_sbd(struct super_block *sb)
{
struct gfs2_sbd *sdp;
+ struct address_space *mapping;
sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
if (!sdp)
@@ -91,18 +94,30 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
INIT_LIST_HEAD(&sdp->sd_jindex_list);
spin_lock_init(&sdp->sd_jindex_spin);
mutex_init(&sdp->sd_jindex_mutex);
+ init_completion(&sdp->sd_journal_ready);
INIT_LIST_HEAD(&sdp->sd_quota_list);
mutex_init(&sdp->sd_quota_mutex);
+ mutex_init(&sdp->sd_quota_sync_mutex);
init_waitqueue_head(&sdp->sd_quota_wait);
INIT_LIST_HEAD(&sdp->sd_trunc_list);
spin_lock_init(&sdp->sd_trunc_lock);
+ spin_lock_init(&sdp->sd_bitmap_lock);
+
+ mapping = &sdp->sd_aspace;
+
+ address_space_init_once(mapping);
+ mapping->a_ops = &gfs2_rgrp_aops;
+ mapping->host = sb->s_bdev->bd_inode;
+ mapping->flags = 0;
+ mapping_set_gfp_mask(mapping, GFP_NOFS);
+ mapping->private_data = NULL;
+ mapping->backing_dev_info = sb->s_bdi;
+ mapping->writeback_index = 0;
spin_lock_init(&sdp->sd_log_lock);
atomic_set(&sdp->sd_log_pinned, 0);
- INIT_LIST_HEAD(&sdp->sd_log_le_buf);
INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
- INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
spin_lock_init(&sdp->sd_ordered_lock);
@@ -115,8 +130,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
init_rwsem(&sdp->sd_log_flush_lock);
atomic_set(&sdp->sd_log_in_flight, 0);
init_waitqueue_head(&sdp->sd_log_flush_wait);
-
- INIT_LIST_HEAD(&sdp->sd_revoke_list);
+ init_waitqueue_head(&sdp->sd_log_frozen_wait);
+ atomic_set(&sdp->sd_log_freeze, 0);
+ atomic_set(&sdp->sd_frozen_root, 0);
+ init_waitqueue_head(&sdp->sd_frozen_root_wait);
return sdp;
}
@@ -140,7 +157,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
if (sb->sb_magic != GFS2_MAGIC ||
sb->sb_type != GFS2_METATYPE_SB) {
if (!silent)
- printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n");
+ pr_warn("not a GFS2 filesystem\n");
return -EINVAL;
}
@@ -162,7 +179,7 @@ static void end_bio_io_page(struct bio *bio, int error)
if (!error)
SetPageUptodate(page);
else
- printk(KERN_WARNING "gfs2: error %d reading superblock\n", error);
+ pr_warn("error %d reading superblock\n", error);
unlock_page(page);
}
@@ -217,14 +234,14 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
page = alloc_page(GFP_NOFS);
if (unlikely(!page))
- return -ENOBUFS;
+ return -ENOMEM;
ClearPageUptodate(page);
ClearPageDirty(page);
lock_page(page);
bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_sector = sector * (sb->s_blocksize >> 9);
+ bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9);
bio->bi_bdev = sb->s_bdev;
bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -407,8 +424,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
goto fail_live;
}
- error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
- CREATE, &sdp->sd_trans_gl);
+ error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops,
+ CREATE, &sdp->sd_freeze_gl);
if (error) {
fs_err(sdp, "can't create transaction glock: %d\n", error);
goto fail_rename;
@@ -417,7 +434,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
return 0;
fail_trans:
- gfs2_glock_put(sdp->sd_trans_gl);
+ gfs2_glock_put(sdp->sd_freeze_gl);
fail_rename:
gfs2_glock_put(sdp->sd_rename_gl);
fail_live:
@@ -505,67 +522,6 @@ out:
return ret;
}
-/**
- * map_journal_extents - create a reusable "extent" mapping from all logical
- * blocks to all physical blocks for the given journal. This will save
- * us time when writing journal blocks. Most journals will have only one
- * extent that maps all their logical blocks. That's because gfs2.mkfs
- * arranges the journal blocks sequentially to maximize performance.
- * So the extent would map the first block for the entire file length.
- * However, gfs2_jadd can happen while file activity is happening, so
- * those journals may not be sequential. Less likely is the case where
- * the users created their own journals by mounting the metafs and
- * laying it out. But it's still possible. These journals might have
- * several extents.
- *
- * TODO: This should be done in bigger chunks rather than one block at a time,
- * but since it's only done at mount time, I'm not worried about the
- * time it takes.
- */
-static int map_journal_extents(struct gfs2_sbd *sdp)
-{
- struct gfs2_jdesc *jd = sdp->sd_jdesc;
- unsigned int lb;
- u64 db, prev_db; /* logical block, disk block, prev disk block */
- struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
- struct gfs2_journal_extent *jext = NULL;
- struct buffer_head bh;
- int rc = 0;
-
- prev_db = 0;
-
- for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
- bh.b_state = 0;
- bh.b_blocknr = 0;
- bh.b_size = 1 << ip->i_inode.i_blkbits;
- rc = gfs2_block_map(jd->jd_inode, lb, &bh, 0);
- db = bh.b_blocknr;
- if (rc || !db) {
- printk(KERN_INFO "GFS2 journal mapping error %d: lb="
- "%u db=%llu\n", rc, lb, (unsigned long long)db);
- break;
- }
- if (!prev_db || db != prev_db + 1) {
- jext = kzalloc(sizeof(struct gfs2_journal_extent),
- GFP_KERNEL);
- if (!jext) {
- printk(KERN_INFO "GFS2 error: out of memory "
- "mapping journal extents.\n");
- rc = -ENOMEM;
- break;
- }
- jext->dblock = db;
- jext->lblock = lb;
- jext->blocks = 1;
- list_add_tail(&jext->extent_list, &jd->extent_list);
- } else {
- jext->blocks++;
- }
- prev_db = db;
- }
- return rc;
-}
-
static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
{
char *message = "FIRSTMOUNT=Done";
@@ -624,6 +580,8 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
break;
INIT_LIST_HEAD(&jd->extent_list);
+ INIT_LIST_HEAD(&jd->jd_revoke_list);
+
INIT_WORK(&jd->jd_work, gfs2_recover_func);
jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
@@ -767,7 +725,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5);
/* Map the extents for this journal's blocks */
- map_journal_extents(sdp);
+ gfs2_map_journal_extents(sdp, sdp->sd_jdesc);
}
trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free));
@@ -802,7 +760,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
gfs2_glock_dq_uninit(&ji_gh);
jindex = 0;
-
+ if (!sdp->sd_args.ar_spectator) {
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+ &sdp->sd_thaw_gh);
+ if (error) {
+ fs_err(sdp, "can't acquire freeze glock: %d\n", error);
+ goto fail_jinode_gh;
+ }
+ }
+ gfs2_glock_dq_uninit(&sdp->sd_thaw_gh);
return 0;
fail_jinode_gh:
@@ -831,6 +797,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
goto fail_qinode;
error = init_journal(sdp, undo);
+ complete_all(&sdp->sd_journal_ready);
if (error)
goto fail;
@@ -956,40 +923,6 @@ fail:
return error;
}
-static int init_threads(struct gfs2_sbd *sdp, int undo)
-{
- struct task_struct *p;
- int error = 0;
-
- if (undo)
- goto fail_quotad;
-
- p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- fs_err(sdp, "can't start logd thread: %d\n", error);
- return error;
- }
- sdp->sd_logd_process = p;
-
- p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
- if (IS_ERR(p)) {
- error = PTR_ERR(p);
- fs_err(sdp, "can't start quotad thread: %d\n", error);
- goto fail;
- }
- sdp->sd_quotad_process = p;
-
- return 0;
-
-
-fail_quotad:
- kthread_stop(sdp->sd_quotad_process);
-fail:
- kthread_stop(sdp->sd_logd_process);
- return error;
-}
-
static const match_table_t nolock_tokens = {
{ Opt_jid, "jid=%d\n", },
{ Opt_err, NULL },
@@ -1028,7 +961,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
lm = &gfs2_dlm_ops;
#endif
} else {
- printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
+ pr_info("can't find protocol %s\n", proto);
return -ENOENT;
}
@@ -1135,7 +1068,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
sdp = init_sbd(sb);
if (!sdp) {
- printk(KERN_WARNING "GFS2: can't alloc struct gfs2_sbd\n");
+ pr_warn("can't alloc struct gfs2_sbd\n");
return -ENOMEM;
}
sdp->sd_args = *args;
@@ -1254,15 +1187,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
goto fail_per_node;
}
- error = init_threads(sdp, DO);
- if (error)
- goto fail_per_node;
-
if (!(sb->s_flags & MS_RDONLY)) {
error = gfs2_make_fs_rw(sdp);
if (error) {
fs_err(sdp, "can't make FS RW: %d\n", error);
- goto fail_threads;
+ goto fail_per_node;
}
}
@@ -1270,8 +1199,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
gfs2_online_uevent(sdp);
return 0;
-fail_threads:
- init_threads(sdp, UNDO);
fail_per_node:
init_per_node(sdp, UNDO);
fail_inodes:
@@ -1287,6 +1214,7 @@ fail_sb:
fail_locking:
init_locking(sdp, &mount_gh, UNDO);
fail_lm:
+ complete_all(&sdp->sd_journal_ready);
gfs2_gl_hash_clear(sdp);
gfs2_lm_unmount(sdp);
fail_debug:
@@ -1366,8 +1294,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
if (IS_ERR(s))
goto error_bdev;
- if (s->s_root)
+ if (s->s_root) {
+ /*
+ * s_umount nests inside bd_mutex during
+ * __invalidate_device(). blkdev_put() acquires
+ * bd_mutex and can't be called under s_umount. Drop
+ * s_umount temporarily. This is safe as we're
+ * holding an active reference.
+ */
+ up_write(&s->s_umount);
blkdev_put(bdev, mode);
+ down_write(&s->s_umount);
+ }
memset(&args, 0, sizeof(args));
args.ar_quota = GFS2_QUOTA_DEFAULT;
@@ -1379,7 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
error = gfs2_mount_args(&args, data);
if (error) {
- printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
+ pr_warn("can't parse mount arguments\n");
goto error_super;
}
@@ -1429,15 +1367,15 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
if (error) {
- printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
- dev_name, error);
+ pr_warn("path_lookup on %s returned error %d\n",
+ dev_name, error);
return ERR_PTR(error);
}
s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags,
path.dentry->d_inode->i_sb->s_bdev);
path_put(&path);
if (IS_ERR(s)) {
- printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
+ pr_warn("gfs2 mount does not exist\n");
return ERR_CAST(s);
}
if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -1457,7 +1395,7 @@ static void gfs2_kill_sb(struct super_block *sb)
return;
}
- gfs2_meta_syncfs(sdp);
+ gfs2_log_flush(sdp, NULL, SYNC_FLUSH);
dput(sdp->sd_root_dir);
dput(sdp->sd_master_dir);
sdp->sd_root_dir = NULL;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index db441359ee8..64b29f7f6b4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -36,6 +36,8 @@
* the quota file, so it is not being constantly read.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/mm.h>
@@ -50,6 +52,13 @@
#include <linux/freezer.h>
#include <linux/quota.h>
#include <linux/dqblk_xfs.h>
+#include <linux/lockref.h>
+#include <linux/list_lru.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/jhash.h>
+#include <linux/vmalloc.h>
#include "gfs2.h"
#include "incore.h"
@@ -65,35 +74,63 @@
#include "inode.h"
#include "util.h"
-struct gfs2_quota_change_host {
- u64 qc_change;
- u32 qc_flags; /* GFS2_QCF_... */
- struct kqid qc_id;
-};
+#define GFS2_QD_HASH_SHIFT 12
+#define GFS2_QD_HASH_SIZE (1 << GFS2_QD_HASH_SHIFT)
+#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1)
-static LIST_HEAD(qd_lru_list);
-static atomic_t qd_lru_count = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(qd_lru_lock);
+/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
+/* -> sd_bitmap_lock */
+static DEFINE_SPINLOCK(qd_lock);
+struct list_lru gfs2_qd_lru;
-unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc)
+static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
+
+static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
+ const struct kqid qid)
+{
+ unsigned int h;
+
+ h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
+ h = jhash(&qid, sizeof(struct kqid), h);
+
+ return h & GFS2_QD_HASH_MASK;
+}
+
+static inline void spin_lock_bucket(unsigned int hash)
+{
+ hlist_bl_lock(&qd_hash_table[hash]);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+ hlist_bl_unlock(&qd_hash_table[hash]);
+}
+
+static void gfs2_qd_dealloc(struct rcu_head *rcu)
+{
+ struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+ kmem_cache_free(gfs2_quotad_cachep, qd);
+}
+
+static void gfs2_qd_dispose(struct list_head *list)
{
struct gfs2_quota_data *qd;
struct gfs2_sbd *sdp;
- int nr_to_scan = sc->nr_to_scan;
- long freed = 0;
-
- if (!(sc->gfp_mask & __GFP_FS))
- return SHRINK_STOP;
- spin_lock(&qd_lru_lock);
- while (nr_to_scan && !list_empty(&qd_lru_list)) {
- qd = list_entry(qd_lru_list.next,
- struct gfs2_quota_data, qd_reclaim);
+ while (!list_empty(list)) {
+ qd = list_entry(list->next, struct gfs2_quota_data, qd_lru);
sdp = qd->qd_gl->gl_sbd;
+ list_del(&qd->qd_lru);
+
/* Free from the filesystem-specific list */
+ spin_lock(&qd_lock);
list_del(&qd->qd_list);
+ spin_unlock(&qd_lock);
+
+ spin_lock_bucket(qd->qd_hash);
+ hlist_bl_del_rcu(&qd->qd_hlist);
+ spin_unlock_bucket(qd->qd_hash);
gfs2_assert_warn(sdp, !qd->qd_change);
gfs2_assert_warn(sdp, !qd->qd_slot_count);
@@ -103,24 +140,59 @@ unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
atomic_dec(&sdp->sd_quota_count);
/* Delete it from the common reclaim list */
- list_del_init(&qd->qd_reclaim);
- atomic_dec(&qd_lru_count);
- spin_unlock(&qd_lru_lock);
- kmem_cache_free(gfs2_quotad_cachep, qd);
- spin_lock(&qd_lru_lock);
- nr_to_scan--;
- freed++;
+ call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
}
- spin_unlock(&qd_lru_lock);
+}
+
+
+static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+{
+ struct list_head *dispose = arg;
+ struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
+
+ if (!spin_trylock(&qd->qd_lockref.lock))
+ return LRU_SKIP;
+
+ if (qd->qd_lockref.count == 0) {
+ lockref_mark_dead(&qd->qd_lockref);
+ list_move(&qd->qd_lru, dispose);
+ }
+
+ spin_unlock(&qd->qd_lockref.lock);
+ return LRU_REMOVED;
+}
+
+static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+ struct shrink_control *sc)
+{
+ LIST_HEAD(dispose);
+ unsigned long freed;
+
+ if (!(sc->gfp_mask & __GFP_FS))
+ return SHRINK_STOP;
+
+ freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
+ &dispose, &sc->nr_to_scan);
+
+ gfs2_qd_dispose(&dispose);
+
return freed;
}
-unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc)
+static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+ struct shrink_control *sc)
{
- return vfs_pressure_ratio(atomic_read(&qd_lru_count));
+ return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
}
+struct shrinker gfs2_qd_shrinker = {
+ .count_objects = gfs2_qd_shrink_count,
+ .scan_objects = gfs2_qd_shrink_scan,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
+};
+
+
static u64 qd2index(struct gfs2_quota_data *qd)
{
struct kqid qid = qd->qd_id;
@@ -138,168 +210,158 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
return offset;
}
-static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
- struct gfs2_quota_data **qdp)
+static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
{
struct gfs2_quota_data *qd;
int error;
qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
if (!qd)
- return -ENOMEM;
+ return NULL;
- atomic_set(&qd->qd_count, 1);
+ qd->qd_sbd = sdp;
+ qd->qd_lockref.count = 1;
+ spin_lock_init(&qd->qd_lockref.lock);
qd->qd_id = qid;
qd->qd_slot = -1;
- INIT_LIST_HEAD(&qd->qd_reclaim);
+ INIT_LIST_HEAD(&qd->qd_lru);
+ qd->qd_hash = hash;
error = gfs2_glock_get(sdp, qd2index(qd),
&gfs2_quota_glops, CREATE, &qd->qd_gl);
if (error)
goto fail;
- *qdp = qd;
-
- return 0;
+ return qd;
fail:
kmem_cache_free(gfs2_quotad_cachep, qd);
- return error;
+ return NULL;
}
-static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
- struct gfs2_quota_data **qdp)
+static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
+ const struct gfs2_sbd *sdp,
+ struct kqid qid)
{
- struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
- int error, found;
-
- *qdp = NULL;
+ struct gfs2_quota_data *qd;
+ struct hlist_bl_node *h;
- for (;;) {
- found = 0;
- spin_lock(&qd_lru_lock);
- list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- if (qid_eq(qd->qd_id, qid)) {
- if (!atomic_read(&qd->qd_count) &&
- !list_empty(&qd->qd_reclaim)) {
- /* Remove it from reclaim list */
- list_del_init(&qd->qd_reclaim);
- atomic_dec(&qd_lru_count);
- }
- atomic_inc(&qd->qd_count);
- found = 1;
- break;
- }
+ hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
+ if (!qid_eq(qd->qd_id, qid))
+ continue;
+ if (qd->qd_sbd != sdp)
+ continue;
+ if (lockref_get_not_dead(&qd->qd_lockref)) {
+ list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+ return qd;
}
+ }
- if (!found)
- qd = NULL;
+ return NULL;
+}
- if (!qd && new_qd) {
- qd = new_qd;
- list_add(&qd->qd_list, &sdp->sd_quota_list);
- atomic_inc(&sdp->sd_quota_count);
- new_qd = NULL;
- }
- spin_unlock(&qd_lru_lock);
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+ struct gfs2_quota_data **qdp)
+{
+ struct gfs2_quota_data *qd, *new_qd;
+ unsigned int hash = gfs2_qd_hash(sdp, qid);
+
+ rcu_read_lock();
+ *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+ rcu_read_unlock();
- if (qd) {
- if (new_qd) {
- gfs2_glock_put(new_qd->qd_gl);
- kmem_cache_free(gfs2_quotad_cachep, new_qd);
- }
- *qdp = qd;
- return 0;
- }
+ if (qd)
+ return 0;
- error = qd_alloc(sdp, qid, &new_qd);
- if (error)
- return error;
+ new_qd = qd_alloc(hash, sdp, qid);
+ if (!new_qd)
+ return -ENOMEM;
+
+ spin_lock(&qd_lock);
+ spin_lock_bucket(hash);
+ *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+ if (qd == NULL) {
+ *qdp = new_qd;
+ list_add(&new_qd->qd_list, &sdp->sd_quota_list);
+ hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
+ atomic_inc(&sdp->sd_quota_count);
+ }
+ spin_unlock_bucket(hash);
+ spin_unlock(&qd_lock);
+
+ if (qd) {
+ gfs2_glock_put(new_qd->qd_gl);
+ kmem_cache_free(gfs2_quotad_cachep, new_qd);
}
+
+ return 0;
}
+
static void qd_hold(struct gfs2_quota_data *qd)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
- gfs2_assert(sdp, atomic_read(&qd->qd_count));
- atomic_inc(&qd->qd_count);
+ gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref));
+ lockref_get(&qd->qd_lockref);
}
static void qd_put(struct gfs2_quota_data *qd)
{
- if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
- /* Add to the reclaim list */
- list_add_tail(&qd->qd_reclaim, &qd_lru_list);
- atomic_inc(&qd_lru_count);
- spin_unlock(&qd_lru_lock);
- }
+ if (lockref_put_or_lock(&qd->qd_lockref))
+ return;
+
+ qd->qd_lockref.count = 0;
+ list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+ spin_unlock(&qd->qd_lockref.lock);
+
}
static int slot_get(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
- unsigned int c, o = 0, b;
- unsigned char byte = 0;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
+ unsigned int bit;
+ int error = 0;
- spin_lock(&qd_lru_lock);
+ spin_lock(&sdp->sd_bitmap_lock);
+ if (qd->qd_slot_count != 0)
+ goto out;
- if (qd->qd_slot_count++) {
- spin_unlock(&qd_lru_lock);
- return 0;
+ error = -ENOSPC;
+ bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
+ if (bit < sdp->sd_quota_slots) {
+ set_bit(bit, sdp->sd_quota_bitmap);
+ qd->qd_slot = bit;
+ error = 0;
+out:
+ qd->qd_slot_count++;
}
+ spin_unlock(&sdp->sd_bitmap_lock);
- for (c = 0; c < sdp->sd_quota_chunks; c++)
- for (o = 0; o < PAGE_SIZE; o++) {
- byte = sdp->sd_quota_bitmap[c][o];
- if (byte != 0xFF)
- goto found;
- }
-
- goto fail;
-
-found:
- for (b = 0; b < 8; b++)
- if (!(byte & (1 << b)))
- break;
- qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
-
- if (qd->qd_slot >= sdp->sd_quota_slots)
- goto fail;
-
- sdp->sd_quota_bitmap[c][o] |= 1 << b;
-
- spin_unlock(&qd_lru_lock);
-
- return 0;
-
-fail:
- qd->qd_slot_count--;
- spin_unlock(&qd_lru_lock);
- return -ENOSPC;
+ return error;
}
static void slot_hold(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
- spin_lock(&qd_lru_lock);
+ spin_lock(&sdp->sd_bitmap_lock);
gfs2_assert(sdp, qd->qd_slot_count);
qd->qd_slot_count++;
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&sdp->sd_bitmap_lock);
}
static void slot_put(struct gfs2_quota_data *qd)
{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+ struct gfs2_sbd *sdp = qd->qd_sbd;
- spin_lock(&qd_lru_lock);
+ spin_lock(&sdp->sd_bitmap_lock);
gfs2_assert(sdp, qd->qd_slot_count);
if (!--qd->qd_slot_count) {
- gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+ BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
qd->qd_slot = -1;
}
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&sdp->sd_bitmap_lock);
}
static int bh_get(struct gfs2_quota_data *qd)
@@ -363,6 +425,24 @@ static void bh_put(struct gfs2_quota_data *qd)
mutex_unlock(&sdp->sd_quota_mutex);
}
+static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
+ u64 *sync_gen)
+{
+ if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
+ !test_bit(QDF_CHANGE, &qd->qd_flags) ||
+ (sync_gen && (qd->qd_sync_gen >= *sync_gen)))
+ return 0;
+
+ if (!lockref_get_not_dead(&qd->qd_lockref))
+ return 0;
+
+ list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
+ set_bit(QDF_LOCKED, &qd->qd_flags);
+ qd->qd_change_sync = qd->qd_change;
+ slot_hold(qd);
+ return 1;
+}
+
static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
{
struct gfs2_quota_data *qd = NULL;
@@ -374,31 +454,18 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
if (sdp->sd_vfs->s_flags & MS_RDONLY)
return 0;
- spin_lock(&qd_lru_lock);
+ spin_lock(&qd_lock);
list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
- if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
- !test_bit(QDF_CHANGE, &qd->qd_flags) ||
- qd->qd_sync_gen >= sdp->sd_quota_sync_gen)
- continue;
-
- list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
-
- set_bit(QDF_LOCKED, &qd->qd_flags);
- gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
- atomic_inc(&qd->qd_count);
- qd->qd_change_sync = qd->qd_change;
- gfs2_assert_warn(sdp, qd->qd_slot_count);
- qd->qd_slot_count++;
- found = 1;
-
- break;
+ found = qd_check_sync(sdp, qd, &sdp->sd_quota_sync_gen);
+ if (found)
+ break;
}
if (!found)
qd = NULL;
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
if (qd) {
gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -416,43 +483,6 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
return 0;
}
-static int qd_trylock(struct gfs2_quota_data *qd)
-{
- struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-
- if (sdp->sd_vfs->s_flags & MS_RDONLY)
- return 0;
-
- spin_lock(&qd_lru_lock);
-
- if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
- !test_bit(QDF_CHANGE, &qd->qd_flags)) {
- spin_unlock(&qd_lru_lock);
- return 0;
- }
-
- list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
-
- set_bit(QDF_LOCKED, &qd->qd_flags);
- gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
- atomic_inc(&qd->qd_count);
- qd->qd_change_sync = qd->qd_change;
- gfs2_assert_warn(sdp, qd->qd_slot_count);
- qd->qd_slot_count++;
-
- spin_unlock(&qd_lru_lock);
-
- gfs2_assert_warn(sdp, qd->qd_change_sync);
- if (bh_get(qd)) {
- clear_bit(QDF_LOCKED, &qd->qd_flags);
- slot_put(qd);
- qd_put(qd);
- return 0;
- }
-
- return 1;
-}
-
static void qd_unlock(struct gfs2_quota_data *qd)
{
gfs2_assert_warn(qd->qd_gl->gl_sbd,
@@ -602,9 +632,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
x = be64_to_cpu(qc->qc_change) + change;
qc->qc_change = cpu_to_be64(x);
- spin_lock(&qd_lru_lock);
+ spin_lock(&qd_lock);
qd->qd_change = x;
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
if (!x) {
gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -648,7 +678,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
struct buffer_head *bh;
struct page *page;
void *kaddr, *ptr;
- struct gfs2_quota q, *qp;
+ struct gfs2_quota q;
int err, nbytes;
u64 size;
@@ -664,28 +694,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
return err;
err = -EIO;
- qp = &q;
- qp->qu_value = be64_to_cpu(qp->qu_value);
- qp->qu_value += change;
- qp->qu_value = cpu_to_be64(qp->qu_value);
- qd->qd_qb.qb_value = qp->qu_value;
+ be64_add_cpu(&q.qu_value, change);
+ qd->qd_qb.qb_value = q.qu_value;
if (fdq) {
if (fdq->d_fieldmask & FS_DQ_BSOFT) {
- qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
- qd->qd_qb.qb_warn = qp->qu_warn;
+ q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+ qd->qd_qb.qb_warn = q.qu_warn;
}
if (fdq->d_fieldmask & FS_DQ_BHARD) {
- qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
- qd->qd_qb.qb_limit = qp->qu_limit;
+ q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+ qd->qd_qb.qb_limit = q.qu_limit;
}
if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
- qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
- qd->qd_qb.qb_value = qp->qu_value;
+ q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+ qd->qd_qb.qb_value = q.qu_value;
}
}
/* Write the quota into the quota file on disk */
- ptr = qp;
+ ptr = &q;
nbytes = sizeof(struct gfs2_quota);
get_a_page:
page = find_or_create_page(mapping, index, GFP_NOFS);
@@ -751,6 +778,7 @@ get_a_page:
i_size_write(inode, size);
inode->i_mtime = inode->i_atime = CURRENT_TIME;
mark_inode_dirty(inode);
+ set_bit(QDF_REFRESH, &qd->qd_flags);
return 0;
unlock_out:
@@ -763,6 +791,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
{
struct gfs2_sbd *sdp = (*qda)->qd_gl->gl_sbd;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
+ struct gfs2_alloc_parms ap = { .aflags = 0, };
unsigned int data_blocks, ind_blocks;
struct gfs2_holder *ghs, i_gh;
unsigned int qx, x;
@@ -815,7 +844,8 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
reserved = 1 + (nalloc * (data_blocks + ind_blocks));
- error = gfs2_inplace_reserve(ip, reserved, 0);
+ ap.target = reserved;
+ error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto out_alloc;
@@ -850,7 +880,7 @@ out:
gfs2_glock_dq_uninit(&ghs[qx]);
mutex_unlock(&ip->i_inode.i_mutex);
kfree(ghs);
- gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+ gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
return error;
}
@@ -974,9 +1004,9 @@ static int need_sync(struct gfs2_quota_data *qd)
if (!qd->qd_qb.qb_limit)
return 0;
- spin_lock(&qd_lru_lock);
+ spin_lock(&qd_lock);
value = qd->qd_change;
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
spin_lock(&gt->gt_spin);
num = gt->gt_quota_scale_num;
@@ -1001,9 +1031,11 @@ static int need_sync(struct gfs2_quota_data *qd)
void gfs2_quota_unlock(struct gfs2_inode *ip)
{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_quota_data *qda[4];
unsigned int count = 0;
unsigned int x;
+ int found;
if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
goto out;
@@ -1016,9 +1048,25 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
sync = need_sync(qd);
gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+ if (!sync)
+ continue;
+
+ spin_lock(&qd_lock);
+ found = qd_check_sync(sdp, qd, NULL);
+ spin_unlock(&qd_lock);
+
+ if (!found)
+ continue;
- if (sync && qd_trylock(qd))
- qda[count++] = qd;
+ gfs2_assert_warn(sdp, qd->qd_change_sync);
+ if (bh_get(qd)) {
+ clear_bit(QDF_LOCKED, &qd->qd_flags);
+ slot_put(qd);
+ qd_put(qd);
+ continue;
+ }
+
+ qda[count++] = qd;
}
if (count) {
@@ -1037,10 +1085,10 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
{
struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
- printk(KERN_INFO "GFS2: fsid=%s: quota %s for %s %u\n",
- sdp->sd_fsname, type,
- (qd->qd_id.type == USRQUOTA) ? "user" : "group",
- from_kqid(&init_user_ns, qd->qd_id));
+ fs_info(sdp, "quota %s for %s %u\n",
+ type,
+ (qd->qd_id.type == USRQUOTA) ? "user" : "group",
+ from_kqid(&init_user_ns, qd->qd_id));
return 0;
}
@@ -1067,9 +1115,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
continue;
value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
- spin_lock(&qd_lru_lock);
+ spin_lock(&qd_lock);
value += qd->qd_change;
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
print_message(qd, "exceeded");
@@ -1118,17 +1166,18 @@ int gfs2_quota_sync(struct super_block *sb, int type)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_data **qda;
- unsigned int max_qd = gfs2_tune_get(sdp, gt_quota_simul_sync);
+ unsigned int max_qd = PAGE_SIZE/sizeof(struct gfs2_holder);
unsigned int num_qd;
unsigned int x;
int error = 0;
- sdp->sd_quota_sync_gen++;
-
qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL);
if (!qda)
return -ENOMEM;
+ mutex_lock(&sdp->sd_quota_sync_mutex);
+ sdp->sd_quota_sync_gen++;
+
do {
num_qd = 0;
@@ -1153,6 +1202,7 @@ int gfs2_quota_sync(struct super_block *sb, int type)
}
} while (!error && num_qd == max_qd);
+ mutex_unlock(&sdp->sd_quota_sync_mutex);
kfree(qda);
return error;
@@ -1176,17 +1226,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
return error;
}
-static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
-{
- const struct gfs2_quota_change *str = buf;
-
- qc->qc_change = be64_to_cpu(str->qc_change);
- qc->qc_flags = be32_to_cpu(str->qc_flags);
- qc->qc_id = make_kqid(&init_user_ns,
- (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
- be32_to_cpu(str->qc_id));
-}
-
int gfs2_quota_init(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1194,6 +1233,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
unsigned int x, slot = 0;
unsigned int found = 0;
+ unsigned int hash;
+ unsigned int bm_size;
u64 dblock;
u32 extlen = 0;
int error;
@@ -1202,23 +1243,19 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
return -EIO;
sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
- sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
-
+ bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
+ bm_size *= sizeof(unsigned long);
error = -ENOMEM;
-
- sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
- sizeof(unsigned char *), GFP_NOFS);
+ sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN);
+ if (sdp->sd_quota_bitmap == NULL)
+ sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS |
+ __GFP_ZERO, PAGE_KERNEL);
if (!sdp->sd_quota_bitmap)
return error;
- for (x = 0; x < sdp->sd_quota_chunks; x++) {
- sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
- if (!sdp->sd_quota_bitmap[x])
- goto fail;
- }
-
for (x = 0; x < blocks; x++) {
struct buffer_head *bh;
+ const struct gfs2_quota_change *qc;
unsigned int y;
if (!extlen) {
@@ -1236,33 +1273,41 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
goto fail;
}
+ qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
y++, slot++) {
- struct gfs2_quota_change_host qc;
struct gfs2_quota_data *qd;
-
- gfs2_quota_change_in(&qc, bh->b_data +
- sizeof(struct gfs2_meta_header) +
- y * sizeof(struct gfs2_quota_change));
- if (!qc.qc_change)
+ s64 qc_change = be64_to_cpu(qc->qc_change);
+ u32 qc_flags = be32_to_cpu(qc->qc_flags);
+ enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
+ USRQUOTA : GRPQUOTA;
+ struct kqid qc_id = make_kqid(&init_user_ns, qtype,
+ be32_to_cpu(qc->qc_id));
+ qc++;
+ if (!qc_change)
continue;
- error = qd_alloc(sdp, qc.qc_id, &qd);
- if (error) {
+ hash = gfs2_qd_hash(sdp, qc_id);
+ qd = qd_alloc(hash, sdp, qc_id);
+ if (qd == NULL) {
brelse(bh);
goto fail;
}
set_bit(QDF_CHANGE, &qd->qd_flags);
- qd->qd_change = qc.qc_change;
+ qd->qd_change = qc_change;
qd->qd_slot = slot;
qd->qd_slot_count = 1;
- spin_lock(&qd_lru_lock);
- gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+ spin_lock(&qd_lock);
+ BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
list_add(&qd->qd_list, &sdp->sd_quota_list);
atomic_inc(&sdp->sd_quota_count);
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
+
+ spin_lock_bucket(hash);
+ hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
+ spin_unlock_bucket(hash);
found++;
}
@@ -1286,51 +1331,41 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
{
struct list_head *head = &sdp->sd_quota_list;
struct gfs2_quota_data *qd;
- unsigned int x;
- spin_lock(&qd_lru_lock);
+ spin_lock(&qd_lock);
while (!list_empty(head)) {
qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
- if (atomic_read(&qd->qd_count) > 1 ||
- (atomic_read(&qd->qd_count) &&
- !test_bit(QDF_CHANGE, &qd->qd_flags))) {
- list_move(&qd->qd_list, head);
- spin_unlock(&qd_lru_lock);
- schedule();
- spin_lock(&qd_lru_lock);
- continue;
- }
-
list_del(&qd->qd_list);
+
/* Also remove if this qd exists in the reclaim list */
- if (!list_empty(&qd->qd_reclaim)) {
- list_del_init(&qd->qd_reclaim);
- atomic_dec(&qd_lru_count);
- }
+ list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
atomic_dec(&sdp->sd_quota_count);
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
+
+ spin_lock_bucket(qd->qd_hash);
+ hlist_bl_del_rcu(&qd->qd_hlist);
+ spin_unlock_bucket(qd->qd_hash);
- if (!atomic_read(&qd->qd_count)) {
- gfs2_assert_warn(sdp, !qd->qd_change);
- gfs2_assert_warn(sdp, !qd->qd_slot_count);
- } else
- gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+ gfs2_assert_warn(sdp, !qd->qd_change);
+ gfs2_assert_warn(sdp, !qd->qd_slot_count);
gfs2_assert_warn(sdp, !qd->qd_bh_count);
gfs2_glock_put(qd->qd_gl);
- kmem_cache_free(gfs2_quotad_cachep, qd);
+ call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
- spin_lock(&qd_lru_lock);
+ spin_lock(&qd_lock);
}
- spin_unlock(&qd_lru_lock);
+ spin_unlock(&qd_lock);
gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
if (sdp->sd_quota_bitmap) {
- for (x = 0; x < sdp->sd_quota_chunks; x++)
- kfree(sdp->sd_quota_bitmap[x]);
- kfree(sdp->sd_quota_bitmap);
+ if (is_vmalloc_addr(sdp->sd_quota_bitmap))
+ vfree(sdp->sd_quota_bitmap);
+ else
+ kfree(sdp->sd_quota_bitmap);
+ sdp->sd_quota_bitmap = NULL;
}
}
@@ -1462,7 +1497,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
}
fqs->qs_uquota.qfs_nextents = 1; /* unsupported */
fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */
- fqs->qs_incoredqs = atomic_read(&qd_lru_count);
+ fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru);
return 0;
}
@@ -1573,10 +1608,12 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
if (gfs2_is_stuffed(ip))
alloc_required = 1;
if (alloc_required) {
+ struct gfs2_alloc_parms ap = { .aflags = 0, };
gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
&data_blocks, &ind_blocks);
blocks = 1 + data_blocks + ind_blocks;
- error = gfs2_inplace_reserve(ip, blocks, 0);
+ ap.target = blocks;
+ error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto out_i;
blocks += gfs2_rg_blocks(ip, blocks);
@@ -1612,3 +1649,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
.get_dqblk = gfs2_get_dqblk,
.set_dqblk = gfs2_set_dqblk,
};
+
+void __init gfs2_quota_hash_init(void)
+{
+ unsigned i;
+
+ for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
+ INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 0f64d9deb1b..55d506eb3c4 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -10,9 +10,10 @@
#ifndef __QUOTA_DOT_H__
#define __QUOTA_DOT_H__
+#include <linux/list_lru.h>
+
struct gfs2_inode;
struct gfs2_sbd;
-struct shrink_control;
#define NO_UID_QUOTA_CHANGE INVALID_UID
#define NO_GID_QUOTA_CHANGE INVALID_GID
@@ -53,10 +54,9 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
return ret;
}
-extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
- struct shrink_control *sc);
-extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
- struct shrink_control *sc);
extern const struct quotactl_ops gfs2_quotactl_ops;
+extern struct shrinker gfs2_qd_shrinker;
+extern struct list_lru gfs2_qd_lru;
+extern void __init gfs2_quota_hash_init(void);
#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 963b2d75200..94555d4c569 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -52,9 +52,9 @@ int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
return error;
}
-int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
- struct list_head *head = &sdp->sd_revoke_list;
+ struct list_head *head = &jd->jd_revoke_list;
struct gfs2_revoke_replay *rr;
int found = 0;
@@ -81,13 +81,13 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
return 1;
}
-int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
+int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where)
{
struct gfs2_revoke_replay *rr;
int wrap, a, b, revoke;
int found = 0;
- list_for_each_entry(rr, &sdp->sd_revoke_list, rr_list) {
+ list_for_each_entry(rr, &jd->jd_revoke_list, rr_list) {
if (rr->rr_blkno == blkno) {
found = 1;
break;
@@ -97,17 +97,17 @@ int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
if (!found)
return 0;
- wrap = (rr->rr_where < sdp->sd_replay_tail);
- a = (sdp->sd_replay_tail < where);
+ wrap = (rr->rr_where < jd->jd_replay_tail);
+ a = (jd->jd_replay_tail < where);
b = (where < rr->rr_where);
revoke = (wrap) ? (a || b) : (a && b);
return revoke;
}
-void gfs2_revoke_clean(struct gfs2_sbd *sdp)
+void gfs2_revoke_clean(struct gfs2_jdesc *jd)
{
- struct list_head *head = &sdp->sd_revoke_list;
+ struct list_head *head = &jd->jd_revoke_list;
struct gfs2_revoke_replay *rr;
while (!list_empty(head)) {
@@ -454,7 +454,7 @@ void gfs2_recover_func(struct work_struct *work)
struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct gfs2_log_header_host head;
- struct gfs2_holder j_gh, ji_gh, t_gh;
+ struct gfs2_holder j_gh, ji_gh, thaw_gh;
unsigned long t;
int ro = 0;
unsigned int pass;
@@ -508,11 +508,11 @@ void gfs2_recover_func(struct work_struct *work)
t = jiffies;
- /* Acquire a shared hold on the transaction lock */
+ /* Acquire a shared hold on the freeze lock */
- error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
- LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
- GL_NOCACHE, &t_gh);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ LM_FLAG_NOEXP | LM_FLAG_PRIORITY,
+ &thaw_gh);
if (error)
goto fail_gunlock_ji;
@@ -538,7 +538,7 @@ void gfs2_recover_func(struct work_struct *work)
fs_warn(sdp, "jid=%u: Can't replay: read-only block "
"device\n", jd->jd_jid);
error = -EROFS;
- goto fail_gunlock_tr;
+ goto fail_gunlock_thaw;
}
fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
@@ -549,14 +549,14 @@ void gfs2_recover_func(struct work_struct *work)
head.lh_blkno, pass);
lops_after_scan(jd, error, pass);
if (error)
- goto fail_gunlock_tr;
+ goto fail_gunlock_thaw;
}
error = clean_journal(jd, &head);
if (error)
- goto fail_gunlock_tr;
+ goto fail_gunlock_thaw;
- gfs2_glock_dq_uninit(&t_gh);
+ gfs2_glock_dq_uninit(&thaw_gh);
t = DIV_ROUND_UP(jiffies - t, HZ);
fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
jd->jd_jid, t);
@@ -572,8 +572,8 @@ void gfs2_recover_func(struct work_struct *work)
fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
goto done;
-fail_gunlock_tr:
- gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_thaw:
+ gfs2_glock_dq_uninit(&thaw_gh);
fail_gunlock_ji:
if (jlocked) {
gfs2_glock_dq_uninit(&ji_gh);
@@ -587,7 +587,7 @@ fail:
gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
done:
clear_bit(JDF_RECOVERY, &jd->jd_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
}
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index 2226136c764..6142836cce9 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -23,9 +23,9 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk)
extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk,
struct buffer_head **bh);
-extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where);
-extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
+extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where);
+extern void gfs2_revoke_clean(struct gfs2_jdesc *jd);
extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
struct gfs2_log_header_host *head);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 69317435faa..f4cb9c0d6bb 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
@@ -57,6 +59,11 @@
* 3 = Used (metadata)
*/
+struct gfs2_extent {
+ struct gfs2_rbm rbm;
+ u32 len;
+};
+
static const char valid_change[16] = {
/* current */
/* n */ 0, 1, 1, 1,
@@ -65,8 +72,9 @@ static const char valid_change[16] = {
1, 0, 0, 0
};
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
- const struct gfs2_inode *ip, bool nowrap);
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+ const struct gfs2_inode *ip, bool nowrap,
+ const struct gfs2_alloc_parms *ap);
/**
@@ -81,32 +89,32 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
unsigned char new_state)
{
unsigned char *byte1, *byte2, *end, cur_state;
- unsigned int buflen = rbm->bi->bi_len;
+ struct gfs2_bitmap *bi = rbm_bi(rbm);
+ unsigned int buflen = bi->bi_len;
const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
- byte1 = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
- end = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset + buflen;
+ byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
+ end = bi->bi_bh->b_data + bi->bi_offset + buflen;
BUG_ON(byte1 >= end);
cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
if (unlikely(!valid_change[new_state * 4 + cur_state])) {
- printk(KERN_WARNING "GFS2: buf_blk = 0x%x old_state=%d, "
- "new_state=%d\n", rbm->offset, cur_state, new_state);
- printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%x\n",
- (unsigned long long)rbm->rgd->rd_addr,
- rbm->bi->bi_start);
- printk(KERN_WARNING "GFS2: bi_offset=0x%x bi_len=0x%x\n",
- rbm->bi->bi_offset, rbm->bi->bi_len);
+ pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
+ rbm->offset, cur_state, new_state);
+ pr_warn("rgrp=0x%llx bi_start=0x%x\n",
+ (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
+ pr_warn("bi_offset=0x%x bi_len=0x%x\n",
+ bi->bi_offset, bi->bi_len);
dump_stack();
gfs2_consist_rgrpd(rbm->rgd);
return;
}
*byte1 ^= (cur_state ^ new_state) << bit;
- if (do_clone && rbm->bi->bi_clone) {
- byte2 = rbm->bi->bi_clone + rbm->bi->bi_offset + (rbm->offset / GFS2_NBBY);
+ if (do_clone && bi->bi_clone) {
+ byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY);
cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
*byte2 ^= (cur_state ^ new_state) << bit;
}
@@ -121,7 +129,8 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm)
{
- const u8 *buffer = rbm->bi->bi_bh->b_data + rbm->bi->bi_offset;
+ struct gfs2_bitmap *bi = rbm_bi(rbm);
+ const u8 *buffer = bi->bi_bh->b_data + bi->bi_offset;
const u8 *byte;
unsigned int bit;
@@ -252,29 +261,53 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
{
u64 rblock = block - rbm->rgd->rd_data0;
- u32 x;
if (WARN_ON_ONCE(rblock > UINT_MAX))
return -EINVAL;
if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
return -E2BIG;
- rbm->bi = rbm->rgd->rd_bits;
+ rbm->bii = 0;
rbm->offset = (u32)(rblock);
/* Check if the block is within the first block */
- if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
+ if (rbm->offset < rbm_bi(rbm)->bi_blocks)
return 0;
/* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
rbm->offset += (sizeof(struct gfs2_rgrp) -
sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
- x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
- rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
- rbm->bi += x;
+ rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+ rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
return 0;
}
/**
+ * gfs2_rbm_incr - increment an rbm structure
+ * @rbm: The rbm with rgd already set correctly
+ *
+ * This function takes an existing rbm structure and increments it to the next
+ * viable block offset.
+ *
+ * Returns: If incrementing the offset would cause the rbm to go past the
+ * end of the rgrp, true is returned, otherwise false.
+ *
+ */
+
+static bool gfs2_rbm_incr(struct gfs2_rbm *rbm)
+{
+ if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */
+ rbm->offset++;
+ return false;
+ }
+ if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */
+ return true;
+
+ rbm->offset = 0;
+ rbm->bii++;
+ return false;
+}
+
+/**
* gfs2_unaligned_extlen - Look for free blocks which are not byte aligned
* @rbm: Position to search (value/result)
* @n_unaligned: Number of unaligned blocks to check
@@ -285,7 +318,6 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len)
{
- u64 block;
u32 n;
u8 res;
@@ -296,8 +328,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le
(*len)--;
if (*len == 0)
return true;
- block = gfs2_rbm_to_block(rbm);
- if (gfs2_rbm_from_block(rbm, block + 1))
+ if (gfs2_rbm_incr(rbm))
return true;
}
@@ -306,7 +337,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le
/**
* gfs2_free_extlen - Return extent length of free blocks
- * @rbm: Starting position
+ * @rrbm: Starting position
* @len: Max length to check
*
* Starting at the block specified by the rbm, see how many free blocks
@@ -328,6 +359,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
u32 chunk_size;
u8 *ptr, *start, *end;
u64 block;
+ struct gfs2_bitmap *bi;
if (n_unaligned &&
gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len))
@@ -336,11 +368,12 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
n_unaligned = len & 3;
/* Start is now byte aligned */
while (len > 3) {
- start = rbm.bi->bi_bh->b_data;
- if (rbm.bi->bi_clone)
- start = rbm.bi->bi_clone;
- end = start + rbm.bi->bi_bh->b_size;
- start += rbm.bi->bi_offset;
+ bi = rbm_bi(&rbm);
+ start = bi->bi_bh->b_data;
+ if (bi->bi_clone)
+ start = bi->bi_clone;
+ end = start + bi->bi_bh->b_size;
+ start += bi->bi_offset;
BUG_ON(rbm.offset & 3);
start += (rbm.offset / GFS2_NBBY);
bytes = min_t(u32, len / GFS2_NBBY, (end - start));
@@ -605,12 +638,18 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
RB_CLEAR_NODE(&rs->rs_node);
if (rs->rs_free) {
+ struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+
/* return reserved blocks to the rgrp */
BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
+ /* The rgrp extent failure point is likely not to increase;
+ it will only do so if the freed blocks are somehow
+ contiguous with a span of free blocks that follows. Still,
+ it will force the number to be recalculated later. */
+ rgd->rd_extfail_pt += rs->rs_free;
rs->rs_free = 0;
- clear_bit(GBF_FULL, &rs->rs_rbm.bi->bi_flags);
- smp_mb__after_clear_bit();
+ clear_bit(GBF_FULL, &bi->bi_flags);
}
}
@@ -634,14 +673,13 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
/**
* gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
+ * @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip)
+void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
{
- struct inode *inode = &ip->i_inode;
-
down_write(&ip->i_rw_mutex);
- if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) {
+ if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
gfs2_rs_deltree(ip->i_res);
BUG_ON(ip->i_res->rs_free);
kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
@@ -700,11 +738,11 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
{
- printk(KERN_INFO " ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
- printk(KERN_INFO " ri_length = %u\n", rgd->rd_length);
- printk(KERN_INFO " ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
- printk(KERN_INFO " ri_data = %u\n", rgd->rd_data);
- printk(KERN_INFO " ri_bitbytes = %u\n", rgd->rd_bitbytes);
+ pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
+ pr_info("ri_length = %u\n", rgd->rd_length);
+ pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
+ pr_info("ri_data = %u\n", rgd->rd_data);
+ pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
}
/**
@@ -743,18 +781,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
bi->bi_offset = sizeof(struct gfs2_rgrp);
bi->bi_start = 0;
bi->bi_len = bytes;
+ bi->bi_blocks = bytes * GFS2_NBBY;
/* header block */
} else if (x == 0) {
bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
bi->bi_offset = sizeof(struct gfs2_rgrp);
bi->bi_start = 0;
bi->bi_len = bytes;
+ bi->bi_blocks = bytes * GFS2_NBBY;
/* last block */
} else if (x + 1 == length) {
bytes = bytes_left;
bi->bi_offset = sizeof(struct gfs2_meta_header);
bi->bi_start = rgd->rd_bitbytes - bytes_left;
bi->bi_len = bytes;
+ bi->bi_blocks = bytes * GFS2_NBBY;
/* other blocks */
} else {
bytes = sdp->sd_sb.sb_bsize -
@@ -762,6 +803,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
bi->bi_offset = sizeof(struct gfs2_meta_header);
bi->bi_start = rgd->rd_bitbytes - bytes_left;
bi->bi_len = bytes;
+ bi->bi_blocks = bytes * GFS2_NBBY;
}
bytes_left -= bytes;
@@ -846,6 +888,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
static int read_rindex_entry(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ const unsigned bsize = sdp->sd_sb.sb_bsize;
loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
struct gfs2_rindex buf;
int error;
@@ -883,6 +926,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_gl->gl_object = rgd;
+ rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
+ rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1059,7 +1104,7 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
* Returns: errno
*/
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
+static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
{
struct gfs2_sbd *sdp = rgd->rd_sbd;
struct gfs2_glock *gl = rgd->rd_gl;
@@ -1096,8 +1141,10 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
rgd->rd_free_clone = rgd->rd_free;
+ /* max out the rgrp allocation failure point */
+ rgd->rd_extfail_pt = rgd->rd_free;
}
- if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
+ if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
rgd->rd_bits[0].bi_bh->b_data);
@@ -1124,14 +1171,14 @@ fail:
return error;
}
-int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
+static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
{
u32 rl_flags;
if (rgd->rd_flags & GFS2_RDF_UPTODATE)
return 0;
- if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
+ if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
return gfs2_rgrp_bh_get(rgd);
rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
@@ -1154,7 +1201,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
return 0;
- return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object);
+ return gfs2_rgrp_bh_get(rgd);
}
/**
@@ -1392,12 +1439,12 @@ static void rs_insert(struct gfs2_inode *ip)
* rg_mblk_search - find a group of multiple free blocks to form a reservation
* @rgd: the resource group descriptor
* @ip: pointer to the inode for which we're reserving blocks
- * @requested: number of blocks required for this allocation
+ * @ap: the allocation parameters
*
*/
static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
- unsigned requested)
+ const struct gfs2_alloc_parms *ap)
{
struct gfs2_rbm rbm = { .rgd = rgd, };
u64 goal;
@@ -1410,7 +1457,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
if (S_ISDIR(inode->i_mode))
extlen = 1;
else {
- extlen = max_t(u32, atomic_read(&rs->rs_sizehint), requested);
+ extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
}
if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
@@ -1425,7 +1472,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
return;
- ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
+ ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
if (ret == 0) {
rs->rs_rbm = rbm;
rs->rs_free = extlen;
@@ -1490,6 +1537,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
* @rbm: The current position in the resource group
* @ip: The inode for which we are searching for blocks
* @minext: The minimum extent length
+ * @maxext: A pointer to the maximum extent structure
*
* This checks the current position in the rgrp to see whether there is
* a reservation covering this block. If not then this function is a
@@ -1502,7 +1550,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
const struct gfs2_inode *ip,
- u32 minext)
+ u32 minext,
+ struct gfs2_extent *maxext)
{
u64 block = gfs2_rbm_to_block(rbm);
u32 extlen = 1;
@@ -1515,8 +1564,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
*/
if (minext) {
extlen = gfs2_free_extlen(rbm, minext);
- nblock = block + extlen;
- if (extlen < minext)
+ if (extlen <= maxext->len)
goto fail;
}
@@ -1525,9 +1573,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
* and skip if parts of it are already reserved
*/
nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
- if (nblock == block)
- return 0;
+ if (nblock == block) {
+ if (!minext || extlen >= minext)
+ return 0;
+
+ if (extlen > maxext->len) {
+ maxext->len = extlen;
+ maxext->rbm = *rbm;
+ }
fail:
+ nblock = block + extlen;
+ }
ret = gfs2_rbm_from_block(rbm, nblock);
if (ret < 0)
return ret;
@@ -1538,30 +1594,38 @@ fail:
* gfs2_rbm_find - Look for blocks of a particular state
* @rbm: Value/result starting position and final position
* @state: The state which we want to find
- * @minext: The requested extent length (0 for a single block)
+ * @minext: Pointer to the requested extent length (NULL for a single block)
+ * This is updated to be the actual reservation size.
* @ip: If set, check for reservations
* @nowrap: Stop looking at the end of the rgrp, rather than wrapping
* around until we've reached the starting point.
+ * @ap: the allocation parameters
*
* Side effects:
* - If looking for free blocks, we set GBF_FULL on each bitmap which
* has no free blocks in it.
+ * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
+ * has come up short on a free block search.
*
* Returns: 0 on success, -ENOSPC if there is no block of the requested state
*/
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
- const struct gfs2_inode *ip, bool nowrap)
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+ const struct gfs2_inode *ip, bool nowrap,
+ const struct gfs2_alloc_parms *ap)
{
struct buffer_head *bh;
- struct gfs2_bitmap *initial_bi;
+ int initial_bii;
u32 initial_offset;
+ int first_bii = rbm->bii;
+ u32 first_offset = rbm->offset;
u32 offset;
u8 *buffer;
- int index;
int n = 0;
int iters = rbm->rgd->rd_length;
int ret;
+ struct gfs2_bitmap *bi;
+ struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
/* If we are not starting at the beginning of a bitmap, then we
* need to add one to the bitmap count to ensure that we search
@@ -1571,52 +1635,55 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
iters++;
while(1) {
- if (test_bit(GBF_FULL, &rbm->bi->bi_flags) &&
+ bi = rbm_bi(rbm);
+ if (test_bit(GBF_FULL, &bi->bi_flags) &&
(state == GFS2_BLKST_FREE))
goto next_bitmap;
- bh = rbm->bi->bi_bh;
- buffer = bh->b_data + rbm->bi->bi_offset;
+ bh = bi->bi_bh;
+ buffer = bh->b_data + bi->bi_offset;
WARN_ON(!buffer_uptodate(bh));
- if (state != GFS2_BLKST_UNLINKED && rbm->bi->bi_clone)
- buffer = rbm->bi->bi_clone + rbm->bi->bi_offset;
+ if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
+ buffer = bi->bi_clone + bi->bi_offset;
initial_offset = rbm->offset;
- offset = gfs2_bitfit(buffer, rbm->bi->bi_len, rbm->offset, state);
+ offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
if (offset == BFITNOENT)
goto bitmap_full;
rbm->offset = offset;
if (ip == NULL)
return 0;
- initial_bi = rbm->bi;
- ret = gfs2_reservation_check_and_update(rbm, ip, minext);
+ initial_bii = rbm->bii;
+ ret = gfs2_reservation_check_and_update(rbm, ip,
+ minext ? *minext : 0,
+ &maxext);
if (ret == 0)
return 0;
if (ret > 0) {
- n += (rbm->bi - initial_bi);
+ n += (rbm->bii - initial_bii);
goto next_iter;
}
if (ret == -E2BIG) {
- index = 0;
+ rbm->bii = 0;
rbm->offset = 0;
- n += (rbm->bi - initial_bi);
+ n += (rbm->bii - initial_bii);
goto res_covered_end_of_rgrp;
}
return ret;
bitmap_full: /* Mark bitmap as full and fall through */
- if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
- set_bit(GBF_FULL, &rbm->bi->bi_flags);
+ if ((state == GFS2_BLKST_FREE) && initial_offset == 0) {
+ struct gfs2_bitmap *bi = rbm_bi(rbm);
+ set_bit(GBF_FULL, &bi->bi_flags);
+ }
next_bitmap: /* Find next bitmap in the rgrp */
rbm->offset = 0;
- index = rbm->bi - rbm->rgd->rd_bits;
- index++;
- if (index == rbm->rgd->rd_length)
- index = 0;
+ rbm->bii++;
+ if (rbm->bii == rbm->rgd->rd_length)
+ rbm->bii = 0;
res_covered_end_of_rgrp:
- rbm->bi = &rbm->rgd->rd_bits[index];
- if ((index == 0) && nowrap)
+ if ((rbm->bii == 0) && nowrap)
break;
n++;
next_iter:
@@ -1624,6 +1691,24 @@ next_iter:
break;
}
+ if (minext == NULL || state != GFS2_BLKST_FREE)
+ return -ENOSPC;
+
+ /* If the extent was too small, and it's smaller than the smallest
+ to have failed before, remember for future reference that it's
+ useless to search this rgrp again for this amount or more. */
+ if ((first_offset == 0) && (first_bii == 0) &&
+ (*minext < rbm->rgd->rd_extfail_pt))
+ rbm->rgd->rd_extfail_pt = *minext;
+
+ /* If the maximum extent we found is big enough to fulfill the
+ minimum requirements, use it anyway. */
+ if (maxext.len) {
+ *rbm = maxext.rbm;
+ *minext = maxext.len;
+ return 0;
+ }
+
return -ENOSPC;
}
@@ -1645,11 +1730,12 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
struct gfs2_inode *ip;
int error;
int found = 0;
- struct gfs2_rbm rbm = { .rgd = rgd, .bi = rgd->rd_bits, .offset = 0 };
+ struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 };
while (1) {
down_write(&sdp->sd_log_flush_lock);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
+ true, NULL);
up_write(&sdp->sd_log_flush_lock);
if (error == -ENOSPC)
break;
@@ -1800,12 +1886,12 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
/**
* gfs2_inplace_reserve - Reserve space in the filesystem
* @ip: the inode to reserve space for
- * @requested: the number of blocks to be reserved
+ * @ap: the allocation parameters
*
* Returns: errno
*/
-int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrpd *begin = NULL;
@@ -1817,17 +1903,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
if (sdp->sd_args.ar_rgrplvb)
flags |= GL_SKIP;
- if (gfs2_assert_warn(sdp, requested))
+ if (gfs2_assert_warn(sdp, ap->target))
return -EINVAL;
if (gfs2_rs_active(rs)) {
begin = rs->rs_rbm.rgd;
- flags = 0; /* Yoda: Do or do not. There is no try */
} else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
rs->rs_rbm.rgd = begin = ip->i_rgd;
} else {
rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
}
- if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
+ if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
skip = gfs2_orlov_skip(ip);
if (rs->rs_rbm.rgd == NULL)
return -EBADSLT;
@@ -1861,7 +1946,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
}
/* Skip unuseable resource groups */
- if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+ if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
+ GFS2_RDF_ERROR)) ||
+ (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
goto skip_rgrp;
if (sdp->sd_args.ar_rgrplvb)
@@ -1869,27 +1956,28 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
/* Get a reservation if we don't already have one */
if (!gfs2_rs_active(rs))
- rg_mblk_search(rs->rs_rbm.rgd, ip, requested);
+ rg_mblk_search(rs->rs_rbm.rgd, ip, ap);
/* Skip rgrps when we can't get a reservation on first pass */
if (!gfs2_rs_active(rs) && (loops < 1))
goto check_rgrp;
/* If rgrp has enough free space, use it */
- if (rs->rs_rbm.rgd->rd_free_clone >= requested) {
+ if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) {
ip->i_rgd = rs->rs_rbm.rgd;
return 0;
}
- /* Drop reservation, if we couldn't use reserved rgrp */
- if (gfs2_rs_active(rs))
- gfs2_rs_deltree(rs);
check_rgrp:
/* Check for unlinked inodes which can be reclaimed */
if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
ip->i_no_addr);
skip_rgrp:
+ /* Drop reservation, if we couldn't use reserved rgrp */
+ if (gfs2_rs_active(rs))
+ gfs2_rs_deltree(rs);
+
/* Unlock rgrp if required */
if (!rg_locked)
gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -1913,7 +2001,7 @@ next_rgrp:
}
/* Flushing the log may release space */
if (loops == 2)
- gfs2_log_flush(sdp, NULL);
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
}
return -ENOSPC;
@@ -1973,14 +2061,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
*n = 1;
block = gfs2_rbm_to_block(rbm);
- gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
+ gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
block++;
while (*n < elen) {
ret = gfs2_rbm_from_block(&pos, block);
if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
break;
- gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
+ gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh);
gfs2_setbit(&pos, true, GFS2_BLKST_USED);
(*n)++;
block++;
@@ -2001,6 +2089,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
u32 blen, unsigned char new_state)
{
struct gfs2_rbm rbm;
+ struct gfs2_bitmap *bi;
rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
if (!rbm.rgd) {
@@ -2011,15 +2100,15 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
while (blen--) {
gfs2_rbm_from_block(&rbm, bstart);
+ bi = rbm_bi(&rbm);
bstart++;
- if (!rbm.bi->bi_clone) {
- rbm.bi->bi_clone = kmalloc(rbm.bi->bi_bh->b_size,
- GFP_NOFS | __GFP_NOFAIL);
- memcpy(rbm.bi->bi_clone + rbm.bi->bi_offset,
- rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
- rbm.bi->bi_len);
+ if (!bi->bi_clone) {
+ bi->bi_clone = kmalloc(bi->bi_bh->b_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ memcpy(bi->bi_clone + bi->bi_offset,
+ bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
}
- gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
+ gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
gfs2_setbit(&rbm, false, new_state);
}
@@ -2033,25 +2122,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
*
*/
-int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
{
struct gfs2_rgrpd *rgd = gl->gl_object;
struct gfs2_blkreserv *trs;
const struct rb_node *n;
if (rgd == NULL)
- return 0;
- gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
+ return;
+ gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
- rgd->rd_reserved);
+ rgd->rd_reserved, rgd->rd_extfail_pt);
spin_lock(&rgd->rd_rsspin);
for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
dump_rs(seq, trs);
}
spin_unlock(&rgd->rd_rsspin);
- return 0;
}
static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2103,6 +2191,35 @@ out:
}
/**
+ * gfs2_set_alloc_start - Set starting point for block allocation
+ * @rbm: The rbm which will be set to the required location
+ * @ip: The gfs2 inode
+ * @dinode: Flag to say if allocation includes a new inode
+ *
+ * This sets the starting point from the reservation if one is active
+ * otherwise it falls back to guessing a start point based on the
+ * inode's goal block or the last allocation point in the rgrp.
+ */
+
+static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
+ const struct gfs2_inode *ip, bool dinode)
+{
+ u64 goal;
+
+ if (gfs2_rs_active(ip->i_res)) {
+ *rbm = ip->i_res->rs_rbm;
+ return;
+ }
+
+ if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal))
+ goal = ip->i_goal;
+ else
+ goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
+
+ gfs2_rbm_from_block(rbm, goal);
+}
+
+/**
* gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
* @ip: the inode to allocate the block for
* @bn: Used to return the starting block number
@@ -2120,30 +2237,24 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
struct buffer_head *dibh;
struct gfs2_rbm rbm = { .rgd = ip->i_rgd, };
unsigned int ndata;
- u64 goal;
u64 block; /* block, within the file system scope */
int error;
- if (gfs2_rs_active(ip->i_res))
- goal = gfs2_rbm_to_block(&ip->i_res->rs_rbm);
- else if (!dinode && rgrp_contains_block(rbm.rgd, ip->i_goal))
- goal = ip->i_goal;
- else
- goal = rbm.rgd->rd_last_alloc + rbm.rgd->rd_data0;
-
- gfs2_rbm_from_block(&rbm, goal);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
+ gfs2_set_alloc_start(&rbm, ip, dinode);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
if (error == -ENOSPC) {
- gfs2_rbm_from_block(&rbm, goal);
- error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
+ gfs2_set_alloc_start(&rbm, ip, dinode);
+ error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
+ NULL);
}
/* Since all blocks are reserved in advance, this shouldn't happen */
if (error) {
- fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
+ fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
(unsigned long long)ip->i_no_addr, error, *nblocks,
- test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
+ test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
+ rbm.rgd->rd_extfail_pt);
goto rgrp_error;
}
@@ -2169,7 +2280,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
}
}
if (rbm.rgd->rd_free < *nblocks) {
- printk(KERN_WARNING "nblocks=%u\n", *nblocks);
+ pr_warn("nblocks=%u\n", *nblocks);
goto rgrp_error;
}
@@ -2187,7 +2298,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0);
if (dinode)
- gfs2_trans_add_unrevoke(sdp, block, 1);
+ gfs2_trans_add_unrevoke(sdp, block, *nblocks);
gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid);
@@ -2411,7 +2522,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
/**
* gfs2_rlist_free - free a resource group list
- * @list: the list of resource groups
+ * @rlist: the list of resource groups
*
*/
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5b3f4a896e6..463ab2e95d1 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -40,7 +40,7 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
#define GFS2_AF_ORLOV 1
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap);
extern void gfs2_inplace_release(struct gfs2_inode *ip);
extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
@@ -48,7 +48,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
extern int gfs2_rs_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip);
+extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
struct buffer_head *bh,
const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5639dec66c..1319b5c4ec6 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/bio.h>
#include <linux/sched.h>
#include <linux/slab.h>
@@ -175,8 +177,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
break;
case Opt_debug:
if (args->ar_errors == GFS2_ERRORS_PANIC) {
- printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
- "are mutually exclusive.\n");
+ pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
return -EINVAL;
}
args->ar_debug = 1;
@@ -228,21 +229,21 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
case Opt_commit:
rv = match_int(&tmp[0], &args->ar_commit);
if (rv || args->ar_commit <= 0) {
- printk(KERN_WARNING "GFS2: commit mount option requires a positive numeric argument\n");
+ pr_warn("commit mount option requires a positive numeric argument\n");
return rv ? rv : -EINVAL;
}
break;
case Opt_statfs_quantum:
rv = match_int(&tmp[0], &args->ar_statfs_quantum);
if (rv || args->ar_statfs_quantum < 0) {
- printk(KERN_WARNING "GFS2: statfs_quantum mount option requires a non-negative numeric argument\n");
+ pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n");
return rv ? rv : -EINVAL;
}
break;
case Opt_quota_quantum:
rv = match_int(&tmp[0], &args->ar_quota_quantum);
if (rv || args->ar_quota_quantum <= 0) {
- printk(KERN_WARNING "GFS2: quota_quantum mount option requires a positive numeric argument\n");
+ pr_warn("quota_quantum mount option requires a positive numeric argument\n");
return rv ? rv : -EINVAL;
}
break;
@@ -250,7 +251,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
rv = match_int(&tmp[0], &args->ar_statfs_percent);
if (rv || args->ar_statfs_percent < 0 ||
args->ar_statfs_percent > 100) {
- printk(KERN_WARNING "statfs_percent mount option requires a numeric argument between 0 and 100\n");
+ pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n");
return rv ? rv : -EINVAL;
}
break;
@@ -259,8 +260,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
break;
case Opt_err_panic:
if (args->ar_debug) {
- printk(KERN_WARNING "GFS2: -o debug and -o errors=panic "
- "are mutually exclusive.\n");
+ pr_warn("-o debug and -o errors=panic are mutually exclusive\n");
return -EINVAL;
}
args->ar_errors = GFS2_ERRORS_PANIC;
@@ -279,7 +279,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
break;
case Opt_error:
default:
- printk(KERN_WARNING "GFS2: invalid mount option: %s\n", o);
+ pr_warn("invalid mount option: %s\n", o);
return -EINVAL;
}
}
@@ -295,9 +295,8 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
void gfs2_jindex_free(struct gfs2_sbd *sdp)
{
- struct list_head list, *head;
+ struct list_head list;
struct gfs2_jdesc *jd;
- struct gfs2_journal_extent *jext;
spin_lock(&sdp->sd_jindex_spin);
list_add(&list, &sdp->sd_jindex_list);
@@ -307,14 +306,7 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp)
while (!list_empty(&list)) {
jd = list_entry(list.next, struct gfs2_jdesc, jd_list);
- head = &jd->extent_list;
- while (!list_empty(head)) {
- jext = list_entry(head->next,
- struct gfs2_journal_extent,
- extent_list);
- list_del(&jext->extent_list);
- kfree(jext);
- }
+ gfs2_free_journal_extents(jd);
list_del(&jd->jd_list);
iput(jd->jd_inode);
kfree(jd);
@@ -369,6 +361,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
return 0;
}
+static int init_threads(struct gfs2_sbd *sdp)
+{
+ struct task_struct *p;
+ int error = 0;
+
+ p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ fs_err(sdp, "can't start logd thread: %d\n", error);
+ return error;
+ }
+ sdp->sd_logd_process = p;
+
+ p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ fs_err(sdp, "can't start quotad thread: %d\n", error);
+ goto fail;
+ }
+ sdp->sd_quotad_process = p;
+ return 0;
+
+fail:
+ kthread_stop(sdp->sd_logd_process);
+ return error;
+}
+
/**
* gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
* @sdp: the filesystem
@@ -380,14 +399,19 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
{
struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
struct gfs2_glock *j_gl = ip->i_gl;
- struct gfs2_holder t_gh;
+ struct gfs2_holder thaw_gh;
struct gfs2_log_header_host head;
int error;
- error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+ error = init_threads(sdp);
if (error)
return error;
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+ &thaw_gh);
+ if (error)
+ goto fail_threads;
+
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -410,14 +434,16 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
- gfs2_glock_dq_uninit(&t_gh);
+ gfs2_glock_dq_uninit(&thaw_gh);
return 0;
fail:
- t_gh.gh_flags |= GL_NOCACHE;
- gfs2_glock_dq_uninit(&t_gh);
-
+ thaw_gh.gh_flags |= GL_NOCACHE;
+ gfs2_glock_dq_uninit(&thaw_gh);
+fail_threads:
+ kthread_stop(sdp->sd_quotad_process);
+ kthread_stop(sdp->sd_logd_process);
return error;
}
@@ -610,15 +636,21 @@ struct lfcc {
*/
static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
- struct gfs2_holder *t_gh)
+ struct gfs2_holder *freeze_gh)
{
struct gfs2_inode *ip;
struct gfs2_jdesc *jd;
struct lfcc *lfcc;
LIST_HEAD(list);
struct gfs2_log_header_host lh;
+ struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode);
int error;
+ error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0,
+ &sdp->sd_freeze_root_gh);
+ if (error)
+ return error;
+ atomic_set(&sdp->sd_frozen_root, 1);
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
if (!lfcc) {
@@ -634,8 +666,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
list_add(&lfcc->list, &list);
}
- error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
- GL_NOCACHE, t_gh);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
+ GL_NOCACHE, freeze_gh);
list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
error = gfs2_jdesc_check(jd);
@@ -651,7 +683,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
}
if (error)
- gfs2_glock_dq_uninit(t_gh);
+ gfs2_glock_dq_uninit(freeze_gh);
out:
while (!list_empty(&list)) {
@@ -660,6 +692,11 @@ out:
gfs2_glock_dq_uninit(&lfcc->gh);
kfree(lfcc);
}
+ if (error) {
+ atomic_dec(&sdp->sd_frozen_root);
+ wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0);
+ gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh);
+ }
return error;
}
@@ -717,7 +754,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
int ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
- gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+ gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
if (bdi->dirty_exceeded)
gfs2_ail1_flush(sdp, wbc);
else
@@ -797,25 +834,30 @@ out:
static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
{
- struct gfs2_holder t_gh;
+ struct gfs2_holder thaw_gh;
int error;
- flush_workqueue(gfs2_delete_workqueue);
- gfs2_quota_sync(sdp->sd_vfs, 0);
- gfs2_statfs_sync(sdp->sd_vfs, 0);
-
- error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
- &t_gh);
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
+ &thaw_gh);
if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return error;
- gfs2_meta_syncfs(sdp);
- gfs2_log_shutdown(sdp);
-
+ down_write(&sdp->sd_log_flush_lock);
clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+ up_write(&sdp->sd_log_flush_lock);
- if (t_gh.gh_gl)
- gfs2_glock_dq_uninit(&t_gh);
+ kthread_stop(sdp->sd_quotad_process);
+ kthread_stop(sdp->sd_logd_process);
+
+ flush_workqueue(gfs2_delete_workqueue);
+ gfs2_quota_sync(sdp->sd_vfs, 0);
+ gfs2_statfs_sync(sdp->sd_vfs, 0);
+
+ gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
+ gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
+
+ if (thaw_gh.gh_gl)
+ gfs2_glock_dq_uninit(&thaw_gh);
gfs2_quota_cleanup(sdp);
@@ -857,9 +899,6 @@ restart:
}
spin_unlock(&sdp->sd_jindex_spin);
- kthread_stop(sdp->sd_quotad_process);
- kthread_stop(sdp->sd_logd_process);
-
if (!(sb->s_flags & MS_RDONLY)) {
error = gfs2_make_fs_ro(sdp);
if (error)
@@ -875,7 +914,7 @@ restart:
iput(sdp->sd_quota_inode);
gfs2_glock_put(sdp->sd_rename_gl);
- gfs2_glock_put(sdp->sd_trans_gl);
+ gfs2_glock_put(sdp->sd_freeze_gl);
if (!sdp->sd_args.ar_spectator) {
gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
@@ -910,8 +949,8 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
struct gfs2_sbd *sdp = sb->s_fs_info;
gfs2_quota_sync(sb, -1);
- if (wait && sdp)
- gfs2_log_flush(sdp, NULL);
+ if (wait && sdp && !atomic_read(&sdp->sd_log_freeze))
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
return 0;
}
@@ -961,6 +1000,9 @@ static int gfs2_unfreeze(struct super_block *sb)
struct gfs2_sbd *sdp = sb->s_fs_info;
gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+ atomic_dec(&sdp->sd_frozen_root);
+ wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0);
+ gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh);
return 0;
}
@@ -1142,6 +1184,8 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
struct gfs2_tune *gt = &sdp->sd_tune;
int error;
+ sync_filesystem(sb);
+
spin_lock(&gt->gt_spin);
args.ar_commit = gt->gt_logd_secs;
args.ar_quota_quantum = gt->gt_quota_quantum;
@@ -1223,7 +1267,7 @@ static int gfs2_drop_inode(struct inode *inode)
{
struct gfs2_inode *ip = GFS2_I(inode);
- if (inode->i_nlink) {
+ if (!test_bit(GIF_FREE_VFS_INODE, &ip->i_flags) && inode->i_nlink) {
struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
clear_nlink(inode);
@@ -1438,6 +1482,11 @@ static void gfs2_evict_inode(struct inode *inode)
struct gfs2_holder gh;
int error;
+ if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
+ clear_inode(inode);
+ return;
+ }
+
if (inode->i_nlink || (sb->s_flags & MS_RDONLY))
goto out;
@@ -1493,7 +1542,7 @@ static void gfs2_evict_inode(struct inode *inode)
goto out_unlock;
out_truncate:
- gfs2_log_flush(sdp, ip->i_gl);
+ gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
filemap_fdatawrite(metamapping);
@@ -1525,8 +1574,8 @@ out_unlock:
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
/* Case 3 starts here */
- truncate_inode_pages(&inode->i_data, 0);
- gfs2_rs_delete(ip);
+ truncate_inode_pages_final(&inode->i_data);
+ gfs2_rs_delete(ip, NULL);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index aa5c4804496..3ab566ba569 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
@@ -138,9 +140,8 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
if (simple_strtol(buf, NULL, 0) != 1)
return -EINVAL;
- gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: withdrawing from cluster at user's request\n",
- sdp->sd_fsname);
+ gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n");
+
return len;
}
@@ -239,8 +240,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
if (gltype > LM_TYPE_JOURNAL)
return -EINVAL;
- if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
- glops = &gfs2_trans_glops;
+ if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK)
+ glops = &gfs2_freeze_glops;
else
glops = gfs2_glops_list[gltype];
if (glops == NULL)
@@ -332,7 +333,7 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
else if (val == 0) {
clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
gfs2_glock_thaw(sdp);
} else {
ret = -EINVAL;
@@ -406,6 +407,9 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
struct gfs2_jdesc *jd;
int rv;
+ /* Wait for our primary journal to be initialized */
+ wait_for_completion(&sdp->sd_journal_ready);
+
spin_lock(&sdp->sd_jindex_spin);
rv = -EBUSY;
if (sdp->sd_jdesc->jd_jid == jid)
@@ -481,7 +485,7 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
rv = jid = -EINVAL;
sdp->sd_lockstruct.ls_jid = jid;
clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
out:
spin_unlock(&sdp->sd_jindex_spin);
@@ -587,7 +591,6 @@ TUNE_ATTR(max_readahead, 0);
TUNE_ATTR(complain_secs, 0);
TUNE_ATTR(statfs_slow, 0);
TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(quota_simul_sync, 1);
TUNE_ATTR(statfs_quantum, 1);
TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
@@ -597,7 +600,6 @@ static struct attribute *tune_attrs[] = {
&tune_attr_max_readahead.attr,
&tune_attr_complain_secs.attr,
&tune_attr_statfs_slow.attr,
- &tune_attr_quota_simul_sync.attr,
&tune_attr_statfs_quantum.attr,
&tune_attr_quota_scale.attr,
&tune_attr_new_files_jdata.attr,
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 2b20d7046bf..0546ab4e28e 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -46,64 +48,41 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
tr->tr_blocks = blocks;
tr->tr_revokes = revokes;
tr->tr_reserved = 1;
+ tr->tr_alloced = 1;
if (blocks)
tr->tr_reserved += 6 + blocks;
if (revokes)
tr->tr_reserved += gfs2_struct2blk(sdp, revokes,
sizeof(u64));
- sb_start_intwrite(sdp->sd_vfs);
- gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
+ INIT_LIST_HEAD(&tr->tr_databuf);
+ INIT_LIST_HEAD(&tr->tr_buf);
- error = gfs2_glock_nq(&tr->tr_t_gh);
- if (error)
- goto fail_holder_uninit;
+ sb_start_intwrite(sdp->sd_vfs);
error = gfs2_log_reserve(sdp, tr->tr_reserved);
if (error)
- goto fail_gunlock;
+ goto fail;
current->journal_info = tr;
return 0;
-fail_gunlock:
- gfs2_glock_dq(&tr->tr_t_gh);
-
-fail_holder_uninit:
+fail:
sb_end_intwrite(sdp->sd_vfs);
- gfs2_holder_uninit(&tr->tr_t_gh);
kfree(tr);
return error;
}
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-
-static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-
- atomic_add(blks, &sdp->sd_log_blks_free);
- trace_gfs2_log_blocks(sdp, blks);
- gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
- sdp->sd_jdesc->jd_blocks);
- up_read(&sdp->sd_log_flush_lock);
-}
-
static void gfs2_print_trans(const struct gfs2_trans *tr)
{
- printk(KERN_WARNING "GFS2: Transaction created at: %pSR\n",
- (void *)tr->tr_ip);
- printk(KERN_WARNING "GFS2: blocks=%u revokes=%u reserved=%u touched=%d\n",
- tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
- printk(KERN_WARNING "GFS2: Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
- tr->tr_num_buf_new, tr->tr_num_buf_rm,
- tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
- tr->tr_num_revoke, tr->tr_num_revoke_rm);
+ pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
+ pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
+ tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
+ pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+ tr->tr_num_buf_new, tr->tr_num_buf_rm,
+ tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
+ tr->tr_num_revoke, tr->tr_num_revoke_rm);
}
void gfs2_trans_end(struct gfs2_sbd *sdp)
@@ -115,11 +94,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
if (!tr->tr_touched) {
gfs2_log_release(sdp, tr->tr_reserved);
- if (tr->tr_t_gh.gh_gl) {
- gfs2_glock_dq(&tr->tr_t_gh);
- gfs2_holder_uninit(&tr->tr_t_gh);
+ if (tr->tr_alloced)
kfree(tr);
- }
sb_end_intwrite(sdp->sd_vfs);
return;
}
@@ -133,16 +109,12 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
gfs2_print_trans(tr);
gfs2_log_commit(sdp, tr);
- if (tr->tr_t_gh.gh_gl) {
- gfs2_glock_dq(&tr->tr_t_gh);
- gfs2_holder_uninit(&tr->tr_t_gh);
- if (!tr->tr_attached)
+ if (tr->tr_alloced && !tr->tr_attached)
kfree(tr);
- }
up_read(&sdp->sd_log_flush_lock);
if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
- gfs2_log_flush(sdp, NULL);
+ gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
sb_end_intwrite(sdp->sd_vfs);
}
@@ -210,8 +182,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
gfs2_pin(sdp, bd->bd_bh);
tr->tr_num_databuf_new++;
- sdp->sd_log_num_databuf++;
- list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
+ list_add_tail(&bd->bd_list, &tr->tr_databuf);
}
gfs2_log_unlock(sdp);
unlock_buffer(bh);
@@ -230,16 +201,14 @@ static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
- printk(KERN_ERR
- "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+ pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
gfs2_pin(sdp, bd->bd_bh);
mh->__pad0 = cpu_to_be64(0);
mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
- sdp->sd_log_num_buf++;
- list_add(&bd->bd_list, &sdp->sd_log_le_buf);
+ list_add(&bd->bd_list, &tr->tr_buf);
tr->tr_num_buf_new++;
}
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 6402fb69d71..86d2035ac66 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -7,6 +7,8 @@
* of the GNU General Public License version 2.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
@@ -30,22 +32,27 @@ mempool_t *gfs2_page_pool __read_mostly;
void gfs2_assert_i(struct gfs2_sbd *sdp)
{
- printk(KERN_EMERG "GFS2: fsid=%s: fatal assertion failed\n",
- sdp->sd_fsname);
+ fs_emerg(sdp, "fatal assertion failed\n");
}
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
const struct lm_lockops *lm = ls->ls_ops;
va_list args;
+ struct va_format vaf;
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return 0;
va_start(args, fmt);
- vprintk(fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ fs_err(sdp, "%pV", &vaf);
+
va_end(args);
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
@@ -66,7 +73,7 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
}
if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
- panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
+ panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname);
return -1;
}
@@ -82,10 +89,9 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
{
int me;
me = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: assertion \"%s\" failed\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname, assertion,
- sdp->sd_fsname, function, file, line);
+ "fatal: assertion \"%s\" failed\n"
+ " function = %s, file = %s, line = %u\n",
+ assertion, function, file, line);
dump_stack();
return (me) ? -1 : -2;
}
@@ -105,11 +111,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
return -2;
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
- printk(KERN_WARNING
- "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname, assertion,
- sdp->sd_fsname, function, file, line);
+ fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n",
+ assertion, function, file, line);
if (sdp->sd_args.ar_debug)
BUG();
@@ -138,10 +141,8 @@ int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function,
{
int rv;
rv = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: filesystem consistency error\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, function, file, line);
+ "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n",
+ function, file, line);
return rv;
}
@@ -157,13 +158,12 @@ int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
int rv;
rv = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: filesystem consistency error\n"
- "GFS2: fsid=%s: inode = %llu %llu\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr,
- sdp->sd_fsname, function, file, line);
+ "fatal: filesystem consistency error\n"
+ " inode = %llu %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)ip->i_no_formal_ino,
+ (unsigned long long)ip->i_no_addr,
+ function, file, line);
return rv;
}
@@ -179,12 +179,11 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
struct gfs2_sbd *sdp = rgd->rd_sbd;
int rv;
rv = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: filesystem consistency error\n"
- "GFS2: fsid=%s: RG = %llu\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, (unsigned long long)rgd->rd_addr,
- sdp->sd_fsname, function, file, line);
+ "fatal: filesystem consistency error\n"
+ " RG = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)rgd->rd_addr,
+ function, file, line);
return rv;
}
@@ -200,12 +199,11 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
{
int me;
me = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: invalid metadata block\n"
- "GFS2: fsid=%s: bh = %llu (%s)\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type,
- sdp->sd_fsname, function, file, line);
+ "fatal: invalid metadata block\n"
+ " bh = %llu (%s)\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, type,
+ function, file, line);
return (me) ? -1 : -2;
}
@@ -221,12 +219,11 @@ int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
{
int me;
me = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: invalid metadata block\n"
- "GFS2: fsid=%s: bh = %llu (type: exp=%u, found=%u)\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, (unsigned long long)bh->b_blocknr, type, t,
- sdp->sd_fsname, function, file, line);
+ "fatal: invalid metadata block\n"
+ " bh = %llu (type: exp=%u, found=%u)\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr, type, t,
+ function, file, line);
return (me) ? -1 : -2;
}
@@ -241,10 +238,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file,
{
int rv;
rv = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: I/O error\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, function, file, line);
+ "fatal: I/O error\n"
+ " function = %s, file = %s, line = %u\n",
+ function, file, line);
return rv;
}
@@ -259,32 +255,11 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
{
int rv;
rv = gfs2_lm_withdraw(sdp,
- "GFS2: fsid=%s: fatal: I/O error\n"
- "GFS2: fsid=%s: block = %llu\n"
- "GFS2: fsid=%s: function = %s, file = %s, line = %u\n",
- sdp->sd_fsname,
- sdp->sd_fsname, (unsigned long long)bh->b_blocknr,
- sdp->sd_fsname, function, file, line);
+ "fatal: I/O error\n"
+ " block = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr,
+ function, file, line);
return rv;
}
-void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
- unsigned int bit, int new_value)
-{
- unsigned int c, o, b = bit;
- int old_value;
-
- c = b / (8 * PAGE_SIZE);
- b %= 8 * PAGE_SIZE;
- o = b / 8;
- b %= 8;
-
- old_value = (bitmap[c][o] & (1 << b));
- gfs2_assert_withdraw(sdp, !old_value != !new_value);
-
- if (new_value)
- bitmap[c][o] |= 1 << b;
- else
- bitmap[c][o] &= ~(1 << b);
-}
-
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 80535739ac7..cbdcbdf3961 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -10,22 +10,23 @@
#ifndef __UTIL_DOT_H__
#define __UTIL_DOT_H__
+#ifdef pr_fmt
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#endif
+
#include <linux/mempool.h>
#include "incore.h"
-#define fs_printk(level, fs, fmt, arg...) \
- printk(level "GFS2: fsid=%s: " fmt , (fs)->sd_fsname , ## arg)
-
-#define fs_info(fs, fmt, arg...) \
- fs_printk(KERN_INFO , fs , fmt , ## arg)
-
-#define fs_warn(fs, fmt, arg...) \
- fs_printk(KERN_WARNING , fs , fmt , ## arg)
-
-#define fs_err(fs, fmt, arg...) \
- fs_printk(KERN_ERR, fs , fmt , ## arg)
-
+#define fs_emerg(fs, fmt, ...) \
+ pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_warn(fs, fmt, ...) \
+ pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_err(fs, fmt, ...) \
+ pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
+#define fs_info(fs, fmt, ...) \
+ pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__)
void gfs2_assert_i(struct gfs2_sbd *sdp);
@@ -85,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
u32 magic = be32_to_cpu(mh->mh_magic);
if (unlikely(magic != GFS2_MAGIC)) {
- printk(KERN_ERR "GFS2: Magic number missing at %llu\n",
+ pr_err("Magic number missing at %llu\n",
(unsigned long long)bh->b_blocknr);
return -EIO;
}
@@ -164,9 +165,7 @@ static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
#define gfs2_tune_get(sdp, field) \
gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
-void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
- unsigned int bit, int new_value);
-int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
+__printf(2, 3)
+int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, ...);
#endif /* __UTIL_DOT_H__ */
-
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index ecd37f30ab9..0b81f783f78 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -13,6 +13,7 @@
#include <linux/buffer_head.h>
#include <linux/xattr.h>
#include <linux/gfs2_ondisk.h>
+#include <linux/posix_acl_xattr.h>
#include <asm/uaccess.h>
#include "gfs2.h"
@@ -723,6 +724,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
unsigned int blks,
ea_skeleton_call_t skeleton_call, void *private)
{
+ struct gfs2_alloc_parms ap = { .target = blks };
struct buffer_head *dibh;
int error;
@@ -734,7 +736,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
if (error)
return error;
- error = gfs2_inplace_reserve(ip, blks, 0);
+ error = gfs2_inplace_reserve(ip, &ap);
if (error)
goto out_gunlock_q;
@@ -1499,7 +1501,8 @@ static const struct xattr_handler gfs2_xattr_security_handler = {
const struct xattr_handler *gfs2_xattr_handlers[] = {
&gfs2_xattr_user_handler,
&gfs2_xattr_security_handler,
- &gfs2_xattr_system_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
NULL,
};
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index 2a1d712f85d..f6bd266d70b 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -153,11 +153,6 @@ struct hfs_btree_header_rec {
u32 reserved3[16];
} __packed;
-#define HFS_NODE_INDEX 0x00 /* An internal (index) node */
-#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */
-#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */
-#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */
-
#define BTREE_ATTR_BADCLOSE 0x00000001 /* b-tree not closed properly. not
used by hfsplus. */
#define HFS_TREE_BIGKEYS 0x00000002 /* key length is u16 instead of u8.
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 380ab31b5e0..d0929bc8178 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -125,15 +125,15 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
}
static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file)->i_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- hfs_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -141,7 +141,7 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
hfs_write_failed(mapping, end);
@@ -547,7 +547,7 @@ out:
void hfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
@@ -674,10 +674,10 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
static const struct file_operations hfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
.fsync = hfs_file_fsync,
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 2d2039e754c..eee7206c38d 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -112,6 +112,7 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int hfs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_NODIRATIME;
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
index 07c0d494752..95c8ed9ec17 100644
--- a/fs/hfsplus/acl.h
+++ b/fs/hfsplus/acl.h
@@ -12,16 +12,13 @@
/* posix_acl.c */
struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
-extern int hfsplus_posix_acl_chmod(struct inode *);
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+ int type);
extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */
#define hfsplus_get_posix_acl NULL
-
-static inline int hfsplus_posix_acl_chmod(struct inode *inode)
-{
- return 0;
-}
+#define hfsplus_set_posix_acl NULL
static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
{
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index 0f47890299c..e5b221de7de 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -11,7 +11,7 @@
static struct kmem_cache *hfsplus_attr_tree_cachep;
-int hfsplus_create_attr_tree_cache(void)
+int __init hfsplus_create_attr_tree_cache(void)
{
if (hfsplus_attr_tree_cachep)
return -EEXIST;
@@ -54,14 +54,11 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
memset(key, 0, sizeof(struct hfsplus_attr_key));
key->attr.cnid = cpu_to_be32(cnid);
if (name) {
- len = strlen(name);
- if (len > HFSPLUS_ATTR_MAX_STRLEN) {
- pr_err("invalid xattr name's length\n");
- return -EINVAL;
- }
- hfsplus_asc2uni(sb,
+ int res = hfsplus_asc2uni(sb,
(struct hfsplus_unistr *)&key->attr.key_name,
- HFSPLUS_ATTR_MAX_STRLEN, name, len);
+ HFSPLUS_ATTR_MAX_STRLEN, name, strlen(name));
+ if (res)
+ return res;
len = be16_to_cpu(key->attr.key_name.length);
} else {
key->attr.key_name.length = 0;
@@ -82,31 +79,6 @@ int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
return 0;
}
-void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
- u32 cnid,
- struct hfsplus_attr_unistr *name)
-{
- int ustrlen;
-
- memset(key, 0, sizeof(struct hfsplus_attr_key));
- ustrlen = be16_to_cpu(name->length);
- key->attr.cnid = cpu_to_be32(cnid);
- key->attr.key_name.length = cpu_to_be16(ustrlen);
- ustrlen *= 2;
- memcpy(key->attr.key_name.unicode, name->unicode, ustrlen);
-
- /* The length of the key, as stored in key_len field, does not include
- * the size of the key_len field itself.
- * So, offsetof(hfsplus_attr_key, key_name) is a trick because
- * it takes into consideration key_len field (__be16) of
- * hfsplus_attr_key structure instead of length field (__be16) of
- * hfsplus_attr_unistr structure.
- */
- key->key_len =
- cpu_to_be16(offsetof(struct hfsplus_attr_key, key_name) +
- ustrlen);
-}
-
hfsplus_attr_entry *hfsplus_alloc_attr_entry(void)
{
return kmem_cache_alloc(hfsplus_attr_tree_cachep, GFP_KERNEL);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 11c86020452..759708fd933 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -27,13 +27,13 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
pagep = node->page + (off >> PAGE_CACHE_SHIFT);
off &= ~PAGE_CACHE_MASK;
- l = min(len, (int)PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_CACHE_SIZE - off);
memcpy(buf, kmap(*pagep) + off, l);
kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
- l = min(len, (int)PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_CACHE_SIZE);
memcpy(buf, kmap(*++pagep), l);
kunmap(*pagep);
}
@@ -80,14 +80,14 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len)
pagep = node->page + (off >> PAGE_CACHE_SHIFT);
off &= ~PAGE_CACHE_MASK;
- l = min(len, (int)PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_CACHE_SIZE - off);
memcpy(kmap(*pagep) + off, buf, l);
set_page_dirty(*pagep);
kunmap(*pagep);
while ((len -= l) != 0) {
buf += l;
- l = min(len, (int)PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_CACHE_SIZE);
memcpy(kmap(*++pagep), buf, l);
set_page_dirty(*pagep);
kunmap(*pagep);
@@ -110,13 +110,13 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len)
pagep = node->page + (off >> PAGE_CACHE_SHIFT);
off &= ~PAGE_CACHE_MASK;
- l = min(len, (int)PAGE_CACHE_SIZE - off);
+ l = min_t(int, len, PAGE_CACHE_SIZE - off);
memset(kmap(*pagep) + off, 0, l);
set_page_dirty(*pagep);
kunmap(*pagep);
while ((len -= l) != 0) {
- l = min(len, (int)PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_CACHE_SIZE);
memset(kmap(*++pagep), 0, l);
set_page_dirty(*pagep);
kunmap(*pagep);
@@ -142,14 +142,14 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
dst &= ~PAGE_CACHE_MASK;
if (src == dst) {
- l = min(len, (int)PAGE_CACHE_SIZE - src);
+ l = min_t(int, len, PAGE_CACHE_SIZE - src);
memcpy(kmap(*dst_page) + src, kmap(*src_page) + src, l);
kunmap(*src_page);
set_page_dirty(*dst_page);
kunmap(*dst_page);
while ((len -= l) != 0) {
- l = min(len, (int)PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_CACHE_SIZE);
memcpy(kmap(*++dst_page), kmap(*++src_page), l);
kunmap(*src_page);
set_page_dirty(*dst_page);
@@ -251,7 +251,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
dst &= ~PAGE_CACHE_MASK;
if (src == dst) {
- l = min(len, (int)PAGE_CACHE_SIZE - src);
+ l = min_t(int, len, PAGE_CACHE_SIZE - src);
memmove(kmap(*dst_page) + src,
kmap(*src_page) + src, l);
kunmap(*src_page);
@@ -259,7 +259,7 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len)
kunmap(*dst_page);
while ((len -= l) != 0) {
- l = min(len, (int)PAGE_CACHE_SIZE);
+ l = min_t(int, len, PAGE_CACHE_SIZE);
memmove(kmap(*++dst_page),
kmap(*++src_page), l);
kunmap(*src_page);
@@ -386,9 +386,8 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid)
struct hfs_bnode *node;
if (cnid >= tree->node_count) {
- pr_err("request for non-existent node "
- "%d in B*Tree\n",
- cnid);
+ pr_err("request for non-existent node %d in B*Tree\n",
+ cnid);
return NULL;
}
@@ -409,9 +408,8 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
loff_t off;
if (cnid >= tree->node_count) {
- pr_err("request for non-existent node "
- "%d in B*Tree\n",
- cnid);
+ pr_err("request for non-existent node %d in B*Tree\n",
+ cnid);
return NULL;
}
@@ -602,7 +600,7 @@ struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num)
pagep = node->page;
memset(kmap(*pagep) + node->page_offset, 0,
- min((int)PAGE_CACHE_SIZE, (int)tree->node_size));
+ min_t(int, PAGE_CACHE_SIZE, tree->node_size));
set_page_dirty(*pagep);
kunmap(*pagep);
for (i = 1; i < tree->pages_per_bnode; i++) {
@@ -648,8 +646,8 @@ void hfs_bnode_put(struct hfs_bnode *node)
if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
hfs_bnode_unhash(node);
spin_unlock(&tree->hash_lock);
- hfs_bnode_clear(node, 0,
- PAGE_CACHE_SIZE * tree->pages_per_bnode);
+ if (hfs_bnode_need_zeroout(tree))
+ hfs_bnode_clear(node, 0, tree->node_size);
hfs_bmap_free(node);
hfs_bnode_free(node);
return;
@@ -658,3 +656,16 @@ void hfs_bnode_put(struct hfs_bnode *node)
}
}
+/*
+ * Unused nodes have to be zeroed if this is the catalog tree and
+ * a corresponding flag in the volume header is set.
+ */
+bool hfs_bnode_need_zeroout(struct hfs_btree *tree)
+{
+ struct super_block *sb = tree->inode->i_sb;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ const u32 volume_attr = be32_to_cpu(sbi->s_vhdr->attributes);
+
+ return tree->cnid == HFSPLUS_CAT_CNID &&
+ volume_attr & HFSPLUS_VOL_UNUSED_NODE_FIX;
+}
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index 0c6540c9116..3345c7553ed 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -15,6 +15,118 @@
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
+/*
+ * Initial source code of clump size calculation is gotten
+ * from http://opensource.apple.com/tarballs/diskdev_cmds/
+ */
+#define CLUMP_ENTRIES 15
+
+static short clumptbl[CLUMP_ENTRIES * 3] = {
+/*
+ * Volume Attributes Catalog Extents
+ * Size Clump (MB) Clump (MB) Clump (MB)
+ */
+ /* 1GB */ 4, 4, 4,
+ /* 2GB */ 6, 6, 4,
+ /* 4GB */ 8, 8, 4,
+ /* 8GB */ 11, 11, 5,
+ /*
+ * For volumes 16GB and larger, we want to make sure that a full OS
+ * install won't require fragmentation of the Catalog or Attributes
+ * B-trees. We do this by making the clump sizes sufficiently large,
+ * and by leaving a gap after the B-trees for them to grow into.
+ *
+ * For SnowLeopard 10A298, a FullNetInstall with all packages selected
+ * results in:
+ * Catalog B-tree Header
+ * nodeSize: 8192
+ * totalNodes: 31616
+ * freeNodes: 1978
+ * (used = 231.55 MB)
+ * Attributes B-tree Header
+ * nodeSize: 8192
+ * totalNodes: 63232
+ * freeNodes: 958
+ * (used = 486.52 MB)
+ *
+ * We also want Time Machine backup volumes to have a sufficiently
+ * large clump size to reduce fragmentation.
+ *
+ * The series of numbers for Catalog and Attribute form a geometric
+ * series. For Catalog (16GB to 512GB), each term is 8**(1/5) times
+ * the previous term. For Attributes (16GB to 512GB), each term is
+ * 4**(1/5) times the previous term. For 1TB to 16TB, each term is
+ * 2**(1/5) times the previous term.
+ */
+ /* 16GB */ 64, 32, 5,
+ /* 32GB */ 84, 49, 6,
+ /* 64GB */ 111, 74, 7,
+ /* 128GB */ 147, 111, 8,
+ /* 256GB */ 194, 169, 9,
+ /* 512GB */ 256, 256, 11,
+ /* 1TB */ 294, 294, 14,
+ /* 2TB */ 338, 338, 16,
+ /* 4TB */ 388, 388, 20,
+ /* 8TB */ 446, 446, 25,
+ /* 16TB */ 512, 512, 32
+};
+
+u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size,
+ u64 sectors, int file_id)
+{
+ u32 mod = max(node_size, block_size);
+ u32 clump_size;
+ int column;
+ int i;
+
+ /* Figure out which column of the above table to use for this file. */
+ switch (file_id) {
+ case HFSPLUS_ATTR_CNID:
+ column = 0;
+ break;
+ case HFSPLUS_CAT_CNID:
+ column = 1;
+ break;
+ default:
+ column = 2;
+ break;
+ }
+
+ /*
+ * The default clump size is 0.8% of the volume size. And
+ * it must also be a multiple of the node and block size.
+ */
+ if (sectors < 0x200000) {
+ clump_size = sectors << 2; /* 0.8 % */
+ if (clump_size < (8 * node_size))
+ clump_size = 8 * node_size;
+ } else {
+ /* turn exponent into table index... */
+ for (i = 0, sectors = sectors >> 22;
+ sectors && (i < CLUMP_ENTRIES - 1);
+ ++i, sectors = sectors >> 1) {
+ /* empty body */
+ }
+
+ clump_size = clumptbl[column + (i) * 3] * 1024 * 1024;
+ }
+
+ /*
+ * Round the clump size to a multiple of node and block size.
+ * NOTE: This rounds down.
+ */
+ clump_size /= mod;
+ clump_size *= mod;
+
+ /*
+ * Rounding down could have rounded down to 0 if the block size was
+ * greater than the clump size. If so, just use one block or node.
+ */
+ if (clump_size == 0)
+ clump_size = mod;
+
+ return clump_size;
+}
/* Get a reference to a B*Tree and do some initial checks */
struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
@@ -246,7 +358,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
u32 count;
int res;
- res = hfsplus_file_extend(inode);
+ res = hfsplus_file_extend(inode, hfs_bnode_need_zeroout(tree));
if (res)
return ERR_PTR(res);
hip->phys_size = inode->i_size =
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index 968ce411db5..32602c667b4 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -103,6 +103,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry,
folder = &entry->folder;
memset(folder, 0, sizeof(*folder));
folder->type = cpu_to_be16(HFSPLUS_FOLDER);
+ if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags))
+ folder->flags |= cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT);
folder->id = cpu_to_be32(inode->i_ino);
HFSPLUS_I(inode)->create_date =
folder->create_date =
@@ -203,6 +205,36 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
return hfs_brec_find(fd, hfs_find_rec_by_key);
}
+static void hfsplus_subfolders_inc(struct inode *dir)
+{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+
+ if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+ /*
+ * Increment subfolder count. Note, the value is only meaningful
+ * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
+ */
+ HFSPLUS_I(dir)->subfolders++;
+ }
+}
+
+static void hfsplus_subfolders_dec(struct inode *dir)
+{
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
+
+ if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
+ /*
+ * Decrement subfolder count. Note, the value is only meaningful
+ * for folders with HFSPLUS_HAS_FOLDER_COUNT flag set.
+ *
+ * Check for zero. Some subfolders may have been created
+ * by an implementation ignorant of this counter.
+ */
+ if (HFSPLUS_I(dir)->subfolders)
+ HFSPLUS_I(dir)->subfolders--;
+ }
+}
+
int hfsplus_create_cat(u32 cnid, struct inode *dir,
struct qstr *str, struct inode *inode)
{
@@ -247,6 +279,8 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir,
goto err1;
dir->i_size++;
+ if (S_ISDIR(inode->i_mode))
+ hfsplus_subfolders_inc(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
@@ -336,6 +370,8 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
goto out;
dir->i_size--;
+ if (type == HFSPLUS_FOLDER)
+ hfsplus_subfolders_dec(dir);
dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY);
@@ -380,6 +416,7 @@ int hfsplus_rename_cat(u32 cnid,
hfs_bnode_read(src_fd.bnode, &entry, src_fd.entryoffset,
src_fd.entrylength);
+ type = be16_to_cpu(entry.type);
/* create new dir entry with the data from the old entry */
hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
@@ -394,6 +431,8 @@ int hfsplus_rename_cat(u32 cnid,
if (err)
goto out;
dst_dir->i_size++;
+ if (type == HFSPLUS_FOLDER)
+ hfsplus_subfolders_inc(dst_dir);
dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC;
/* finally remove the old entry */
@@ -405,6 +444,8 @@ int hfsplus_rename_cat(u32 cnid,
if (err)
goto out;
src_dir->i_size--;
+ if (type == HFSPLUS_FOLDER)
+ hfsplus_subfolders_dec(src_dir);
src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC;
/* remove old thread entry */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 4a4fea00267..610a3260bef 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -12,6 +12,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/random.h>
+#include <linux/nls.h>
#include "hfsplus_fs.h"
#include "hfsplus_raw.h"
@@ -127,7 +128,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
int len, err;
- char strbuf[HFSPLUS_MAX_STRLEN + 1];
+ char *strbuf;
hfsplus_cat_entry entry;
struct hfs_find_data fd;
struct hfsplus_readdir_data *rd;
@@ -139,6 +140,11 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
if (err)
return err;
+ strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN + 1, GFP_KERNEL);
+ if (!strbuf) {
+ err = -ENOMEM;
+ goto out;
+ }
hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
err = hfs_brec_find(&fd, hfs_find_rec_by_key);
if (err)
@@ -193,7 +199,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
fd.entrylength);
type = be16_to_cpu(entry.type);
- len = HFSPLUS_MAX_STRLEN;
+ len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN;
err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len);
if (err)
goto out;
@@ -212,13 +218,31 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx)
be32_to_cpu(entry.folder.id), DT_DIR))
break;
} else if (type == HFSPLUS_FILE) {
+ u16 mode;
+ unsigned type = DT_UNKNOWN;
+
if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
pr_err("small file entry\n");
err = -EIO;
goto out;
}
+
+ mode = be16_to_cpu(entry.file.permissions.mode);
+ if (S_ISREG(mode))
+ type = DT_REG;
+ else if (S_ISLNK(mode))
+ type = DT_LNK;
+ else if (S_ISFIFO(mode))
+ type = DT_FIFO;
+ else if (S_ISCHR(mode))
+ type = DT_CHR;
+ else if (S_ISBLK(mode))
+ type = DT_BLK;
+ else if (S_ISSOCK(mode))
+ type = DT_SOCK;
+
if (!dir_emit(ctx, strbuf, len,
- be32_to_cpu(entry.file.id), DT_REG))
+ be32_to_cpu(entry.file.id), type))
break;
} else {
pr_err("bad catalog entry type\n");
@@ -246,6 +270,7 @@ next:
}
memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
out:
+ kfree(strbuf);
hfs_find_exit(&fd);
return err;
}
@@ -529,9 +554,10 @@ const struct inode_operations hfsplus_dir_inode_operations = {
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = hfsplus_listxattr,
- .removexattr = hfsplus_removexattr,
+ .removexattr = generic_removexattr,
#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
.get_acl = hfsplus_get_posix_acl,
+ .set_acl = hfsplus_set_posix_acl,
#endif
};
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index fbb212fbb1e..feca524ce2a 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -227,17 +227,15 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
u32 ablock, dblock, mask;
sector_t sector;
int was_dirty = 0;
- int shift;
/* Convert inode block to disk allocation block */
- shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
ablock = iblock >> sbi->fs_shift;
if (iblock >= hip->fs_blocks) {
if (iblock > hip->fs_blocks || !create)
return -EIO;
if (ablock >= hip->alloc_blocks) {
- res = hfsplus_file_extend(inode);
+ res = hfsplus_file_extend(inode, false);
if (res)
return res;
}
@@ -427,7 +425,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid,
return res;
}
-int hfsplus_file_extend(struct inode *inode)
+int hfsplus_file_extend(struct inode *inode, bool zeroout)
{
struct super_block *sb = inode->i_sb;
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
@@ -438,10 +436,9 @@ int hfsplus_file_extend(struct inode *inode)
if (sbi->alloc_file->i_size * 8 <
sbi->total_blocks - sbi->free_blocks + 8) {
/* extend alloc file */
- pr_err("extend alloc file! "
- "(%llu,%u,%u)\n",
- sbi->alloc_file->i_size * 8,
- sbi->total_blocks, sbi->free_blocks);
+ pr_err("extend alloc file! (%llu,%u,%u)\n",
+ sbi->alloc_file->i_size * 8,
+ sbi->total_blocks, sbi->free_blocks);
return -ENOSPC;
}
@@ -465,6 +462,12 @@ int hfsplus_file_extend(struct inode *inode)
}
}
+ if (zeroout) {
+ res = sb_issue_zeroout(sb, start, len, GFP_NOFS);
+ if (res)
+ goto out;
+ }
+
hfs_dbg(EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
if (hip->alloc_blocks <= hip->first_blocks) {
@@ -498,11 +501,13 @@ int hfsplus_file_extend(struct inode *inode)
goto insert_extent;
}
out:
- mutex_unlock(&hip->extents_lock);
if (!res) {
hip->alloc_blocks += len;
+ mutex_unlock(&hip->extents_lock);
hfsplus_mark_inode_dirty(inode, HFSPLUS_I_ALLOC_DIRTY);
+ return 0;
}
+ mutex_unlock(&hip->extents_lock);
return res;
insert_extent:
@@ -556,11 +561,13 @@ void hfsplus_file_truncate(struct inode *inode)
blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
HFSPLUS_SB(sb)->alloc_blksz_shift;
+
+ mutex_lock(&hip->extents_lock);
+
alloc_cnt = hip->alloc_blocks;
if (blk_cnt == alloc_cnt)
- goto out;
+ goto out_unlock;
- mutex_lock(&hip->extents_lock);
res = hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
if (res) {
mutex_unlock(&hip->extents_lock);
@@ -592,10 +599,10 @@ void hfsplus_file_truncate(struct inode *inode)
hfs_brec_remove(&fd);
}
hfs_find_exit(&fd);
- mutex_unlock(&hip->extents_lock);
hip->alloc_blocks = blk_cnt;
-out:
+out_unlock:
+ mutex_unlock(&hip->extents_lock);
hip->phys_size = inode->i_size;
hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >>
sb->s_blocksize_bits;
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 2b9cd01696e..eb5e059f481 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -127,6 +127,14 @@ struct hfs_bnode {
#define HFS_BNODE_DELETED 4
/*
+ * Attributes file states
+ */
+#define HFSPLUS_EMPTY_ATTR_TREE 0
+#define HFSPLUS_CREATING_ATTR_TREE 1
+#define HFSPLUS_VALID_ATTR_TREE 2
+#define HFSPLUS_FAILED_ATTR_TREE 3
+
+/*
* HFS+ superblock info (built from Volume Header on disk)
*/
@@ -141,6 +149,7 @@ struct hfsplus_sb_info {
struct hfs_btree *ext_tree;
struct hfs_btree *cat_tree;
struct hfs_btree *attr_tree;
+ atomic_t attr_tree_state;
struct inode *alloc_file;
struct inode *hidden_dir;
struct nls_table *nls;
@@ -233,6 +242,7 @@ struct hfsplus_inode_info {
*/
sector_t fs_blocks;
u8 userflags; /* BSD user file flags */
+ u32 subfolders; /* Subfolder count (HFSX only) */
struct list_head open_dir_list;
loff_t phys_size;
@@ -357,115 +367,121 @@ typedef int (*search_strategy_t)(struct hfs_bnode *,
*/
/* attributes.c */
-int hfsplus_create_attr_tree_cache(void);
+int __init hfsplus_create_attr_tree_cache(void);
void hfsplus_destroy_attr_tree_cache(void);
+int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
+ const hfsplus_btree_key *k2);
+int hfsplus_attr_build_key(struct super_block *sb, hfsplus_btree_key *key,
+ u32 cnid, const char *name);
hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
-void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
-int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *,
- const hfsplus_btree_key *);
-int hfsplus_attr_build_key(struct super_block *, hfsplus_btree_key *,
- u32, const char *);
-void hfsplus_attr_build_key_uni(hfsplus_btree_key *key,
- u32 cnid,
- struct hfsplus_attr_unistr *name);
-int hfsplus_find_attr(struct super_block *, u32,
- const char *, struct hfs_find_data *);
+void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry);
+int hfsplus_find_attr(struct super_block *sb, u32 cnid, const char *name,
+ struct hfs_find_data *fd);
int hfsplus_attr_exists(struct inode *inode, const char *name);
-int hfsplus_create_attr(struct inode *, const char *, const void *, size_t);
-int hfsplus_delete_attr(struct inode *, const char *);
+int hfsplus_create_attr(struct inode *inode, const char *name,
+ const void *value, size_t size);
+int hfsplus_delete_attr(struct inode *inode, const char *name);
int hfsplus_delete_all_attrs(struct inode *dir, u32 cnid);
/* bitmap.c */
-int hfsplus_block_allocate(struct super_block *, u32, u32, u32 *);
-int hfsplus_block_free(struct super_block *, u32, u32);
+int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset,
+ u32 *max);
+int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count);
/* btree.c */
-struct hfs_btree *hfs_btree_open(struct super_block *, u32);
-void hfs_btree_close(struct hfs_btree *);
-int hfs_btree_write(struct hfs_btree *);
-struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *);
-void hfs_bmap_free(struct hfs_bnode *);
+u32 hfsplus_calc_btree_clump_size(u32 block_size, u32 node_size, u64 sectors,
+ int file_id);
+struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id);
+void hfs_btree_close(struct hfs_btree *tree);
+int hfs_btree_write(struct hfs_btree *tree);
+struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree);
+void hfs_bmap_free(struct hfs_bnode *node);
/* bnode.c */
-void hfs_bnode_read(struct hfs_bnode *, void *, int, int);
-u16 hfs_bnode_read_u16(struct hfs_bnode *, int);
-u8 hfs_bnode_read_u8(struct hfs_bnode *, int);
-void hfs_bnode_read_key(struct hfs_bnode *, void *, int);
-void hfs_bnode_write(struct hfs_bnode *, void *, int, int);
-void hfs_bnode_write_u16(struct hfs_bnode *, int, u16);
-void hfs_bnode_clear(struct hfs_bnode *, int, int);
-void hfs_bnode_copy(struct hfs_bnode *, int,
- struct hfs_bnode *, int, int);
-void hfs_bnode_move(struct hfs_bnode *, int, int, int);
-void hfs_bnode_dump(struct hfs_bnode *);
-void hfs_bnode_unlink(struct hfs_bnode *);
-struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *, u32);
-struct hfs_bnode *hfs_bnode_find(struct hfs_btree *, u32);
-void hfs_bnode_unhash(struct hfs_bnode *);
-void hfs_bnode_free(struct hfs_bnode *);
-struct hfs_bnode *hfs_bnode_create(struct hfs_btree *, u32);
-void hfs_bnode_get(struct hfs_bnode *);
-void hfs_bnode_put(struct hfs_bnode *);
+void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len);
+u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off);
+u8 hfs_bnode_read_u8(struct hfs_bnode *node, int off);
+void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off);
+void hfs_bnode_write(struct hfs_bnode *node, void *buf, int off, int len);
+void hfs_bnode_write_u16(struct hfs_bnode *node, int off, u16 data);
+void hfs_bnode_clear(struct hfs_bnode *node, int off, int len);
+void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
+ struct hfs_bnode *src_node, int src, int len);
+void hfs_bnode_move(struct hfs_bnode *node, int dst, int src, int len);
+void hfs_bnode_dump(struct hfs_bnode *node);
+void hfs_bnode_unlink(struct hfs_bnode *node);
+struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid);
+void hfs_bnode_unhash(struct hfs_bnode *node);
+struct hfs_bnode *hfs_bnode_find(struct hfs_btree *tree, u32 num);
+void hfs_bnode_free(struct hfs_bnode *node);
+struct hfs_bnode *hfs_bnode_create(struct hfs_btree *tree, u32 num);
+void hfs_bnode_get(struct hfs_bnode *node);
+void hfs_bnode_put(struct hfs_bnode *node);
+bool hfs_bnode_need_zeroout(struct hfs_btree *tree);
/* brec.c */
-u16 hfs_brec_lenoff(struct hfs_bnode *, u16, u16 *);
-u16 hfs_brec_keylen(struct hfs_bnode *, u16);
-int hfs_brec_insert(struct hfs_find_data *, void *, int);
-int hfs_brec_remove(struct hfs_find_data *);
+u16 hfs_brec_lenoff(struct hfs_bnode *node, u16 rec, u16 *off);
+u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec);
+int hfs_brec_insert(struct hfs_find_data *fd, void *entry, int entry_len);
+int hfs_brec_remove(struct hfs_find_data *fd);
/* bfind.c */
-int hfs_find_init(struct hfs_btree *, struct hfs_find_data *);
-void hfs_find_exit(struct hfs_find_data *);
-int hfs_find_1st_rec_by_cnid(struct hfs_bnode *,
- struct hfs_find_data *,
- int *, int *, int *);
-int hfs_find_rec_by_key(struct hfs_bnode *,
- struct hfs_find_data *,
- int *, int *, int *);
-int __hfs_brec_find(struct hfs_bnode *, struct hfs_find_data *,
- search_strategy_t);
-int hfs_brec_find(struct hfs_find_data *, search_strategy_t);
-int hfs_brec_read(struct hfs_find_data *, void *, int);
-int hfs_brec_goto(struct hfs_find_data *, int);
+int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd);
+void hfs_find_exit(struct hfs_find_data *fd);
+int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+ int *begin, int *end, int *cur_rec);
+int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+ int *begin, int *end, int *cur_rec);
+int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd,
+ search_strategy_t rec_found);
+int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare);
+int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len);
+int hfs_brec_goto(struct hfs_find_data *fd, int cnt);
/* catalog.c */
-int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *,
- const hfsplus_btree_key *);
-int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *,
- const hfsplus_btree_key *);
-void hfsplus_cat_build_key(struct super_block *sb,
- hfsplus_btree_key *, u32, struct qstr *);
-int hfsplus_find_cat(struct super_block *, u32, struct hfs_find_data *);
-int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
-int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
-int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
- struct inode *, struct qstr *);
+int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1,
+ const hfsplus_btree_key *k2);
+int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1,
+ const hfsplus_btree_key *k2);
+void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key,
+ u32 parent, struct qstr *str);
void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
+int hfsplus_find_cat(struct super_block *sb, u32 cnid,
+ struct hfs_find_data *fd);
+int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str,
+ struct inode *inode);
+int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str);
+int hfsplus_rename_cat(u32 cnid, struct inode *src_dir, struct qstr *src_name,
+ struct inode *dst_dir, struct qstr *dst_name);
/* dir.c */
extern const struct inode_operations hfsplus_dir_inode_operations;
extern const struct file_operations hfsplus_dir_operations;
/* extents.c */
-int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *);
-int hfsplus_ext_write_extent(struct inode *);
-int hfsplus_get_block(struct inode *, sector_t, struct buffer_head *, int);
-int hfsplus_free_fork(struct super_block *, u32,
- struct hfsplus_fork_raw *, int);
-int hfsplus_file_extend(struct inode *);
-void hfsplus_file_truncate(struct inode *);
+int hfsplus_ext_cmp_key(const hfsplus_btree_key *k1,
+ const hfsplus_btree_key *k2);
+int hfsplus_ext_write_extent(struct inode *inode);
+int hfsplus_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
+int hfsplus_free_fork(struct super_block *sb, u32 cnid,
+ struct hfsplus_fork_raw *fork, int type);
+int hfsplus_file_extend(struct inode *inode, bool zeroout);
+void hfsplus_file_truncate(struct inode *inode);
/* inode.c */
extern const struct address_space_operations hfsplus_aops;
extern const struct address_space_operations hfsplus_btree_aops;
extern const struct dentry_operations hfsplus_dentry_operations;
-void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
-void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
-int hfsplus_cat_read_inode(struct inode *, struct hfs_find_data *);
-int hfsplus_cat_write_inode(struct inode *);
-struct inode *hfsplus_new_inode(struct super_block *, umode_t);
-void hfsplus_delete_inode(struct inode *);
+struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode);
+void hfsplus_delete_inode(struct inode *inode);
+void hfsplus_inode_read_fork(struct inode *inode,
+ struct hfsplus_fork_raw *fork);
+void hfsplus_inode_write_fork(struct inode *inode,
+ struct hfsplus_fork_raw *fork);
+int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd);
+int hfsplus_cat_write_inode(struct inode *inode);
int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
@@ -473,13 +489,17 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
/* options.c */
-int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
+void hfsplus_fill_defaults(struct hfsplus_sb_info *opts);
int hfsplus_parse_options_remount(char *input, int *force);
-void hfsplus_fill_defaults(struct hfsplus_sb_info *);
-int hfsplus_show_options(struct seq_file *, struct dentry *);
+int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi);
+int hfsplus_show_options(struct seq_file *seq, struct dentry *root);
+
+/* part_tbl.c */
+int hfs_part_find(struct super_block *sb, sector_t *part_start,
+ sector_t *part_size);
/* super.c */
-struct inode *hfsplus_iget(struct super_block *, unsigned long);
+struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino);
void hfsplus_mark_mdb_dirty(struct super_block *sb);
/* tables.c */
@@ -488,23 +508,23 @@ extern u16 hfsplus_decompose_table[];
extern u16 hfsplus_compose_table[];
/* unicode.c */
-int hfsplus_strcasecmp(const struct hfsplus_unistr *,
- const struct hfsplus_unistr *);
-int hfsplus_strcmp(const struct hfsplus_unistr *,
- const struct hfsplus_unistr *);
-int hfsplus_uni2asc(struct super_block *,
- const struct hfsplus_unistr *, char *, int *);
-int hfsplus_asc2uni(struct super_block *,
- struct hfsplus_unistr *, int, const char *, int);
+int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
+ const struct hfsplus_unistr *s2);
+int hfsplus_strcmp(const struct hfsplus_unistr *s1,
+ const struct hfsplus_unistr *s2);
+int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr,
+ char *astr, int *len_p);
+int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
+ int max_unistr_len, const char *astr, int len);
int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str);
-int hfsplus_compare_dentry(const struct dentry *parent, const struct dentry *dentry,
- unsigned int len, const char *str, const struct qstr *name);
+int hfsplus_compare_dentry(const struct dentry *parent,
+ const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name);
/* wrapper.c */
-int hfsplus_read_wrapper(struct super_block *);
-int hfs_part_find(struct super_block *, sector_t *, sector_t *);
-int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
- void *buf, void **data, int rw);
+int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf,
+ void **data, int rw);
+int hfsplus_read_wrapper(struct super_block *sb);
/* time macros */
#define __hfsp_mt2ut(t) (be32_to_cpu(t) - 2082844800U)
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index 452ede01b03..8298d0985f8 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -144,6 +144,7 @@ struct hfsplus_vh {
#define HFSPLUS_VOL_NODEID_REUSED (1 << 12)
#define HFSPLUS_VOL_JOURNALED (1 << 13)
#define HFSPLUS_VOL_SOFTLOCK (1 << 15)
+#define HFSPLUS_VOL_UNUSED_NODE_FIX (1 << 31)
/* HFS+ BTree node descriptor */
struct hfs_bnode_desc {
@@ -156,10 +157,10 @@ struct hfs_bnode_desc {
} __packed;
/* HFS+ BTree node types */
-#define HFS_NODE_INDEX 0x00
-#define HFS_NODE_HEADER 0x01
-#define HFS_NODE_MAP 0x02
-#define HFS_NODE_LEAF 0xFF
+#define HFS_NODE_INDEX 0x00 /* An internal (index) node */
+#define HFS_NODE_HEADER 0x01 /* The tree header node (node 0) */
+#define HFS_NODE_MAP 0x02 /* Holds part of the bitmap of used nodes */
+#define HFS_NODE_LEAF 0xFF /* A leaf (ndNHeight==1) node */
/* HFS+ BTree header */
struct hfs_btree_header_rec {
@@ -187,6 +188,9 @@ struct hfs_btree_header_rec {
/* HFS+ BTree misc info */
#define HFSPLUS_TREE_HEAD 0
#define HFSPLUS_NODE_MXSZ 32768
+#define HFSPLUS_ATTR_TREE_NODE_SIZE 8192
+#define HFSPLUS_BTREE_HDR_NODE_RECS_COUNT 3
+#define HFSPLUS_BTREE_HDR_USER_BYTES 128
/* Some special File ID numbers (stolen from hfs.h) */
#define HFSPLUS_POR_CNID 1 /* Parent Of the Root */
@@ -258,7 +262,7 @@ struct hfsplus_cat_folder {
struct DInfo user_info;
struct DXInfo finder_info;
__be32 text_encoding;
- u32 reserved;
+ __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */
} __packed;
/* HFS file info (stolen from hfs.h) */
@@ -298,11 +302,13 @@ struct hfsplus_cat_file {
struct hfsplus_fork_raw rsrc_fork;
} __packed;
-/* File attribute bits */
+/* File and folder flag bits */
#define HFSPLUS_FILE_LOCKED 0x0001
#define HFSPLUS_FILE_THREAD_EXISTS 0x0002
#define HFSPLUS_XATTR_EXISTS 0x0004
#define HFSPLUS_ACL_EXISTS 0x0008
+#define HFSPLUS_HAS_FOLDER_COUNT 0x0010 /* Folder has subfolder count
+ * (HFSX only) */
/* HFS+ catalog thread (part of a cat_entry) */
struct hfsplus_cat_thread {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 37213d075f3..0cf786f2d04 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -123,14 +123,15 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
}
static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file)->i_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
hfsplus_get_block);
/*
@@ -139,7 +140,7 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
hfsplus_write_failed(mapping, end);
@@ -178,64 +179,6 @@ const struct dentry_operations hfsplus_dentry_operations = {
.d_compare = hfsplus_compare_dentry,
};
-static struct dentry *hfsplus_file_lookup(struct inode *dir,
- struct dentry *dentry, unsigned int flags)
-{
- struct hfs_find_data fd;
- struct super_block *sb = dir->i_sb;
- struct inode *inode = NULL;
- struct hfsplus_inode_info *hip;
- int err;
-
- if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
- goto out;
-
- inode = HFSPLUS_I(dir)->rsrc_inode;
- if (inode)
- goto out;
-
- inode = new_inode(sb);
- if (!inode)
- return ERR_PTR(-ENOMEM);
-
- hip = HFSPLUS_I(inode);
- inode->i_ino = dir->i_ino;
- INIT_LIST_HEAD(&hip->open_dir_list);
- mutex_init(&hip->extents_lock);
- hip->extent_state = 0;
- hip->flags = 0;
- hip->userflags = 0;
- set_bit(HFSPLUS_I_RSRC, &hip->flags);
-
- err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
- if (!err) {
- err = hfsplus_find_cat(sb, dir->i_ino, &fd);
- if (!err)
- err = hfsplus_cat_read_inode(inode, &fd);
- hfs_find_exit(&fd);
- }
- if (err) {
- iput(inode);
- return ERR_PTR(err);
- }
- hip->rsrc_inode = dir;
- HFSPLUS_I(dir)->rsrc_inode = inode;
- igrab(dir);
-
- /*
- * __mark_inode_dirty expects inodes to be hashed. Since we don't
- * want resource fork inodes in the regular inode space, we make them
- * appear hashed, but do not put on any lists. hlist_del()
- * will work fine and require no locking.
- */
- hlist_add_fake(&inode->i_hash);
-
- mark_inode_dirty(inode);
-out:
- d_add(dentry, inode);
- return NULL;
-}
-
static void hfsplus_get_perms(struct inode *inode,
struct hfsplus_perm *perms, int dir)
{
@@ -319,7 +262,7 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
mark_inode_dirty(inode);
if (attr->ia_valid & ATTR_MODE) {
- error = hfsplus_posix_acl_chmod(inode);
+ error = posix_acl_chmod(inode, inode->i_mode);
if (unlikely(error))
return error;
}
@@ -385,23 +328,23 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
}
static const struct inode_operations hfsplus_file_inode_operations = {
- .lookup = hfsplus_file_lookup,
.setattr = hfsplus_setattr,
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = hfsplus_listxattr,
- .removexattr = hfsplus_removexattr,
+ .removexattr = generic_removexattr,
#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
.get_acl = hfsplus_get_posix_acl,
+ .set_acl = hfsplus_set_posix_acl,
#endif
};
static const struct file_operations hfsplus_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
.fsync = hfsplus_file_fsync,
@@ -433,6 +376,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
hip->extent_state = 0;
hip->flags = 0;
hip->userflags = 0;
+ hip->subfolders = 0;
memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
hip->alloc_blocks = 0;
@@ -552,6 +496,10 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
HFSPLUS_I(inode)->create_date = folder->create_date;
HFSPLUS_I(inode)->fs_blocks = 0;
+ if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
+ HFSPLUS_I(inode)->subfolders =
+ be32_to_cpu(folder->subfolders);
+ }
inode->i_op = &hfsplus_dir_inode_operations;
inode->i_fop = &hfsplus_dir_operations;
} else if (type == HFSPLUS_FILE) {
@@ -624,6 +572,10 @@ int hfsplus_cat_write_inode(struct inode *inode)
folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
folder->valence = cpu_to_be32(inode->i_size - 2);
+ if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) {
+ folder->subfolders =
+ cpu_to_be32(HFSPLUS_I(inode)->subfolders);
+ }
hfs_bnode_write(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
} else if (HFSPLUS_IS_RSRC(inode)) {
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 968eab5bc1f..c90b72ee676 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -75,7 +75,7 @@ int hfsplus_parse_options_remount(char *input, int *force)
int token;
if (!input)
- return 0;
+ return 1;
while ((p = strsep(&input, ",")) != NULL) {
if (!*p)
@@ -173,9 +173,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (p)
sbi->nls = load_nls(p);
if (!sbi->nls) {
- pr_err("unable to load "
- "nls mapping \"%s\"\n",
- p);
+ pr_err("unable to load nls mapping \"%s\"\n",
+ p);
kfree(p);
return 0;
}
@@ -232,8 +231,8 @@ int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
if (sbi->nls)
seq_printf(seq, ",nls=%s", sbi->nls->charset);
if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
- seq_printf(seq, ",nodecompose");
+ seq_puts(seq, ",nodecompose");
if (test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
- seq_printf(seq, ",nobarrier");
+ seq_puts(seq, ",nobarrier");
return 0;
}
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index b609cc14c72..df0c9af68d0 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -17,9 +17,7 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
char *value = NULL;
ssize_t size;
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
+ hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
switch (type) {
case ACL_TYPE_ACCESS:
@@ -56,17 +54,15 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
return acl;
}
-static int hfsplus_set_posix_acl(struct inode *inode,
- int type,
- struct posix_acl *acl)
+int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
+ int type)
{
int err;
char *xattr_name;
size_t size = 0;
char *value = NULL;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
+ hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
switch (type) {
case ACL_TYPE_ACCESS:
@@ -115,7 +111,7 @@ end_set_acl:
int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
{
int err = 0;
- struct posix_acl *acl = NULL;
+ struct posix_acl *default_acl, *acl;
hfs_dbg(ACL_MOD,
"[%s]: ino %lu, dir->ino %lu\n",
@@ -124,151 +120,21 @@ int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
if (S_ISLNK(inode->i_mode))
return 0;
- acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (acl) {
- if (S_ISDIR(inode->i_mode)) {
- err = hfsplus_set_posix_acl(inode,
- ACL_TYPE_DEFAULT,
- acl);
- if (unlikely(err))
- goto init_acl_cleanup;
- }
-
- err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (unlikely(err < 0))
- return err;
-
- if (err > 0)
- err = hfsplus_set_posix_acl(inode,
- ACL_TYPE_ACCESS,
- acl);
- } else
- inode->i_mode &= ~current_umask();
-
-init_acl_cleanup:
- posix_acl_release(acl);
- return err;
-}
-
-int hfsplus_posix_acl_chmod(struct inode *inode)
-{
- int err;
- struct posix_acl *acl;
-
- hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
-
- err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (unlikely(err))
+ err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (err)
return err;
- err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- return err;
-}
-
-static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
- const char *name,
- void *buffer,
- size_t size,
- int type)
-{
- int err = 0;
- struct posix_acl *acl;
-
- hfs_dbg(ACL_MOD,
- "[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
- __func__, dentry->d_inode->i_ino, buffer, size, type);
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
- acl = hfsplus_get_posix_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
-
- err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return err;
-}
-
-static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
- const char *name,
- const void *value,
- size_t size,
- int flags,
- int type)
-{
- int err = 0;
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl = NULL;
-
- hfs_dbg(ACL_MOD,
- "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
- __func__, inode->i_ino, value, size, flags, type);
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- err = posix_acl_valid(acl);
- if (err)
- goto end_xattr_set_acl;
- }
+ if (default_acl) {
+ err = hfsplus_set_posix_acl(inode, default_acl,
+ ACL_TYPE_DEFAULT);
+ posix_acl_release(default_acl);
}
- err = hfsplus_set_posix_acl(inode, type, acl);
-
-end_xattr_set_acl:
- posix_acl_release(acl);
+ if (acl) {
+ if (!err)
+ err = hfsplus_set_posix_acl(inode, acl,
+ ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ }
return err;
}
-
-static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
- char *list,
- size_t list_size,
- const char *name,
- size_t name_len,
- int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
-const struct xattr_handler hfsplus_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = hfsplus_xattr_list_posix_acl,
- .get = hfsplus_xattr_get_posix_acl,
- .set = hfsplus_xattr_set_posix_acl,
-};
-
-const struct xattr_handler hfsplus_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = hfsplus_xattr_list_posix_acl,
- .get = hfsplus_xattr_get_posix_acl,
- .set = hfsplus_xattr_set_posix_acl,
-};
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4c4d142cf89..4cf2024b87d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -131,9 +131,10 @@ static int hfsplus_system_write_inode(struct inode *inode)
hfsplus_inode_write_fork(inode, fork);
if (tree) {
int err = hfs_btree_write(tree);
+
if (err) {
pr_err("b-tree write err: %d, ino %lu\n",
- err, inode->i_ino);
+ err, inode->i_ino);
return err;
}
}
@@ -161,7 +162,7 @@ static int hfsplus_write_inode(struct inode *inode,
static void hfsplus_evict_inode(struct inode *inode)
{
hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HFSPLUS_IS_RSRC(inode)) {
HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
@@ -323,6 +324,7 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
if (!(*flags & MS_RDONLY)) {
@@ -474,12 +476,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
pr_err("failed to load catalog file\n");
goto out_close_ext_tree;
}
+ atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
if (vhdr->attr_file.total_blocks != 0) {
sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
if (!sbi->attr_tree) {
pr_err("failed to load attributes file\n");
goto out_close_cat_tree;
}
+ atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
}
sb->s_xattr = hfsplus_xattr_handlers;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b51a6079108..cc623567143 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,15 +24,8 @@ struct hfsplus_wd {
u16 embed_count;
};
-static void hfsplus_end_io_sync(struct bio *bio, int err)
-{
- if (err)
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- complete(bio->bi_private);
-}
-
-/*
- * hfsplus_submit_bio - Perfrom block I/O
+/**
+ * hfsplus_submit_bio - Perform block I/O
* @sb: super block of volume for I/O
* @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
* @buf: buffer for I/O
@@ -53,7 +46,6 @@ static void hfsplus_end_io_sync(struct bio *bio, int err)
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
void *buf, void **data, int rw)
{
- DECLARE_COMPLETION_ONSTACK(wait);
struct bio *bio;
int ret = 0;
u64 io_size;
@@ -71,10 +63,8 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1);
bio = bio_alloc(GFP_NOIO, 1);
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
bio->bi_bdev = sb->s_bdev;
- bio->bi_end_io = hfsplus_end_io_sync;
- bio->bi_private = &wait;
if (!(rw & WRITE) && data)
*data = (u8 *)buf + offset;
@@ -93,12 +83,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
buf = (u8 *)buf + len;
}
- submit_bio(rw, bio);
- wait_for_completion(&wait);
-
- if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
-
+ ret = submit_bio_wait(rw, bio);
out:
bio_put(bio);
return ret < 0 ? ret : 0;
@@ -246,10 +231,8 @@ reread:
if (blocksize < HFSPLUS_SECTOR_SIZE || ((blocksize - 1) & blocksize))
goto out_free_backup_vhdr;
sbi->alloc_blksz = blocksize;
- sbi->alloc_blksz_shift = 0;
- while ((blocksize >>= 1) != 0)
- sbi->alloc_blksz_shift++;
- blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
+ sbi->alloc_blksz_shift = ilog2(blocksize);
+ blocksize = min_t(u32, sbi->alloc_blksz, PAGE_SIZE);
/*
* Align block size to block offset.
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index bd8471fb9a6..d98094a9f47 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -7,16 +7,20 @@
*/
#include "hfsplus_fs.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/nls.h>
#include "xattr.h"
#include "acl.h"
+static int hfsplus_removexattr(struct inode *inode, const char *name);
+
const struct xattr_handler *hfsplus_xattr_handlers[] = {
&hfsplus_xattr_osx_handler,
&hfsplus_xattr_user_handler,
&hfsplus_xattr_trusted_handler,
#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
- &hfsplus_xattr_acl_access_handler,
- &hfsplus_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&hfsplus_xattr_security_handler,
NULL
@@ -51,80 +55,209 @@ static inline int is_known_namespace(const char *name)
return true;
}
-static int can_set_system_xattr(struct inode *inode, const char *name,
- const void *value, size_t size)
+static void hfsplus_init_header_node(struct inode *attr_file,
+ u32 clump_size,
+ char *buf, u16 node_size)
{
-#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
- struct posix_acl *acl;
- int err;
+ struct hfs_bnode_desc *desc;
+ struct hfs_btree_header_rec *head;
+ u16 offset;
+ __be16 *rec_offsets;
+ u32 hdr_node_map_rec_bits;
+ char *bmp;
+ u32 used_nodes;
+ u32 used_bmp_bytes;
+ u64 tmp;
+
+ hfs_dbg(ATTR_MOD, "init_hdr_attr_file: clump %u, node_size %u\n",
+ clump_size, node_size);
+
+ /* The end of the node contains list of record offsets */
+ rec_offsets = (__be16 *)(buf + node_size);
+
+ desc = (struct hfs_bnode_desc *)buf;
+ desc->type = HFS_NODE_HEADER;
+ desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT);
+ offset = sizeof(struct hfs_bnode_desc);
+ *--rec_offsets = cpu_to_be16(offset);
+
+ head = (struct hfs_btree_header_rec *)(buf + offset);
+ head->node_size = cpu_to_be16(node_size);
+ tmp = i_size_read(attr_file);
+ do_div(tmp, node_size);
+ head->node_count = cpu_to_be32(tmp);
+ head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1);
+ head->clump_size = cpu_to_be32(clump_size);
+ head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS);
+ head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16));
+ offset += sizeof(struct hfs_btree_header_rec);
+ *--rec_offsets = cpu_to_be16(offset);
+ offset += HFSPLUS_BTREE_HDR_USER_BYTES;
+ *--rec_offsets = cpu_to_be16(offset);
+
+ hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16)));
+ if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) {
+ u32 map_node_bits;
+ u32 map_nodes;
+
+ desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1);
+ map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) -
+ (2 * sizeof(u16)) - 2);
+ map_nodes = (be32_to_cpu(head->node_count) -
+ hdr_node_map_rec_bits +
+ (map_node_bits - 1)) / map_node_bits;
+ be32_add_cpu(&head->free_nodes, 0 - map_nodes);
+ }
- if (!inode_owner_or_capable(inode))
- return -EPERM;
+ bmp = buf + offset;
+ used_nodes =
+ be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes);
+ used_bmp_bytes = used_nodes / 8;
+ if (used_bmp_bytes) {
+ memset(bmp, 0xFF, used_bmp_bytes);
+ bmp += used_bmp_bytes;
+ used_nodes %= 8;
+ }
+ *bmp = ~(0xFF >> used_nodes);
+ offset += hdr_node_map_rec_bits / 8;
+ *--rec_offsets = cpu_to_be16(offset);
+}
- /*
- * POSIX_ACL_XATTR_ACCESS is tied to i_mode
- */
- if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- err = posix_acl_equiv_mode(acl, &inode->i_mode);
- posix_acl_release(acl);
- if (err < 0)
- return err;
- mark_inode_dirty(inode);
- }
+static int hfsplus_create_attributes_file(struct super_block *sb)
+{
+ int err = 0;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct inode *attr_file;
+ struct hfsplus_inode_info *hip;
+ u32 clump_size;
+ u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE;
+ char *buf;
+ int index, written;
+ struct address_space *mapping;
+ struct page *page;
+ int old_state = HFSPLUS_EMPTY_ATTR_TREE;
+
+ hfs_dbg(ATTR_MOD, "create_attr_file: ino %d\n", HFSPLUS_ATTR_CNID);
+
+check_attr_tree_state_again:
+ switch (atomic_read(&sbi->attr_tree_state)) {
+ case HFSPLUS_EMPTY_ATTR_TREE:
+ if (old_state != atomic_cmpxchg(&sbi->attr_tree_state,
+ old_state,
+ HFSPLUS_CREATING_ATTR_TREE))
+ goto check_attr_tree_state_again;
+ break;
+ case HFSPLUS_CREATING_ATTR_TREE:
/*
- * We're changing the ACL. Get rid of the cached one
+ * This state means that another thread is in process
+ * of AttributesFile creation. Theoretically, it is
+ * possible to be here. But really __setxattr() method
+ * first of all calls hfs_find_init() for lookup in
+ * B-tree of CatalogFile. This method locks mutex of
+ * CatalogFile's B-tree. As a result, if some thread
+ * is inside AttributedFile creation operation then
+ * another threads will be waiting unlocking of
+ * CatalogFile's B-tree's mutex. However, if code will
+ * change then we will return error code (-EAGAIN) from
+ * here. Really, it means that first try to set of xattr
+ * fails with error but second attempt will have success.
*/
- forget_cached_acl(inode, ACL_TYPE_ACCESS);
-
+ return -EAGAIN;
+ case HFSPLUS_VALID_ATTR_TREE:
return 0;
- } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- posix_acl_release(acl);
+ case HFSPLUS_FAILED_ATTR_TREE:
+ return -EOPNOTSUPP;
+ default:
+ BUG();
+ }
- /*
- * We're changing the default ACL. Get rid of the cached one
- */
- forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+ attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID);
+ if (IS_ERR(attr_file)) {
+ pr_err("failed to load attributes file\n");
+ return PTR_ERR(attr_file);
+ }
- return 0;
+ BUG_ON(i_size_read(attr_file) != 0);
+
+ hip = HFSPLUS_I(attr_file);
+
+ clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize,
+ node_size,
+ sbi->sect_count,
+ HFSPLUS_ATTR_CNID);
+
+ mutex_lock(&hip->extents_lock);
+ hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift;
+ mutex_unlock(&hip->extents_lock);
+
+ if (sbi->free_blocks <= (hip->clump_blocks << 1)) {
+ err = -ENOSPC;
+ goto end_attr_file_creation;
}
-#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
- return -EOPNOTSUPP;
-}
-static int can_set_xattr(struct inode *inode, const char *name,
- const void *value, size_t value_len)
-{
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return can_set_system_xattr(inode, name, value, value_len);
+ while (hip->alloc_blocks < hip->clump_blocks) {
+ err = hfsplus_file_extend(attr_file, false);
+ if (unlikely(err)) {
+ pr_err("failed to extend attributes file\n");
+ goto end_attr_file_creation;
+ }
+ hip->phys_size = attr_file->i_size =
+ (loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift;
+ hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift;
+ inode_set_bytes(attr_file, attr_file->i_size);
+ }
- if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
- /*
- * This makes sure that we aren't trying to set an
- * attribute in a different namespace by prefixing it
- * with "osx."
- */
- if (is_known_namespace(name + XATTR_MAC_OSX_PREFIX_LEN))
- return -EOPNOTSUPP;
+ buf = kzalloc(node_size, GFP_NOFS);
+ if (!buf) {
+ pr_err("failed to allocate memory for header node\n");
+ err = -ENOMEM;
+ goto end_attr_file_creation;
+ }
- return 0;
+ hfsplus_init_header_node(attr_file, clump_size, buf, node_size);
+
+ mapping = attr_file->i_mapping;
+
+ index = 0;
+ written = 0;
+ for (; written < node_size; index++, written += PAGE_CACHE_SIZE) {
+ void *kaddr;
+
+ page = read_mapping_page(mapping, index, NULL);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto failed_header_node_init;
+ }
+
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, buf + written,
+ min_t(size_t, PAGE_CACHE_SIZE, node_size - written));
+ kunmap_atomic(kaddr);
+
+ set_page_dirty(page);
+ page_cache_release(page);
}
- /*
- * Don't allow setting an attribute in an unknown namespace.
- */
- if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) &&
- strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
- strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
- return -EOPNOTSUPP;
+ hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY);
+
+ sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID);
+ if (!sbi->attr_tree)
+ pr_err("failed to load attributes file\n");
+
+failed_header_node_init:
+ kfree(buf);
+
+end_attr_file_creation:
+ iput(attr_file);
- return 0;
+ if (!err)
+ atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE);
+ else if (err == -ENOSPC)
+ atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE);
+ else
+ atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE);
+
+ return err;
}
int __hfsplus_setxattr(struct inode *inode, const char *name,
@@ -144,18 +277,8 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
HFSPLUS_IS_RSRC(inode))
return -EOPNOTSUPP;
- err = can_set_xattr(inode, name, value, size);
- if (err)
- return err;
-
- if (strncmp(name, XATTR_MAC_OSX_PREFIX,
- XATTR_MAC_OSX_PREFIX_LEN) == 0)
- name += XATTR_MAC_OSX_PREFIX_LEN;
-
- if (value == NULL) {
- value = "";
- size = 0;
- }
+ if (value == NULL)
+ return hfsplus_removexattr(inode, name);
err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd);
if (err) {
@@ -211,8 +334,9 @@ int __hfsplus_setxattr(struct inode *inode, const char *name,
}
if (!HFSPLUS_SB(inode->i_sb)->attr_tree) {
- err = -EOPNOTSUPP;
- goto end_setxattr;
+ err = hfsplus_create_attributes_file(inode->i_sb);
+ if (unlikely(err))
+ goto end_setxattr;
}
if (hfsplus_attr_exists(inode, name)) {
@@ -272,16 +396,11 @@ end_setxattr:
return err;
}
-static inline int is_osx_xattr(const char *xattr_name)
-{
- return !is_known_namespace(xattr_name);
-}
-
static int name_len(const char *xattr_name, int xattr_name_len)
{
int len = xattr_name_len + 1;
- if (is_osx_xattr(xattr_name))
+ if (!is_known_namespace(xattr_name))
len += XATTR_MAC_OSX_PREFIX_LEN;
return len;
@@ -292,7 +411,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
int len = name_len;
int offset = 0;
- if (is_osx_xattr(xattr_name)) {
+ if (!is_known_namespace(xattr_name)) {
strncpy(buffer, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN);
offset += XATTR_MAC_OSX_PREFIX_LEN;
len += XATTR_MAC_OSX_PREFIX_LEN;
@@ -370,18 +489,6 @@ ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
HFSPLUS_IS_RSRC(inode))
return -EOPNOTSUPP;
- if (strncmp(name, XATTR_MAC_OSX_PREFIX,
- XATTR_MAC_OSX_PREFIX_LEN) == 0) {
- /* skip "osx." prefix */
- name += XATTR_MAC_OSX_PREFIX_LEN;
- /*
- * Don't allow retrieving properly prefixed attributes
- * by prepending them with "osx."
- */
- if (is_known_namespace(name))
- return -EOPNOTSUPP;
- }
-
if (!strcmp_xattr_finder_info(name))
return hfsplus_getxattr_finder_info(inode, value, size);
@@ -539,8 +646,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
struct hfs_find_data fd;
u16 key_len = 0;
struct hfsplus_attr_key attr_key;
- char strbuf[HFSPLUS_ATTR_MAX_STRLEN +
- XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
+ char *strbuf;
int xattr_name_len;
if ((!S_ISREG(inode->i_mode) &&
@@ -560,6 +666,13 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
return err;
}
+ strbuf = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN +
+ XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+ if (!strbuf) {
+ res = -ENOMEM;
+ goto out;
+ }
+
err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd);
if (err) {
if (err == -ENOENT) {
@@ -586,7 +699,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
if (be32_to_cpu(attr_key.cnid) != inode->i_ino)
goto end_listxattr;
- xattr_name_len = HFSPLUS_ATTR_MAX_STRLEN;
+ xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN;
if (hfsplus_uni2asc(inode->i_sb,
(const struct hfsplus_unistr *)&fd.key->attr.key_name,
strbuf, &xattr_name_len)) {
@@ -612,36 +725,24 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size)
}
end_listxattr:
+ kfree(strbuf);
+out:
hfs_find_exit(&fd);
return res;
}
-int hfsplus_removexattr(struct dentry *dentry, const char *name)
+static int hfsplus_removexattr(struct inode *inode, const char *name)
{
int err = 0;
- struct inode *inode = dentry->d_inode;
struct hfs_find_data cat_fd;
u16 flags;
u16 cat_entry_type;
int is_xattr_acl_deleted = 0;
int is_all_xattrs_deleted = 0;
- if ((!S_ISREG(inode->i_mode) &&
- !S_ISDIR(inode->i_mode)) ||
- HFSPLUS_IS_RSRC(inode))
- return -EOPNOTSUPP;
-
if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
return -EOPNOTSUPP;
- err = can_set_xattr(inode, name, NULL, 0);
- if (err)
- return err;
-
- if (strncmp(name, XATTR_MAC_OSX_PREFIX,
- XATTR_MAC_OSX_PREFIX_LEN) == 0)
- name += XATTR_MAC_OSX_PREFIX_LEN;
-
if (!strcmp_xattr_finder_info(name))
return -EOPNOTSUPP;
@@ -705,39 +806,55 @@ end_removexattr:
static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
- XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len > HFSPLUS_ATTR_MAX_STRLEN)
+ /*
+ * Don't allow retrieving properly prefixed attributes
+ * by prepending them with "osx."
+ */
+ if (is_known_namespace(name))
return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
+ + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN +
- XATTR_MAC_OSX_PREFIX_LEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len > HFSPLUS_ATTR_MAX_STRLEN)
+ /*
+ * Don't allow setting properly prefixed attributes
+ * by prepending them with "osx."
+ */
+ if (is_known_namespace(name))
return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN
+ + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_MAC_OSX_PREFIX);
strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 841b5698c0f..288530cf80b 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,6 @@
extern const struct xattr_handler hfsplus_xattr_osx_handler;
extern const struct xattr_handler hfsplus_xattr_user_handler;
extern const struct xattr_handler hfsplus_xattr_trusted_handler;
-extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
-extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
extern const struct xattr_handler hfsplus_xattr_security_handler;
extern const struct xattr_handler *hfsplus_xattr_handlers[];
@@ -42,8 +40,6 @@ static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int hfsplus_removexattr(struct dentry *dentry, const char *name);
-
int hfsplus_init_security(struct inode *inode, struct inode *dir,
const struct qstr *qstr);
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 00722765ea7..6ec5e107691 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -7,6 +7,8 @@
*/
#include <linux/security.h>
+#include <linux/nls.h>
+
#include "hfsplus_fs.h"
#include "xattr.h"
#include "acl.h"
@@ -14,37 +16,43 @@
static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_SECURITY_PREFIX);
strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_SECURITY_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_SECURITY_PREFIX);
strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
@@ -62,31 +70,30 @@ static int hfsplus_initxattrs(struct inode *inode,
void *fs_info)
{
const struct xattr *xattr;
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t xattr_name_len;
+ char *xattr_name;
int err = 0;
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- xattr_name_len = strlen(xattr->name);
- if (xattr_name_len == 0)
+ if (!strcmp(xattr->name, ""))
continue;
- if (xattr_name_len + XATTR_SECURITY_PREFIX_LEN >
- HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
strcpy(xattr_name, XATTR_SECURITY_PREFIX);
strcpy(xattr_name +
XATTR_SECURITY_PREFIX_LEN, xattr->name);
memset(xattr_name +
- XATTR_SECURITY_PREFIX_LEN + xattr_name_len, 0, 1);
+ XATTR_SECURITY_PREFIX_LEN + strlen(xattr->name), 0, 1);
err = __hfsplus_setxattr(inode, xattr_name,
xattr->value, xattr->value_len, 0);
if (err)
break;
}
+ kfree(xattr_name);
return err;
}
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index 426cee27754..3c5f27e4746 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -6,43 +6,51 @@
* Handler for trusted extended attributes.
*/
+#include <linux/nls.h>
+
#include "hfsplus_fs.h"
#include "xattr.h"
static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_TRUSTED_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_TRUSTED_PREFIX);
strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index e34016561ae..2b625a538b6 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -6,43 +6,51 @@
* Handler for user extended attributes.
*/
+#include <linux/nls.h>
+
#include "hfsplus_fs.h"
#include "xattr.h"
static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
void *buffer, size_t size, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_USER_PREFIX);
strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
- return hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ res = hfsplus_getxattr(dentry, xattr_name, buffer, size);
+ kfree(xattr_name);
+ return res;
}
static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
const void *buffer, size_t size, int flags, int type)
{
- char xattr_name[HFSPLUS_ATTR_MAX_STRLEN + 1] = {0};
- size_t len = strlen(name);
+ char *xattr_name;
+ int res;
if (!strcmp(name, ""))
return -EINVAL;
- if (len + XATTR_USER_PREFIX_LEN > HFSPLUS_ATTR_MAX_STRLEN)
- return -EOPNOTSUPP;
-
+ xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
+ GFP_KERNEL);
+ if (!xattr_name)
+ return -ENOMEM;
strcpy(xattr_name, XATTR_USER_PREFIX);
strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name);
- return hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags);
+ kfree(xattr_name);
+ return res;
}
static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 25437280a20..bb529f3b7f2 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
-static int hostfs_d_delete(const struct dentry *dentry)
-{
- return 1;
-}
-
-static const struct dentry_operations hostfs_dentry_ops = {
- .d_delete = hostfs_d_delete,
-};
-
/* Changed in hostfs_args before the kernel starts running */
static char *root_ino = "";
static int append = 0;
@@ -195,7 +186,7 @@ static struct inode *hostfs_iget(struct super_block *sb)
return inode;
}
-int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
+static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
{
/*
* do_statfs uses struct statfs64 internally, but the linux kernel
@@ -239,7 +230,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
static void hostfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HOSTFS_I(inode)->fd != -1) {
close_file(&HOSTFS_I(inode)->fd);
@@ -277,7 +268,7 @@ static const struct super_operations hostfs_sbops = {
.show_options = hostfs_show_options,
};
-int hostfs_readdir(struct file *file, struct dir_context *ctx)
+static int hostfs_readdir(struct file *file, struct dir_context *ctx)
{
void *dir;
char *name;
@@ -302,7 +293,7 @@ int hostfs_readdir(struct file *file, struct dir_context *ctx)
return 0;
}
-int hostfs_file_open(struct inode *ino, struct file *file)
+static int hostfs_file_open(struct inode *ino, struct file *file)
{
static DEFINE_MUTEX(open_mutex);
char *name;
@@ -368,7 +359,8 @@ static int hostfs_file_release(struct inode *inode, struct file *file)
return 0;
}
-int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
{
struct inode *inode = file->f_mapping->host;
int ret;
@@ -386,11 +378,11 @@ int hostfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
static const struct file_operations hostfs_file_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
+ .read = new_sync_read,
.splice_read = generic_file_splice_read,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
- .write = do_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
+ .write = new_sync_write,
.mmap = generic_file_mmap,
.open = hostfs_file_open,
.release = hostfs_file_release,
@@ -403,7 +395,7 @@ static const struct file_operations hostfs_dir_fops = {
.read = generic_read_dir,
};
-int hostfs_writepage(struct page *page, struct writeback_control *wbc)
+static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
@@ -439,7 +431,7 @@ int hostfs_writepage(struct page *page, struct writeback_control *wbc)
return err;
}
-int hostfs_readpage(struct file *file, struct page *page)
+static int hostfs_readpage(struct file *file, struct page *page)
{
char *buffer;
long long start;
@@ -464,9 +456,9 @@ int hostfs_readpage(struct file *file, struct page *page)
return err;
}
-int hostfs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
+static int hostfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
@@ -476,9 +468,9 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
return 0;
}
-int hostfs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
+static int hostfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
void *buffer;
@@ -558,8 +550,8 @@ static int read_name(struct inode *ino, char *name)
return 0;
}
-int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+ bool excl)
{
struct inode *inode;
char *name;
@@ -600,8 +592,8 @@ int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
return error;
}
-struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
- unsigned int flags)
+static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
+ unsigned int flags)
{
struct inode *inode;
char *name;
@@ -637,7 +629,8 @@ struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
return ERR_PTR(err);
}
-int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
+static int hostfs_link(struct dentry *to, struct inode *ino,
+ struct dentry *from)
{
char *from_name, *to_name;
int err;
@@ -655,7 +648,7 @@ int hostfs_link(struct dentry *to, struct inode *ino, struct dentry *from)
return err;
}
-int hostfs_unlink(struct inode *ino, struct dentry *dentry)
+static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
{
char *file;
int err;
@@ -671,7 +664,8 @@ int hostfs_unlink(struct inode *ino, struct dentry *dentry)
return err;
}
-int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
+static int hostfs_symlink(struct inode *ino, struct dentry *dentry,
+ const char *to)
{
char *file;
int err;
@@ -683,7 +677,7 @@ int hostfs_symlink(struct inode *ino, struct dentry *dentry, const char *to)
return err;
}
-int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
+static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
{
char *file;
int err;
@@ -695,7 +689,7 @@ int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode)
return err;
}
-int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
+static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
{
char *file;
int err;
@@ -747,8 +741,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
return err;
}
-int hostfs_rename(struct inode *from_ino, struct dentry *from,
- struct inode *to_ino, struct dentry *to)
+static int hostfs_rename(struct inode *from_ino, struct dentry *from,
+ struct inode *to_ino, struct dentry *to)
{
char *from_name, *to_name;
int err;
@@ -765,7 +759,7 @@ int hostfs_rename(struct inode *from_ino, struct dentry *from,
return err;
}
-int hostfs_permission(struct inode *ino, int desired)
+static int hostfs_permission(struct inode *ino, int desired)
{
char *name;
int r = 0, w = 0, x = 0, err;
@@ -791,7 +785,7 @@ int hostfs_permission(struct inode *ino, int desired)
return err;
}
-int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hostfs_setattr(struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = dentry->d_inode;
struct hostfs_iattr attrs;
@@ -925,7 +919,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_blocksize_bits = 10;
sb->s_magic = HOSTFS_SUPER_MAGIC;
sb->s_op = &hostfs_sbops;
- sb->s_d_op = &hostfs_dentry_ops;
+ sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
/* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index cdb84a83806..f005046e159 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -8,6 +8,58 @@
#include "hpfs_fn.h"
+static void hpfs_claim_alloc(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free != (unsigned)-1) {
+ if (unlikely(!sbi->sb_n_free)) {
+ hpfs_error(s, "free count underflow, allocating sector %08x", sec);
+ sbi->sb_n_free = -1;
+ return;
+ }
+ sbi->sb_n_free--;
+ }
+}
+
+static void hpfs_claim_free(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free != (unsigned)-1) {
+ if (unlikely(sbi->sb_n_free >= sbi->sb_fs_size)) {
+ hpfs_error(s, "free count overflow, freeing sector %08x", sec);
+ sbi->sb_n_free = -1;
+ return;
+ }
+ sbi->sb_n_free++;
+ }
+}
+
+static void hpfs_claim_dirband_alloc(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+ if (unlikely(!sbi->sb_n_free_dnodes)) {
+ hpfs_error(s, "dirband free count underflow, allocating sector %08x", sec);
+ sbi->sb_n_free_dnodes = -1;
+ return;
+ }
+ sbi->sb_n_free_dnodes--;
+ }
+}
+
+static void hpfs_claim_dirband_free(struct super_block *s, secno sec)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free_dnodes != (unsigned)-1) {
+ if (unlikely(sbi->sb_n_free_dnodes >= sbi->sb_dirband_size / 4)) {
+ hpfs_error(s, "dirband free count overflow, freeing sector %08x", sec);
+ sbi->sb_n_free_dnodes = -1;
+ return;
+ }
+ sbi->sb_n_free_dnodes++;
+ }
+}
+
/*
* Check if a sector is allocated in bitmap
* This is really slow. Turned on only if chk==2
@@ -203,9 +255,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
}
sec = 0;
ret:
+ if (sec) {
+ i = 0;
+ do
+ hpfs_claim_alloc(s, sec + i);
+ while (unlikely(++i < n));
+ }
if (sec && f_p) {
for (i = 0; i < forward; i++) {
- if (!hpfs_alloc_if_possible(s, sec + i + 1)) {
+ if (!hpfs_alloc_if_possible(s, sec + n + i)) {
hpfs_error(s, "Prealloc doesn't work! Wanted %d, allocated at %08x, can't allocate %d", forward, sec, i);
sec = 0;
break;
@@ -228,6 +286,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
nr >>= 2;
sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
if (!sec) return 0;
+ hpfs_claim_dirband_alloc(s, sec);
return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
}
@@ -242,6 +301,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
+ hpfs_claim_alloc(s, sec);
return 1;
}
hpfs_brelse4(&qbh);
@@ -256,7 +316,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
struct quad_buffer_head qbh;
__le32 *bmp;
struct hpfs_sb_info *sbi = hpfs_sb(s);
- /*printk("2 - ");*/
+ /*pr_info("2 - ");*/
if (!n) return;
if (sec < 0x12) {
hpfs_error(s, "Trying to free reserved sector %08x", sec);
@@ -275,6 +335,7 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
return;
}
bmp[(sec & 0x3fff) >> 5] |= cpu_to_le32(1 << (sec & 0x1f));
+ hpfs_claim_free(s, sec);
if (!--n) {
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
@@ -359,6 +420,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
bmp[ssec >> 5] |= cpu_to_le32(1 << (ssec & 0x1f));
hpfs_mark_4buffers_dirty(&qbh);
hpfs_brelse4(&qbh);
+ hpfs_claim_dirband_free(s, dno);
}
}
@@ -366,7 +428,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
dnode_secno *dno, struct quad_buffer_head *qbh)
{
struct dnode *d;
- if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) {
+ if (hpfs_get_free_dnodes(s) > FREE_DNODES_ADD) {
if (!(*dno = alloc_in_dirband(s, near)))
if (!(*dno = hpfs_alloc_sector(s, near, 4, 0))) return NULL;
} else {
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index 4d0a1afa058..8057fe4e657 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -55,7 +55,7 @@ void *hpfs_map_sector(struct super_block *s, unsigned secno, struct buffer_head
if (bh != NULL)
return bh->b_data;
else {
- printk("HPFS: hpfs_map_sector: read error\n");
+ pr_err("%s(): read error\n", __func__);
return NULL;
}
}
@@ -76,7 +76,7 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head
set_buffer_uptodate(bh);
return bh->b_data;
} else {
- printk("HPFS: hpfs_get_sector: getblk failed\n");
+ pr_err("%s(): getblk failed\n", __func__);
return NULL;
}
}
@@ -86,7 +86,6 @@ void *hpfs_get_sector(struct super_block *s, unsigned secno, struct buffer_head
void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh,
int ahead)
{
- struct buffer_head *bh;
char *data;
hpfs_lock_assert(s);
@@ -94,40 +93,38 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
cond_resched();
if (secno & 3) {
- printk("HPFS: hpfs_map_4sectors: unaligned read\n");
+ pr_err("%s(): unaligned read\n", __func__);
return NULL;
}
hpfs_prefetch_sectors(s, secno, 4 + ahead);
+ if (!(qbh->bh[0] = sb_bread(s, secno + 0))) goto bail0;
+ if (!(qbh->bh[1] = sb_bread(s, secno + 1))) goto bail1;
+ if (!(qbh->bh[2] = sb_bread(s, secno + 2))) goto bail2;
+ if (!(qbh->bh[3] = sb_bread(s, secno + 3))) goto bail3;
+
+ if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+ likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+ likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+ return qbh->data = qbh->bh[0]->b_data;
+ }
+
qbh->data = data = kmalloc(2048, GFP_NOFS);
if (!data) {
- printk("HPFS: hpfs_map_4sectors: out of memory\n");
- goto bail;
+ pr_err("%s(): out of memory\n", __func__);
+ goto bail4;
}
- qbh->bh[0] = bh = sb_bread(s, secno);
- if (!bh)
- goto bail0;
- memcpy(data, bh->b_data, 512);
-
- qbh->bh[1] = bh = sb_bread(s, secno + 1);
- if (!bh)
- goto bail1;
- memcpy(data + 512, bh->b_data, 512);
-
- qbh->bh[2] = bh = sb_bread(s, secno + 2);
- if (!bh)
- goto bail2;
- memcpy(data + 2 * 512, bh->b_data, 512);
-
- qbh->bh[3] = bh = sb_bread(s, secno + 3);
- if (!bh)
- goto bail3;
- memcpy(data + 3 * 512, bh->b_data, 512);
+ memcpy(data + 0 * 512, qbh->bh[0]->b_data, 512);
+ memcpy(data + 1 * 512, qbh->bh[1]->b_data, 512);
+ memcpy(data + 2 * 512, qbh->bh[2]->b_data, 512);
+ memcpy(data + 3 * 512, qbh->bh[3]->b_data, 512);
return data;
+ bail4:
+ brelse(qbh->bh[3]);
bail3:
brelse(qbh->bh[2]);
bail2:
@@ -135,9 +132,6 @@ void *hpfs_map_4sectors(struct super_block *s, unsigned secno, struct quad_buffe
bail1:
brelse(qbh->bh[0]);
bail0:
- kfree(data);
- printk("HPFS: hpfs_map_4sectors: read error\n");
- bail:
return NULL;
}
@@ -151,48 +145,58 @@ void *hpfs_get_4sectors(struct super_block *s, unsigned secno,
hpfs_lock_assert(s);
if (secno & 3) {
- printk("HPFS: hpfs_get_4sectors: unaligned read\n");
+ pr_err("%s(): unaligned read\n", __func__);
return NULL;
}
- /*return hpfs_map_4sectors(s, secno, qbh, 0);*/
+ if (!hpfs_get_sector(s, secno + 0, &qbh->bh[0])) goto bail0;
+ if (!hpfs_get_sector(s, secno + 1, &qbh->bh[1])) goto bail1;
+ if (!hpfs_get_sector(s, secno + 2, &qbh->bh[2])) goto bail2;
+ if (!hpfs_get_sector(s, secno + 3, &qbh->bh[3])) goto bail3;
+
+ if (likely(qbh->bh[1]->b_data == qbh->bh[0]->b_data + 1 * 512) &&
+ likely(qbh->bh[2]->b_data == qbh->bh[0]->b_data + 2 * 512) &&
+ likely(qbh->bh[3]->b_data == qbh->bh[0]->b_data + 3 * 512)) {
+ return qbh->data = qbh->bh[0]->b_data;
+ }
+
if (!(qbh->data = kmalloc(2048, GFP_NOFS))) {
- printk("HPFS: hpfs_get_4sectors: out of memory\n");
- return NULL;
+ pr_err("%s(): out of memory\n", __func__);
+ goto bail4;
}
- if (!(hpfs_get_sector(s, secno, &qbh->bh[0]))) goto bail0;
- if (!(hpfs_get_sector(s, secno + 1, &qbh->bh[1]))) goto bail1;
- if (!(hpfs_get_sector(s, secno + 2, &qbh->bh[2]))) goto bail2;
- if (!(hpfs_get_sector(s, secno + 3, &qbh->bh[3]))) goto bail3;
- memcpy(qbh->data, qbh->bh[0]->b_data, 512);
- memcpy(qbh->data + 512, qbh->bh[1]->b_data, 512);
- memcpy(qbh->data + 2*512, qbh->bh[2]->b_data, 512);
- memcpy(qbh->data + 3*512, qbh->bh[3]->b_data, 512);
return qbh->data;
- bail3: brelse(qbh->bh[2]);
- bail2: brelse(qbh->bh[1]);
- bail1: brelse(qbh->bh[0]);
- bail0:
+bail4:
+ brelse(qbh->bh[3]);
+bail3:
+ brelse(qbh->bh[2]);
+bail2:
+ brelse(qbh->bh[1]);
+bail1:
+ brelse(qbh->bh[0]);
+bail0:
return NULL;
}
void hpfs_brelse4(struct quad_buffer_head *qbh)
{
- brelse(qbh->bh[3]);
- brelse(qbh->bh[2]);
- brelse(qbh->bh[1]);
+ if (unlikely(qbh->data != qbh->bh[0]->b_data))
+ kfree(qbh->data);
brelse(qbh->bh[0]);
- kfree(qbh->data);
+ brelse(qbh->bh[1]);
+ brelse(qbh->bh[2]);
+ brelse(qbh->bh[3]);
}
void hpfs_mark_4buffers_dirty(struct quad_buffer_head *qbh)
{
- memcpy(qbh->bh[0]->b_data, qbh->data, 512);
- memcpy(qbh->bh[1]->b_data, qbh->data + 512, 512);
- memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
- memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+ if (unlikely(qbh->data != qbh->bh[0]->b_data)) {
+ memcpy(qbh->bh[0]->b_data, qbh->data + 0 * 512, 512);
+ memcpy(qbh->bh[1]->b_data, qbh->data + 1 * 512, 512);
+ memcpy(qbh->bh[2]->b_data, qbh->data + 2 * 512, 512);
+ memcpy(qbh->bh[3]->b_data, qbh->data + 3 * 512, 512);
+ }
mark_buffer_dirty(qbh->bh[0]);
mark_buffer_dirty(qbh->bh[1]);
mark_buffer_dirty(qbh->bh[2]);
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 292b1acb9b8..2a8e07425de 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -36,7 +36,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
mutex_lock(&i->i_mutex);
hpfs_lock(s);
- /*printk("dir lseek\n");*/
+ /*pr_info("dir lseek\n");*/
if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1;
while (pos != new_off) {
@@ -51,7 +51,7 @@ ok:
mutex_unlock(&i->i_mutex);
return new_off;
fail:
- /*printk("illegal lseek: %016llx\n", new_off);*/
+ /*pr_warn("illegal lseek: %016llx\n", new_off);*/
hpfs_unlock(s);
mutex_unlock(&i->i_mutex);
return -ESPIPE;
@@ -127,7 +127,7 @@ static int hpfs_readdir(struct file *file, struct dir_context *ctx)
if (ctx->pos == 12)
goto out;
if (ctx->pos == 3 || ctx->pos == 4 || ctx->pos == 5) {
- printk("HPFS: warning: pos==%d\n",(int)ctx->pos);
+ pr_err("pos==%d\n", (int)ctx->pos);
goto out;
}
if (ctx->pos == 0) {
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 4364b2a02c5..f36fc010fcc 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -17,7 +17,7 @@ static loff_t get_pos(struct dnode *d, struct hpfs_dirent *fde)
if (de == fde) return ((loff_t) le32_to_cpu(d->self) << 4) | (loff_t)i;
i++;
}
- printk("HPFS: get_pos: not_found\n");
+ pr_info("%s(): not_found\n", __func__);
return ((loff_t)le32_to_cpu(d->self) << 4) | (loff_t)1;
}
@@ -32,7 +32,7 @@ void hpfs_add_pos(struct inode *inode, loff_t *pos)
if (hpfs_inode->i_rddir_off[i] == pos) return;
if (!(i&0x0f)) {
if (!(ppos = kmalloc((i+0x11) * sizeof(loff_t*), GFP_NOFS))) {
- printk("HPFS: out of memory for position list\n");
+ pr_err("out of memory for position list\n");
return;
}
if (hpfs_inode->i_rddir_off) {
@@ -63,7 +63,8 @@ void hpfs_del_pos(struct inode *inode, loff_t *pos)
}
return;
not_f:
- /*printk("HPFS: warning: position pointer %p->%08x not found\n", pos, (int)*pos);*/
+ /*pr_warn("position pointer %p->%08x not found\n",
+ pos, (int)*pos);*/
return;
}
@@ -92,8 +93,11 @@ static void hpfs_pos_ins(loff_t *p, loff_t d, loff_t c)
{
if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) {
int n = (*p & 0x3f) + c;
- if (n > 0x3f) printk("HPFS: hpfs_pos_ins: %08x + %d\n", (int)*p, (int)c >> 8);
- else *p = (*p & ~0x3f) | n;
+ if (n > 0x3f)
+ pr_err("%s(): %08x + %d\n",
+ __func__, (int)*p, (int)c >> 8);
+ else
+ *p = (*p & ~0x3f) | n;
}
}
@@ -101,8 +105,11 @@ static void hpfs_pos_del(loff_t *p, loff_t d, loff_t c)
{
if ((*p & ~0x3f) == (d & ~0x3f) && (*p & 0x3f) >= (d & 0x3f)) {
int n = (*p & 0x3f) - c;
- if (n < 1) printk("HPFS: hpfs_pos_ins: %08x - %d\n", (int)*p, (int)c >> 8);
- else *p = (*p & ~0x3f) | n;
+ if (n < 1)
+ pr_err("%s(): %08x - %d\n",
+ __func__, (int)*p, (int)c >> 8);
+ else
+ *p = (*p & ~0x3f) | n;
}
}
@@ -239,12 +246,12 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
struct fnode *fnode;
int c1, c2 = 0;
if (!(nname = kmalloc(256, GFP_NOFS))) {
- printk("HPFS: out of memory, can't add to dnode\n");
+ pr_err("out of memory, can't add to dnode\n");
return 1;
}
go_up:
if (namelen >= 256) {
- hpfs_error(i->i_sb, "hpfs_add_to_dnode: namelen == %d", namelen);
+ hpfs_error(i->i_sb, "%s(): namelen == %d", __func__, namelen);
kfree(nd);
kfree(nname);
return 1;
@@ -281,7 +288,7 @@ static int hpfs_add_to_dnode(struct inode *i, dnode_secno dno,
not be any error while splitting dnodes, otherwise the
whole directory, not only file we're adding, would
be lost. */
- printk("HPFS: out of memory for dnode splitting\n");
+ pr_err("out of memory for dnode splitting\n");
hpfs_brelse4(&qbh);
kfree(nname);
return 1;
@@ -597,7 +604,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
if (!de_next->down) goto endm;
ndown = de_down_pointer(de_next);
if (!(de_cp = kmalloc(le16_to_cpu(de->length), GFP_NOFS))) {
- printk("HPFS: out of memory for dtree balancing\n");
+ pr_err("out of memory for dtree balancing\n");
goto endm;
}
memcpy(de_cp, de, le16_to_cpu(de->length));
@@ -612,7 +619,8 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
hpfs_brelse4(&qbh1);
}
hpfs_add_to_dnode(i, ndown, de_cp->name, de_cp->namelen, de_cp, de_cp->down ? de_down_pointer(de_cp) : 0);
- /*printk("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n", up, ndown, down, dno);*/
+ /*pr_info("UP-TO-DNODE: %08x (ndown = %08x, down = %08x, dno = %08x)\n",
+ up, ndown, down, dno);*/
dno = up;
kfree(de_cp);
goto try_it_again;
@@ -637,15 +645,15 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
if (!dlp && down) {
if (le32_to_cpu(d1->first_free) > 2044) {
if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
- printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
- printk("HPFS: warning: terminating balancing operation\n");
+ pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n");
+ pr_err("terminating balancing operation\n");
}
hpfs_brelse4(&qbh1);
goto endm;
}
if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
- printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
- printk("HPFS: warning: goin'on\n");
+ pr_err("unbalanced dnode tree, see hpfs.txt 4 more info\n");
+ pr_err("goin'on\n");
}
le16_add_cpu(&del->length, 4);
del->down = 1;
@@ -659,7 +667,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
*(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
} else goto endm;
if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
- printk("HPFS: out of memory for dtree balancing\n");
+ pr_err("out of memory for dtree balancing\n");
hpfs_brelse4(&qbh1);
goto endm;
}
@@ -1000,7 +1008,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
int d1, d2 = 0;
name1 = f->name;
if (!(name2 = kmalloc(256, GFP_NOFS))) {
- printk("HPFS: out of memory, can't map dirent\n");
+ pr_err("out of memory, can't map dirent\n");
return NULL;
}
if (f->len <= 15)
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c
index bcaafcd2666..ce3f98ba993 100644
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -51,7 +51,7 @@ static char *get_indirect_ea(struct super_block *s, int ano, secno a, int size)
{
char *ret;
if (!(ret = kmalloc(size + 1, GFP_NOFS))) {
- printk("HPFS: out of memory for EA\n");
+ pr_err("out of memory for EA\n");
return NULL;
}
if (hpfs_ea_read(s, a, ano, 0, size, ret)) {
@@ -139,7 +139,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
if (ea_indirect(ea))
return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
- printk("HPFS: out of memory for EA\n");
+ pr_err("out of memory for EA\n");
return NULL;
}
memcpy(ret, ea_data(ea), ea_valuelen(ea));
@@ -165,7 +165,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
if (ea_indirect(ea))
return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
- printk("HPFS: out of memory for EA\n");
+ pr_err("out of memory for EA\n");
return NULL;
}
if (hpfs_ea_read(s, a, ano, pos + 4 + ea->namelen + 1, ea_valuelen(ea), ret)) {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 67c1a61e095..7f54e5f76ce 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -197,10 +197,10 @@ const struct address_space_operations hpfs_aops = {
const struct file_operations hpfs_file_ops =
{
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.release = hpfs_file_release,
.fsync = hpfs_file_fsync,
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1b398636e99..b63b75fa00e 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -8,6 +8,11 @@
//#define DBG
//#define DEBUG_LOCKS
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mutex.h>
#include <linux/pagemap.h>
@@ -80,6 +85,7 @@ struct hpfs_sb_info {
unsigned sb_c_bitmap; /* current bitmap */
unsigned sb_max_fwd_alloc; /* max forwad allocation */
int sb_timeshift;
+ struct rcu_head rcu;
};
/* Four 512-byte buffers and the 2k block obtained by concatenating them */
@@ -311,7 +317,7 @@ static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
__printf(2, 3)
void hpfs_error(struct super_block *, const char *, ...);
int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *);
-unsigned hpfs_count_one_bitmap(struct super_block *, secno);
+unsigned hpfs_get_free_dnodes(struct super_block *);
/*
* local time (HPFS) to GMT (Unix)
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 9edeeb0ea97..7ce4b74234a 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -183,7 +183,8 @@ void hpfs_write_inode(struct inode *i)
struct inode *parent;
if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
- if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n");
+ if (*hpfs_inode->i_rddir_off)
+ pr_err("write_inode: some position still there\n");
kfree(hpfs_inode->i_rddir_off);
hpfs_inode->i_rddir_off = NULL;
}
@@ -304,7 +305,7 @@ void hpfs_write_if_changed(struct inode *inode)
void hpfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (!inode->i_nlink) {
hpfs_lock(inode->i_sb);
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 3aa66ae1031..442770edcdc 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -65,12 +65,13 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0);
if (!cp) return NULL;
if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) {
- printk("HPFS: Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic));
+ pr_err("Code page directory magic doesn't match (magic = %08x)\n",
+ le32_to_cpu(cp->magic));
brelse(bh);
return NULL;
}
if (!le32_to_cpu(cp->n_code_pages)) {
- printk("HPFS: n_code_pages == 0\n");
+ pr_err("n_code_pages == 0\n");
brelse(bh);
return NULL;
}
@@ -79,19 +80,19 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
brelse(bh);
if (cpi >= 3) {
- printk("HPFS: Code page index out of array\n");
+ pr_err("Code page index out of array\n");
return NULL;
}
if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL;
if (le16_to_cpu(cpd->offs[cpi]) > 0x178) {
- printk("HPFS: Code page index out of sector\n");
+ pr_err("Code page index out of sector\n");
brelse(bh);
return NULL;
}
ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6;
if (!(cp_table = kmalloc(256, GFP_KERNEL))) {
- printk("HPFS: out of memory for code page table\n");
+ pr_err("out of memory for code page table\n");
brelse(bh);
return NULL;
}
@@ -114,7 +115,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
int i;
__le32 *b;
if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
- printk("HPFS: can't allocate memory for bitmap directory\n");
+ pr_err("can't allocate memory for bitmap directory\n");
return NULL;
}
for (i=0;i<n;i++) {
@@ -281,7 +282,9 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
hpfs_error(s, "dnode %08x does not end with \\377 entry", secno);
goto bail;
}
- if (b == 3) printk("HPFS: warning: unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", secno);
+ if (b == 3)
+ pr_err("unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n",
+ secno);
}
return dnode;
bail:
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index 9acdf338def..b00d396d22c 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -56,14 +56,15 @@ unsigned char *hpfs_translate_name(struct super_block *s, unsigned char *from,
unsigned char *to;
int i;
if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
- printk("HPFS: Long name flag mismatch - name ");
- for (i=0; i<len; i++) printk("%c", from[i]);
- printk(" misidentified as %s.\n", lng ? "short" : "long");
- printk("HPFS: It's nothing serious. It could happen because of bug in OS/2.\nHPFS: Set checks=normal to disable this message.\n");
+ pr_err("Long name flag mismatch - name ");
+ for (i = 0; i < len; i++)
+ pr_cont("%c", from[i]);
+ pr_cont(" misidentified as %s.\n", lng ? "short" : "long");
+ pr_err("It's nothing serious. It could happen because of bug in OS/2.\nSet checks=normal to disable this message.\n");
}
if (!lc) return from;
if (!(to = kmalloc(len, GFP_KERNEL))) {
- printk("HPFS: can't allocate memory for name conversion buffer\n");
+ pr_err("can't allocate memory for name conversion buffer\n");
return from;
}
for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]);
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 345713d2f8f..bdbc2c3080a 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -404,10 +404,10 @@ again:
d_rehash(dentry);
} else {
struct iattr newattrs;
- /*printk("HPFS: truncating file before delete.\n");*/
+ /*pr_info("truncating file before delete.\n");*/
newattrs.ia_size = 0;
newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
- err = notify_change(dentry, &newattrs);
+ err = notify_change(dentry, &newattrs, NULL);
put_write_access(inode);
if (!err)
goto again;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 4334cda8dba..7cd00d3a7c9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -62,22 +62,26 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
vsnprintf(err_buf, sizeof(err_buf), fmt, args);
va_end(args);
- printk("HPFS: filesystem error: %s", err_buf);
+ pr_err("filesystem error: %s", err_buf);
if (!hpfs_sb(s)->sb_was_error) {
if (hpfs_sb(s)->sb_err == 2) {
- printk("; crashing the system because you wanted it\n");
+ pr_cont("; crashing the system because you wanted it\n");
mark_dirty(s, 0);
panic("HPFS panic");
} else if (hpfs_sb(s)->sb_err == 1) {
- if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n");
+ if (s->s_flags & MS_RDONLY)
+ pr_cont("; already mounted read-only\n");
else {
- printk("; remounting read-only\n");
+ pr_cont("; remounting read-only\n");
mark_dirty(s, 0);
s->s_flags |= MS_RDONLY;
}
- } else if (s->s_flags & MS_RDONLY) printk("; going on - but anything won't be destroyed because it's read-only\n");
- else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n");
- } else printk("\n");
+ } else if (s->s_flags & MS_RDONLY)
+ pr_cont("; going on - but anything won't be destroyed because it's read-only\n");
+ else
+ pr_cont("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n");
+ } else
+ pr_cont("\n");
hpfs_sb(s)->sb_was_error = 1;
}
@@ -101,21 +105,27 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
return 0;
}
-static void hpfs_put_super(struct super_block *s)
+static void free_sbi(struct hpfs_sb_info *sbi)
{
- struct hpfs_sb_info *sbi = hpfs_sb(s);
+ kfree(sbi->sb_cp_table);
+ kfree(sbi->sb_bmp_dir);
+ kfree(sbi);
+}
+static void lazy_free_sbi(struct rcu_head *rcu)
+{
+ free_sbi(container_of(rcu, struct hpfs_sb_info, rcu));
+}
+
+static void hpfs_put_super(struct super_block *s)
+{
hpfs_lock(s);
unmark_dirty(s);
hpfs_unlock(s);
-
- kfree(sbi->sb_cp_table);
- kfree(sbi->sb_bmp_dir);
- s->s_fs_info = NULL;
- kfree(sbi);
+ call_rcu(&hpfs_sb(s)->rcu, lazy_free_sbi);
}
-unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
+static unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
{
struct quad_buffer_head qbh;
unsigned long *bits;
@@ -123,7 +133,7 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
bits = hpfs_map_4sectors(s, secno, &qbh, 0);
if (!bits)
- return 0;
+ return (unsigned)-1;
count = bitmap_weight(bits, 2048 * BITS_PER_BYTE);
hpfs_brelse4(&qbh);
return count;
@@ -138,30 +148,45 @@ static unsigned count_bitmaps(struct super_block *s)
hpfs_prefetch_bitmap(s, n);
}
for (n = 0; n < n_bands; n++) {
+ unsigned c;
hpfs_prefetch_bitmap(s, n + COUNT_RD_AHEAD);
- count += hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+ c = hpfs_count_one_bitmap(s, le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[n]));
+ if (c != (unsigned)-1)
+ count += c;
}
return count;
}
+unsigned hpfs_get_free_dnodes(struct super_block *s)
+{
+ struct hpfs_sb_info *sbi = hpfs_sb(s);
+ if (sbi->sb_n_free_dnodes == (unsigned)-1) {
+ unsigned c = hpfs_count_one_bitmap(s, sbi->sb_dmap);
+ if (c == (unsigned)-1)
+ return 0;
+ sbi->sb_n_free_dnodes = c;
+ }
+ return sbi->sb_n_free_dnodes;
+}
+
static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *s = dentry->d_sb;
struct hpfs_sb_info *sbi = hpfs_sb(s);
u64 id = huge_encode_dev(s->s_bdev->bd_dev);
+
hpfs_lock(s);
- /*if (sbi->sb_n_free == -1) {*/
+ if (sbi->sb_n_free == (unsigned)-1)
sbi->sb_n_free = count_bitmaps(s);
- sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap);
- /*}*/
+
buf->f_type = s->s_magic;
buf->f_bsize = 512;
buf->f_blocks = sbi->sb_fs_size;
buf->f_bfree = sbi->sb_n_free;
buf->f_bavail = sbi->sb_n_free;
buf->f_files = sbi->sb_dirband_size / 4;
- buf->f_ffree = sbi->sb_n_free_dnodes;
+ buf->f_ffree = hpfs_get_free_dnodes(s);
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
buf->f_namelen = 254;
@@ -271,7 +296,7 @@ static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
if (!opts)
return 1;
- /*printk("Parsing opts: '%s'\n",opts);*/
+ /*pr_info("Parsing opts: '%s'\n",opts);*/
while ((p = strsep(&opts, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -366,7 +391,7 @@ static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
static inline void hpfs_help(void)
{
- printk("\n\
+ pr_info("\n\
HPFS filesystem options:\n\
help do not mount and display this text\n\
uid=xxx set uid of files that don't have uid specified in eas\n\
@@ -400,6 +425,8 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
struct hpfs_sb_info *sbi = hpfs_sb(s);
char *new_opts = kstrdup(data, GFP_KERNEL);
+ sync_filesystem(s);
+
*flags |= MS_NOATIME;
hpfs_lock(s);
@@ -411,7 +438,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase,
&eas, &chk, &errs, &chkdsk, &timeshift))) {
- printk("HPFS: bad mount options.\n");
+ pr_err("bad mount options.\n");
goto out_err;
}
if (o == 2) {
@@ -419,7 +446,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
goto out_err;
}
if (timeshift != sbi->sb_timeshift) {
- printk("HPFS: timeshift can't be changed using remount.\n");
+ pr_err("timeshift can't be changed using remount.\n");
goto out_err;
}
@@ -485,9 +512,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
}
s->s_fs_info = sbi;
- sbi->sb_bmp_dir = NULL;
- sbi->sb_cp_table = NULL;
-
mutex_init(&sbi->hpfs_mutex);
hpfs_lock(s);
@@ -503,7 +527,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase,
&eas, &chk, &errs, &chkdsk, &timeshift))) {
- printk("HPFS: bad mount options.\n");
+ pr_err("bad mount options.\n");
goto bail0;
}
if (o==2) {
@@ -522,16 +546,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
if (/*le16_to_cpu(bootblock->magic) != BB_MAGIC
||*/ le32_to_cpu(superblock->magic) != SB_MAGIC
|| le32_to_cpu(spareblock->magic) != SP_MAGIC) {
- if (!silent) printk("HPFS: Bad magic ... probably not HPFS\n");
+ if (!silent)
+ pr_err("Bad magic ... probably not HPFS\n");
goto bail4;
}
/* Check version */
if (!(s->s_flags & MS_RDONLY) &&
superblock->funcversion != 2 && superblock->funcversion != 3) {
- printk("HPFS: Bad version %d,%d. Mount readonly to go around\n",
+ pr_err("Bad version %d,%d. Mount readonly to go around\n",
(int)superblock->version, (int)superblock->funcversion);
- printk("HPFS: please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n");
+ pr_err("please try recent version of HPFS driver at http://artax.karlin.mff.cuni.cz/~mikulas/vyplody/hpfs/index-e.cgi and if it still can't understand this format, contact author - mikulas@artax.karlin.mff.cuni.cz\n");
goto bail4;
}
@@ -577,7 +602,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
/* Check for general fs errors*/
if (spareblock->dirty && !spareblock->old_wrote) {
if (errs == 2) {
- printk("HPFS: Improperly stopped, not mounted\n");
+ pr_err("Improperly stopped, not mounted\n");
goto bail4;
}
hpfs_error(s, "improperly stopped");
@@ -591,22 +616,25 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
if (spareblock->hotfixes_used || spareblock->n_spares_used) {
if (errs >= 2) {
- printk("HPFS: Hotfixes not supported here, try chkdsk\n");
+ pr_err("Hotfixes not supported here, try chkdsk\n");
mark_dirty(s, 0);
goto bail4;
}
hpfs_error(s, "hotfixes not supported here, try chkdsk");
- if (errs == 0) printk("HPFS: Proceeding, but your filesystem will be probably corrupted by this driver...\n");
- else printk("HPFS: This driver may read bad files or crash when operating on disk with hotfixes.\n");
+ if (errs == 0)
+ pr_err("Proceeding, but your filesystem will be probably corrupted by this driver...\n");
+ else
+ pr_err("This driver may read bad files or crash when operating on disk with hotfixes.\n");
}
if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
if (errs >= 2) {
- printk("HPFS: Spare dnodes used, try chkdsk\n");
+ pr_err("Spare dnodes used, try chkdsk\n");
mark_dirty(s, 0);
goto bail4;
}
hpfs_error(s, "warning: spare dnodes used, try chkdsk");
- if (errs == 0) printk("HPFS: Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
+ if (errs == 0)
+ pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
}
if (chk) {
unsigned a;
@@ -625,12 +653,13 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
goto bail4;
}
sbi->sb_dirband_size = a;
- } else printk("HPFS: You really don't want any checks? You are crazy...\n");
+ } else
+ pr_err("You really don't want any checks? You are crazy...\n");
/* Load code page table */
if (le32_to_cpu(spareblock->n_code_pages))
if (!(sbi->sb_cp_table = hpfs_load_code_page(s, le32_to_cpu(spareblock->code_page_dir))))
- printk("HPFS: Warning: code page support is disabled\n");
+ pr_err("code page support is disabled\n");
brelse(bh2);
brelse(bh1);
@@ -679,10 +708,7 @@ bail2: brelse(bh0);
bail1:
bail0:
hpfs_unlock(s);
- kfree(sbi->sb_bmp_dir);
- kfree(sbi->sb_cp_table);
- s->s_fs_info = NULL;
- kfree(sbi);
+ free_sbi(sbi);
return -EINVAL;
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d19b30ababf..1e2872b2534 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -6,6 +6,8 @@
* Copyright (C) 2002 Linus Torvalds.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/thread_info.h>
#include <asm/current.h>
@@ -366,7 +368,13 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
static void hugetlbfs_evict_inode(struct inode *inode)
{
+ struct resv_map *resv_map;
+
truncate_hugepages(inode, 0);
+ resv_map = (struct resv_map *)inode->i_mapping->private_data;
+ /* root inode doesn't have the resv_map, so we should check it */
+ if (resv_map)
+ resv_map_release(&resv_map->refs);
clear_inode(inode);
}
@@ -469,13 +477,18 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
* annotation because huge_pmd_share() does an allocation under
* i_mmap_mutex.
*/
-struct lock_class_key hugetlbfs_i_mmap_mutex_key;
+static struct lock_class_key hugetlbfs_i_mmap_mutex_key;
static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct inode *dir,
umode_t mode, dev_t dev)
{
struct inode *inode;
+ struct resv_map *resv_map;
+
+ resv_map = resv_map_alloc();
+ if (!resv_map)
+ return NULL;
inode = new_inode(sb);
if (inode) {
@@ -487,7 +500,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- INIT_LIST_HEAD(&inode->i_mapping->private_list);
+ inode->i_mapping->private_data = resv_map;
info = HUGETLBFS_I(inode);
/*
* The policy is initialized here even if we are creating a
@@ -517,7 +530,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
- }
+ } else
+ kref_put(&resv_map->refs, resv_map_release);
+
return inode;
}
@@ -810,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
ps = memparse(args[0].from, &rest);
pconfig->hstate = size_to_hstate(ps);
if (!pconfig->hstate) {
- printk(KERN_ERR
- "hugetlbfs: Unsupported page size %lu MB\n",
+ pr_err("Unsupported page size %lu MB\n",
ps >> 20);
return -EINVAL;
}
@@ -819,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
}
default:
- printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n",
- p);
+ pr_err("Bad mount option: \"%s\"\n", p);
return -EINVAL;
break;
}
@@ -840,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
return 0;
bad_val:
- printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n",
- args[0].from, p);
+ pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p);
return -EINVAL;
}
@@ -889,8 +901,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent)
goto out_free;
return 0;
out_free:
- if (sbinfo->spool)
- kfree(sbinfo->spool);
+ kfree(sbinfo->spool);
kfree(sbinfo);
return -ENOMEM;
}
@@ -926,7 +937,7 @@ static int get_hstate_idx(int page_size_log)
return h - hstates;
}
-static struct dentry_operations anon_ops = {
+static const struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
@@ -957,8 +968,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
*user = current_user();
if (user_shm_lock(size, *user)) {
task_lock(current);
- printk_once(KERN_WARNING
- "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid);
task_unlock(current);
} else {
@@ -1017,6 +1027,11 @@ static int __init init_hugetlbfs_fs(void)
int error;
int i;
+ if (!hugepages_supported()) {
+ pr_info("disabling because there are no supported hugepage sizes\n");
+ return -ENOTSUPP;
+ }
+
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
return error;
@@ -1042,7 +1057,7 @@ static int __init init_hugetlbfs_fs(void)
buf);
if (IS_ERR(hugetlbfs_vfsmount[i])) {
- pr_err("hugetlb: Cannot mount internal hugetlbfs for "
+ pr_err("Cannot mount internal hugetlbfs for "
"page size %uK", ps_kb);
error = PTR_ERR(hugetlbfs_vfsmount[i]);
hugetlbfs_vfsmount[i] = NULL;
diff --git a/fs/inode.c b/fs/inode.c
index b33ba8e021c..6eecb7ff0b9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -105,7 +105,7 @@ long get_nr_dirty_inodes(void)
* Handle nr_inode sysctl
*/
#ifdef CONFIG_SYSCTL
-int proc_nr_inodes(ctl_table *table, int write,
+int proc_nr_inodes(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
inodes_stat.nr_inodes = get_nr_inodes();
@@ -503,6 +503,7 @@ void clear_inode(struct inode *inode)
*/
spin_lock_irq(&inode->i_data.tree_lock);
BUG_ON(inode->i_data.nrpages);
+ BUG_ON(inode->i_data.nrshadows);
spin_unlock_irq(&inode->i_data.tree_lock);
BUG_ON(!list_empty(&inode->i_data.private_list));
BUG_ON(!(inode->i_state & I_FREEING));
@@ -548,8 +549,7 @@ static void evict(struct inode *inode)
if (op->evict_inode) {
op->evict_inode(inode);
} else {
- if (inode->i_data.nrpages)
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -773,15 +773,11 @@ static struct inode *find_inode(struct super_block *sb,
repeat:
hlist_for_each_entry(inode, head, i_hash) {
- spin_lock(&inode->i_lock);
- if (inode->i_sb != sb) {
- spin_unlock(&inode->i_lock);
+ if (inode->i_sb != sb)
continue;
- }
- if (!test(inode, data)) {
- spin_unlock(&inode->i_lock);
+ if (!test(inode, data))
continue;
- }
+ spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
@@ -804,15 +800,11 @@ static struct inode *find_inode_fast(struct super_block *sb,
repeat:
hlist_for_each_entry(inode, head, i_hash) {
- spin_lock(&inode->i_lock);
- if (inode->i_ino != ino) {
- spin_unlock(&inode->i_lock);
+ if (inode->i_ino != ino)
continue;
- }
- if (inode->i_sb != sb) {
- spin_unlock(&inode->i_lock);
+ if (inode->i_sb != sb)
continue;
- }
+ spin_lock(&inode->i_lock);
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
__wait_on_freeing_inode(inode);
goto repeat;
@@ -951,6 +943,41 @@ void unlock_new_inode(struct inode *inode)
EXPORT_SYMBOL(unlock_new_inode);
/**
+ * lock_two_nondirectories - take two i_mutexes on non-directory objects
+ *
+ * Lock any non-NULL argument that is not a directory.
+ * Zero, one or two objects may be locked by this function.
+ *
+ * @inode1: first inode to lock
+ * @inode2: second inode to lock
+ */
+void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 > inode2)
+ swap(inode1, inode2);
+
+ if (inode1 && !S_ISDIR(inode1->i_mode))
+ mutex_lock(&inode1->i_mutex);
+ if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
+ mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+}
+EXPORT_SYMBOL(lock_two_nondirectories);
+
+/**
+ * unlock_two_nondirectories - release locks from lock_two_nondirectories()
+ * @inode1: first inode to unlock
+ * @inode2: second inode to unlock
+ */
+void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 && !S_ISDIR(inode1->i_mode))
+ mutex_unlock(&inode1->i_mutex);
+ if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
+ mutex_unlock(&inode2->i_mutex);
+}
+EXPORT_SYMBOL(unlock_two_nondirectories);
+
+/**
* iget5_locked - obtain an inode from a mounted file system
* @sb: super block of file system
* @hashval: hash value (usually inode number) to get
@@ -1575,7 +1602,11 @@ static int __remove_suid(struct dentry *dentry, int kill)
struct iattr newattrs;
newattrs.ia_valid = ATTR_FORCE | kill;
- return notify_change(dentry, &newattrs);
+ /*
+ * Note we call this on write, so notify_change will not
+ * encounter any conflicting delegations:
+ */
+ return notify_change(dentry, &newattrs, NULL);
}
int file_remove_suid(struct file *file)
@@ -1808,14 +1839,18 @@ EXPORT_SYMBOL(inode_init_owner);
* inode_owner_or_capable - check current task permissions to inode
* @inode: inode being checked
*
- * Return true if current either has CAP_FOWNER to the inode, or
- * owns the file.
+ * Return true if current either has CAP_FOWNER in a namespace with the
+ * inode owner uid mapped, or owns the file.
*/
bool inode_owner_or_capable(const struct inode *inode)
{
+ struct user_namespace *ns;
+
if (uid_eq(current_fsuid(), inode->i_uid))
return true;
- if (inode_capable(inode, CAP_FOWNER))
+
+ ns = current_user_ns();
+ if (ns_capable(ns, CAP_FOWNER) && kuid_has_mapping(ns, inode->i_uid))
return true;
return false;
}
@@ -1867,3 +1902,34 @@ void inode_dio_done(struct inode *inode)
wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
}
EXPORT_SYMBOL(inode_dio_done);
+
+/*
+ * inode_set_flags - atomically set some inode flags
+ *
+ * Note: the caller should be holding i_mutex, or else be sure that
+ * they have exclusive access to the inode structure (i.e., while the
+ * inode is being instantiated). The reason for the cmpxchg() loop
+ * --- which wouldn't be necessary if all code paths which modify
+ * i_flags actually followed this rule, is that there is at least one
+ * code path which doesn't today --- for example,
+ * __generic_file_aio_write() calls file_remove_suid() without holding
+ * i_mutex --- so we use cmpxchg() out of an abundance of caution.
+ *
+ * In the long run, i_mutex is overkill, and we should probably look
+ * at using the i_lock spinlock to protect i_flags, and then make sure
+ * it is so documented in include/linux/fs.h and that all code follows
+ * the locking convention!!
+ */
+void inode_set_flags(struct inode *inode, unsigned int flags,
+ unsigned int mask)
+{
+ unsigned int old_flags, new_flags;
+
+ WARN_ON_ONCE(flags & ~mask);
+ do {
+ old_flags = ACCESS_ONCE(inode->i_flags);
+ new_flags = (old_flags & ~mask) | flags;
+ } while (unlikely(cmpxchg(&inode->i_flags, old_flags,
+ new_flags) != old_flags));
+}
+EXPORT_SYMBOL(inode_set_flags);
diff --git a/fs/internal.h b/fs/internal.h
index 513e0d859a6..46574240746 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,8 +9,6 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/lglock.h>
-
struct super_block;
struct file_system_type;
struct linux_binprm;
@@ -62,8 +60,6 @@ extern int sb_prepare_remount_readonly(struct super_block *);
extern void __init mnt_init(void);
-extern struct lglock vfsmount_lock;
-
extern int __mnt_want_write(struct vfsmount *);
extern int __mnt_want_write_file(struct file *);
extern void __mnt_drop_write(struct vfsmount *);
@@ -77,9 +73,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
/*
* file_table.c
*/
-extern void file_sb_list_add(struct file *f, struct super_block *sb);
-extern void file_sb_list_del(struct file *f);
-extern void mark_files_ro(struct super_block *);
extern struct file *get_empty_filp(void);
/*
diff --git a/fs/ioctl.c b/fs/ioctl.c
index fd507fb460f..8ac3fad3619 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -37,7 +37,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd,
{
int error = -ENOTTY;
- if (!filp->f_op || !filp->f_op->unlocked_ioctl)
+ if (!filp->f_op->unlocked_ioctl)
goto out;
error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
@@ -501,7 +501,7 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
/* Did FASYNC state change ? */
if ((flag ^ filp->f_flags) & FASYNC) {
- if (filp->f_op && filp->f_op->fasync)
+ if (filp->f_op->fasync)
/* fasync() adjusts filp->f_flags */
error = filp->f_op->fasync(fd, filp, on);
else
diff --git a/fs/ioprio.c b/fs/ioprio.c
deleted file mode 100644
index e50170ca7c3..00000000000
--- a/fs/ioprio.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * fs/ioprio.c
- *
- * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
- *
- * Helper functions for setting/querying io priorities of processes. The
- * system calls closely mimmick getpriority/setpriority, see the man page for
- * those. The prio argument is a composite of prio class and prio data, where
- * the data argument has meaning within that class. The standard scheduling
- * classes have 8 distinct prio levels, with 0 being the highest prio and 7
- * being the lowest.
- *
- * IOW, setting BE scheduling class with prio 2 is done ala:
- *
- * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
- *
- * ioprio_set(PRIO_PROCESS, pid, prio);
- *
- * See also Documentation/block/ioprio.txt
- *
- */
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ioprio.h>
-#include <linux/blkdev.h>
-#include <linux/capability.h>
-#include <linux/syscalls.h>
-#include <linux/security.h>
-#include <linux/pid_namespace.h>
-
-int set_task_ioprio(struct task_struct *task, int ioprio)
-{
- int err;
- struct io_context *ioc;
- const struct cred *cred = current_cred(), *tcred;
-
- rcu_read_lock();
- tcred = __task_cred(task);
- if (!uid_eq(tcred->uid, cred->euid) &&
- !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
-
- err = security_task_setioprio(task, ioprio);
- if (err)
- return err;
-
- ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
- if (ioc) {
- ioc->ioprio = ioprio;
- put_io_context(ioc);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(set_task_ioprio);
-
-SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
-{
- int class = IOPRIO_PRIO_CLASS(ioprio);
- int data = IOPRIO_PRIO_DATA(ioprio);
- struct task_struct *p, *g;
- struct user_struct *user;
- struct pid *pgrp;
- kuid_t uid;
- int ret;
-
- switch (class) {
- case IOPRIO_CLASS_RT:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- /* fall through, rt has prio field too */
- case IOPRIO_CLASS_BE:
- if (data >= IOPRIO_BE_NR || data < 0)
- return -EINVAL;
-
- break;
- case IOPRIO_CLASS_IDLE:
- break;
- case IOPRIO_CLASS_NONE:
- if (data)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
- ret = -ESRCH;
- rcu_read_lock();
- switch (which) {
- case IOPRIO_WHO_PROCESS:
- if (!who)
- p = current;
- else
- p = find_task_by_vpid(who);
- if (p)
- ret = set_task_ioprio(p, ioprio);
- break;
- case IOPRIO_WHO_PGRP:
- if (!who)
- pgrp = task_pgrp(current);
- else
- pgrp = find_vpid(who);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- ret = set_task_ioprio(p, ioprio);
- if (ret)
- break;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case IOPRIO_WHO_USER:
- uid = make_kuid(current_user_ns(), who);
- if (!uid_valid(uid))
- break;
- if (!who)
- user = current_user();
- else
- user = find_user(uid);
-
- if (!user)
- break;
-
- do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), uid))
- continue;
- ret = set_task_ioprio(p, ioprio);
- if (ret)
- goto free_uid;
- } while_each_thread(g, p);
-free_uid:
- if (who)
- free_uid(user);
- break;
- default:
- ret = -EINVAL;
- }
-
- rcu_read_unlock();
- return ret;
-}
-
-static int get_task_ioprio(struct task_struct *p)
-{
- int ret;
-
- ret = security_task_getioprio(p);
- if (ret)
- goto out;
- ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
- if (p->io_context)
- ret = p->io_context->ioprio;
-out:
- return ret;
-}
-
-int ioprio_best(unsigned short aprio, unsigned short bprio)
-{
- unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
- unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
-
- if (aclass == IOPRIO_CLASS_NONE)
- aclass = IOPRIO_CLASS_BE;
- if (bclass == IOPRIO_CLASS_NONE)
- bclass = IOPRIO_CLASS_BE;
-
- if (aclass == bclass)
- return min(aprio, bprio);
- if (aclass > bclass)
- return bprio;
- else
- return aprio;
-}
-
-SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
-{
- struct task_struct *g, *p;
- struct user_struct *user;
- struct pid *pgrp;
- kuid_t uid;
- int ret = -ESRCH;
- int tmpio;
-
- rcu_read_lock();
- switch (which) {
- case IOPRIO_WHO_PROCESS:
- if (!who)
- p = current;
- else
- p = find_task_by_vpid(who);
- if (p)
- ret = get_task_ioprio(p);
- break;
- case IOPRIO_WHO_PGRP:
- if (!who)
- pgrp = task_pgrp(current);
- else
- pgrp = find_vpid(who);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- tmpio = get_task_ioprio(p);
- if (tmpio < 0)
- continue;
- if (ret == -ESRCH)
- ret = tmpio;
- else
- ret = ioprio_best(ret, tmpio);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case IOPRIO_WHO_USER:
- uid = make_kuid(current_user_ns(), who);
- if (!who)
- user = current_user();
- else
- user = find_user(uid);
-
- if (!user)
- break;
-
- do_each_thread(g, p) {
- if (!uid_eq(task_uid(p), user->uid))
- continue;
- tmpio = get_task_ioprio(p);
- if (tmpio < 0)
- continue;
- if (ret == -ESRCH)
- ret = tmpio;
- else
- ret = ioprio_best(ret, tmpio);
- } while_each_thread(g, p);
-
- if (who)
- free_uid(user);
- break;
- default:
- ret = -EINVAL;
- }
-
- rcu_read_unlock();
- return ret;
-}
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index e5d408a7ea4..4556ce1af5b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
sizeof(struct iso_inode_info),
@@ -117,6 +117,7 @@ static void destroy_inodecache(void)
static int isofs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
if (!(*flags & MS_RDONLY))
return -EROFS;
return 0;
@@ -181,7 +182,7 @@ struct iso9660_options{
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hash_common(struct qstr *qstr, int ms)
{
const char *name;
int len;
@@ -202,7 +203,7 @@ isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
+isofs_hashi_common(struct qstr *qstr, int ms)
{
const char *name;
int len;
@@ -259,13 +260,13 @@ static int isofs_dentry_cmp_common(
static int
isofs_hash(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hash_common(dentry, qstr, 0);
+ return isofs_hash_common(qstr, 0);
}
static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hashi_common(dentry, qstr, 0);
+ return isofs_hashi_common(qstr, 0);
}
static int
@@ -286,13 +287,13 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
static int
isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hash_common(dentry, qstr, 1);
+ return isofs_hash_common(qstr, 1);
}
static int
isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hashi_common(dentry, qstr, 1);
+ return isofs_hashi_common(qstr, 1);
}
static int
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 2d04f9afafd..06fe11e0abf 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -573,7 +573,7 @@ int log_wait_commit(journal_t *journal, tid_t tid)
#ifdef CONFIG_JBD_DEBUG
spin_lock(&journal->j_state_lock);
if (!tid_geq(journal->j_commit_request, tid)) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: error: j_commit_request=%d, tid=%d\n",
__func__, journal->j_commit_request, tid);
}
@@ -604,10 +604,8 @@ int log_wait_commit(journal_t *journal, tid_t tid)
out_unlock:
spin_unlock(&journal->j_state_lock);
- if (unlikely(is_journal_aborted(journal))) {
- printk(KERN_EMERG "journal commit I/O error\n");
+ if (unlikely(is_journal_aborted(journal)))
err = -EIO;
- }
return err;
}
@@ -2136,7 +2134,7 @@ static void __exit journal_exit(void)
#ifdef CONFIG_JBD_DEBUG
int n = atomic_read(&nr_journal_heads);
if (n)
- printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+ printk(KERN_ERR "JBD: leaked %d journal_heads!\n", n);
#endif
jbd_remove_debugfs_entry();
journal_destroy_caches();
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 25c713e7071..8898bbd2b61 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -231,19 +231,15 @@ record_cache_failure:
static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
{
- int shift = 0;
- int tmp = hash_size;
+ int i;
struct jbd_revoke_table_s *table;
table = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
if (!table)
goto out;
- while((tmp >>= 1UL) != 0UL)
- shift++;
-
table->hash_size = hash_size;
- table->hash_shift = shift;
+ table->hash_shift = ilog2(hash_size);
table->hash_table =
kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
if (!table->hash_table) {
@@ -252,8 +248,8 @@ static struct jbd_revoke_table_s *journal_init_revoke_table(int hash_size)
goto out;
}
- for (tmp = 0; tmp < hash_size; tmp++)
- INIT_LIST_HEAD(&table->hash_table[tmp]);
+ for (i = 0; i < hash_size; i++)
+ INIT_LIST_HEAD(&table->hash_table[i]);
out:
return table;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index be0c39b66fe..1695ba8334a 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -26,7 +26,6 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
-#include <linux/backing-dev.h>
static void __journal_temp_unlink_buffer(struct journal_head *jh);
@@ -100,10 +99,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
alloc_transaction:
if (!journal->j_running_transaction) {
- new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
+ new_transaction = kzalloc(sizeof(*new_transaction),
+ GFP_NOFS|__GFP_NOFAIL);
if (!new_transaction) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- goto alloc_transaction;
+ ret = -ENOMEM;
+ goto out;
}
}
@@ -675,7 +675,7 @@ repeat:
jbd_alloc(jh2bh(jh)->b_size,
GFP_NOFS);
if (!frozen_buffer) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: OOM for frozen_buffer\n",
__func__);
JBUFFER_TRACE(jh, "oom!");
@@ -898,7 +898,7 @@ repeat:
if (!jh->b_committed_data) {
committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
- printk(KERN_EMERG "%s: No memory for committed data\n",
+ printk(KERN_ERR "%s: No memory for committed data\n",
__func__);
err = -ENOMEM;
goto out;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index cf2fc059406..6fac7434985 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -43,7 +43,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
clear_buffer_uptodate(bh);
if (orig_bh) {
clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&orig_bh->b_state, BH_Shadow);
}
unlock_buffer(bh);
@@ -239,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal,
spin_lock(&journal->j_list_lock);
J_ASSERT(jinode->i_transaction == commit_transaction);
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
spin_unlock(&journal->j_list_lock);
@@ -277,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
}
spin_lock(&journal->j_list_lock);
clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
}
@@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
blk_start_plug(&plug);
jbd2_journal_write_revoke_records(journal, commit_transaction,
&log_bufs, WRITE_SYNC);
- blk_finish_plug(&plug);
jbd_debug(3, "JBD2: commit phase 2b\n");
@@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
err = 0;
bufs = 0;
descriptor = NULL;
- blk_start_plug(&plug);
while (commit_transaction->t_buffers) {
/* Find the next buffer to be journaled... */
@@ -1067,6 +1065,25 @@ restart_loop:
goto restart_loop;
}
+ /* Add the transaction to the checkpoint list
+ * __journal_remove_checkpoint() can not destroy transaction
+ * under us because it is not marked as T_FINISHED yet */
+ if (journal->j_checkpoint_transactions == NULL) {
+ journal->j_checkpoint_transactions = commit_transaction;
+ commit_transaction->t_cpnext = commit_transaction;
+ commit_transaction->t_cpprev = commit_transaction;
+ } else {
+ commit_transaction->t_cpnext =
+ journal->j_checkpoint_transactions;
+ commit_transaction->t_cpprev =
+ commit_transaction->t_cpnext->t_cpprev;
+ commit_transaction->t_cpnext->t_cpprev =
+ commit_transaction;
+ commit_transaction->t_cpprev->t_cpnext =
+ commit_transaction;
+ }
+ spin_unlock(&journal->j_list_lock);
+
/* Done with this transaction! */
jbd_debug(3, "JBD2: commit phase 7\n");
@@ -1085,24 +1102,7 @@ restart_loop:
atomic_read(&commit_transaction->t_handle_count);
trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
commit_transaction->t_tid, &stats.run);
-
- /*
- * Calculate overall stats
- */
- spin_lock(&journal->j_history_lock);
- journal->j_stats.ts_tid++;
- if (commit_transaction->t_requested)
- journal->j_stats.ts_requested++;
- journal->j_stats.run.rs_wait += stats.run.rs_wait;
- journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
- journal->j_stats.run.rs_running += stats.run.rs_running;
- journal->j_stats.run.rs_locked += stats.run.rs_locked;
- journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
- journal->j_stats.run.rs_logging += stats.run.rs_logging;
- journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
- journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
- journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
- spin_unlock(&journal->j_history_lock);
+ stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
@@ -1122,24 +1122,6 @@ restart_loop:
write_unlock(&journal->j_state_lock);
- if (journal->j_checkpoint_transactions == NULL) {
- journal->j_checkpoint_transactions = commit_transaction;
- commit_transaction->t_cpnext = commit_transaction;
- commit_transaction->t_cpprev = commit_transaction;
- } else {
- commit_transaction->t_cpnext =
- journal->j_checkpoint_transactions;
- commit_transaction->t_cpprev =
- commit_transaction->t_cpnext->t_cpprev;
- commit_transaction->t_cpnext->t_cpprev =
- commit_transaction;
- commit_transaction->t_cpprev->t_cpnext =
- commit_transaction;
- }
- spin_unlock(&journal->j_list_lock);
- /* Drop all spin_locks because commit_callback may be block.
- * __journal_remove_checkpoint() can not destroy transaction
- * under us because it is not marked as T_FINISHED yet */
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
@@ -1150,7 +1132,7 @@ restart_loop:
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
- /* Recheck checkpoint lists after j_list_lock was dropped */
+ /* Check if the transaction can be dropped now that we are finished */
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
@@ -1159,4 +1141,21 @@ restart_loop:
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
+
+ /*
+ * Calculate overall stats
+ */
+ spin_lock(&journal->j_history_lock);
+ journal->j_stats.ts_tid++;
+ journal->j_stats.ts_requested += stats.ts_requested;
+ journal->j_stats.run.rs_wait += stats.run.rs_wait;
+ journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
+ journal->j_stats.run.rs_running += stats.run.rs_running;
+ journal->j_stats.run.rs_locked += stats.run.rs_locked;
+ journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
+ journal->j_stats.run.rs_logging += stats.run.rs_logging;
+ journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
+ journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
+ journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
+ spin_unlock(&journal->j_history_lock);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 52032647dd4..67b8e303946 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug);
#endif
/* Checksumming functions */
-int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
@@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
return cpu_to_be32(csum);
}
-int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
{
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return 1;
@@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
return sb->s_checksum == jbd2_superblock_csum(j, sb);
}
-void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
{
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
return;
@@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal)
journal->j_flags |= JBD2_UNMOUNT;
while (journal->j_task) {
- wake_up(&journal->j_wait_commit);
write_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_wait_commit);
wait_event(journal->j_wait_done_commit, journal->j_task == NULL);
write_lock(&journal->j_state_lock);
}
@@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
read_lock(&journal->j_state_lock);
#ifdef CONFIG_JBD2_DEBUG
if (!tid_geq(journal->j_commit_request, tid)) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: error: j_commit_request=%d, tid=%d\n",
__func__, journal->j_commit_request, tid);
}
@@ -710,18 +710,16 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
while (tid_gt(tid, journal->j_commit_sequence)) {
jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n",
tid, journal->j_commit_sequence);
- wake_up(&journal->j_wait_commit);
read_unlock(&journal->j_state_lock);
+ wake_up(&journal->j_wait_commit);
wait_event(journal->j_wait_done_commit,
!tid_gt(tid, journal->j_commit_sequence));
read_lock(&journal->j_state_lock);
}
read_unlock(&journal->j_state_lock);
- if (unlikely(is_journal_aborted(journal))) {
- printk(KERN_EMERG "journal commit I/O error\n");
+ if (unlikely(is_journal_aborted(journal)))
err = -EIO;
- }
return err;
}
@@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal)
if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
/* Can't have checksum v1 and v2 on at the same time! */
- printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
+ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
"at the same time!\n");
goto out;
}
if (!jbd2_verify_csum_type(journal, sb)) {
- printk(KERN_ERR "JBD: Unknown checksum type\n");
+ printk(KERN_ERR "JBD2: Unknown checksum type\n");
goto out;
}
@@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal)
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
+ printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
err = PTR_ERR(journal->j_chksum_driver);
journal->j_chksum_driver = NULL;
goto out;
@@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal)
/* Check superblock checksum */
if (!jbd2_superblock_csum_verify(journal, sb)) {
- printk(KERN_ERR "JBD: journal checksum error\n");
+ printk(KERN_ERR "JBD2: journal checksum error\n");
goto out;
}
@@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
journal->j_chksum_driver = crypto_alloc_shash("crc32c",
0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
- printk(KERN_ERR "JBD: Cannot load crc32c "
+ printk(KERN_ERR "JBD2: Cannot load crc32c "
"driver.\n");
journal->j_chksum_driver = NULL;
return 0;
@@ -2645,7 +2643,7 @@ static void __exit journal_exit(void)
#ifdef CONFIG_JBD2_DEBUG
int n = atomic_read(&nr_journal_heads);
if (n)
- printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
+ printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
#endif
jbd2_remove_jbd_stats_proc_entry();
jbd2_journal_destroy_caches();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3929c50428b..3b6bb19d60b 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal,
be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
success = -EIO;
- printk(KERN_ERR "JBD: Invalid "
+ printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"block %llu in log\n",
blocknr);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 7aa9a32573b..6f0f590cc5a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -514,11 +514,13 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
* similarly constrained call sites
*/
ret = start_this_handle(journal, handle, GFP_NOFS);
- if (ret < 0)
+ if (ret < 0) {
jbd2_journal_free_reserved(handle);
+ return ret;
+ }
handle->h_type = type;
handle->h_line_no = line_no;
- return ret;
+ return 0;
}
EXPORT_SYMBOL(jbd2_journal_start_reserved);
@@ -932,7 +934,7 @@ repeat:
jbd2_alloc(jh2bh(jh)->b_size,
GFP_NOFS);
if (!frozen_buffer) {
- printk(KERN_EMERG
+ printk(KERN_ERR
"%s: OOM for frozen_buffer\n",
__func__);
JBUFFER_TRACE(jh, "oom!");
@@ -1071,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
* reused here.
*/
jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
jh->b_transaction == NULL ||
(jh->b_transaction == journal->j_committing_transaction &&
@@ -1094,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
jh->b_modified = 0;
JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
} else if (jh->b_transaction == journal->j_committing_transaction) {
/* first access by this transaction */
jh->b_modified = 0;
JBUFFER_TRACE(jh, "set next transaction");
+ spin_lock(&journal->j_list_lock);
jh->b_next_transaction = transaction;
}
spin_unlock(&journal->j_list_lock);
@@ -1166,7 +1169,7 @@ repeat:
if (!jh->b_committed_data) {
committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
if (!committed_data) {
- printk(KERN_EMERG "%s: No memory for committed data\n",
+ printk(KERN_ERR "%s: No memory for committed data\n",
__func__);
err = -ENOMEM;
goto out;
@@ -1290,7 +1293,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
* once a transaction -bzzz
*/
jh->b_modified = 1;
- J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ if (handle->h_buffer_credits <= 0) {
+ ret = -ENOSPC;
+ goto out_unlock_bh;
+ }
handle->h_buffer_credits--;
}
@@ -1305,9 +1311,9 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
JBUFFER_TRACE(jh, "fastpath");
if (unlikely(jh->b_transaction !=
journal->j_running_transaction)) {
- printk(KERN_EMERG "JBD: %s: "
+ printk(KERN_ERR "JBD2: %s: "
"jh->b_transaction (%llu, %p, %u) != "
- "journal->j_running_transaction (%p, %u)",
+ "journal->j_running_transaction (%p, %u)\n",
journal->j_devname,
(unsigned long long) bh->b_blocknr,
jh->b_transaction,
@@ -1330,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
*/
if (jh->b_transaction != transaction) {
JBUFFER_TRACE(jh, "already on other transaction");
- if (unlikely(jh->b_transaction !=
- journal->j_committing_transaction)) {
- printk(KERN_EMERG "JBD: %s: "
- "jh->b_transaction (%llu, %p, %u) != "
- "journal->j_committing_transaction (%p, %u)",
+ if (unlikely(((jh->b_transaction !=
+ journal->j_committing_transaction)) ||
+ (jh->b_next_transaction != transaction))) {
+ printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: "
+ "bad jh for block %llu: "
+ "transaction (%p, %u), "
+ "jh->b_transaction (%p, %u), "
+ "jh->b_next_transaction (%p, %u), jlist %u\n",
journal->j_devname,
(unsigned long long) bh->b_blocknr,
+ transaction, transaction->t_tid,
jh->b_transaction,
- jh->b_transaction ? jh->b_transaction->t_tid : 0,
- journal->j_committing_transaction,
- journal->j_committing_transaction ?
- journal->j_committing_transaction->t_tid : 0);
- ret = -EINVAL;
- }
- if (unlikely(jh->b_next_transaction != transaction)) {
- printk(KERN_EMERG "JBD: %s: "
- "jh->b_next_transaction (%llu, %p, %u) != "
- "transaction (%p, %u)",
- journal->j_devname,
- (unsigned long long) bh->b_blocknr,
+ jh->b_transaction ?
+ jh->b_transaction->t_tid : 0,
jh->b_next_transaction,
jh->b_next_transaction ?
jh->b_next_transaction->t_tid : 0,
- transaction, transaction->t_tid);
+ jh->b_jlist);
+ WARN_ON(1);
ret = -EINVAL;
}
/* And this case is illegal: we can't reuse another
@@ -1373,7 +1374,6 @@ out_unlock_bh:
jbd2_journal_put_journal_head(jh);
out:
JBUFFER_TRACE(jh, "exit");
- WARN_ON(ret); /* All errors are bugs, so dump the stack */
return ret;
}
@@ -1411,7 +1411,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
BUFFER_TRACE(bh, "entry");
jbd_lock_bh_state(bh);
- spin_lock(&journal->j_list_lock);
if (!buffer_jbd(bh))
goto not_jbd;
@@ -1464,6 +1463,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
* we know to remove the checkpoint after we commit.
*/
+ spin_lock(&journal->j_list_lock);
if (jh->b_cp_transaction) {
__jbd2_journal_temp_unlink_buffer(jh);
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
@@ -1476,6 +1476,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
goto drop;
}
}
+ spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
J_ASSERT_JH(jh, (jh->b_transaction ==
journal->j_committing_transaction));
@@ -1487,7 +1488,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction == transaction);
+ spin_lock(&journal->j_list_lock);
jh->b_next_transaction = NULL;
+ spin_unlock(&journal->j_list_lock);
/*
* only drop a reference if this transaction modified
@@ -1499,7 +1502,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
}
not_jbd:
- spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
__brelse(bh);
drop:
@@ -1586,9 +1588,12 @@ int jbd2_journal_stop(handle_t *handle)
* to perform a synchronous write. We do this to detect the
* case where a single process is doing a stream of sync
* writes. No point in waiting for joiners in that case.
+ *
+ * Setting max_batch_time to 0 disables this completely.
*/
pid = current->pid;
- if (handle->h_sync && journal->j_last_sync_writer != pid) {
+ if (handle->h_sync && journal->j_last_sync_writer != pid &&
+ journal->j_max_batch_time) {
u64 commit_time, trans_time;
journal->j_last_sync_writer = pid;
@@ -1817,11 +1822,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
if (buffer_locked(bh) || buffer_dirty(bh))
goto out;
- if (jh->b_next_transaction != NULL)
+ if (jh->b_next_transaction != NULL || jh->b_transaction != NULL)
goto out;
spin_lock(&journal->j_list_lock);
- if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+ if (jh->b_cp_transaction != NULL) {
/* written-back checkpointed metadata buffer */
JBUFFER_TRACE(jh, "remove from checkpoint list");
__jbd2_journal_remove_checkpoint(jh);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 223283c3011..009ec0b5993 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -178,10 +178,6 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
char *value = NULL;
int rc, xprefix;
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -232,13 +228,10 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a
return rc;
}
-static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
int rc, xprefix;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
xprefix = JFFS2_XPREFIX_ACL_ACCESS;
@@ -277,30 +270,21 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, umode_t *i_mode)
{
- struct posix_acl *acl;
+ struct posix_acl *default_acl, *acl;
int rc;
cache_no_acl(inode);
- if (S_ISLNK(*i_mode))
- return 0; /* Symlink always has no-ACL */
-
- acl = jffs2_get_acl(dir_i, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
-
- if (!acl) {
- *i_mode &= ~current_umask();
- } else {
- if (S_ISDIR(*i_mode))
- set_cached_acl(inode, ACL_TYPE_DEFAULT, acl);
-
- rc = posix_acl_create(&acl, GFP_KERNEL, i_mode);
- if (rc < 0)
- return rc;
- if (rc > 0)
- set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+ rc = posix_acl_create(dir_i, i_mode, &default_acl, &acl);
+ if (rc)
+ return rc;
+ if (default_acl) {
+ set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+ posix_acl_release(default_acl);
+ }
+ if (acl) {
+ set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
posix_acl_release(acl);
}
return 0;
@@ -324,106 +308,3 @@ int jffs2_init_acl_post(struct inode *inode)
return 0;
}
-
-int jffs2_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
- int rc;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
- acl = jffs2_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (rc)
- return rc;
- rc = jffs2_set_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- return rc;
-}
-
-static size_t jffs2_acl_access_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- const int retlen = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (list && retlen <= list_size)
- strcpy(list, POSIX_ACL_XATTR_ACCESS);
- return retlen;
-}
-
-static size_t jffs2_acl_default_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- const int retlen = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (list && retlen <= list_size)
- strcpy(list, POSIX_ACL_XATTR_DEFAULT);
- return retlen;
-}
-
-static int jffs2_acl_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- struct posix_acl *acl;
- int rc;
-
- if (name[0] != '\0')
- return -EINVAL;
-
- acl = jffs2_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (!acl)
- return -ENODATA;
- rc = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return rc;
-}
-
-static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- struct posix_acl *acl;
- int rc;
-
- if (name[0] != '\0')
- return -EINVAL;
- if (!inode_owner_or_capable(dentry->d_inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- rc = posix_acl_valid(acl);
- if (rc)
- goto out;
- }
- } else {
- acl = NULL;
- }
- rc = jffs2_set_acl(dentry->d_inode, type, acl);
- out:
- posix_acl_release(acl);
- return rc;
-}
-
-const struct xattr_handler jffs2_acl_access_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_DEFAULT,
- .list = jffs2_acl_access_listxattr,
- .get = jffs2_acl_getxattr,
- .set = jffs2_acl_setxattr,
-};
-
-const struct xattr_handler jffs2_acl_default_xattr_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = jffs2_acl_default_listxattr,
- .get = jffs2_acl_getxattr,
- .set = jffs2_acl_setxattr,
-};
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index 9b477246f2a..2e2b5745c3b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -27,17 +27,14 @@ struct jffs2_acl_header {
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
struct posix_acl *jffs2_get_acl(struct inode *inode, int type);
-extern int jffs2_acl_chmod(struct inode *);
+int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *);
extern int jffs2_init_acl_post(struct inode *);
-extern const struct xattr_handler jffs2_acl_access_xattr_handler;
-extern const struct xattr_handler jffs2_acl_default_xattr_handler;
-
#else
#define jffs2_get_acl (NULL)
-#define jffs2_acl_chmod(inode) (0)
+#define jffs2_set_acl (NULL)
#define jffs2_init_acl_pre(dir_i,inode,mode) (0)
#define jffs2_init_acl_post(inode) (0)
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 2b60ce1996a..bb9cebc9ca8 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -75,10 +75,13 @@ void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c)
static int jffs2_garbage_collect_thread(void *_c)
{
struct jffs2_sb_info *c = _c;
+ sigset_t hupmask;
+ siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
allow_signal(SIGCONT);
+ allow_signal(SIGHUP);
c->gc_task = current;
complete(&c->gc_thread_start);
@@ -87,7 +90,7 @@ static int jffs2_garbage_collect_thread(void *_c)
set_freezable();
for (;;) {
- allow_signal(SIGHUP);
+ sigprocmask(SIG_UNBLOCK, &hupmask, NULL);
again:
spin_lock(&c->erase_completion_lock);
if (!jffs2_thread_should_wake(c)) {
@@ -95,10 +98,9 @@ static int jffs2_garbage_collect_thread(void *_c)
spin_unlock(&c->erase_completion_lock);
jffs2_dbg(1, "%s(): sleeping...\n", __func__);
schedule();
- } else
+ } else {
spin_unlock(&c->erase_completion_lock);
-
-
+ }
/* Problem - immediately after bootup, the GCD spends a lot
* of time in places like jffs2_kill_fragtree(); so much so
* that userspace processes (like gdm and X) are starved
@@ -150,7 +152,7 @@ static int jffs2_garbage_collect_thread(void *_c)
}
}
/* We don't want SIGHUP to interrupt us. STOP and KILL are OK though. */
- disallow_signal(SIGHUP);
+ sigprocmask(SIG_BLOCK, &hupmask, NULL);
jffs2_dbg(1, "%s(): pass\n", __func__);
if (jffs2_garbage_collect_pass(c) == -ENOSPC) {
diff --git a/fs/jffs2/compr_rtime.c b/fs/jffs2/compr_rtime.c
index 16a5047903a..406d9cc84ba 100644
--- a/fs/jffs2/compr_rtime.c
+++ b/fs/jffs2/compr_rtime.c
@@ -33,7 +33,7 @@ static int jffs2_rtime_compress(unsigned char *data_in,
unsigned char *cpage_out,
uint32_t *sourcelen, uint32_t *dstlen)
{
- short positions[256];
+ unsigned short positions[256];
int outpos = 0;
int pos=0;
@@ -74,7 +74,7 @@ static int jffs2_rtime_decompress(unsigned char *data_in,
unsigned char *cpage_out,
uint32_t srclen, uint32_t destlen)
{
- short positions[256];
+ unsigned short positions[256];
int outpos = 0;
int pos=0;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index e3aac222472..938556025d6 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -59,6 +59,7 @@ const struct inode_operations jffs2_dir_inode_operations =
.mknod = jffs2_mknod,
.rename = jffs2_rename,
.get_acl = jffs2_get_acl,
+ .set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 1506673c087..64989ca9ba9 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -51,10 +51,10 @@ const struct file_operations jffs2_file_operations =
{
.llseek = generic_file_llseek,
.open = generic_file_open,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl=jffs2_ioctl,
.mmap = generic_file_readonly_mmap,
.fsync = jffs2_fsync,
@@ -66,6 +66,7 @@ const struct file_operations jffs2_file_operations =
const struct inode_operations jffs2_file_inode_operations =
{
.get_acl = jffs2_get_acl,
+ .set_acl = jffs2_set_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index fe3c0527545..601afd1afdd 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -190,15 +190,16 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
int jffs2_setattr(struct dentry *dentry, struct iattr *iattr)
{
+ struct inode *inode = dentry->d_inode;
int rc;
- rc = inode_change_ok(dentry->d_inode, iattr);
+ rc = inode_change_ok(inode, iattr);
if (rc)
return rc;
- rc = jffs2_do_setattr(dentry->d_inode, iattr);
+ rc = jffs2_do_setattr(inode, iattr);
if (!rc && (iattr->ia_valid & ATTR_MODE))
- rc = jffs2_acl_chmod(dentry->d_inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
return rc;
}
@@ -241,7 +242,7 @@ void jffs2_evict_inode (struct inode *inode)
jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
__func__, inode->i_ino, inode->i_mode);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
jffs2_do_clear_inode(c, f);
}
@@ -456,12 +457,14 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
The umask is only applied if there's no default ACL */
ret = jffs2_init_acl_pre(dir_i, inode, &mode);
if (ret) {
- make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(ret);
+ mutex_unlock(&f->sem);
+ make_bad_inode(inode);
+ iput(inode);
+ return ERR_PTR(ret);
}
ret = jffs2_do_new_inode (c, f, mode, ri);
if (ret) {
+ mutex_unlock(&f->sem);
make_bad_inode(inode);
iput(inode);
return ERR_PTR(ret);
@@ -478,6 +481,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r
inode->i_size = 0;
if (insert_inode_locked(inode) < 0) {
+ mutex_unlock(&f->sem);
make_bad_inode(inode);
iput(inode);
return ERR_PTR(-EINVAL);
@@ -515,6 +519,10 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
c = JFFS2_SB_INFO(sb);
+ /* Do not support the MLC nand */
+ if (c->mtd->type == MTD_MLCNANDFLASH)
+ return -EINVAL;
+
#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
if (c->mtd->type == MTD_NANDFLASH) {
pr_err("Cannot operate on NAND flash unless jffs2 NAND support is compiled in\n");
@@ -682,7 +690,7 @@ unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c,
struct inode *inode = OFNI_EDONI_2SFFJ(f);
struct page *pg;
- pg = read_cache_page_async(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
+ pg = read_cache_page(inode->i_mapping, offset >> PAGE_CACHE_SHIFT,
(void *)jffs2_do_readpage_unlock, inode);
if (IS_ERR(pg))
return (void *)pg;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index 4f47aa24b55..b8fd651307a 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -288,6 +288,8 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
struct jffs2_xattr_datum *xd;
xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
dbg_memalloc("%p\n", xd);
+ if (!xd)
+ return NULL;
xd->class = RAWNODE_CLASS_XATTR_DATUM;
xd->node = (void *)xd;
@@ -306,6 +308,8 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
struct jffs2_xattr_ref *ref;
ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
dbg_memalloc("%p\n", ref);
+ if (!ref)
+ return NULL;
ref->class = RAWNODE_CLASS_XATTR_REF;
ref->node = (void *)ref;
diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c
index 975a1f562c1..9a5449bc3af 100644
--- a/fs/jffs2/nodelist.c
+++ b/fs/jffs2/nodelist.c
@@ -564,25 +564,10 @@ struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_
they're killed. */
void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
{
- struct jffs2_node_frag *frag;
- struct jffs2_node_frag *parent;
-
- if (!root->rb_node)
- return;
+ struct jffs2_node_frag *frag, *next;
dbg_fragtree("killing\n");
-
- frag = (rb_entry(root->rb_node, struct jffs2_node_frag, rb));
- while(frag) {
- if (frag->rb.rb_left) {
- frag = frag_left(frag);
- continue;
- }
- if (frag->rb.rb_right) {
- frag = frag_right(frag);
- continue;
- }
-
+ rbtree_postorder_for_each_entry_safe(frag, next, root, rb) {
if (frag->node && !(--frag->node->frags)) {
/* Not a hole, and it's the final remaining frag
of this node. Free the node */
@@ -591,17 +576,8 @@ void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c)
jffs2_free_full_dnode(frag->node);
}
- parent = frag_parent(frag);
- if (parent) {
- if (frag_left(parent) == frag)
- parent->rb.rb_left = NULL;
- else
- parent->rb.rb_right = NULL;
- }
jffs2_free_node_frag(frag);
- frag = parent;
-
cond_resched();
}
}
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index e4619b00f7c..fa35ff79ab3 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -231,7 +231,7 @@ struct jffs2_tmp_dnode_info
uint32_t version;
uint32_t data_crc;
uint32_t partial_crc;
- uint16_t csize;
+ uint32_t csize;
uint16_t overlapped;
};
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 03310721712..b6bd4affd9a 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -179,6 +179,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize,
spin_unlock(&c->erase_completion_lock);
schedule();
+ remove_wait_queue(&c->erase_wait, &wait);
} else
spin_unlock(&c->erase_completion_lock);
} else if (ret)
@@ -211,20 +212,25 @@ out:
int jffs2_reserve_space_gc(struct jffs2_sb_info *c, uint32_t minsize,
uint32_t *len, uint32_t sumsize)
{
- int ret = -EAGAIN;
+ int ret;
minsize = PAD(minsize);
jffs2_dbg(1, "%s(): Requested 0x%x bytes\n", __func__, minsize);
- spin_lock(&c->erase_completion_lock);
- while(ret == -EAGAIN) {
+ while (true) {
+ spin_lock(&c->erase_completion_lock);
ret = jffs2_do_reserve_space(c, minsize, len, sumsize);
if (ret) {
jffs2_dbg(1, "%s(): looping, ret is %d\n",
__func__, ret);
}
+ spin_unlock(&c->erase_completion_lock);
+
+ if (ret == -EAGAIN)
+ cond_resched();
+ else
+ break;
}
- spin_unlock(&c->erase_completion_lock);
if (!ret)
ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1);
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index ae81b01e6fd..386303dca38 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -543,33 +543,13 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
{
- struct rb_node *this;
- struct jffs2_tmp_dnode_info *tn;
-
- this = list->rb_node;
+ struct jffs2_tmp_dnode_info *tn, *next;
- /* Now at bottom of tree */
- while (this) {
- if (this->rb_left)
- this = this->rb_left;
- else if (this->rb_right)
- this = this->rb_right;
- else {
- tn = rb_entry(this, struct jffs2_tmp_dnode_info, rb);
+ rbtree_postorder_for_each_entry_safe(tn, next, list, rb) {
jffs2_free_full_dnode(tn->fn);
jffs2_free_tmp_dnode_info(tn);
-
- this = rb_parent(this);
- if (!this)
- break;
-
- if (this->rb_left == &tn->rb)
- this->rb_left = NULL;
- else if (this->rb_right == &tn->rb)
- this->rb_right = NULL;
- else BUG();
- }
}
+
*list = RB_ROOT;
}
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 0defb1cc2a3..0918f0e2e26 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -243,6 +243,7 @@ static int jffs2_remount_fs(struct super_block *sb, int *flags, char *data)
struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
int err;
+ sync_filesystem(sb);
err = jffs2_parse_options(c, data);
if (err)
return -EINVAL;
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 6e563332bb2..c7c77b0dfcc 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -22,7 +22,6 @@ const struct inode_operations jffs2_symlink_inode_operations =
{
.readlink = generic_readlink,
.follow_link = jffs2_follow_link,
- .get_acl = jffs2_get_acl,
.setattr = jffs2_setattr,
.setxattr = jffs2_setxattr,
.getxattr = jffs2_getxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 3034e970eb9..ad0f2e2a170 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -22,6 +22,7 @@
#include <linux/crc32.h>
#include <linux/jffs2.h>
#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
#include <linux/mtd/mtd.h>
#include "nodelist.h"
/* -------- xdatum related functions ----------------
@@ -921,8 +922,8 @@ const struct xattr_handler *jffs2_xattr_handlers[] = {
&jffs2_security_xattr_handler,
#endif
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
- &jffs2_acl_access_xattr_handler,
- &jffs2_acl_default_xattr_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
&jffs2_trusted_xattr_handler,
NULL
@@ -942,10 +943,10 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) {
#endif
#ifdef CONFIG_JFFS2_FS_POSIX_ACL
case JFFS2_XPREFIX_ACL_ACCESS:
- ret = &jffs2_acl_access_xattr_handler;
+ ret = &posix_acl_access_xattr_handler;
break;
case JFFS2_XPREFIX_ACL_DEFAULT:
- ret = &jffs2_acl_default_xattr_handler;
+ ret = &posix_acl_default_xattr_handler;
break;
#endif
case JFFS2_XPREFIX_TRUSTED:
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d254d6d3599..0c8ca830b11 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -72,7 +72,7 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
return acl;
}
-static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
+static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
struct posix_acl *acl)
{
char *ea_name;
@@ -80,21 +80,26 @@ static int jfs_set_acl(tid_t tid, struct inode *inode, int type,
int size = 0;
char *value = NULL;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- switch(type) {
- case ACL_TYPE_ACCESS:
- ea_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- ea_name = POSIX_ACL_XATTR_DEFAULT;
- if (!S_ISDIR(inode->i_mode))
- return acl ? -EACCES : 0;
- break;
- default:
- return -EINVAL;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ ea_name = POSIX_ACL_XATTR_ACCESS;
+ if (acl) {
+ rc = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (rc < 0)
+ return rc;
+ inode->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+ if (rc == 0)
+ acl = NULL;
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ ea_name = POSIX_ACL_XATTR_DEFAULT;
+ break;
+ default:
+ return -EINVAL;
}
+
if (acl) {
size = posix_acl_xattr_size(acl->a_count);
value = kmalloc(size, GFP_KERNEL);
@@ -114,65 +119,43 @@ out:
return rc;
}
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int rc;
+ tid_t tid;
+
+ tid = txBegin(inode->i_sb, 0);
+ mutex_lock(&JFS_IP(inode)->commit_mutex);
+ rc = __jfs_set_acl(tid, inode, type, acl);
+ if (!rc)
+ rc = txCommit(tid, 1, &inode, 0);
+ txEnd(tid);
+ mutex_unlock(&JFS_IP(inode)->commit_mutex);
+ return rc;
+}
+
int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
{
- struct posix_acl *acl = NULL;
+ struct posix_acl *default_acl, *acl;
int rc = 0;
- if (S_ISLNK(inode->i_mode))
- return 0;
+ rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (rc)
+ return rc;
- acl = jfs_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
+ if (default_acl) {
+ rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
+ posix_acl_release(default_acl);
+ }
if (acl) {
- if (S_ISDIR(inode->i_mode)) {
- rc = jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, acl);
- if (rc)
- goto cleanup;
- }
- rc = posix_acl_create(&acl, GFP_KERNEL, &inode->i_mode);
- if (rc < 0)
- goto cleanup; /* posix_acl_release(NULL) is no-op */
- if (rc > 0)
- rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
-cleanup:
+ if (!rc)
+ rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
posix_acl_release(acl);
- } else
- inode->i_mode &= ~current_umask();
+ }
JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
inode->i_mode;
return rc;
}
-
-int jfs_acl_chmod(struct inode *inode)
-{
- struct posix_acl *acl;
- int rc;
- tid_t tid;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
-
- rc = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (rc)
- return rc;
-
- tid = txBegin(inode->i_sb, 0);
- mutex_lock(&JFS_IP(inode)->commit_mutex);
- rc = jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
- if (!rc)
- rc = txCommit(tid, 1, &inode, 0);
- txEnd(tid);
- mutex_unlock(&JFS_IP(inode)->commit_mutex);
-
- posix_acl_release(acl);
- return rc;
-}
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index dd7442c5835..33aa0cc1f8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -19,6 +19,7 @@
#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/posix_acl.h>
#include <linux/quotaops.h>
#include "jfs_incore.h"
#include "jfs_inode.h"
@@ -131,7 +132,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
mark_inode_dirty(inode);
if (iattr->ia_valid & ATTR_MODE)
- rc = jfs_acl_chmod(inode);
+ rc = posix_acl_chmod(inode, inode->i_mode);
return rc;
}
@@ -143,19 +144,20 @@ const struct inode_operations jfs_file_inode_operations = {
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
+ .set_acl = jfs_set_acl,
#endif
};
const struct file_operations jfs_file_operations = {
.open = jfs_open,
.llseek = generic_file_llseek,
- .write = do_sync_write,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .write = new_sync_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fsync = jfs_fsync,
.release = jfs_release,
.unlocked_ioctl = jfs_ioctl,
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f4aab719add..bd3df1ca3c9 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode)
dquot_initialize(inode);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
@@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode)
dquot_free_inode(inode);
}
} else {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
clear_inode(inode);
dquot_drop(inode);
@@ -331,15 +331,15 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
}
static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- jfs_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block);
/*
* In case of error extending write may have instantiated a few
@@ -347,7 +347,7 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
jfs_write_failed(mapping, end);
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index ad84fe50ca9..489f993b7b1 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -21,8 +21,8 @@
#ifdef CONFIG_JFS_POSIX_ACL
struct posix_acl *jfs_get_acl(struct inode *inode, int type);
+int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
int jfs_init_acl(tid_t, struct inode *, struct inode *);
-int jfs_acl_chmod(struct inode *inode);
#else
@@ -32,10 +32,5 @@ static inline int jfs_init_acl(tid_t tid, struct inode *inode,
return 0;
}
-static inline int jfs_acl_chmod(struct inode *inode)
-{
- return 0;
-}
-
#endif
#endif /* _H_JFS_ACL */
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 370d7b6c594..2d514c7affc 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1208,7 +1208,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
* by this leaf.
*/
l2size =
- min((int)leaf[word], NLSTOL2BSZ(nwords));
+ min_t(int, leaf[word], NLSTOL2BSZ(nwords));
/* determine how many words were handled.
*/
@@ -1902,7 +1902,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
/* determine how many blocks to allocate from this dmap.
*/
- nb = min(n, (s64)BPERDMAP);
+ nb = min_t(s64, n, BPERDMAP);
/* allocate the blocks from the dmap.
*/
@@ -2260,7 +2260,8 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* of bits being allocated and the l2 number
* of bits currently described by this leaf.
*/
- size = min((int)leaf[word], NLSTOL2BSZ(nwords));
+ size = min_t(int, leaf[word],
+ NLSTOL2BSZ(nwords));
/* update the leaf to reflect the allocation.
* in addition to setting the leaf value to
@@ -3563,7 +3564,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
if (mp == NULL)
goto errout;
- n = min(nblocks, (s64)BPERDMAP);
+ n = min_t(s64, nblocks, BPERDMAP);
}
dp = (struct dmap *) mp->data;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index c1a3e603279..6b0f816201a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -29,20 +29,20 @@
void jfs_set_inode_flags(struct inode *inode)
{
unsigned int flags = JFS_IP(inode)->mode2;
-
- inode->i_flags &= ~(S_IMMUTABLE | S_APPEND |
- S_NOATIME | S_DIRSYNC | S_SYNC);
+ unsigned int new_fl = 0;
if (flags & JFS_IMMUTABLE_FL)
- inode->i_flags |= S_IMMUTABLE;
+ new_fl |= S_IMMUTABLE;
if (flags & JFS_APPEND_FL)
- inode->i_flags |= S_APPEND;
+ new_fl |= S_APPEND;
if (flags & JFS_NOATIME_FL)
- inode->i_flags |= S_NOATIME;
+ new_fl |= S_NOATIME;
if (flags & JFS_DIRSYNC_FL)
- inode->i_flags |= S_DIRSYNC;
+ new_fl |= S_DIRSYNC;
if (flags & JFS_SYNC_FL)
- inode->i_flags |= S_SYNC;
+ new_fl |= S_SYNC;
+ inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME |
+ S_DIRSYNC | S_SYNC);
}
void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
@@ -95,7 +95,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
if (insert_inode_locked(inode) < 0) {
rc = -EINVAL;
- goto fail_unlock;
+ goto fail_put;
}
inode_init_owner(inode, parent, mode);
@@ -156,7 +156,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
-fail_unlock:
clear_nlink(inode);
unlock_new_inode(inode);
fail_put:
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 360d27c4888..0acddf60af5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -167,7 +167,7 @@ do { \
* Global list of active external journals
*/
static LIST_HEAD(jfs_external_logs);
-static struct jfs_log *dummy_log = NULL;
+static struct jfs_log *dummy_log;
static DEFINE_MUTEX(jfs_log_mutex);
/*
@@ -1998,20 +1998,20 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+ bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
bio->bi_bdev = log->bdev;
bio->bi_io_vec[0].bv_page = bp->l_page;
bio->bi_io_vec[0].bv_len = LOGPSIZE;
bio->bi_io_vec[0].bv_offset = bp->l_offset;
bio->bi_vcnt = 1;
- bio->bi_size = LOGPSIZE;
+ bio->bi_iter.bi_size = LOGPSIZE;
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
/*check if journaling to disk has been disabled*/
if (log->no_integrity) {
- bio->bi_size = 0;
+ bio->bi_iter.bi_size = 0;
lbmIODone(bio, 0);
} else {
submit_bio(READ_SYNC, bio);
@@ -2144,21 +2144,21 @@ static void lbmStartIO(struct lbuf * bp)
jfs_info("lbmStartIO\n");
bio = bio_alloc(GFP_NOFS, 1);
- bio->bi_sector = bp->l_blkno << (log->l2bsize - 9);
+ bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
bio->bi_bdev = log->bdev;
bio->bi_io_vec[0].bv_page = bp->l_page;
bio->bi_io_vec[0].bv_len = LOGPSIZE;
bio->bi_io_vec[0].bv_offset = bp->l_offset;
bio->bi_vcnt = 1;
- bio->bi_size = LOGPSIZE;
+ bio->bi_iter.bi_size = LOGPSIZE;
bio->bi_end_io = lbmIODone;
bio->bi_private = bp;
/* check if journaling to disk has been disabled */
if (log->no_integrity) {
- bio->bi_size = 0;
+ bio->bi_iter.bi_size = 0;
lbmIODone(bio, 0);
} else {
submit_bio(WRITE_SYNC, bio);
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index d165cde0c68..49ba7ff1bbb 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -416,7 +416,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
* count from hitting zero before we're through
*/
inc_io(page);
- if (!bio->bi_size)
+ if (!bio->bi_iter.bi_size)
goto dump_bio;
submit_bio(WRITE, bio);
nr_underway++;
@@ -438,7 +438,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_sector = pblock << (inode->i_blkbits - 9);
+ bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_write_end_io;
bio->bi_private = page;
@@ -452,7 +452,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc)
if (bio) {
if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes)
goto add_failed;
- if (!bio->bi_size)
+ if (!bio->bi_iter.bi_size)
goto dump_bio;
submit_bio(WRITE, bio);
@@ -517,7 +517,8 @@ static int metapage_readpage(struct file *fp, struct page *page)
bio = bio_alloc(GFP_NOFS, 1);
bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_sector = pblock << (inode->i_blkbits - 9);
+ bio->bi_iter.bi_sector =
+ pblock << (inode->i_blkbits - 9);
bio->bi_end_io = metapage_read_end_io;
bio->bi_private = page;
len = xlen << inode->i_blkbits;
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index e9e100fd7c0..e8d717dabca 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -61,6 +61,8 @@ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
extern int jfs_removexattr(struct dentry *, const char *);
+extern const struct xattr_handler *jfs_xattr_handlers[];
+
#ifdef CONFIG_JFS_SECURITY
extern int jfs_init_security(tid_t, struct inode *, struct inode *,
const struct qstr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index aa8a3370631..d59c7defb1e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1524,6 +1524,7 @@ const struct inode_operations jfs_dir_inode_operations = {
.setattr = jfs_setattr,
#ifdef CONFIG_JFS_POSIX_ACL
.get_acl = jfs_get_acl,
+ .set_acl = jfs_set_acl,
#endif
};
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6669aa2042c..adf8cb045b9 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -44,19 +44,20 @@
#include "jfs_imap.h"
#include "jfs_acl.h"
#include "jfs_debug.h"
+#include "jfs_xattr.h"
MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
MODULE_LICENSE("GPL");
-static struct kmem_cache * jfs_inode_cachep;
+static struct kmem_cache *jfs_inode_cachep;
static const struct super_operations jfs_super_operations;
static const struct export_operations jfs_export_operations;
static struct file_system_type jfs_fs_type;
#define MAX_COMMIT_THREADS 64
-static int commit_threads = 0;
+static int commit_threads;
module_param(commit_threads, int, 0);
MODULE_PARM_DESC(commit_threads, "Number of commit threads");
@@ -83,8 +84,7 @@ static void jfs_handle_error(struct super_block *sb)
panic("JFS (device %s): panic forced after error\n",
sb->s_id);
else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
- jfs_err("ERROR: (device %s): remounting filesystem "
- "as read-only\n",
+ jfs_err("ERROR: (device %s): remounting filesystem as read-only\n",
sb->s_id);
sb->s_flags |= MS_RDONLY;
}
@@ -272,7 +272,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_resize:
{
char *resize = args[0].from;
- *newLVSize = simple_strtoull(resize, &resize, 0);
+ int rc = kstrtoll(resize, 0, newLVSize);
+
+ if (rc)
+ goto cleanup;
break;
}
case Opt_resize_nosize:
@@ -326,7 +329,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_uid:
{
char *uid = args[0].from;
- uid_t val = simple_strtoul(uid, &uid, 0);
+ uid_t val;
+ int rc = kstrtouint(uid, 0, &val);
+
+ if (rc)
+ goto cleanup;
sbi->uid = make_kuid(current_user_ns(), val);
if (!uid_valid(sbi->uid))
goto cleanup;
@@ -336,7 +343,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_gid:
{
char *gid = args[0].from;
- gid_t val = simple_strtoul(gid, &gid, 0);
+ gid_t val;
+ int rc = kstrtouint(gid, 0, &val);
+
+ if (rc)
+ goto cleanup;
sbi->gid = make_kgid(current_user_ns(), val);
if (!gid_valid(sbi->gid))
goto cleanup;
@@ -346,7 +357,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
case Opt_umask:
{
char *umask = args[0].from;
- sbi->umask = simple_strtoul(umask, &umask, 8);
+ int rc = kstrtouint(umask, 8, &sbi->umask);
+
+ if (rc)
+ goto cleanup;
if (sbi->umask & ~0777) {
pr_err("JFS: Invalid value of umask\n");
goto cleanup;
@@ -362,12 +376,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
* -> user has more control over the online trimming
*/
sbi->minblks_trim = 64;
- if (blk_queue_discard(q)) {
+ if (blk_queue_discard(q))
*flag |= JFS_DISCARD;
- } else {
- pr_err("JFS: discard option " \
- "not supported on device\n");
- }
+ else
+ pr_err("JFS: discard option not supported on device\n");
break;
}
@@ -379,20 +391,21 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
{
struct request_queue *q = bdev_get_queue(sb->s_bdev);
char *minblks_trim = args[0].from;
+ int rc;
if (blk_queue_discard(q)) {
*flag |= JFS_DISCARD;
- sbi->minblks_trim = simple_strtoull(
- minblks_trim, &minblks_trim, 0);
- } else {
- pr_err("JFS: discard option " \
- "not supported on device\n");
- }
+ rc = kstrtouint(minblks_trim, 0,
+ &sbi->minblks_trim);
+ if (rc)
+ goto cleanup;
+ } else
+ pr_err("JFS: discard option not supported on device\n");
break;
}
default:
- printk("jfs: Unrecognized mount option \"%s\" "
- " or missing value\n", p);
+ printk("jfs: Unrecognized mount option \"%s\" or missing value\n",
+ p);
goto cleanup;
}
}
@@ -417,14 +430,13 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
int flag = JFS_SBI(sb)->flag;
int ret;
- if (!parse_options(data, sb, &newLVSize, &flag)) {
+ sync_filesystem(sb);
+ if (!parse_options(data, sb, &newLVSize, &flag))
return -EINVAL;
- }
if (newLVSize) {
if (sb->s_flags & MS_RDONLY) {
- pr_err("JFS: resize requires volume" \
- " to be mounted read-write\n");
+ pr_err("JFS: resize requires volume to be mounted read-write\n");
return -EROFS;
}
rc = jfs_extendfs(sb, newLVSize, 0);
@@ -450,9 +462,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
}
if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
rc = dquot_suspend(sb, -1);
- if (rc < 0) {
+ if (rc < 0)
return rc;
- }
rc = jfs_umount_rw(sb);
JFS_SBI(sb)->flag = flag;
return rc;
@@ -485,7 +496,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
if (!new_valid_dev(sb->s_bdev->bd_dev))
return -EOVERFLOW;
- sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
+ sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
@@ -522,6 +533,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
*/
sb->s_op = &jfs_super_operations;
sb->s_export_op = &jfs_export_operations;
+ sb->s_xattr = jfs_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->dq_op = &dquot_operations;
sb->s_qcop = &dquot_quotactl_ops;
@@ -545,9 +557,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
rc = jfs_mount(sb);
if (rc) {
- if (!silent) {
+ if (!silent)
jfs_err("jfs_mount failed w/return code = %d", rc);
- }
goto out_mount_failed;
}
if (sb->s_flags & MS_RDONLY)
@@ -584,7 +595,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
* Page cache is indexed by long.
* I would use MAX_LFS_FILESIZE, but it's only half as big
*/
- sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
+ sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1,
+ (u64)sb->s_maxbytes);
#endif
sb->s_time_gran = 1;
return 0;
@@ -594,9 +606,8 @@ out_no_root:
out_no_rw:
rc = jfs_umount(sb);
- if (rc) {
+ if (rc)
jfs_err("jfs_umount failed with return code %d", rc);
- }
out_mount_failed:
filemap_write_and_wait(sbi->direct_inode->i_mapping);
truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
@@ -921,7 +932,8 @@ static int __init init_jfs_fs(void)
commit_threads = MAX_COMMIT_THREADS;
for (i = 0; i < commit_threads; i++) {
- jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit");
+ jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL,
+ "jfsCommit");
if (IS_ERR(jfsCommitThread[i])) {
rc = PTR_ERR(jfsCommitThread[i]);
jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index d3472f4cd53..46325d5c34f 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -666,81 +666,12 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
}
/*
- * can_set_system_xattr
- *
- * This code is specific to the system.* namespace. It contains policy
- * which doesn't belong in the main xattr codepath.
- */
-static int can_set_system_xattr(struct inode *inode, const char *name,
- const void *value, size_t value_len)
-{
-#ifdef CONFIG_JFS_POSIX_ACL
- struct posix_acl *acl;
- int rc;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- /*
- * POSIX_ACL_XATTR_ACCESS is tied to i_mode
- */
- if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
- if (IS_ERR(acl)) {
- rc = PTR_ERR(acl);
- printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
- rc);
- return rc;
- }
- if (acl) {
- rc = posix_acl_equiv_mode(acl, &inode->i_mode);
- posix_acl_release(acl);
- if (rc < 0) {
- printk(KERN_ERR
- "posix_acl_equiv_mode returned %d\n",
- rc);
- return rc;
- }
- mark_inode_dirty(inode);
- }
- /*
- * We're changing the ACL. Get rid of the cached one
- */
- forget_cached_acl(inode, ACL_TYPE_ACCESS);
-
- return 0;
- } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
- acl = posix_acl_from_xattr(&init_user_ns, value, value_len);
- if (IS_ERR(acl)) {
- rc = PTR_ERR(acl);
- printk(KERN_ERR "posix_acl_from_xattr returned %d\n",
- rc);
- return rc;
- }
- posix_acl_release(acl);
-
- /*
- * We're changing the default ACL. Get rid of the cached one
- */
- forget_cached_acl(inode, ACL_TYPE_DEFAULT);
-
- return 0;
- }
-#endif /* CONFIG_JFS_POSIX_ACL */
- return -EOPNOTSUPP;
-}
-
-/*
* Most of the permission checking is done by xattr_permission in the vfs.
- * The local file system is responsible for handling the system.* namespace.
* We also need to verify that this is a namespace that we recognize.
*/
static int can_set_xattr(struct inode *inode, const char *name,
const void *value, size_t value_len)
{
- if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
- return can_set_system_xattr(inode, name, value, value_len);
-
if (!strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)) {
/*
* This makes sure that we aren't trying to set an
@@ -748,7 +679,7 @@ static int can_set_xattr(struct inode *inode, const char *name,
* with "os2."
*/
if (is_known_namespace(name + XATTR_OS2_PREFIX_LEN))
- return -EOPNOTSUPP;
+ return -EOPNOTSUPP;
return 0;
}
@@ -860,6 +791,19 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
/* Completely new ea list */
xattr_size = sizeof (struct jfs_ea_list);
+ /*
+ * The size of EA value is limitted by on-disk format up to
+ * __le16, there would be an overflow if the size is equal
+ * to XATTR_SIZE_MAX (65536). In order to avoid this issue,
+ * we can pre-checkup the value size against USHRT_MAX, and
+ * return -E2BIG in this case, which is consistent with the
+ * VFS setxattr interface.
+ */
+ if (value_len >= USHRT_MAX) {
+ rc = -E2BIG;
+ goto release;
+ }
+
ea = (struct jfs_ea *) ((char *) ealist + xattr_size);
ea->flag = 0;
ea->namelen = namelen;
@@ -874,7 +818,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name,
/* DEBUG - If we did this right, these number match */
if (xattr_size != new_size) {
printk(KERN_ERR
- "jfs_xsetattr: xattr_size = %d, new_size = %d\n",
+ "__jfs_setxattr: xattr_size = %d, new_size = %d\n",
xattr_size, new_size);
rc = -EINVAL;
@@ -910,6 +854,14 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
int rc;
tid_t tid;
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_setxattr(dentry, name, value, value_len, flags);
+
if ((rc = can_set_xattr(inode, name, value, value_len)))
return rc;
@@ -986,6 +938,14 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data,
{
int err;
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_getxattr(dentry, name, data, buf_size);
+
if (strncmp(name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
/*
* skip past "os2." prefix
@@ -1074,6 +1034,14 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
int rc;
tid_t tid;
+ /*
+ * If this is a request for a synthetic attribute in the system.*
+ * namespace use the generic infrastructure to resolve a handler
+ * for it via sb->s_xattr.
+ */
+ if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+ return generic_removexattr(dentry, name);
+
if ((rc = can_set_xattr(inode, name, NULL, 0)))
return rc;
@@ -1088,6 +1056,19 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
return rc;
}
+/*
+ * List of handlers for synthetic system.* attributes. All real ondisk
+ * attributes are handled directly.
+ */
+const struct xattr_handler *jfs_xattr_handlers[] = {
+#ifdef CONFIG_JFS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ NULL,
+};
+
+
#ifdef CONFIG_JFS_SECURITY
static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig
new file mode 100644
index 00000000000..397b5f7a7a1
--- /dev/null
+++ b/fs/kernfs/Kconfig
@@ -0,0 +1,7 @@
+#
+# KERNFS should be selected by its users
+#
+
+config KERNFS
+ bool
+ default n
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 00000000000..674337c7667
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+
+obj-y := mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 00000000000..a693f5b01ae
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1432 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+
+#include "kernfs-internal.h"
+
+DEFINE_MUTEX(kernfs_mutex);
+static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
+static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
+
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+
+static bool kernfs_active(struct kernfs_node *kn)
+{
+ lockdep_assert_held(&kernfs_mutex);
+ return atomic_read(&kn->active) >= 0;
+}
+
+static bool kernfs_lockdep(struct kernfs_node *kn)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ return kn->flags & KERNFS_LOCKDEP;
+#else
+ return false;
+#endif
+}
+
+static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
+}
+
+static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
+ size_t buflen)
+{
+ char *p = buf + buflen;
+ int len;
+
+ *--p = '\0';
+
+ do {
+ len = strlen(kn->name);
+ if (p - buf < len + 1) {
+ buf[0] = '\0';
+ p = NULL;
+ break;
+ }
+ p -= len;
+ memcpy(p, kn->name, len);
+ *--p = '/';
+ kn = kn->parent;
+ } while (kn && kn->parent);
+
+ return p;
+}
+
+/**
+ * kernfs_name - obtain the name of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Copies the name of @kn into @buf of @buflen bytes. The behavior is
+ * similar to strlcpy(). It returns the length of @kn's name and if @buf
+ * isn't long enough, it's filled upto @buflen-1 and nul terminated.
+ *
+ * This function can be called from any context.
+ */
+int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+ ret = kernfs_name_locked(kn, buf, buflen);
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ return ret;
+}
+
+/**
+ * kernfs_path - build full path of a given node
+ * @kn: kernfs_node of interest
+ * @buf: buffer to copy @kn's name into
+ * @buflen: size of @buf
+ *
+ * Builds and returns the full path of @kn in @buf of @buflen bytes. The
+ * path is built from the end of @buf so the returned pointer usually
+ * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated
+ * and %NULL is returned.
+ */
+char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
+{
+ unsigned long flags;
+ char *p;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+ p = kernfs_path_locked(kn, buf, buflen);
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ return p;
+}
+EXPORT_SYMBOL_GPL(kernfs_path);
+
+/**
+ * pr_cont_kernfs_name - pr_cont name of a kernfs_node
+ * @kn: kernfs_node of interest
+ *
+ * This function can be called from any context.
+ */
+void pr_cont_kernfs_name(struct kernfs_node *kn)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+ kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+ pr_cont("%s", kernfs_pr_cont_buf);
+
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+}
+
+/**
+ * pr_cont_kernfs_path - pr_cont path of a kernfs_node
+ * @kn: kernfs_node of interest
+ *
+ * This function can be called from any context.
+ */
+void pr_cont_kernfs_path(struct kernfs_node *kn)
+{
+ unsigned long flags;
+ char *p;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+
+ p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
+ if (p)
+ pr_cont("%s", p);
+ else
+ pr_cont("<name too long>");
+
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+}
+
+/**
+ * kernfs_get_parent - determine the parent node and pin it
+ * @kn: kernfs_node of interest
+ *
+ * Determines @kn's parent, pins and returns it. This function can be
+ * called from any context.
+ */
+struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent;
+ unsigned long flags;
+
+ spin_lock_irqsave(&kernfs_rename_lock, flags);
+ parent = kn->parent;
+ kernfs_get(parent);
+ spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+
+ return parent;
+}
+
+/**
+ * kernfs_name_hash
+ * @name: Null terminated string to hash
+ * @ns: Namespace tag to hash
+ *
+ * Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
+{
+ unsigned long hash = init_name_hash();
+ unsigned int len = strlen(name);
+ while (len--)
+ hash = partial_name_hash(*name++, hash);
+ hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+ hash &= 0x7fffffffU;
+ /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+ if (hash < 2)
+ hash += 2;
+ if (hash >= INT_MAX)
+ hash = INT_MAX - 1;
+ return hash;
+}
+
+static int kernfs_name_compare(unsigned int hash, const char *name,
+ const void *ns, const struct kernfs_node *kn)
+{
+ if (hash != kn->hash)
+ return hash - kn->hash;
+ if (ns != kn->ns)
+ return ns - kn->ns;
+ return strcmp(name, kn->name);
+}
+
+static int kernfs_sd_compare(const struct kernfs_node *left,
+ const struct kernfs_node *right)
+{
+ return kernfs_name_compare(left->hash, left->name, left->ns, right);
+}
+
+/**
+ * kernfs_link_sibling - link kernfs_node into sibling rbtree
+ * @kn: kernfs_node of interest
+ *
+ * Link @kn into its sibling rbtree which starts from
+ * @kn->parent->dir.children.
+ *
+ * Locking:
+ * mutex_lock(kernfs_mutex)
+ *
+ * RETURNS:
+ * 0 on susccess -EEXIST on failure.
+ */
+static int kernfs_link_sibling(struct kernfs_node *kn)
+{
+ struct rb_node **node = &kn->parent->dir.children.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (*node) {
+ struct kernfs_node *pos;
+ int result;
+
+ pos = rb_to_kn(*node);
+ parent = *node;
+ result = kernfs_sd_compare(kn, pos);
+ if (result < 0)
+ node = &pos->rb.rb_left;
+ else if (result > 0)
+ node = &pos->rb.rb_right;
+ else
+ return -EEXIST;
+ }
+
+ /* add new node and rebalance the tree */
+ rb_link_node(&kn->rb, parent, node);
+ rb_insert_color(&kn->rb, &kn->parent->dir.children);
+
+ /* successfully added, account subdir number */
+ if (kernfs_type(kn) == KERNFS_DIR)
+ kn->parent->dir.subdirs++;
+
+ return 0;
+}
+
+/**
+ * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ * @kn: kernfs_node of interest
+ *
+ * Try to unlink @kn from its sibling rbtree which starts from
+ * kn->parent->dir.children. Returns %true if @kn was actually
+ * removed, %false if @kn wasn't on the rbtree.
+ *
+ * Locking:
+ * mutex_lock(kernfs_mutex)
+ */
+static bool kernfs_unlink_sibling(struct kernfs_node *kn)
+{
+ if (RB_EMPTY_NODE(&kn->rb))
+ return false;
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ kn->parent->dir.subdirs--;
+
+ rb_erase(&kn->rb, &kn->parent->dir.children);
+ RB_CLEAR_NODE(&kn->rb);
+ return true;
+}
+
+/**
+ * kernfs_get_active - get an active reference to kernfs_node
+ * @kn: kernfs_node to get an active reference to
+ *
+ * Get an active reference of @kn. This function is noop if @kn
+ * is NULL.
+ *
+ * RETURNS:
+ * Pointer to @kn on success, NULL on failure.
+ */
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
+{
+ if (unlikely(!kn))
+ return NULL;
+
+ if (!atomic_inc_unless_negative(&kn->active))
+ return NULL;
+
+ if (kernfs_lockdep(kn))
+ rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+ return kn;
+}
+
+/**
+ * kernfs_put_active - put an active reference to kernfs_node
+ * @kn: kernfs_node to put an active reference to
+ *
+ * Put an active reference to @kn. This function is noop if @kn
+ * is NULL.
+ */
+void kernfs_put_active(struct kernfs_node *kn)
+{
+ struct kernfs_root *root = kernfs_root(kn);
+ int v;
+
+ if (unlikely(!kn))
+ return;
+
+ if (kernfs_lockdep(kn))
+ rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ v = atomic_dec_return(&kn->active);
+ if (likely(v != KN_DEACTIVATED_BIAS))
+ return;
+
+ wake_up_all(&root->deactivate_waitq);
+}
+
+/**
+ * kernfs_drain - drain kernfs_node
+ * @kn: kernfs_node to drain
+ *
+ * Drain existing usages and nuke all existing mmaps of @kn. Mutiple
+ * removers may invoke this function concurrently on @kn and all will
+ * return after draining is complete.
+ */
+static void kernfs_drain(struct kernfs_node *kn)
+ __releases(&kernfs_mutex) __acquires(&kernfs_mutex)
+{
+ struct kernfs_root *root = kernfs_root(kn);
+
+ lockdep_assert_held(&kernfs_mutex);
+ WARN_ON_ONCE(kernfs_active(kn));
+
+ mutex_unlock(&kernfs_mutex);
+
+ if (kernfs_lockdep(kn)) {
+ rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+ if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS)
+ lock_contended(&kn->dep_map, _RET_IP_);
+ }
+
+ /* but everyone should wait for draining */
+ wait_event(root->deactivate_waitq,
+ atomic_read(&kn->active) == KN_DEACTIVATED_BIAS);
+
+ if (kernfs_lockdep(kn)) {
+ lock_acquired(&kn->dep_map, _RET_IP_);
+ rwsem_release(&kn->dep_map, 1, _RET_IP_);
+ }
+
+ kernfs_unmap_bin_file(kn);
+
+ mutex_lock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ */
+void kernfs_get(struct kernfs_node *kn)
+{
+ if (kn) {
+ WARN_ON(!atomic_read(&kn->count));
+ atomic_inc(&kn->count);
+ }
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+
+/**
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ *
+ * Put a reference count of @kn and destroy it if it reached zero.
+ */
+void kernfs_put(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent;
+ struct kernfs_root *root;
+
+ if (!kn || !atomic_dec_and_test(&kn->count))
+ return;
+ root = kernfs_root(kn);
+ repeat:
+ /*
+ * Moving/renaming is always done while holding reference.
+ * kn->parent won't change beneath us.
+ */
+ parent = kn->parent;
+
+ WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS,
+ "kernfs_put: %s/%s: released with incorrect active_ref %d\n",
+ parent ? parent->name : "", kn->name, atomic_read(&kn->active));
+
+ if (kernfs_type(kn) == KERNFS_LINK)
+ kernfs_put(kn->symlink.target_kn);
+ if (!(kn->flags & KERNFS_STATIC_NAME))
+ kfree(kn->name);
+ if (kn->iattr) {
+ if (kn->iattr->ia_secdata)
+ security_release_secctx(kn->iattr->ia_secdata,
+ kn->iattr->ia_secdata_len);
+ simple_xattrs_free(&kn->iattr->xattrs);
+ }
+ kfree(kn->iattr);
+ ida_simple_remove(&root->ino_ida, kn->ino);
+ kmem_cache_free(kernfs_node_cache, kn);
+
+ kn = parent;
+ if (kn) {
+ if (atomic_dec_and_test(&kn->count))
+ goto repeat;
+ } else {
+ /* just released the root kn, free @root too */
+ ida_destroy(&root->ino_ida);
+ kfree(root);
+ }
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct kernfs_node *kn;
+
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ /* Always perform fresh lookup for negatives */
+ if (!dentry->d_inode)
+ goto out_bad_unlocked;
+
+ kn = dentry->d_fsdata;
+ mutex_lock(&kernfs_mutex);
+
+ /* The kernfs node has been deactivated */
+ if (!kernfs_active(kn))
+ goto out_bad;
+
+ /* The kernfs node has been moved? */
+ if (dentry->d_parent->d_fsdata != kn->parent)
+ goto out_bad;
+
+ /* The kernfs node has been renamed */
+ if (strcmp(dentry->d_name.name, kn->name) != 0)
+ goto out_bad;
+
+ /* The kernfs node has been moved to a different namespace */
+ if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+ kernfs_info(dentry->d_sb)->ns != kn->ns)
+ goto out_bad;
+
+ mutex_unlock(&kernfs_mutex);
+out_valid:
+ return 1;
+out_bad:
+ mutex_unlock(&kernfs_mutex);
+out_bad_unlocked:
+ /*
+ * @dentry doesn't match the underlying kernfs node, drop the
+ * dentry and force lookup. If we have submounts we must allow the
+ * vfs caches to lie about the state of the filesystem to prevent
+ * leaks and other nasty things, so use check_submounts_and_drop()
+ * instead of d_drop().
+ */
+ if (check_submounts_and_drop(dentry) != 0)
+ goto out_valid;
+
+ return 0;
+}
+
+static void kernfs_dop_release(struct dentry *dentry)
+{
+ kernfs_put(dentry->d_fsdata);
+}
+
+const struct dentry_operations kernfs_dops = {
+ .d_revalidate = kernfs_dop_revalidate,
+ .d_release = kernfs_dop_release,
+};
+
+/**
+ * kernfs_node_from_dentry - determine kernfs_node associated with a dentry
+ * @dentry: the dentry in question
+ *
+ * Return the kernfs_node associated with @dentry. If @dentry is not a
+ * kernfs one, %NULL is returned.
+ *
+ * While the returned kernfs_node will stay accessible as long as @dentry
+ * is accessible, the returned node can be in any state and the caller is
+ * fully responsible for determining what's accessible.
+ */
+struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
+{
+ if (dentry->d_sb->s_op == &kernfs_sops)
+ return dentry->d_fsdata;
+ return NULL;
+}
+
+static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+ const char *name, umode_t mode,
+ unsigned flags)
+{
+ char *dup_name = NULL;
+ struct kernfs_node *kn;
+ int ret;
+
+ if (!(flags & KERNFS_STATIC_NAME)) {
+ name = dup_name = kstrdup(name, GFP_KERNEL);
+ if (!name)
+ return NULL;
+ }
+
+ kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
+ if (!kn)
+ goto err_out1;
+
+ ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+ if (ret < 0)
+ goto err_out2;
+ kn->ino = ret;
+
+ atomic_set(&kn->count, 1);
+ atomic_set(&kn->active, KN_DEACTIVATED_BIAS);
+ RB_CLEAR_NODE(&kn->rb);
+
+ kn->name = name;
+ kn->mode = mode;
+ kn->flags = flags;
+
+ return kn;
+
+ err_out2:
+ kmem_cache_free(kernfs_node_cache, kn);
+ err_out1:
+ kfree(dup_name);
+ return NULL;
+}
+
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+ const char *name, umode_t mode,
+ unsigned flags)
+{
+ struct kernfs_node *kn;
+
+ kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+ if (kn) {
+ kernfs_get(parent);
+ kn->parent = parent;
+ }
+ return kn;
+}
+
+/**
+ * kernfs_add_one - add kernfs_node to parent without warning
+ * @kn: kernfs_node to be added
+ *
+ * The caller must already have initialized @kn->parent. This
+ * function increments nlink of the parent's inode if @kn is a
+ * directory and link into the children list of the parent.
+ *
+ * RETURNS:
+ * 0 on success, -EEXIST if entry with the given name already
+ * exists.
+ */
+int kernfs_add_one(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent = kn->parent;
+ struct kernfs_iattrs *ps_iattr;
+ bool has_ns;
+ int ret;
+
+ mutex_lock(&kernfs_mutex);
+
+ ret = -EINVAL;
+ has_ns = kernfs_ns_enabled(parent);
+ if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+ has_ns ? "required" : "invalid", parent->name, kn->name))
+ goto out_unlock;
+
+ if (kernfs_type(parent) != KERNFS_DIR)
+ goto out_unlock;
+
+ ret = -ENOENT;
+ if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent))
+ goto out_unlock;
+
+ kn->hash = kernfs_name_hash(kn->name, kn->ns);
+
+ ret = kernfs_link_sibling(kn);
+ if (ret)
+ goto out_unlock;
+
+ /* Update timestamps on the parent */
+ ps_iattr = parent->iattr;
+ if (ps_iattr) {
+ struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+ ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+ }
+
+ mutex_unlock(&kernfs_mutex);
+
+ /*
+ * Activate the new node unless CREATE_DEACTIVATED is requested.
+ * If not activated here, the kernfs user is responsible for
+ * activating the node with kernfs_activate(). A node which hasn't
+ * been activated is not visible to userland and its removal won't
+ * trigger deactivation.
+ */
+ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
+ kernfs_activate(kn);
+ return 0;
+
+out_unlock:
+ mutex_unlock(&kernfs_mutex);
+ return ret;
+}
+
+/**
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent. Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
+ */
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+ const unsigned char *name,
+ const void *ns)
+{
+ struct rb_node *node = parent->dir.children.rb_node;
+ bool has_ns = kernfs_ns_enabled(parent);
+ unsigned int hash;
+
+ lockdep_assert_held(&kernfs_mutex);
+
+ if (has_ns != (bool)ns) {
+ WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+ has_ns ? "required" : "invalid", parent->name, name);
+ return NULL;
+ }
+
+ hash = kernfs_name_hash(name, ns);
+ while (node) {
+ struct kernfs_node *kn;
+ int result;
+
+ kn = rb_to_kn(node);
+ result = kernfs_name_compare(hash, name, ns, kn);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return kn;
+ }
+ return NULL;
+}
+
+/**
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent and get a reference
+ * if found. This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+ const char *name, const void *ns)
+{
+ struct kernfs_node *kn;
+
+ mutex_lock(&kernfs_mutex);
+ kn = kernfs_find_ns(parent, name, ns);
+ kernfs_get(kn);
+ mutex_unlock(&kernfs_mutex);
+
+ return kn;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @scops: optional syscall operations for the hierarchy
+ * @flags: KERNFS_ROOT_* flags
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
+ unsigned int flags, void *priv)
+{
+ struct kernfs_root *root;
+ struct kernfs_node *kn;
+
+ root = kzalloc(sizeof(*root), GFP_KERNEL);
+ if (!root)
+ return ERR_PTR(-ENOMEM);
+
+ ida_init(&root->ino_ida);
+ INIT_LIST_HEAD(&root->supers);
+
+ kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+ KERNFS_DIR);
+ if (!kn) {
+ ida_destroy(&root->ino_ida);
+ kfree(root);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ kn->priv = priv;
+ kn->dir.root = root;
+
+ root->syscall_ops = scops;
+ root->flags = flags;
+ root->kn = kn;
+ init_waitqueue_head(&root->deactivate_waitq);
+
+ if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED))
+ kernfs_activate(kn);
+
+ return root;
+}
+
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+ kernfs_remove(root->kn); /* will also free @root */
+}
+
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @mode: mode of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+ const char *name, umode_t mode,
+ void *priv, const void *ns)
+{
+ struct kernfs_node *kn;
+ int rc;
+
+ /* allocate */
+ kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+ if (!kn)
+ return ERR_PTR(-ENOMEM);
+
+ kn->dir.root = parent->dir.root;
+ kn->ns = ns;
+ kn->priv = priv;
+
+ /* link in */
+ rc = kernfs_add_one(kn);
+ if (!rc)
+ return kn;
+
+ kernfs_put(kn);
+ return ERR_PTR(rc);
+}
+
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+ struct dentry *dentry,
+ unsigned int flags)
+{
+ struct dentry *ret;
+ struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+ struct kernfs_node *kn;
+ struct inode *inode;
+ const void *ns = NULL;
+
+ mutex_lock(&kernfs_mutex);
+
+ if (kernfs_ns_enabled(parent))
+ ns = kernfs_info(dir->i_sb)->ns;
+
+ kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+
+ /* no such entry */
+ if (!kn || !kernfs_active(kn)) {
+ ret = NULL;
+ goto out_unlock;
+ }
+ kernfs_get(kn);
+ dentry->d_fsdata = kn;
+
+ /* attach dentry and inode */
+ inode = kernfs_get_inode(dir->i_sb, kn);
+ if (!inode) {
+ ret = ERR_PTR(-ENOMEM);
+ goto out_unlock;
+ }
+
+ /* instantiate and hash dentry */
+ ret = d_materialise_unique(dentry, inode);
+ out_unlock:
+ mutex_unlock(&kernfs_mutex);
+ return ret;
+}
+
+static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+ umode_t mode)
+{
+ struct kernfs_node *parent = dir->i_private;
+ struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops;
+ int ret;
+
+ if (!scops || !scops->mkdir)
+ return -EPERM;
+
+ if (!kernfs_get_active(parent))
+ return -ENODEV;
+
+ ret = scops->mkdir(parent, dentry->d_name.name, mode);
+
+ kernfs_put_active(parent);
+ return ret;
+}
+
+static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
+ int ret;
+
+ if (!scops || !scops->rmdir)
+ return -EPERM;
+
+ if (!kernfs_get_active(kn))
+ return -ENODEV;
+
+ ret = scops->rmdir(kn);
+
+ kernfs_put_active(kn);
+ return ret;
+}
+
+static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct kernfs_node *kn = old_dentry->d_fsdata;
+ struct kernfs_node *new_parent = new_dir->i_private;
+ struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops;
+ int ret;
+
+ if (!scops || !scops->rename)
+ return -EPERM;
+
+ if (!kernfs_get_active(kn))
+ return -ENODEV;
+
+ if (!kernfs_get_active(new_parent)) {
+ kernfs_put_active(kn);
+ return -ENODEV;
+ }
+
+ ret = scops->rename(kn, new_parent, new_dentry->d_name.name);
+
+ kernfs_put_active(new_parent);
+ kernfs_put_active(kn);
+ return ret;
+}
+
+const struct inode_operations kernfs_dir_iops = {
+ .lookup = kernfs_iop_lookup,
+ .permission = kernfs_iop_permission,
+ .setattr = kernfs_iop_setattr,
+ .getattr = kernfs_iop_getattr,
+ .setxattr = kernfs_iop_setxattr,
+ .removexattr = kernfs_iop_removexattr,
+ .getxattr = kernfs_iop_getxattr,
+ .listxattr = kernfs_iop_listxattr,
+
+ .mkdir = kernfs_iop_mkdir,
+ .rmdir = kernfs_iop_rmdir,
+ .rename = kernfs_iop_rename,
+};
+
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
+{
+ struct kernfs_node *last;
+
+ while (true) {
+ struct rb_node *rbn;
+
+ last = pos;
+
+ if (kernfs_type(pos) != KERNFS_DIR)
+ break;
+
+ rbn = rb_first(&pos->dir.children);
+ if (!rbn)
+ break;
+
+ pos = rb_to_kn(rbn);
+ }
+
+ return last;
+}
+
+/**
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: kernfs_node whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants. @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+ struct kernfs_node *root)
+{
+ struct rb_node *rbn;
+
+ lockdep_assert_held(&kernfs_mutex);
+
+ /* if first iteration, visit leftmost descendant which may be root */
+ if (!pos)
+ return kernfs_leftmost_descendant(root);
+
+ /* if we visited @root, we're done */
+ if (pos == root)
+ return NULL;
+
+ /* if there's an unvisited sibling, visit its leftmost descendant */
+ rbn = rb_next(&pos->rb);
+ if (rbn)
+ return kernfs_leftmost_descendant(rb_to_kn(rbn));
+
+ /* no sibling left, visit parent */
+ return pos->parent;
+}
+
+/**
+ * kernfs_activate - activate a node which started deactivated
+ * @kn: kernfs_node whose subtree is to be activated
+ *
+ * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node
+ * needs to be explicitly activated. A node which hasn't been activated
+ * isn't visible to userland and deactivation is skipped during its
+ * removal. This is useful to construct atomic init sequences where
+ * creation of multiple nodes should either succeed or fail atomically.
+ *
+ * The caller is responsible for ensuring that this function is not called
+ * after kernfs_remove*() is invoked on @kn.
+ */
+void kernfs_activate(struct kernfs_node *kn)
+{
+ struct kernfs_node *pos;
+
+ mutex_lock(&kernfs_mutex);
+
+ pos = NULL;
+ while ((pos = kernfs_next_descendant_post(pos, kn))) {
+ if (!pos || (pos->flags & KERNFS_ACTIVATED))
+ continue;
+
+ WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb));
+ WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS);
+
+ atomic_sub(KN_DEACTIVATED_BIAS, &pos->active);
+ pos->flags |= KERNFS_ACTIVATED;
+ }
+
+ mutex_unlock(&kernfs_mutex);
+}
+
+static void __kernfs_remove(struct kernfs_node *kn)
+{
+ struct kernfs_node *pos;
+
+ lockdep_assert_held(&kernfs_mutex);
+
+ /*
+ * Short-circuit if non-root @kn has already finished removal.
+ * This is for kernfs_remove_self() which plays with active ref
+ * after removal.
+ */
+ if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb)))
+ return;
+
+ pr_debug("kernfs %s: removing\n", kn->name);
+
+ /* prevent any new usage under @kn by deactivating all nodes */
+ pos = NULL;
+ while ((pos = kernfs_next_descendant_post(pos, kn)))
+ if (kernfs_active(pos))
+ atomic_add(KN_DEACTIVATED_BIAS, &pos->active);
+
+ /* deactivate and unlink the subtree node-by-node */
+ do {
+ pos = kernfs_leftmost_descendant(kn);
+
+ /*
+ * kernfs_drain() drops kernfs_mutex temporarily and @pos's
+ * base ref could have been put by someone else by the time
+ * the function returns. Make sure it doesn't go away
+ * underneath us.
+ */
+ kernfs_get(pos);
+
+ /*
+ * Drain iff @kn was activated. This avoids draining and
+ * its lockdep annotations for nodes which have never been
+ * activated and allows embedding kernfs_remove() in create
+ * error paths without worrying about draining.
+ */
+ if (kn->flags & KERNFS_ACTIVATED)
+ kernfs_drain(pos);
+ else
+ WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
+
+ /*
+ * kernfs_unlink_sibling() succeeds once per node. Use it
+ * to decide who's responsible for cleanups.
+ */
+ if (!pos->parent || kernfs_unlink_sibling(pos)) {
+ struct kernfs_iattrs *ps_iattr =
+ pos->parent ? pos->parent->iattr : NULL;
+
+ /* update timestamps on the parent */
+ if (ps_iattr) {
+ ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+ ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+ }
+
+ kernfs_put(pos);
+ }
+
+ kernfs_put(pos);
+ } while (pos != kn);
+}
+
+/**
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
+ *
+ * Remove @kn along with all its subdirectories and files.
+ */
+void kernfs_remove(struct kernfs_node *kn)
+{
+ mutex_lock(&kernfs_mutex);
+ __kernfs_remove(kn);
+ mutex_unlock(&kernfs_mutex);
+}
+
+/**
+ * kernfs_break_active_protection - break out of active protection
+ * @kn: the self kernfs_node
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops. Each invocation of
+ * this function must also be matched with an invocation of
+ * kernfs_unbreak_active_protection().
+ *
+ * This function releases the active reference of @kn the caller is
+ * holding. Once this function is called, @kn may be removed at any point
+ * and the caller is solely responsible for ensuring that the objects it
+ * dereferences are accessible.
+ */
+void kernfs_break_active_protection(struct kernfs_node *kn)
+{
+ /*
+ * Take out ourself out of the active ref dependency chain. If
+ * we're called without an active ref, lockdep will complain.
+ */
+ kernfs_put_active(kn);
+}
+
+/**
+ * kernfs_unbreak_active_protection - undo kernfs_break_active_protection()
+ * @kn: the self kernfs_node
+ *
+ * If kernfs_break_active_protection() was called, this function must be
+ * invoked before finishing the kernfs operation. Note that while this
+ * function restores the active reference, it doesn't and can't actually
+ * restore the active protection - @kn may already or be in the process of
+ * being removed. Once kernfs_break_active_protection() is invoked, that
+ * protection is irreversibly gone for the kernfs operation instance.
+ *
+ * While this function may be called at any point after
+ * kernfs_break_active_protection() is invoked, its most useful location
+ * would be right before the enclosing kernfs operation returns.
+ */
+void kernfs_unbreak_active_protection(struct kernfs_node *kn)
+{
+ /*
+ * @kn->active could be in any state; however, the increment we do
+ * here will be undone as soon as the enclosing kernfs operation
+ * finishes and this temporary bump can't break anything. If @kn
+ * is alive, nothing changes. If @kn is being deactivated, the
+ * soon-to-follow put will either finish deactivation or restore
+ * deactivated state. If @kn is already removed, the temporary
+ * bump is guaranteed to be gone before @kn is released.
+ */
+ atomic_inc(&kn->active);
+ if (kernfs_lockdep(kn))
+ rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_);
+}
+
+/**
+ * kernfs_remove_self - remove a kernfs_node from its own method
+ * @kn: the self kernfs_node to remove
+ *
+ * The caller must be running off of a kernfs operation which is invoked
+ * with an active reference - e.g. one of kernfs_ops. This can be used to
+ * implement a file operation which deletes itself.
+ *
+ * For example, the "delete" file for a sysfs device directory can be
+ * implemented by invoking kernfs_remove_self() on the "delete" file
+ * itself. This function breaks the circular dependency of trying to
+ * deactivate self while holding an active ref itself. It isn't necessary
+ * to modify the usual removal path to use kernfs_remove_self(). The
+ * "delete" implementation can simply invoke kernfs_remove_self() on self
+ * before proceeding with the usual removal path. kernfs will ignore later
+ * kernfs_remove() on self.
+ *
+ * kernfs_remove_self() can be called multiple times concurrently on the
+ * same kernfs_node. Only the first one actually performs removal and
+ * returns %true. All others will wait until the kernfs operation which
+ * won self-removal finishes and return %false. Note that the losers wait
+ * for the completion of not only the winning kernfs_remove_self() but also
+ * the whole kernfs_ops which won the arbitration. This can be used to
+ * guarantee, for example, all concurrent writes to a "delete" file to
+ * finish only after the whole operation is complete.
+ */
+bool kernfs_remove_self(struct kernfs_node *kn)
+{
+ bool ret;
+
+ mutex_lock(&kernfs_mutex);
+ kernfs_break_active_protection(kn);
+
+ /*
+ * SUICIDAL is used to arbitrate among competing invocations. Only
+ * the first one will actually perform removal. When the removal
+ * is complete, SUICIDED is set and the active ref is restored
+ * while holding kernfs_mutex. The ones which lost arbitration
+ * waits for SUICDED && drained which can happen only after the
+ * enclosing kernfs operation which executed the winning instance
+ * of kernfs_remove_self() finished.
+ */
+ if (!(kn->flags & KERNFS_SUICIDAL)) {
+ kn->flags |= KERNFS_SUICIDAL;
+ __kernfs_remove(kn);
+ kn->flags |= KERNFS_SUICIDED;
+ ret = true;
+ } else {
+ wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq;
+ DEFINE_WAIT(wait);
+
+ while (true) {
+ prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE);
+
+ if ((kn->flags & KERNFS_SUICIDED) &&
+ atomic_read(&kn->active) == KN_DEACTIVATED_BIAS)
+ break;
+
+ mutex_unlock(&kernfs_mutex);
+ schedule();
+ mutex_lock(&kernfs_mutex);
+ }
+ finish_wait(waitq, &wait);
+ WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb));
+ ret = false;
+ }
+
+ /*
+ * This must be done while holding kernfs_mutex; otherwise, waiting
+ * for SUICIDED && deactivated could finish prematurely.
+ */
+ kernfs_unbreak_active_protection(kn);
+
+ mutex_unlock(&kernfs_mutex);
+ return ret;
+}
+
+/**
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
+ *
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+ const void *ns)
+{
+ struct kernfs_node *kn;
+
+ if (!parent) {
+ WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
+ name);
+ return -ENOENT;
+ }
+
+ mutex_lock(&kernfs_mutex);
+
+ kn = kernfs_find_ns(parent, name, ns);
+ if (kn)
+ __kernfs_remove(kn);
+
+ mutex_unlock(&kernfs_mutex);
+
+ if (kn)
+ return 0;
+ else
+ return -ENOENT;
+}
+
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @kn: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+ const char *new_name, const void *new_ns)
+{
+ struct kernfs_node *old_parent;
+ const char *old_name = NULL;
+ int error;
+
+ /* can't move or rename root */
+ if (!kn->parent)
+ return -EINVAL;
+
+ mutex_lock(&kernfs_mutex);
+
+ error = -ENOENT;
+ if (!kernfs_active(kn) || !kernfs_active(new_parent))
+ goto out;
+
+ error = 0;
+ if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+ (strcmp(kn->name, new_name) == 0))
+ goto out; /* nothing to rename */
+
+ error = -EEXIST;
+ if (kernfs_find_ns(new_parent, new_name, new_ns))
+ goto out;
+
+ /* rename kernfs_node */
+ if (strcmp(kn->name, new_name) != 0) {
+ error = -ENOMEM;
+ new_name = kstrdup(new_name, GFP_KERNEL);
+ if (!new_name)
+ goto out;
+ } else {
+ new_name = NULL;
+ }
+
+ /*
+ * Move to the appropriate place in the appropriate directories rbtree.
+ */
+ kernfs_unlink_sibling(kn);
+ kernfs_get(new_parent);
+
+ /* rename_lock protects ->parent and ->name accessors */
+ spin_lock_irq(&kernfs_rename_lock);
+
+ old_parent = kn->parent;
+ kn->parent = new_parent;
+
+ kn->ns = new_ns;
+ if (new_name) {
+ if (!(kn->flags & KERNFS_STATIC_NAME))
+ old_name = kn->name;
+ kn->flags &= ~KERNFS_STATIC_NAME;
+ kn->name = new_name;
+ }
+
+ spin_unlock_irq(&kernfs_rename_lock);
+
+ kn->hash = kernfs_name_hash(kn->name, kn->ns);
+ kernfs_link_sibling(kn);
+
+ kernfs_put(old_parent);
+ kfree(old_name);
+
+ error = 0;
+ out:
+ mutex_unlock(&kernfs_mutex);
+ return error;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct kernfs_node *kn)
+{
+ return (kn->mode >> 12) & 15;
+}
+
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
+{
+ kernfs_put(filp->private_data);
+ return 0;
+}
+
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
+ struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+{
+ if (pos) {
+ int valid = kernfs_active(pos) &&
+ pos->parent == parent && hash == pos->hash;
+ kernfs_put(pos);
+ if (!valid)
+ pos = NULL;
+ }
+ if (!pos && (hash > 1) && (hash < INT_MAX)) {
+ struct rb_node *node = parent->dir.children.rb_node;
+ while (node) {
+ pos = rb_to_kn(node);
+
+ if (hash < pos->hash)
+ node = node->rb_left;
+ else if (hash > pos->hash)
+ node = node->rb_right;
+ else
+ break;
+ }
+ }
+ /* Skip over entries which are dying/dead or in the wrong namespace */
+ while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
+ struct rb_node *node = rb_next(&pos->rb);
+ if (!node)
+ pos = NULL;
+ else
+ pos = rb_to_kn(node);
+ }
+ return pos;
+}
+
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
+ struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+{
+ pos = kernfs_dir_pos(ns, parent, ino, pos);
+ if (pos) {
+ do {
+ struct rb_node *node = rb_next(&pos->rb);
+ if (!node)
+ pos = NULL;
+ else
+ pos = rb_to_kn(node);
+ } while (pos && (!kernfs_active(pos) || pos->ns != ns));
+ }
+ return pos;
+}
+
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
+{
+ struct dentry *dentry = file->f_path.dentry;
+ struct kernfs_node *parent = dentry->d_fsdata;
+ struct kernfs_node *pos = file->private_data;
+ const void *ns = NULL;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+ mutex_lock(&kernfs_mutex);
+
+ if (kernfs_ns_enabled(parent))
+ ns = kernfs_info(dentry->d_sb)->ns;
+
+ for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+ pos;
+ pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+ const char *name = pos->name;
+ unsigned int type = dt_type(pos);
+ int len = strlen(name);
+ ino_t ino = pos->ino;
+
+ ctx->pos = pos->hash;
+ file->private_data = pos;
+ kernfs_get(pos);
+
+ mutex_unlock(&kernfs_mutex);
+ if (!dir_emit(ctx, name, len, ino, type))
+ return 0;
+ mutex_lock(&kernfs_mutex);
+ }
+ mutex_unlock(&kernfs_mutex);
+ file->private_data = NULL;
+ ctx->pos = INT_MAX;
+ return 0;
+}
+
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+ int whence)
+{
+ struct inode *inode = file_inode(file);
+ loff_t ret;
+
+ mutex_lock(&inode->i_mutex);
+ ret = generic_file_llseek(file, offset, whence);
+ mutex_unlock(&inode->i_mutex);
+
+ return ret;
+}
+
+const struct file_operations kernfs_dir_fops = {
+ .read = generic_read_dir,
+ .iterate = kernfs_fop_readdir,
+ .release = kernfs_dir_fop_release,
+ .llseek = kernfs_dir_fop_llseek,
+};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 00000000000..d895b4b7b66
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,952 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+#include <linux/fsnotify.h>
+
+#include "kernfs-internal.h"
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node. attr.open is
+ * protected by kernfs_open_node_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file. kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
+
+struct kernfs_open_node {
+ atomic_t refcnt;
+ atomic_t event;
+ wait_queue_head_t poll;
+ struct list_head files; /* goes through kernfs_open_file.list */
+};
+
+/*
+ * kernfs_notify() may be called from any context and bounces notifications
+ * through a work item. To minimize space overhead in kernfs_node, the
+ * pending queue is implemented as a singly linked list of kernfs_nodes.
+ * The list is terminated with the self pointer so that whether a
+ * kernfs_node is on the list or not can be determined by testing the next
+ * pointer for NULL.
+ */
+#define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list)
+
+static DEFINE_SPINLOCK(kernfs_notify_lock);
+static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
+
+static struct kernfs_open_file *kernfs_of(struct file *file)
+{
+ return ((struct seq_file *)file->private_data)->private;
+}
+
+/*
+ * Determine the kernfs_ops for the given kernfs_node. This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
+{
+ if (kn->flags & KERNFS_LOCKDEP)
+ lockdep_assert_held(kn);
+ return kn->attr.ops;
+}
+
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure. The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it. As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+ struct kernfs_open_file *of = sf->private;
+ const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+ if (ops->seq_stop)
+ ops->seq_stop(sf, v);
+ kernfs_put_active(of->kn);
+}
+
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+ struct kernfs_open_file *of = sf->private;
+ const struct kernfs_ops *ops;
+
+ /*
+ * @of->mutex nests outside active ref and is just to ensure that
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+ if (!kernfs_get_active(of->kn))
+ return ERR_PTR(-ENODEV);
+
+ ops = kernfs_ops(of->kn);
+ if (ops->seq_start) {
+ void *next = ops->seq_start(sf, ppos);
+ /* see the comment above kernfs_seq_stop_active() */
+ if (next == ERR_PTR(-ENODEV))
+ kernfs_seq_stop_active(sf, next);
+ return next;
+ } else {
+ /*
+ * The same behavior and code as single_open(). Returns
+ * !NULL if pos is at the beginning; otherwise, NULL.
+ */
+ return NULL + !*ppos;
+ }
+}
+
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+ struct kernfs_open_file *of = sf->private;
+ const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+ if (ops->seq_next) {
+ void *next = ops->seq_next(sf, v, ppos);
+ /* see the comment above kernfs_seq_stop_active() */
+ if (next == ERR_PTR(-ENODEV))
+ kernfs_seq_stop_active(sf, next);
+ return next;
+ } else {
+ /*
+ * The same behavior and code as single_open(), always
+ * terminate after the initial read.
+ */
+ ++*ppos;
+ return NULL;
+ }
+}
+
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+ struct kernfs_open_file *of = sf->private;
+
+ if (v != ERR_PTR(-ENODEV))
+ kernfs_seq_stop_active(sf, v);
+ mutex_unlock(&of->mutex);
+}
+
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+ struct kernfs_open_file *of = sf->private;
+
+ of->event = atomic_read(&of->kn->attr.open->event);
+
+ return of->kn->attr.ops->seq_show(sf, v);
+}
+
+static const struct seq_operations kernfs_seq_ops = {
+ .start = kernfs_seq_start,
+ .next = kernfs_seq_next,
+ .stop = kernfs_seq_stop,
+ .show = kernfs_seq_show,
+};
+
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file. Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
+ char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ ssize_t len = min_t(size_t, count, PAGE_SIZE);
+ const struct kernfs_ops *ops;
+ char *buf;
+
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /*
+ * @of->mutex nests outside active ref and is just to ensure that
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+ if (!kernfs_get_active(of->kn)) {
+ len = -ENODEV;
+ mutex_unlock(&of->mutex);
+ goto out_free;
+ }
+
+ ops = kernfs_ops(of->kn);
+ if (ops->read)
+ len = ops->read(of, buf, len, *ppos);
+ else
+ len = -EINVAL;
+
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+
+ if (len < 0)
+ goto out_free;
+
+ if (copy_to_user(user_buf, buf, len)) {
+ len = -EFAULT;
+ goto out_free;
+ }
+
+ *ppos += len;
+
+ out_free:
+ kfree(buf);
+ return len;
+}
+
+/**
+ * kernfs_fop_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct kernfs_open_file *of = kernfs_of(file);
+
+ if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
+ return seq_read(file, user_buf, count, ppos);
+ else
+ return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+
+/**
+ * kernfs_fop_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write. Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct kernfs_open_file *of = kernfs_of(file);
+ const struct kernfs_ops *ops;
+ size_t len;
+ char *buf;
+
+ if (of->atomic_write_len) {
+ len = count;
+ if (len > of->atomic_write_len)
+ return -E2BIG;
+ } else {
+ len = min_t(size_t, count, PAGE_SIZE);
+ }
+
+ buf = kmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, user_buf, len)) {
+ len = -EFAULT;
+ goto out_free;
+ }
+ buf[len] = '\0'; /* guarantee string termination */
+
+ /*
+ * @of->mutex nests outside active ref and is just to ensure that
+ * the ops aren't called concurrently for the same open file.
+ */
+ mutex_lock(&of->mutex);
+ if (!kernfs_get_active(of->kn)) {
+ mutex_unlock(&of->mutex);
+ len = -ENODEV;
+ goto out_free;
+ }
+
+ ops = kernfs_ops(of->kn);
+ if (ops->write)
+ len = ops->write(of, buf, len, *ppos);
+ else
+ len = -EINVAL;
+
+ kernfs_put_active(of->kn);
+ mutex_unlock(&of->mutex);
+
+ if (len > 0)
+ *ppos += len;
+out_free:
+ kfree(buf);
+ return len;
+}
+
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+
+ if (!of->vm_ops)
+ return;
+
+ if (!kernfs_get_active(of->kn))
+ return;
+
+ if (of->vm_ops->open)
+ of->vm_ops->open(vma);
+
+ kernfs_put_active(of->kn);
+}
+
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+ int ret;
+
+ if (!of->vm_ops)
+ return VM_FAULT_SIGBUS;
+
+ if (!kernfs_get_active(of->kn))
+ return VM_FAULT_SIGBUS;
+
+ ret = VM_FAULT_SIGBUS;
+ if (of->vm_ops->fault)
+ ret = of->vm_ops->fault(vma, vmf);
+
+ kernfs_put_active(of->kn);
+ return ret;
+}
+
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+ int ret;
+
+ if (!of->vm_ops)
+ return VM_FAULT_SIGBUS;
+
+ if (!kernfs_get_active(of->kn))
+ return VM_FAULT_SIGBUS;
+
+ ret = 0;
+ if (of->vm_ops->page_mkwrite)
+ ret = of->vm_ops->page_mkwrite(vma, vmf);
+ else
+ file_update_time(file);
+
+ kernfs_put_active(of->kn);
+ return ret;
+}
+
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+ void *buf, int len, int write)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+ int ret;
+
+ if (!of->vm_ops)
+ return -EINVAL;
+
+ if (!kernfs_get_active(of->kn))
+ return -EINVAL;
+
+ ret = -EINVAL;
+ if (of->vm_ops->access)
+ ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+ kernfs_put_active(of->kn);
+ return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+ struct mempolicy *new)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+ int ret;
+
+ if (!of->vm_ops)
+ return 0;
+
+ if (!kernfs_get_active(of->kn))
+ return -EINVAL;
+
+ ret = 0;
+ if (of->vm_ops->set_policy)
+ ret = of->vm_ops->set_policy(vma, new);
+
+ kernfs_put_active(of->kn);
+ return ret;
+}
+
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+ struct mempolicy *pol;
+
+ if (!of->vm_ops)
+ return vma->vm_policy;
+
+ if (!kernfs_get_active(of->kn))
+ return vma->vm_policy;
+
+ pol = vma->vm_policy;
+ if (of->vm_ops->get_policy)
+ pol = of->vm_ops->get_policy(vma, addr);
+
+ kernfs_put_active(of->kn);
+ return pol;
+}
+
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+ const nodemask_t *from, const nodemask_t *to,
+ unsigned long flags)
+{
+ struct file *file = vma->vm_file;
+ struct kernfs_open_file *of = kernfs_of(file);
+ int ret;
+
+ if (!of->vm_ops)
+ return 0;
+
+ if (!kernfs_get_active(of->kn))
+ return 0;
+
+ ret = 0;
+ if (of->vm_ops->migrate)
+ ret = of->vm_ops->migrate(vma, from, to, flags);
+
+ kernfs_put_active(of->kn);
+ return ret;
+}
+#endif
+
+static const struct vm_operations_struct kernfs_vm_ops = {
+ .open = kernfs_vma_open,
+ .fault = kernfs_vma_fault,
+ .page_mkwrite = kernfs_vma_page_mkwrite,
+ .access = kernfs_vma_access,
+#ifdef CONFIG_NUMA
+ .set_policy = kernfs_vma_set_policy,
+ .get_policy = kernfs_vma_get_policy,
+ .migrate = kernfs_vma_migrate,
+#endif
+};
+
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct kernfs_open_file *of = kernfs_of(file);
+ const struct kernfs_ops *ops;
+ int rc;
+
+ /*
+ * mmap path and of->mutex are prone to triggering spurious lockdep
+ * warnings and we don't want to add spurious locking dependency
+ * between the two. Check whether mmap is actually implemented
+ * without grabbing @of->mutex by testing HAS_MMAP flag. See the
+ * comment in kernfs_file_open() for more details.
+ */
+ if (!(of->kn->flags & KERNFS_HAS_MMAP))
+ return -ENODEV;
+
+ mutex_lock(&of->mutex);
+
+ rc = -ENODEV;
+ if (!kernfs_get_active(of->kn))
+ goto out_unlock;
+
+ ops = kernfs_ops(of->kn);
+ rc = ops->mmap(of, vma);
+ if (rc)
+ goto out_put;
+
+ /*
+ * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+ * to satisfy versions of X which crash if the mmap fails: that
+ * substitutes a new vm_file, and we don't then want bin_vm_ops.
+ */
+ if (vma->vm_file != file)
+ goto out_put;
+
+ rc = -EINVAL;
+ if (of->mmapped && of->vm_ops != vma->vm_ops)
+ goto out_put;
+
+ /*
+ * It is not possible to successfully wrap close.
+ * So error if someone is trying to use close.
+ */
+ rc = -EINVAL;
+ if (vma->vm_ops && vma->vm_ops->close)
+ goto out_put;
+
+ rc = 0;
+ of->mmapped = 1;
+ of->vm_ops = vma->vm_ops;
+ vma->vm_ops = &kernfs_vm_ops;
+out_put:
+ kernfs_put_active(of->kn);
+out_unlock:
+ mutex_unlock(&of->mutex);
+
+ return rc;
+}
+
+/**
+ * kernfs_get_open_node - get or create kernfs_open_node
+ * @kn: target kernfs_node
+ * @of: kernfs_open_file for this instance of open
+ *
+ * If @kn->attr.open exists, increment its reference count; otherwise,
+ * create one. @of is chained to the files list.
+ *
+ * LOCKING:
+ * Kernel thread context (may sleep).
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int kernfs_get_open_node(struct kernfs_node *kn,
+ struct kernfs_open_file *of)
+{
+ struct kernfs_open_node *on, *new_on = NULL;
+
+ retry:
+ mutex_lock(&kernfs_open_file_mutex);
+ spin_lock_irq(&kernfs_open_node_lock);
+
+ if (!kn->attr.open && new_on) {
+ kn->attr.open = new_on;
+ new_on = NULL;
+ }
+
+ on = kn->attr.open;
+ if (on) {
+ atomic_inc(&on->refcnt);
+ list_add_tail(&of->list, &on->files);
+ }
+
+ spin_unlock_irq(&kernfs_open_node_lock);
+ mutex_unlock(&kernfs_open_file_mutex);
+
+ if (on) {
+ kfree(new_on);
+ return 0;
+ }
+
+ /* not there, initialize a new one and retry */
+ new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+ if (!new_on)
+ return -ENOMEM;
+
+ atomic_set(&new_on->refcnt, 0);
+ atomic_set(&new_on->event, 1);
+ init_waitqueue_head(&new_on->poll);
+ INIT_LIST_HEAD(&new_on->files);
+ goto retry;
+}
+
+/**
+ * kernfs_put_open_node - put kernfs_open_node
+ * @kn: target kernfs_nodet
+ * @of: associated kernfs_open_file
+ *
+ * Put @kn->attr.open and unlink @of from the files list. If
+ * reference count reaches zero, disassociate and free it.
+ *
+ * LOCKING:
+ * None.
+ */
+static void kernfs_put_open_node(struct kernfs_node *kn,
+ struct kernfs_open_file *of)
+{
+ struct kernfs_open_node *on = kn->attr.open;
+ unsigned long flags;
+
+ mutex_lock(&kernfs_open_file_mutex);
+ spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+ if (of)
+ list_del(&of->list);
+
+ if (atomic_dec_and_test(&on->refcnt))
+ kn->attr.open = NULL;
+ else
+ on = NULL;
+
+ spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+ mutex_unlock(&kernfs_open_file_mutex);
+
+ kfree(on);
+}
+
+static int kernfs_fop_open(struct inode *inode, struct file *file)
+{
+ struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+ struct kernfs_root *root = kernfs_root(kn);
+ const struct kernfs_ops *ops;
+ struct kernfs_open_file *of;
+ bool has_read, has_write, has_mmap;
+ int error = -EACCES;
+
+ if (!kernfs_get_active(kn))
+ return -ENODEV;
+
+ ops = kernfs_ops(kn);
+
+ has_read = ops->seq_show || ops->read || ops->mmap;
+ has_write = ops->write || ops->mmap;
+ has_mmap = ops->mmap;
+
+ /* see the flag definition for details */
+ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
+ if ((file->f_mode & FMODE_WRITE) &&
+ (!(inode->i_mode & S_IWUGO) || !has_write))
+ goto err_out;
+
+ if ((file->f_mode & FMODE_READ) &&
+ (!(inode->i_mode & S_IRUGO) || !has_read))
+ goto err_out;
+ }
+
+ /* allocate a kernfs_open_file for the file */
+ error = -ENOMEM;
+ of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+ if (!of)
+ goto err_out;
+
+ /*
+ * The following is done to give a different lockdep key to
+ * @of->mutex for files which implement mmap. This is a rather
+ * crude way to avoid false positive lockdep warning around
+ * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+ * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+ * which mm->mmap_sem nests, while holding @of->mutex. As each
+ * open file has a separate mutex, it's okay as long as those don't
+ * happen on the same file. At this point, we can't easily give
+ * each file a separate locking class. Let's differentiate on
+ * whether the file has mmap or not for now.
+ *
+ * Both paths of the branch look the same. They're supposed to
+ * look that way and give @of->mutex different static lockdep keys.
+ */
+ if (has_mmap)
+ mutex_init(&of->mutex);
+ else
+ mutex_init(&of->mutex);
+
+ of->kn = kn;
+ of->file = file;
+
+ /*
+ * Write path needs to atomic_write_len outside active reference.
+ * Cache it in open_file. See kernfs_fop_write() for details.
+ */
+ of->atomic_write_len = ops->atomic_write_len;
+
+ /*
+ * Always instantiate seq_file even if read access doesn't use
+ * seq_file or is not requested. This unifies private data access
+ * and readable regular files are the vast majority anyway.
+ */
+ if (ops->seq_show)
+ error = seq_open(file, &kernfs_seq_ops);
+ else
+ error = seq_open(file, NULL);
+ if (error)
+ goto err_free;
+
+ ((struct seq_file *)file->private_data)->private = of;
+
+ /* seq_file clears PWRITE unconditionally, restore it if WRITE */
+ if (file->f_mode & FMODE_WRITE)
+ file->f_mode |= FMODE_PWRITE;
+
+ /* make sure we have open node struct */
+ error = kernfs_get_open_node(kn, of);
+ if (error)
+ goto err_close;
+
+ /* open succeeded, put active references */
+ kernfs_put_active(kn);
+ return 0;
+
+err_close:
+ seq_release(inode, file);
+err_free:
+ kfree(of);
+err_out:
+ kernfs_put_active(kn);
+ return error;
+}
+
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
+{
+ struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+ struct kernfs_open_file *of = kernfs_of(filp);
+
+ kernfs_put_open_node(kn, of);
+ seq_release(inode, filp);
+ kfree(of);
+
+ return 0;
+}
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
+{
+ struct kernfs_open_node *on;
+ struct kernfs_open_file *of;
+
+ if (!(kn->flags & KERNFS_HAS_MMAP))
+ return;
+
+ spin_lock_irq(&kernfs_open_node_lock);
+ on = kn->attr.open;
+ if (on)
+ atomic_inc(&on->refcnt);
+ spin_unlock_irq(&kernfs_open_node_lock);
+ if (!on)
+ return;
+
+ mutex_lock(&kernfs_open_file_mutex);
+ list_for_each_entry(of, &on->files, list) {
+ struct inode *inode = file_inode(of->file);
+ unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+ }
+ mutex_unlock(&kernfs_open_file_mutex);
+
+ kernfs_put_open_node(kn, NULL);
+}
+
+/*
+ * Kernfs attribute files are pollable. The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change. When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code). When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
+{
+ struct kernfs_open_file *of = kernfs_of(filp);
+ struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+ struct kernfs_open_node *on = kn->attr.open;
+
+ /* need parent for the kobj, grab both */
+ if (!kernfs_get_active(kn))
+ goto trigger;
+
+ poll_wait(filp, &on->poll, wait);
+
+ kernfs_put_active(kn);
+
+ if (of->event != atomic_read(&on->event))
+ goto trigger;
+
+ return DEFAULT_POLLMASK;
+
+ trigger:
+ return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+
+static void kernfs_notify_workfn(struct work_struct *work)
+{
+ struct kernfs_node *kn;
+ struct kernfs_open_node *on;
+ struct kernfs_super_info *info;
+repeat:
+ /* pop one off the notify_list */
+ spin_lock_irq(&kernfs_notify_lock);
+ kn = kernfs_notify_list;
+ if (kn == KERNFS_NOTIFY_EOL) {
+ spin_unlock_irq(&kernfs_notify_lock);
+ return;
+ }
+ kernfs_notify_list = kn->attr.notify_next;
+ kn->attr.notify_next = NULL;
+ spin_unlock_irq(&kernfs_notify_lock);
+
+ /* kick poll */
+ spin_lock_irq(&kernfs_open_node_lock);
+
+ on = kn->attr.open;
+ if (on) {
+ atomic_inc(&on->event);
+ wake_up_interruptible(&on->poll);
+ }
+
+ spin_unlock_irq(&kernfs_open_node_lock);
+
+ /* kick fsnotify */
+ mutex_lock(&kernfs_mutex);
+
+ list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
+ struct inode *inode;
+ struct dentry *dentry;
+
+ inode = ilookup(info->sb, kn->ino);
+ if (!inode)
+ continue;
+
+ dentry = d_find_any_alias(inode);
+ if (dentry) {
+ fsnotify_parent(NULL, dentry, FS_MODIFY);
+ fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+ NULL, 0);
+ dput(dentry);
+ }
+
+ iput(inode);
+ }
+
+ mutex_unlock(&kernfs_mutex);
+ kernfs_put(kn);
+ goto repeat;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any
+ * context.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+ static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn);
+ unsigned long flags;
+
+ if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+ return;
+
+ spin_lock_irqsave(&kernfs_notify_lock, flags);
+ if (!kn->attr.notify_next) {
+ kernfs_get(kn);
+ kn->attr.notify_next = kernfs_notify_list;
+ kernfs_notify_list = kn;
+ schedule_work(&kernfs_notify_work);
+ }
+ spin_unlock_irqrestore(&kernfs_notify_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+
+const struct file_operations kernfs_file_fops = {
+ .read = kernfs_fop_read,
+ .write = kernfs_fop_write,
+ .llseek = generic_file_llseek,
+ .mmap = kernfs_fop_mmap,
+ .open = kernfs_fop_open,
+ .release = kernfs_fop_release,
+ .poll = kernfs_fop_poll,
+};
+
+/**
+ * __kernfs_create_file - kernfs internal function to create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @static_name: don't copy file name
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+ const char *name,
+ umode_t mode, loff_t size,
+ const struct kernfs_ops *ops,
+ void *priv, const void *ns,
+ bool name_is_static,
+ struct lock_class_key *key)
+{
+ struct kernfs_node *kn;
+ unsigned flags;
+ int rc;
+
+ flags = KERNFS_FILE;
+ if (name_is_static)
+ flags |= KERNFS_STATIC_NAME;
+
+ kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
+ if (!kn)
+ return ERR_PTR(-ENOMEM);
+
+ kn->attr.ops = ops;
+ kn->attr.size = size;
+ kn->ns = ns;
+ kn->priv = priv;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (key) {
+ lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+ kn->flags |= KERNFS_LOCKDEP;
+ }
+#endif
+
+ /*
+ * kn->attr.ops is accesible only while holding active ref. We
+ * need to know whether some ops are implemented outside active
+ * ref. Cache their existence in flags.
+ */
+ if (ops->seq_show)
+ kn->flags |= KERNFS_HAS_SEQ_SHOW;
+ if (ops->mmap)
+ kn->flags |= KERNFS_HAS_MMAP;
+
+ rc = kernfs_add_one(kn);
+ if (rc) {
+ kernfs_put(kn);
+ return ERR_PTR(rc);
+ }
+ return kn;
+}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 00000000000..985217626e6
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,383 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include "kernfs-internal.h"
+
+static const struct address_space_operations kernfs_aops = {
+ .readpage = simple_readpage,
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end,
+};
+
+static struct backing_dev_info kernfs_bdi = {
+ .name = "kernfs",
+ .ra_pages = 0, /* No readahead */
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static const struct inode_operations kernfs_iops = {
+ .permission = kernfs_iop_permission,
+ .setattr = kernfs_iop_setattr,
+ .getattr = kernfs_iop_getattr,
+ .setxattr = kernfs_iop_setxattr,
+ .removexattr = kernfs_iop_removexattr,
+ .getxattr = kernfs_iop_getxattr,
+ .listxattr = kernfs_iop_listxattr,
+};
+
+void __init kernfs_inode_init(void)
+{
+ if (bdi_init(&kernfs_bdi))
+ panic("failed to init kernfs_bdi");
+}
+
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+ static DEFINE_MUTEX(iattr_mutex);
+ struct kernfs_iattrs *ret;
+ struct iattr *iattrs;
+
+ mutex_lock(&iattr_mutex);
+
+ if (kn->iattr)
+ goto out_unlock;
+
+ kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+ if (!kn->iattr)
+ goto out_unlock;
+ iattrs = &kn->iattr->ia_iattr;
+
+ /* assign default attributes */
+ iattrs->ia_mode = kn->mode;
+ iattrs->ia_uid = GLOBAL_ROOT_UID;
+ iattrs->ia_gid = GLOBAL_ROOT_GID;
+ iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+ simple_xattrs_init(&kn->iattr->xattrs);
+out_unlock:
+ ret = kn->iattr;
+ mutex_unlock(&iattr_mutex);
+ return ret;
+}
+
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+ struct kernfs_iattrs *attrs;
+ struct iattr *iattrs;
+ unsigned int ia_valid = iattr->ia_valid;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ iattrs = &attrs->ia_iattr;
+
+ if (ia_valid & ATTR_UID)
+ iattrs->ia_uid = iattr->ia_uid;
+ if (ia_valid & ATTR_GID)
+ iattrs->ia_gid = iattr->ia_gid;
+ if (ia_valid & ATTR_ATIME)
+ iattrs->ia_atime = iattr->ia_atime;
+ if (ia_valid & ATTR_MTIME)
+ iattrs->ia_mtime = iattr->ia_mtime;
+ if (ia_valid & ATTR_CTIME)
+ iattrs->ia_ctime = iattr->ia_ctime;
+ if (ia_valid & ATTR_MODE) {
+ umode_t mode = iattr->ia_mode;
+ iattrs->ia_mode = kn->mode = mode;
+ }
+ return 0;
+}
+
+/**
+ * kernfs_setattr - set iattr on a node
+ * @kn: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+ int ret;
+
+ mutex_lock(&kernfs_mutex);
+ ret = __kernfs_setattr(kn, iattr);
+ mutex_unlock(&kernfs_mutex);
+ return ret;
+}
+
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+ struct inode *inode = dentry->d_inode;
+ struct kernfs_node *kn = dentry->d_fsdata;
+ int error;
+
+ if (!kn)
+ return -EINVAL;
+
+ mutex_lock(&kernfs_mutex);
+ error = inode_change_ok(inode, iattr);
+ if (error)
+ goto out;
+
+ error = __kernfs_setattr(kn, iattr);
+ if (error)
+ goto out;
+
+ /* this ignores size changes */
+ setattr_copy(inode, iattr);
+
+out:
+ mutex_unlock(&kernfs_mutex);
+ return error;
+}
+
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+ u32 *secdata_len)
+{
+ struct kernfs_iattrs *attrs;
+ void *old_secdata;
+ size_t old_secdata_len;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ old_secdata = attrs->ia_secdata;
+ old_secdata_len = attrs->ia_secdata_len;
+
+ attrs->ia_secdata = *secdata;
+ attrs->ia_secdata_len = *secdata_len;
+
+ *secdata = old_secdata;
+ *secdata_len = old_secdata_len;
+ return 0;
+}
+
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_iattrs *attrs;
+ void *secdata;
+ int error;
+ u32 secdata_len = 0;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+ const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+ error = security_inode_setsecurity(dentry->d_inode, suffix,
+ value, size, flags);
+ if (error)
+ return error;
+ error = security_inode_getsecctx(dentry->d_inode,
+ &secdata, &secdata_len);
+ if (error)
+ return error;
+
+ mutex_lock(&kernfs_mutex);
+ error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
+ mutex_unlock(&kernfs_mutex);
+
+ if (secdata)
+ security_release_secctx(secdata, secdata_len);
+ return error;
+ } else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+ return simple_xattr_set(&attrs->xattrs, name, value, size,
+ flags);
+ }
+
+ return -EINVAL;
+}
+
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ return simple_xattr_remove(&attrs->xattrs, name);
+}
+
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+ size_t size)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_iattrs *attrs;
+
+ attrs = kernfs_iattrs(kn);
+ if (!attrs)
+ return -ENOMEM;
+
+ return simple_xattr_list(&attrs->xattrs, buf, size);
+}
+
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+ inode->i_mode = mode;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+ inode->i_uid = iattr->ia_uid;
+ inode->i_gid = iattr->ia_gid;
+ inode->i_atime = iattr->ia_atime;
+ inode->i_mtime = iattr->ia_mtime;
+ inode->i_ctime = iattr->ia_ctime;
+}
+
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+{
+ struct kernfs_iattrs *attrs = kn->iattr;
+
+ inode->i_mode = kn->mode;
+ if (attrs) {
+ /*
+ * kernfs_node has non-default attributes get them from
+ * persistent copy in kernfs_node.
+ */
+ set_inode_attr(inode, &attrs->ia_iattr);
+ security_inode_notifysecctx(inode, attrs->ia_secdata,
+ attrs->ia_secdata_len);
+ }
+
+ if (kernfs_type(kn) == KERNFS_DIR)
+ set_nlink(inode, kn->dir.subdirs + 2);
+}
+
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct inode *inode = dentry->d_inode;
+
+ mutex_lock(&kernfs_mutex);
+ kernfs_refresh_inode(kn, inode);
+ mutex_unlock(&kernfs_mutex);
+
+ generic_fillattr(inode, stat);
+ return 0;
+}
+
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+{
+ kernfs_get(kn);
+ inode->i_private = kn;
+ inode->i_mapping->a_ops = &kernfs_aops;
+ inode->i_mapping->backing_dev_info = &kernfs_bdi;
+ inode->i_op = &kernfs_iops;
+
+ set_default_inode_attr(inode, kn->mode);
+ kernfs_refresh_inode(kn, inode);
+
+ /* initialize inode according to type */
+ switch (kernfs_type(kn)) {
+ case KERNFS_DIR:
+ inode->i_op = &kernfs_dir_iops;
+ inode->i_fop = &kernfs_dir_fops;
+ break;
+ case KERNFS_FILE:
+ inode->i_size = kn->attr.size;
+ inode->i_fop = &kernfs_file_fops;
+ break;
+ case KERNFS_LINK:
+ inode->i_op = &kernfs_symlink_iops;
+ break;
+ default:
+ BUG();
+ }
+
+ unlock_new_inode(inode);
+}
+
+/**
+ * kernfs_get_inode - get inode for kernfs_node
+ * @sb: super block
+ * @kn: kernfs_node to allocate inode for
+ *
+ * Get inode for @kn. If such inode doesn't exist, a new inode is
+ * allocated and basics are initialized. New inode is returned
+ * locked.
+ *
+ * LOCKING:
+ * Kernel thread context (may sleep).
+ *
+ * RETURNS:
+ * Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{
+ struct inode *inode;
+
+ inode = iget_locked(sb, kn->ino);
+ if (inode && (inode->i_state & I_NEW))
+ kernfs_init_inode(kn, inode);
+
+ return inode;
+}
+
+/*
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs. To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode. A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void kernfs_evict_inode(struct inode *inode)
+{
+ struct kernfs_node *kn = inode->i_private;
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ kernfs_put(kn);
+}
+
+int kernfs_iop_permission(struct inode *inode, int mask)
+{
+ struct kernfs_node *kn;
+
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ kn = inode->i_private;
+
+ mutex_lock(&kernfs_mutex);
+ kernfs_refresh_inode(kn, inode);
+ mutex_unlock(&kernfs_mutex);
+
+ return generic_permission(inode, mask);
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 00000000000..dc84a3ef9ca
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,120 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/xattr.h>
+
+#include <linux/kernfs.h>
+
+struct kernfs_iattrs {
+ struct iattr ia_iattr;
+ void *ia_secdata;
+ u32 ia_secdata_len;
+
+ struct simple_xattrs xattrs;
+};
+
+/* +1 to avoid triggering overflow warning when negating it */
+#define KN_DEACTIVATED_BIAS (INT_MIN + 1)
+
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+
+/**
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
+ *
+ * Return the kernfs_root @kn belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+{
+ /* if parent exists, it's always a dir; otherwise, @sd is a dir */
+ if (kn->parent)
+ kn = kn->parent;
+ return kn->dir.root;
+}
+
+/*
+ * mount.c
+ */
+struct kernfs_super_info {
+ struct super_block *sb;
+
+ /*
+ * The root associated with this super_block. Each super_block is
+ * identified by the root and ns it's associated with.
+ */
+ struct kernfs_root *root;
+
+ /*
+ * Each sb is associated with one namespace tag, currently the
+ * network namespace of the task which mounted this kernfs
+ * instance. If multiple tags become necessary, make the following
+ * an array and compare kernfs_node tag against every entry.
+ */
+ const void *ns;
+
+ /* anchored at kernfs_root->supers, protected by kernfs_mutex */
+ struct list_head node;
+};
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+
+extern const struct super_operations kernfs_sops;
+extern struct kmem_cache *kernfs_node_cache;
+
+/*
+ * inode.c
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+ size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+void kernfs_inode_init(void);
+
+/*
+ * dir.c
+ */
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
+
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+int kernfs_add_one(struct kernfs_node *kn);
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+ const char *name, umode_t mode,
+ unsigned flags);
+
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_fops;
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
+
+/*
+ * symlink.c
+ */
+extern const struct inode_operations kernfs_symlink_iops;
+
+#endif /* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 00000000000..f973ae9b05f
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,250 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "kernfs-internal.h"
+
+struct kmem_cache *kernfs_node_cache;
+
+static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ struct kernfs_root *root = kernfs_info(sb)->root;
+ struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+ if (scops && scops->remount_fs)
+ return scops->remount_fs(root, flags, data);
+ return 0;
+}
+
+static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry)
+{
+ struct kernfs_root *root = kernfs_root(dentry->d_fsdata);
+ struct kernfs_syscall_ops *scops = root->syscall_ops;
+
+ if (scops && scops->show_options)
+ return scops->show_options(sf, root);
+ return 0;
+}
+
+const struct super_operations kernfs_sops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = kernfs_evict_inode,
+
+ .remount_fs = kernfs_sop_remount_fs,
+ .show_options = kernfs_sop_show_options,
+};
+
+/**
+ * kernfs_root_from_sb - determine kernfs_root associated with a super_block
+ * @sb: the super_block in question
+ *
+ * Return the kernfs_root associated with @sb. If @sb is not a kernfs one,
+ * %NULL is returned.
+ */
+struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
+{
+ if (sb->s_op == &kernfs_sops)
+ return kernfs_info(sb)->root;
+ return NULL;
+}
+
+static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
+{
+ struct kernfs_super_info *info = kernfs_info(sb);
+ struct inode *inode;
+ struct dentry *root;
+
+ info->sb = sb;
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_magic = magic;
+ sb->s_op = &kernfs_sops;
+ sb->s_time_gran = 1;
+
+ /* get root inode, initialize and unlock it */
+ mutex_lock(&kernfs_mutex);
+ inode = kernfs_get_inode(sb, info->root->kn);
+ mutex_unlock(&kernfs_mutex);
+ if (!inode) {
+ pr_debug("kernfs: could not get root inode\n");
+ return -ENOMEM;
+ }
+
+ /* instantiate and link root dentry */
+ root = d_make_root(inode);
+ if (!root) {
+ pr_debug("%s: could not get root dentry!\n", __func__);
+ return -ENOMEM;
+ }
+ kernfs_get(info->root->kn);
+ root->d_fsdata = info->root->kn;
+ sb->s_root = root;
+ sb->s_d_op = &kernfs_dops;
+ return 0;
+}
+
+static int kernfs_test_super(struct super_block *sb, void *data)
+{
+ struct kernfs_super_info *sb_info = kernfs_info(sb);
+ struct kernfs_super_info *info = data;
+
+ return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+
+static int kernfs_set_super(struct super_block *sb, void *data)
+{
+ int error;
+ error = set_anon_super(sb, data);
+ if (!error)
+ sb->s_fs_info = data;
+ return error;
+}
+
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+ struct kernfs_super_info *info = kernfs_info(sb);
+
+ return info->ns;
+}
+
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @magic: file system specific magic number
+ * @new_sb_created: tell the caller if we allocated a new superblock
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+ struct kernfs_root *root, unsigned long magic,
+ bool *new_sb_created, const void *ns)
+{
+ struct super_block *sb;
+ struct kernfs_super_info *info;
+ int error;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return ERR_PTR(-ENOMEM);
+
+ info->root = root;
+ info->ns = ns;
+
+ sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+ if (IS_ERR(sb) || sb->s_fs_info != info)
+ kfree(info);
+ if (IS_ERR(sb))
+ return ERR_CAST(sb);
+
+ if (new_sb_created)
+ *new_sb_created = !sb->s_root;
+
+ if (!sb->s_root) {
+ struct kernfs_super_info *info = kernfs_info(sb);
+
+ error = kernfs_fill_super(sb, magic);
+ if (error) {
+ deactivate_locked_super(sb);
+ return ERR_PTR(error);
+ }
+ sb->s_flags |= MS_ACTIVE;
+
+ mutex_lock(&kernfs_mutex);
+ list_add(&info->node, &root->supers);
+ mutex_unlock(&kernfs_mutex);
+ }
+
+ return dget(sb->s_root);
+}
+
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb(). If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+ struct kernfs_super_info *info = kernfs_info(sb);
+ struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+
+ mutex_lock(&kernfs_mutex);
+ list_del(&info->node);
+ mutex_unlock(&kernfs_mutex);
+
+ /*
+ * Remove the superblock from fs_supers/s_instances
+ * so we can't find it, before freeing kernfs_super_info.
+ */
+ kill_anon_super(sb);
+ kfree(info);
+ kernfs_put(root_kn);
+}
+
+/**
+ * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root
+ * @kernfs_root: the kernfs_root in question
+ * @ns: the namespace tag
+ *
+ * Pin the superblock so the superblock won't be destroyed in subsequent
+ * operations. This can be used to block ->kill_sb() which may be useful
+ * for kernfs users which dynamically manage superblocks.
+ *
+ * Returns NULL if there's no superblock associated to this kernfs_root, or
+ * -EINVAL if the superblock is being freed.
+ */
+struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns)
+{
+ struct kernfs_super_info *info;
+ struct super_block *sb = NULL;
+
+ mutex_lock(&kernfs_mutex);
+ list_for_each_entry(info, &root->supers, node) {
+ if (info->ns == ns) {
+ sb = info->sb;
+ if (!atomic_inc_not_zero(&info->sb->s_active))
+ sb = ERR_PTR(-EINVAL);
+ break;
+ }
+ }
+ mutex_unlock(&kernfs_mutex);
+ return sb;
+}
+
+void __init kernfs_init(void)
+{
+ kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+ sizeof(struct kernfs_node),
+ 0, SLAB_PANIC, NULL);
+ kernfs_inode_init();
+}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 00000000000..8a198898e39
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,147 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+
+#include "kernfs-internal.h"
+
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+ const char *name,
+ struct kernfs_node *target)
+{
+ struct kernfs_node *kn;
+ int error;
+
+ kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
+ if (!kn)
+ return ERR_PTR(-ENOMEM);
+
+ if (kernfs_ns_enabled(parent))
+ kn->ns = target->ns;
+ kn->symlink.target_kn = target;
+ kernfs_get(target); /* ref owned by symlink */
+
+ error = kernfs_add_one(kn);
+ if (!error)
+ return kn;
+
+ kernfs_put(kn);
+ return ERR_PTR(error);
+}
+
+static int kernfs_get_target_path(struct kernfs_node *parent,
+ struct kernfs_node *target, char *path)
+{
+ struct kernfs_node *base, *kn;
+ char *s = path;
+ int len = 0;
+
+ /* go up to the root, stop at the base */
+ base = parent;
+ while (base->parent) {
+ kn = target->parent;
+ while (kn->parent && base != kn)
+ kn = kn->parent;
+
+ if (base == kn)
+ break;
+
+ strcpy(s, "../");
+ s += 3;
+ base = base->parent;
+ }
+
+ /* determine end of target string for reverse fillup */
+ kn = target;
+ while (kn->parent && kn != base) {
+ len += strlen(kn->name) + 1;
+ kn = kn->parent;
+ }
+
+ /* check limits */
+ if (len < 2)
+ return -EINVAL;
+ len--;
+ if ((s - path) + len > PATH_MAX)
+ return -ENAMETOOLONG;
+
+ /* reverse fillup of target string from target to base */
+ kn = target;
+ while (kn->parent && kn != base) {
+ int slen = strlen(kn->name);
+
+ len -= slen;
+ strncpy(s + len, kn->name, slen);
+ if (len)
+ s[--len] = '/';
+
+ kn = kn->parent;
+ }
+
+ return 0;
+}
+
+static int kernfs_getlink(struct dentry *dentry, char *path)
+{
+ struct kernfs_node *kn = dentry->d_fsdata;
+ struct kernfs_node *parent = kn->parent;
+ struct kernfs_node *target = kn->symlink.target_kn;
+ int error;
+
+ mutex_lock(&kernfs_mutex);
+ error = kernfs_get_target_path(parent, target, path);
+ mutex_unlock(&kernfs_mutex);
+
+ return error;
+}
+
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ int error = -ENOMEM;
+ unsigned long page = get_zeroed_page(GFP_KERNEL);
+ if (page) {
+ error = kernfs_getlink(dentry, (char *) page);
+ if (error < 0)
+ free_page((unsigned long)page);
+ }
+ nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+ return NULL;
+}
+
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+ void *cookie)
+{
+ char *page = nd_get_link(nd);
+ if (!IS_ERR(page))
+ free_page((unsigned long)page);
+}
+
+const struct inode_operations kernfs_symlink_iops = {
+ .setxattr = kernfs_iop_setxattr,
+ .removexattr = kernfs_iop_removexattr,
+ .getxattr = kernfs_iop_getxattr,
+ .listxattr = kernfs_iop_listxattr,
+ .readlink = generic_readlink,
+ .follow_link = kernfs_iop_follow_link,
+ .put_link = kernfs_iop_put_link,
+ .setattr = kernfs_iop_setattr,
+ .getattr = kernfs_iop_getattr,
+ .permission = kernfs_iop_permission,
+};
diff --git a/fs/libfs.c b/fs/libfs.c
index 8c501849315..88e3e00e2ec 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -3,6 +3,7 @@
* Library for filesystems writers.
*/
+#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
@@ -47,10 +48,16 @@ EXPORT_SYMBOL(simple_statfs);
* Retaining negative dentries for an in-memory filesystem just wastes
* memory and lookup time: arrange for them to be deleted immediately.
*/
-static int simple_delete_dentry(const struct dentry *dentry)
+int always_delete_dentry(const struct dentry *dentry)
{
return 1;
}
+EXPORT_SYMBOL(always_delete_dentry);
+
+const struct dentry_operations simple_dentry_operations = {
+ .d_delete = always_delete_dentry,
+};
+EXPORT_SYMBOL(simple_dentry_operations);
/*
* Lookup the data. This is trivial - if the dentry didn't already
@@ -58,10 +65,6 @@ static int simple_delete_dentry(const struct dentry *dentry)
*/
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
- static const struct dentry_operations simple_dentry_operations = {
- .d_delete = simple_delete_dentry,
- };
-
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
if (!dentry->d_sb->s_d_op)
@@ -921,16 +924,19 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
EXPORT_SYMBOL_GPL(generic_fh_to_parent);
/**
- * generic_file_fsync - generic fsync implementation for simple filesystems
+ * __generic_file_fsync - generic fsync implementation for simple filesystems
+ *
* @file: file to synchronize
+ * @start: start offset in bytes
+ * @end: end offset in bytes (inclusive)
* @datasync: only synchronize essential metadata if true
*
* This is a generic implementation of the fsync method for simple
* filesystems which track all non-inode metadata in the buffers list
* hanging off the address_space structure.
*/
-int generic_file_fsync(struct file *file, loff_t start, loff_t end,
- int datasync)
+int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
{
struct inode *inode = file->f_mapping->host;
int err;
@@ -950,10 +956,34 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
err = sync_inode_metadata(inode, 1);
if (ret == 0)
ret = err;
+
out:
mutex_unlock(&inode->i_mutex);
return ret;
}
+EXPORT_SYMBOL(__generic_file_fsync);
+
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * with flush
+ * @file: file to synchronize
+ * @start: start offset in bytes
+ * @end: end offset in bytes (inclusive)
+ * @datasync: only synchronize essential metadata if true
+ *
+ */
+
+int generic_file_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *inode = file->f_mapping->host;
+ int err;
+
+ err = __generic_file_fsync(file, start, end, datasync);
+ if (err)
+ return err;
+ return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+}
EXPORT_SYMBOL(generic_file_fsync);
/**
@@ -1002,3 +1032,46 @@ void kfree_put_link(struct dentry *dentry, struct nameidata *nd,
kfree(s);
}
EXPORT_SYMBOL(kfree_put_link);
+
+/*
+ * nop .set_page_dirty method so that people can use .page_mkwrite on
+ * anon inodes.
+ */
+static int anon_set_page_dirty(struct page *page)
+{
+ return 0;
+};
+
+/*
+ * A single inode exists for all anon_inode files. Contrary to pipes,
+ * anon_inode inodes have no associated per-instance data, so we need
+ * only allocate one of them.
+ */
+struct inode *alloc_anon_inode(struct super_block *s)
+{
+ static const struct address_space_operations anon_aops = {
+ .set_page_dirty = anon_set_page_dirty,
+ };
+ struct inode *inode = new_inode_pseudo(s);
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ inode->i_ino = get_next_ino();
+ inode->i_mapping->a_ops = &anon_aops;
+
+ /*
+ * Mark the inode dirty from the very beginning,
+ * that way it will never be moved to the dirty
+ * list because mark_inode_dirty() will think
+ * that it already _is_ on the dirty list.
+ */
+ inode->i_state = I_DIRTY;
+ inode->i_mode = S_IRUSR | S_IWUSR;
+ inode->i_uid = current_fsuid();
+ inode->i_gid = current_fsgid();
+ inode->i_flags |= S_PRIVATE;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ return inode;
+}
+EXPORT_SYMBOL(alloc_anon_inode);
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 00ec0b9c94d..d3e40db2893 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -14,6 +14,8 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
+#include <uapi/linux/nfs3.h>
+
#define NLMDBG_FACILITY NLMDBG_XDR
#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 9a55797a1cd..3e9f7874b97 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -15,6 +15,8 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
+#include <uapi/linux/nfs2.h>
+
#define NLMDBG_FACILITY NLMDBG_XDR
#if (NLMCLNT_OHSIZE > XDR_MAX_NETOBJ)
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 10d6c41aeca..8f27c93f8d2 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -235,6 +235,7 @@ out_err:
if (warned++ == 0)
printk(KERN_WARNING
"lockd_up: makesock failed, error=%d\n", err);
+ svc_shutdown_net(serv, net);
return err;
}
@@ -435,7 +436,7 @@ EXPORT_SYMBOL_GPL(lockd_down);
* Sysctl parameters (same as module parameters, different interface).
*/
-static ctl_table nlm_sysctls[] = {
+static struct ctl_table nlm_sysctls[] = {
{
.procname = "nlm_grace_period",
.data = &nlm_grace_period,
@@ -489,7 +490,7 @@ static ctl_table nlm_sysctls[] = {
{ }
};
-static ctl_table nlm_sysctl_dir[] = {
+static struct ctl_table nlm_sysctl_dir[] = {
{
.procname = "nfs",
.mode = 0555,
@@ -498,7 +499,7 @@ static ctl_table nlm_sysctl_dir[] = {
{ }
};
-static ctl_table nlm_sysctl_root[] = {
+static struct ctl_table nlm_sysctl_root[] = {
{
.procname = "fs",
.mode = 0555,
@@ -621,8 +622,8 @@ static int __init init_nlm(void)
err_pernet:
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(nlm_sysctl_table);
-#endif
err_sysctl:
+#endif
return err;
}
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index e066a390297..ab798a88ec1 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -779,6 +779,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
struct nlm_file *file = block->b_file;
struct nlm_lock *lock = &block->b_call->a_args.lock;
int error;
+ loff_t fl_start, fl_end;
dprintk("lockd: grant blocked lock %p\n", block);
@@ -796,9 +797,16 @@ nlmsvc_grant_blocked(struct nlm_block *block)
}
/* Try the lock operation again */
+ /* vfs_lock_file() can mangle fl_start and fl_end, but we need
+ * them unchanged for the GRANT_MSG
+ */
lock->fl.fl_flags |= FL_SLEEP;
+ fl_start = lock->fl.fl_start;
+ fl_end = lock->fl.fl_end;
error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.fl_start = fl_start;
+ lock->fl.fl_end = fl_end;
switch (error) {
case 0:
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index dc5c75930f0..b6f3b84b6e9 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -14,12 +14,11 @@
#include <linux/mutex.h>
#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/addr.h>
-#include <linux/nfsd/nfsfh.h>
-#include <linux/nfsd/export.h>
#include <linux/lockd/lockd.h>
#include <linux/lockd/share.h>
#include <linux/module.h>
#include <linux/mount.h>
+#include <uapi/linux/nfs2.h>
#define NLMDBG_FACILITY NLMDBG_SVCSUBS
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 964666c68a8..9340e7e10ef 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -16,6 +16,8 @@
#include <linux/sunrpc/stats.h>
#include <linux/lockd/lockd.h>
+#include <uapi/linux/nfs2.h>
+
#define NLMDBG_FACILITY NLMDBG_XDR
diff --git a/fs/locks.c b/fs/locks.c
index b27a3005d78..717fbc404e6 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -130,11 +130,15 @@
#include <linux/percpu.h>
#include <linux/lglock.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/filelock.h>
+
#include <asm/uaccess.h>
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & FL_LEASE)
+#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
static bool lease_breaking(struct file_lock *fl)
{
@@ -321,6 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
return -ENOMEM;
fl->fl_file = filp;
+ fl->fl_owner = (fl_owner_t)filp;
fl->fl_pid = current->tgid;
fl->fl_flags = FL_FLOCK;
fl->fl_type = type;
@@ -344,48 +349,43 @@ static int assign_type(struct file_lock *fl, long type)
return 0;
}
-/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
- * style lock.
- */
-static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
- struct flock *l)
+static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
+ struct flock64 *l)
{
- off_t start, end;
-
switch (l->l_whence) {
case SEEK_SET:
- start = 0;
+ fl->fl_start = 0;
break;
case SEEK_CUR:
- start = filp->f_pos;
+ fl->fl_start = filp->f_pos;
break;
case SEEK_END:
- start = i_size_read(file_inode(filp));
+ fl->fl_start = i_size_read(file_inode(filp));
break;
default:
return -EINVAL;
}
+ if (l->l_start > OFFSET_MAX - fl->fl_start)
+ return -EOVERFLOW;
+ fl->fl_start += l->l_start;
+ if (fl->fl_start < 0)
+ return -EINVAL;
/* POSIX-1996 leaves the case l->l_len < 0 undefined;
POSIX-2001 defines it. */
- start += l->l_start;
- if (start < 0)
- return -EINVAL;
- fl->fl_end = OFFSET_MAX;
if (l->l_len > 0) {
- end = start + l->l_len - 1;
- fl->fl_end = end;
+ if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
+ return -EOVERFLOW;
+ fl->fl_end = fl->fl_start + l->l_len - 1;
+
} else if (l->l_len < 0) {
- end = start - 1;
- fl->fl_end = end;
- start += l->l_len;
- if (start < 0)
+ if (fl->fl_start + l->l_len < 0)
return -EINVAL;
- }
- fl->fl_start = start; /* we record the absolute position */
- if (fl->fl_end < fl->fl_start)
- return -EOVERFLOW;
-
+ fl->fl_end = fl->fl_start - 1;
+ fl->fl_start += l->l_len;
+ } else
+ fl->fl_end = OFFSET_MAX;
+
fl->fl_owner = current->files;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
@@ -396,52 +396,21 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
return assign_type(fl, l->l_type);
}
-#if BITS_PER_LONG == 32
-static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
- struct flock64 *l)
+/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
+ * style lock.
+ */
+static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
+ struct flock *l)
{
- loff_t start;
-
- switch (l->l_whence) {
- case SEEK_SET:
- start = 0;
- break;
- case SEEK_CUR:
- start = filp->f_pos;
- break;
- case SEEK_END:
- start = i_size_read(file_inode(filp));
- break;
- default:
- return -EINVAL;
- }
-
- start += l->l_start;
- if (start < 0)
- return -EINVAL;
- fl->fl_end = OFFSET_MAX;
- if (l->l_len > 0) {
- fl->fl_end = start + l->l_len - 1;
- } else if (l->l_len < 0) {
- fl->fl_end = start - 1;
- start += l->l_len;
- if (start < 0)
- return -EINVAL;
- }
- fl->fl_start = start; /* we record the absolute position */
- if (fl->fl_end < fl->fl_start)
- return -EOVERFLOW;
-
- fl->fl_owner = current->files;
- fl->fl_pid = current->tgid;
- fl->fl_file = filp;
- fl->fl_flags = FL_POSIX;
- fl->fl_ops = NULL;
- fl->fl_lmops = NULL;
+ struct flock64 ll = {
+ .l_type = l->l_type,
+ .l_whence = l->l_whence,
+ .l_start = l->l_start,
+ .l_len = l->l_len,
+ };
- return assign_type(fl, l->l_type);
+ return flock64_to_posix_lock(filp, fl, &ll);
}
-#endif
/* default lease lock manager operations */
static void lease_break_callback(struct file_lock *fl)
@@ -462,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
if (assign_type(fl, type) != 0)
return -EINVAL;
- fl->fl_owner = current->files;
+ fl->fl_owner = (fl_owner_t)current->files;
fl->fl_pid = current->tgid;
fl->fl_file = filp;
@@ -511,8 +480,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
}
/* Must be called with the i_lock held! */
-static inline void
-locks_insert_global_locks(struct file_lock *fl)
+static void locks_insert_global_locks(struct file_lock *fl)
{
lg_local_lock(&file_lock_lglock);
fl->fl_link_cpu = smp_processor_id();
@@ -521,8 +489,7 @@ locks_insert_global_locks(struct file_lock *fl)
}
/* Must be called with the i_lock held! */
-static inline void
-locks_delete_global_locks(struct file_lock *fl)
+static void locks_delete_global_locks(struct file_lock *fl)
{
/*
* Avoid taking lock if already unhashed. This is safe since this check
@@ -544,14 +511,12 @@ posix_owner_key(struct file_lock *fl)
return (unsigned long)fl->fl_owner;
}
-static inline void
-locks_insert_global_blocked(struct file_lock *waiter)
+static void locks_insert_global_blocked(struct file_lock *waiter)
{
hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
}
-static inline void
-locks_delete_global_blocked(struct file_lock *waiter)
+static void locks_delete_global_blocked(struct file_lock *waiter)
{
hash_del(&waiter->fl_link);
}
@@ -581,7 +546,7 @@ static void locks_delete_block(struct file_lock *waiter)
* it seems like the reasonable thing to do.
*
* Must be called with both the i_lock and blocked_lock_lock held. The fl_block
- * list itself is protected by the file_lock_list, but by ensuring that the
+ * list itself is protected by the blocked_lock_lock, but by ensuring that the
* i_lock is also held on insertions we can avoid taking the blocked_lock_lock
* in some cases when we see that the fl_block list is empty.
*/
@@ -591,7 +556,7 @@ static void __locks_insert_block(struct file_lock *blocker,
BUG_ON(!list_empty(&waiter->fl_block));
waiter->fl_next = blocker;
list_add_tail(&waiter->fl_block, &blocker->fl_block);
- if (IS_POSIX(blocker))
+ if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
locks_insert_global_blocked(waiter);
}
@@ -652,15 +617,18 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
locks_insert_global_locks(fl);
}
-/*
- * Delete a lock and then free it.
- * Wake up processes that are blocked waiting for this lock,
- * notify the FS that the lock has been cleared and
- * finally free the lock.
+/**
+ * locks_delete_lock - Delete a lock and then free it.
+ * @thisfl_p: pointer that points to the fl_next field of the previous
+ * inode->i_flock list entry
+ *
+ * Unlink a lock from all lists and free the namespace reference, but don't
+ * free it yet. Wake up processes that are blocked waiting for this lock and
+ * notify the FS that the lock has been cleared.
*
* Must be called with the i_lock held!
*/
-static void locks_delete_lock(struct file_lock **thisfl_p)
+static void locks_unlink_lock(struct file_lock **thisfl_p)
{
struct file_lock *fl = *thisfl_p;
@@ -675,6 +643,18 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
}
locks_wake_up_blocks(fl);
+}
+
+/*
+ * Unlink a lock from all lists and free it.
+ *
+ * Must be called with i_lock held!
+ */
+static void locks_delete_lock(struct file_lock **thisfl_p)
+{
+ struct file_lock *fl = *thisfl_p;
+
+ locks_unlink_lock(thisfl_p);
locks_free_lock(fl);
}
@@ -769,8 +749,16 @@ EXPORT_SYMBOL(posix_test_lock);
* Note: the above assumption may not be true when handling lock
* requests from a broken NFS client. It may also fail in the presence
* of tasks (such as posix threads) sharing the same open file table.
- *
* To handle those cases, we just bail out after a few iterations.
+ *
+ * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
+ * Because the owner is not even nominally tied to a thread of
+ * execution, the deadlock detection below can't reasonably work well. Just
+ * skip it for those.
+ *
+ * In principle, we could do a more limited deadlock detection on FL_OFDLCK
+ * locks that just checks for the case where two tasks are attempting to
+ * upgrade from read to write locks on the same inode.
*/
#define MAX_DEADLK_ITERATIONS 10
@@ -793,6 +781,13 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
{
int i = 0;
+ /*
+ * This deadlock detector can't reasonably detect deadlocks with
+ * FL_OFDLCK locks, since they aren't owned by a process, per-se.
+ */
+ if (IS_OFDLCK(caller_fl))
+ return 0;
+
while ((block_fl = what_owner_is_waiting_for(block_fl))) {
if (i++ > MAX_DEADLK_ITERATIONS)
return 0;
@@ -1152,13 +1147,14 @@ EXPORT_SYMBOL(posix_lock_file_wait);
/**
* locks_mandatory_locked - Check for an active lock
- * @inode: the file to check
+ * @file: the file to check
*
* Searches the inode's list of locks to find any POSIX locks which conflict.
* This function is called from locks_verify_locked() only.
*/
-int locks_mandatory_locked(struct inode *inode)
+int locks_mandatory_locked(struct file *file)
{
+ struct inode *inode = file_inode(file);
fl_owner_t owner = current->files;
struct file_lock *fl;
@@ -1169,7 +1165,7 @@ int locks_mandatory_locked(struct inode *inode)
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!IS_POSIX(fl))
continue;
- if (fl->fl_owner != owner)
+ if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file)
break;
}
spin_unlock(&inode->i_lock);
@@ -1195,19 +1191,30 @@ int locks_mandatory_area(int read_write, struct inode *inode,
{
struct file_lock fl;
int error;
+ bool sleep = false;
locks_init_lock(&fl);
- fl.fl_owner = current->files;
fl.fl_pid = current->tgid;
fl.fl_file = filp;
fl.fl_flags = FL_POSIX | FL_ACCESS;
if (filp && !(filp->f_flags & O_NONBLOCK))
- fl.fl_flags |= FL_SLEEP;
+ sleep = true;
fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
fl.fl_start = offset;
fl.fl_end = offset + count - 1;
for (;;) {
+ if (filp) {
+ fl.fl_owner = (fl_owner_t)filp;
+ fl.fl_flags &= ~FL_SLEEP;
+ error = __posix_lock_file(inode, &fl, NULL);
+ if (!error)
+ break;
+ }
+
+ if (sleep)
+ fl.fl_flags |= FL_SLEEP;
+ fl.fl_owner = current->files;
error = __posix_lock_file(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
@@ -1283,6 +1290,7 @@ static void time_out_leases(struct inode *inode)
before = &inode->i_flock;
while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+ trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
lease_modify(before, F_RDLCK);
if (past_time(fl->fl_break_time))
@@ -1292,28 +1300,40 @@ static void time_out_leases(struct inode *inode)
}
}
+static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
+{
+ if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
+ return false;
+ return locks_conflict(breaker, lease);
+}
+
/**
* __break_lease - revoke all outstanding leases on file
* @inode: the inode of the file to return
- * @mode: the open mode (read or write)
+ * @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
+ * break all leases
+ * @type: FL_LEASE: break leases and delegations; FL_DELEG: break
+ * only delegations
*
* break_lease (inlined for speed) has checked there already is at least
* some kind of lock (maybe a lease) on this file. Leases are broken on
* a call to open() or truncate(). This function can sleep unless you
* specified %O_NONBLOCK to your open().
*/
-int __break_lease(struct inode *inode, unsigned int mode)
+int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
struct file_lock *new_fl, *flock;
struct file_lock *fl;
unsigned long break_time;
int i_have_this_lease = 0;
+ bool lease_conflict = false;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
+ new_fl->fl_flags = type;
spin_lock(&inode->i_lock);
@@ -1323,13 +1343,16 @@ int __break_lease(struct inode *inode, unsigned int mode)
if ((flock == NULL) || !IS_LEASE(flock))
goto out;
- if (!locks_conflict(flock, new_fl))
+ for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
+ if (leases_conflict(fl, new_fl)) {
+ lease_conflict = true;
+ if (fl->fl_owner == current->files)
+ i_have_this_lease = 1;
+ }
+ }
+ if (!lease_conflict)
goto out;
- for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next)
- if (fl->fl_owner == current->files)
- i_have_this_lease = 1;
-
break_time = 0;
if (lease_break_time > 0) {
break_time = jiffies + lease_break_time * HZ;
@@ -1338,6 +1361,8 @@ int __break_lease(struct inode *inode, unsigned int mode)
}
for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) {
+ if (!leases_conflict(fl, new_fl))
+ continue;
if (want_write) {
if (fl->fl_flags & FL_UNLOCK_PENDING)
continue;
@@ -1353,22 +1378,24 @@ int __break_lease(struct inode *inode, unsigned int mode)
}
if (i_have_this_lease || (mode & O_NONBLOCK)) {
+ trace_break_lease_noblock(inode, new_fl);
error = -EWOULDBLOCK;
goto out;
}
restart:
break_time = flock->fl_break_time;
- if (break_time != 0) {
+ if (break_time != 0)
break_time -= jiffies;
- if (break_time == 0)
- break_time++;
- }
+ if (break_time == 0)
+ break_time++;
locks_insert_block(flock, new_fl);
+ trace_break_lease_block(inode, new_fl);
spin_unlock(&inode->i_lock);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_next, break_time);
spin_lock(&inode->i_lock);
+ trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
if (error >= 0) {
if (error == 0)
@@ -1379,7 +1406,7 @@ restart:
*/
for (flock = inode->i_flock; flock && IS_LEASE(flock);
flock = flock->fl_next) {
- if (locks_conflict(new_fl, flock))
+ if (leases_conflict(new_fl, flock))
goto restart;
}
error = 0;
@@ -1455,21 +1482,63 @@ int fcntl_getlease(struct file *filp)
return type;
}
+/**
+ * check_conflicting_open - see if the given dentry points to a file that has
+ * an existing open that would conflict with the
+ * desired lease.
+ * @dentry: dentry to check
+ * @arg: type of lease that we're trying to acquire
+ *
+ * Check to see if there's an existing open fd on this file that would
+ * conflict with the lease we're trying to set.
+ */
+static int
+check_conflicting_open(const struct dentry *dentry, const long arg)
+{
+ int ret = 0;
+ struct inode *inode = dentry->d_inode;
+
+ if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
+ return -EAGAIN;
+
+ if ((arg == F_WRLCK) && ((d_count(dentry) > 1) ||
+ (atomic_read(&inode->i_count) > 1)))
+ ret = -EAGAIN;
+
+ return ret;
+}
+
static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp)
{
struct file_lock *fl, **before, **my_before = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
+ bool is_deleg = (*flp)->fl_flags & FL_DELEG;
int error;
lease = *flp;
+ trace_generic_add_lease(inode, lease);
- error = -EAGAIN;
- if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
- goto out;
- if ((arg == F_WRLCK)
- && ((d_count(dentry) > 1)
- || (atomic_read(&inode->i_count) > 1)))
+ /*
+ * In the delegation case we need mutual exclusion with
+ * a number of operations that take the i_mutex. We trylock
+ * because delegations are an optional optimization, and if
+ * there's some chance of a conflict--we'd rather not
+ * bother, maybe that's a sign this just isn't a good file to
+ * hand out a delegation on.
+ */
+ if (is_deleg && !mutex_trylock(&inode->i_mutex))
+ return -EAGAIN;
+
+ if (is_deleg && arg == F_WRLCK) {
+ /* Write delegations are not currently supported: */
+ mutex_unlock(&inode->i_mutex);
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ error = check_conflicting_open(dentry, arg);
+ if (error)
goto out;
/*
@@ -1514,9 +1583,22 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
goto out;
locks_insert_lock(before, lease);
- return 0;
-
+ /*
+ * The check in break_lease() is lockless. It's possible for another
+ * open to race in after we did the earlier check for a conflicting
+ * open but before the lease was inserted. Check again for a
+ * conflicting open and cancel the lease if there is one.
+ *
+ * We also add a barrier here to ensure that the insertion of the lock
+ * precedes these checks.
+ */
+ smp_mb();
+ error = check_conflicting_open(dentry, arg);
+ if (error)
+ locks_unlink_lock(flp);
out:
+ if (is_deleg)
+ mutex_unlock(&inode->i_mutex);
return error;
}
@@ -1526,6 +1608,8 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp)
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
+ trace_generic_delete_lease(inode, *flp);
+
for (before = &inode->i_flock;
((fl = *before) != NULL) && IS_LEASE(fl);
before = &fl->fl_next) {
@@ -1579,7 +1663,7 @@ EXPORT_SYMBOL(generic_setlease);
static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
{
- if (filp->f_op && filp->f_op->setlease)
+ if (filp->f_op->setlease)
return filp->f_op->setlease(filp, arg, lease);
else
return generic_setlease(filp, arg, lease);
@@ -1771,7 +1855,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (error)
goto out_free;
- if (f.file->f_op && f.file->f_op->flock)
+ if (f.file->f_op->flock)
error = f.file->f_op->flock(f.file,
(can_sleep) ? F_SETLKW : F_SETLK,
lock);
@@ -1797,7 +1881,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
*/
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
- if (filp->f_op && filp->f_op->lock)
+ if (filp->f_op->lock)
return filp->f_op->lock(filp, F_GETLK, fl);
posix_test_lock(filp, fl);
return 0;
@@ -1806,7 +1890,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
- flock->l_pid = fl->fl_pid;
+ flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
#if BITS_PER_LONG == 32
/*
* Make sure we can represent the posix lock via
@@ -1828,7 +1912,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
- flock->l_pid = fl->fl_pid;
+ flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid;
flock->l_start = fl->fl_start;
flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
fl->fl_end - fl->fl_start + 1;
@@ -1840,7 +1924,7 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
*/
-int fcntl_getlk(struct file *filp, struct flock __user *l)
+int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
{
struct file_lock file_lock;
struct flock flock;
@@ -1857,6 +1941,16 @@ int fcntl_getlk(struct file *filp, struct flock __user *l)
if (error)
goto out;
+ if (cmd == F_OFD_GETLK) {
+ error = -EINVAL;
+ if (flock.l_pid != 0)
+ goto out;
+
+ cmd = F_GETLK;
+ file_lock.fl_flags |= FL_OFDLCK;
+ file_lock.fl_owner = (fl_owner_t)filp;
+ }
+
error = vfs_test_lock(filp, &file_lock);
if (error)
goto out;
@@ -1909,7 +2003,7 @@ out:
*/
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
- if (filp->f_op && filp->f_op->lock)
+ if (filp->f_op->lock)
return filp->f_op->lock(filp, cmd, fl);
else
return posix_lock_file(filp, fl, conf);
@@ -1940,6 +2034,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
+/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+static int
+check_fmode_for_setlk(struct file_lock *fl)
+{
+ switch (fl->fl_type) {
+ case F_RDLCK:
+ if (!(fl->fl_file->f_mode & FMODE_READ))
+ return -EBADF;
+ break;
+ case F_WRLCK:
+ if (!(fl->fl_file->f_mode & FMODE_WRITE))
+ return -EBADF;
+ }
+ return 0;
+}
+
/* Apply the lock described by l to an open file descriptor.
* This implements both the F_SETLK and F_SETLKW commands of fcntl().
*/
@@ -1976,25 +2086,36 @@ again:
error = flock_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
- if (cmd == F_SETLKW) {
- file_lock->fl_flags |= FL_SLEEP;
- }
-
- error = -EBADF;
- switch (flock.l_type) {
- case F_RDLCK:
- if (!(filp->f_mode & FMODE_READ))
- goto out;
- break;
- case F_WRLCK:
- if (!(filp->f_mode & FMODE_WRITE))
+
+ error = check_fmode_for_setlk(file_lock);
+ if (error)
+ goto out;
+
+ /*
+ * If the cmd is requesting file-private locks, then set the
+ * FL_OFDLCK flag and override the owner.
+ */
+ switch (cmd) {
+ case F_OFD_SETLK:
+ error = -EINVAL;
+ if (flock.l_pid != 0)
goto out;
+
+ cmd = F_SETLK;
+ file_lock->fl_flags |= FL_OFDLCK;
+ file_lock->fl_owner = (fl_owner_t)filp;
break;
- case F_UNLCK:
- break;
- default:
+ case F_OFD_SETLKW:
error = -EINVAL;
- goto out;
+ if (flock.l_pid != 0)
+ goto out;
+
+ cmd = F_SETLKW;
+ file_lock->fl_flags |= FL_OFDLCK;
+ file_lock->fl_owner = (fl_owner_t)filp;
+ /* Fallthrough */
+ case F_SETLKW:
+ file_lock->fl_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2025,7 +2146,7 @@ out:
/* Report the first existing lock that would conflict with l.
* This implements the F_GETLK command of fcntl().
*/
-int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
+int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
{
struct file_lock file_lock;
struct flock64 flock;
@@ -2042,6 +2163,16 @@ int fcntl_getlk64(struct file *filp, struct flock64 __user *l)
if (error)
goto out;
+ if (cmd == F_OFD_GETLK) {
+ error = -EINVAL;
+ if (flock.l_pid != 0)
+ goto out;
+
+ cmd = F_GETLK64;
+ file_lock.fl_flags |= FL_OFDLCK;
+ file_lock.fl_owner = (fl_owner_t)filp;
+ }
+
error = vfs_test_lock(filp, &file_lock);
if (error)
goto out;
@@ -2094,25 +2225,36 @@ again:
error = flock64_to_posix_lock(filp, file_lock, &flock);
if (error)
goto out;
- if (cmd == F_SETLKW64) {
- file_lock->fl_flags |= FL_SLEEP;
- }
-
- error = -EBADF;
- switch (flock.l_type) {
- case F_RDLCK:
- if (!(filp->f_mode & FMODE_READ))
- goto out;
- break;
- case F_WRLCK:
- if (!(filp->f_mode & FMODE_WRITE))
+
+ error = check_fmode_for_setlk(file_lock);
+ if (error)
+ goto out;
+
+ /*
+ * If the cmd is requesting file-private locks, then set the
+ * FL_OFDLCK flag and override the owner.
+ */
+ switch (cmd) {
+ case F_OFD_SETLK:
+ error = -EINVAL;
+ if (flock.l_pid != 0)
goto out;
+
+ cmd = F_SETLK64;
+ file_lock->fl_flags |= FL_OFDLCK;
+ file_lock->fl_owner = (fl_owner_t)filp;
break;
- case F_UNLCK:
- break;
- default:
+ case F_OFD_SETLKW:
error = -EINVAL;
- goto out;
+ if (flock.l_pid != 0)
+ goto out;
+
+ cmd = F_SETLKW64;
+ file_lock->fl_flags |= FL_OFDLCK;
+ file_lock->fl_owner = (fl_owner_t)filp;
+ /* Fallthrough */
+ case F_SETLKW64:
+ file_lock->fl_flags |= FL_SLEEP;
}
error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2173,7 +2315,7 @@ EXPORT_SYMBOL(locks_remove_posix);
/*
* This function is called on the last close of an open file.
*/
-void locks_remove_flock(struct file *filp)
+void locks_remove_file(struct file *filp)
{
struct inode * inode = file_inode(filp);
struct file_lock *fl;
@@ -2182,8 +2324,11 @@ void locks_remove_flock(struct file *filp)
if (!inode->i_flock)
return;
- if (filp->f_op && filp->f_op->flock) {
+ locks_remove_posix(filp, (fl_owner_t)filp);
+
+ if (filp->f_op->flock) {
struct file_lock fl = {
+ .fl_owner = (fl_owner_t)filp,
.fl_pid = current->tgid,
.fl_file = filp,
.fl_flags = FL_FLOCK,
@@ -2200,16 +2345,28 @@ void locks_remove_flock(struct file *filp)
while ((fl = *before) != NULL) {
if (fl->fl_file == filp) {
- if (IS_FLOCK(fl)) {
- locks_delete_lock(before);
- continue;
- }
if (IS_LEASE(fl)) {
lease_modify(before, F_UNLCK);
continue;
}
- /* What? */
- BUG();
+
+ /*
+ * There's a leftover lock on the list of a type that
+ * we didn't expect to see. Most likely a classic
+ * POSIX lock that ended up not getting released
+ * properly, or that raced onto the list somehow. Log
+ * some info about it and then just remove it from
+ * the list.
+ */
+ WARN(!IS_FLOCK(fl),
+ "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
+ MAJOR(inode->i_sb->s_dev),
+ MINOR(inode->i_sb->s_dev), inode->i_ino,
+ fl->fl_type, fl->fl_flags,
+ fl->fl_start, fl->fl_end);
+
+ locks_delete_lock(before);
+ continue;
}
before = &fl->fl_next;
}
@@ -2246,7 +2403,7 @@ EXPORT_SYMBOL(posix_unblock_lock);
*/
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
- if (filp->f_op && filp->f_op->lock)
+ if (filp->f_op->lock)
return filp->f_op->lock(filp, F_CANCELLK, fl);
return 0;
}
@@ -2278,26 +2435,32 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
seq_printf(f, "%lld:%s ", id, pfx);
if (IS_POSIX(fl)) {
- seq_printf(f, "%6s %s ",
- (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
+ if (fl->fl_flags & FL_ACCESS)
+ seq_puts(f, "ACCESS");
+ else if (IS_OFDLCK(fl))
+ seq_puts(f, "OFDLCK");
+ else
+ seq_puts(f, "POSIX ");
+
+ seq_printf(f, " %s ",
(inode == NULL) ? "*NOINODE*" :
mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
} else if (IS_FLOCK(fl)) {
if (fl->fl_type & LOCK_MAND) {
- seq_printf(f, "FLOCK MSNFS ");
+ seq_puts(f, "FLOCK MSNFS ");
} else {
- seq_printf(f, "FLOCK ADVISORY ");
+ seq_puts(f, "FLOCK ADVISORY ");
}
} else if (IS_LEASE(fl)) {
- seq_printf(f, "LEASE ");
+ seq_puts(f, "LEASE ");
if (lease_breaking(fl))
- seq_printf(f, "BREAKING ");
+ seq_puts(f, "BREAKING ");
else if (fl->fl_file)
- seq_printf(f, "ACTIVE ");
+ seq_puts(f, "ACTIVE ");
else
- seq_printf(f, "BREAKER ");
+ seq_puts(f, "BREAKER ");
} else {
- seq_printf(f, "UNKNOWN UNKNOWN ");
+ seq_puts(f, "UNKNOWN UNKNOWN ");
}
if (fl->fl_type & LOCK_MAND) {
seq_printf(f, "%s ",
@@ -2329,7 +2492,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
else
seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
} else {
- seq_printf(f, "0 EOF\n");
+ seq_puts(f, "0 EOF\n");
}
}
@@ -2349,6 +2512,7 @@ static int locks_show(struct seq_file *f, void *v)
}
static void *locks_start(struct seq_file *f, loff_t *pos)
+ __acquires(&blocked_lock_lock)
{
struct locks_iterator *iter = f->private;
@@ -2367,6 +2531,7 @@ static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
}
static void locks_stop(struct seq_file *f, void *v)
+ __releases(&blocked_lock_lock)
{
spin_unlock(&blocked_lock_lock);
lg_global_unlock(&file_lock_lglock);
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 550475ca6a0..76279e11982 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,16 +14,10 @@
#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-static void request_complete(struct bio *bio, int err)
-{
- complete((struct completion *)bio->bi_private);
-}
-
static int sync_request(struct page *page, struct block_device *bdev, int rw)
{
struct bio bio;
struct bio_vec bio_vec;
- struct completion complete;
bio_init(&bio);
bio.bi_max_vecs = 1;
@@ -32,16 +26,11 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
bio_vec.bv_len = PAGE_SIZE;
bio_vec.bv_offset = 0;
bio.bi_vcnt = 1;
- bio.bi_size = PAGE_SIZE;
bio.bi_bdev = bdev;
- bio.bi_sector = page->index * (PAGE_SIZE >> 9);
- init_completion(&complete);
- bio.bi_private = &complete;
- bio.bi_end_io = request_complete;
-
- submit_bio(rw, &bio);
- wait_for_completion(&complete);
- return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+ bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
+ bio.bi_iter.bi_size = PAGE_SIZE;
+
+ return submit_bio_wait(rw, &bio);
}
static int bdev_readpage(void *_sb, struct page *page)
@@ -67,22 +56,18 @@ static DECLARE_WAIT_QUEUE_HEAD(wq);
static void writeseg_end_io(struct bio *bio, int err)
{
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec;
+ int i;
struct super_block *sb = bio->bi_private;
struct logfs_super *super = logfs_super(sb);
- struct page *page;
BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
BUG_ON(err);
- BUG_ON(bio->bi_vcnt == 0);
- do {
- page = bvec->bv_page;
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- end_page_writeback(page);
- page_cache_release(page);
- } while (bvec >= bio->bi_io_vec);
+
+ bio_for_each_segment_all(bvec, bio, i) {
+ end_page_writeback(bvec->bv_page);
+ page_cache_release(bvec->bv_page);
+ }
bio_put(bio);
if (atomic_dec_and_test(&super->s_pending_writes))
wake_up(&wq);
@@ -107,9 +92,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
if (i >= max_pages) {
/* Block layer cannot split bios :( */
bio->bi_vcnt = i;
- bio->bi_size = i * PAGE_SIZE;
+ bio->bi_iter.bi_size = i * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
- bio->bi_sector = ofs >> 9;
+ bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = writeseg_end_io;
atomic_inc(&super->s_pending_writes);
@@ -134,9 +119,9 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
unlock_page(page);
}
bio->bi_vcnt = nr_pages;
- bio->bi_size = nr_pages * PAGE_SIZE;
+ bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
- bio->bi_sector = ofs >> 9;
+ bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = writeseg_end_io;
atomic_inc(&super->s_pending_writes);
@@ -199,9 +184,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
if (i >= max_pages) {
/* Block layer cannot split bios :( */
bio->bi_vcnt = i;
- bio->bi_size = i * PAGE_SIZE;
+ bio->bi_iter.bi_size = i * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
- bio->bi_sector = ofs >> 9;
+ bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = erase_end_io;
atomic_inc(&super->s_pending_writes);
@@ -220,9 +205,9 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
bio->bi_io_vec[i].bv_offset = 0;
}
bio->bi_vcnt = nr_pages;
- bio->bi_size = nr_pages * PAGE_SIZE;
+ bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
bio->bi_bdev = super->s_bdev;
- bio->bi_sector = ofs >> 9;
+ bio->bi_iter.bi_sector = ofs >> 9;
bio->bi_private = sb;
bio->bi_end_io = erase_end_io;
atomic_inc(&super->s_pending_writes);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 57914fc32b6..8538752df2f 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -264,15 +264,15 @@ const struct inode_operations logfs_reg_iops = {
};
const struct file_operations logfs_reg_fops = {
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.fsync = logfs_fsync,
.unlocked_ioctl = logfs_ioctl,
.llseek = generic_file_llseek,
.mmap = generic_file_readonly_mmap,
.open = generic_file_open,
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
};
const struct address_space_operations logfs_reg_aops = {
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9a59cbade2f..48140315f62 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode)
do_delete_inode(inode);
}
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Cheaper version of write_inode. All changes are concealed in
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index d448a777166..7f9b096d8d5 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -62,7 +62,8 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
page = read_cache_page(mapping, index, filler, sb);
else {
page = find_or_create_page(mapping, index, GFP_NOFS);
- unlock_page(page);
+ if (page)
+ unlock_page(page);
}
return page;
}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index e519e45bf67..187477ded6b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -26,6 +26,41 @@
* back on the lru list.
*/
+/*
+ * Lock descriptions and usage:
+ *
+ * Each hash chain of both the block and index hash tables now contains
+ * a built-in lock used to serialize accesses to the hash chain.
+ *
+ * Accesses to global data structures mb_cache_list and mb_cache_lru_list
+ * are serialized via the global spinlock mb_cache_spinlock.
+ *
+ * Each mb_cache_entry contains a spinlock, e_entry_lock, to serialize
+ * accesses to its local data, such as e_used and e_queued.
+ *
+ * Lock ordering:
+ *
+ * Each block hash chain's lock has the highest lock order, followed by an
+ * index hash chain's lock, mb_cache_bg_lock (used to implement mb_cache_entry's
+ * lock), and mb_cach_spinlock, with the lowest order. While holding
+ * either a block or index hash chain lock, a thread can acquire an
+ * mc_cache_bg_lock, which in turn can also acquire mb_cache_spinlock.
+ *
+ * Synchronization:
+ *
+ * Since both mb_cache_entry_get and mb_cache_entry_find scan the block and
+ * index hash chian, it needs to lock the corresponding hash chain. For each
+ * mb_cache_entry within the chain, it needs to lock the mb_cache_entry to
+ * prevent either any simultaneous release or free on the entry and also
+ * to serialize accesses to either the e_used or e_queued member of the entry.
+ *
+ * To avoid having a dangling reference to an already freed
+ * mb_cache_entry, an mb_cache_entry is only freed when it is not on a
+ * block hash chain and also no longer being referenced, both e_used,
+ * and e_queued are 0's. When an mb_cache_entry is explicitly freed it is
+ * first removed from a block hash chain.
+ */
+
#include <linux/kernel.h>
#include <linux/module.h>
@@ -34,9 +69,11 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched.h>
-#include <linux/init.h>
+#include <linux/list_bl.h>
#include <linux/mbcache.h>
-
+#include <linux/init.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/log2.h>
#ifdef MB_CACHE_DEBUG
# define mb_debug(f...) do { \
@@ -57,8 +94,14 @@
#define MB_CACHE_WRITER ((unsigned short)~0U >> 1)
+#define MB_CACHE_ENTRY_LOCK_BITS ilog2(NR_BG_LOCKS)
+#define MB_CACHE_ENTRY_LOCK_INDEX(ce) \
+ (hash_long((unsigned long)ce, MB_CACHE_ENTRY_LOCK_BITS))
+
static DECLARE_WAIT_QUEUE_HEAD(mb_cache_queue);
-
+static struct blockgroup_lock *mb_cache_bg_lock;
+static struct kmem_cache *mb_cache_kmem_cache;
+
MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
MODULE_LICENSE("GPL");
@@ -86,58 +129,110 @@ static LIST_HEAD(mb_cache_list);
static LIST_HEAD(mb_cache_lru_list);
static DEFINE_SPINLOCK(mb_cache_spinlock);
+static inline void
+__spin_lock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+ spin_lock(bgl_lock_ptr(mb_cache_bg_lock,
+ MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
+static inline void
+__spin_unlock_mb_cache_entry(struct mb_cache_entry *ce)
+{
+ spin_unlock(bgl_lock_ptr(mb_cache_bg_lock,
+ MB_CACHE_ENTRY_LOCK_INDEX(ce)));
+}
+
static inline int
-__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
+__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce)
{
- return !list_empty(&ce->e_block_list);
+ return !hlist_bl_unhashed(&ce->e_block_list);
}
-static void
-__mb_cache_entry_unhash(struct mb_cache_entry *ce)
+static inline void
+__mb_cache_entry_unhash_block(struct mb_cache_entry *ce)
{
- if (__mb_cache_entry_is_hashed(ce)) {
- list_del_init(&ce->e_block_list);
- list_del(&ce->e_index.o_list);
- }
+ if (__mb_cache_entry_is_block_hashed(ce))
+ hlist_bl_del_init(&ce->e_block_list);
}
+static inline int
+__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce)
+{
+ return !hlist_bl_unhashed(&ce->e_index.o_list);
+}
+
+static inline void
+__mb_cache_entry_unhash_index(struct mb_cache_entry *ce)
+{
+ if (__mb_cache_entry_is_index_hashed(ce))
+ hlist_bl_del_init(&ce->e_index.o_list);
+}
+
+/*
+ * __mb_cache_entry_unhash_unlock()
+ *
+ * This function is called to unhash both the block and index hash
+ * chain.
+ * It assumes both the block and index hash chain is locked upon entry.
+ * It also unlock both hash chains both exit
+ */
+static inline void
+__mb_cache_entry_unhash_unlock(struct mb_cache_entry *ce)
+{
+ __mb_cache_entry_unhash_index(ce);
+ hlist_bl_unlock(ce->e_index_hash_p);
+ __mb_cache_entry_unhash_block(ce);
+ hlist_bl_unlock(ce->e_block_hash_p);
+}
static void
__mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask)
{
struct mb_cache *cache = ce->e_cache;
- mb_assert(!(ce->e_used || ce->e_queued));
+ mb_assert(!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt)));
kmem_cache_free(cache->c_entry_cache, ce);
atomic_dec(&cache->c_entry_count);
}
-
static void
-__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
- __releases(mb_cache_spinlock)
+__mb_cache_entry_release(struct mb_cache_entry *ce)
{
+ /* First lock the entry to serialize access to its local data. */
+ __spin_lock_mb_cache_entry(ce);
/* Wake up all processes queuing for this cache entry. */
if (ce->e_queued)
wake_up_all(&mb_cache_queue);
if (ce->e_used >= MB_CACHE_WRITER)
ce->e_used -= MB_CACHE_WRITER;
+ /*
+ * Make sure that all cache entries on lru_list have
+ * both e_used and e_qued of 0s.
+ */
ce->e_used--;
- if (!(ce->e_used || ce->e_queued)) {
- if (!__mb_cache_entry_is_hashed(ce))
+ if (!(ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))) {
+ if (!__mb_cache_entry_is_block_hashed(ce)) {
+ __spin_unlock_mb_cache_entry(ce);
goto forget;
- mb_assert(list_empty(&ce->e_lru_list));
- list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+ }
+ /*
+ * Need access to lru list, first drop entry lock,
+ * then reacquire the lock in the proper order.
+ */
+ spin_lock(&mb_cache_spinlock);
+ if (list_empty(&ce->e_lru_list))
+ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
+ spin_unlock(&mb_cache_spinlock);
}
- spin_unlock(&mb_cache_spinlock);
+ __spin_unlock_mb_cache_entry(ce);
return;
forget:
- spin_unlock(&mb_cache_spinlock);
+ mb_assert(list_empty(&ce->e_lru_list));
__mb_cache_entry_forget(ce, GFP_KERNEL);
}
-
/*
* mb_cache_shrink_scan() memory pressure callback
*
@@ -160,17 +255,34 @@ mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
mb_debug("trying to free %d entries", nr_to_scan);
spin_lock(&mb_cache_spinlock);
- while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) {
+ while ((nr_to_scan-- > 0) && !list_empty(&mb_cache_lru_list)) {
struct mb_cache_entry *ce =
list_entry(mb_cache_lru_list.next,
- struct mb_cache_entry, e_lru_list);
- list_move_tail(&ce->e_lru_list, &free_list);
- __mb_cache_entry_unhash(ce);
- freed++;
+ struct mb_cache_entry, e_lru_list);
+ list_del_init(&ce->e_lru_list);
+ if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt))
+ continue;
+ spin_unlock(&mb_cache_spinlock);
+ /* Prevent any find or get operation on the entry */
+ hlist_bl_lock(ce->e_block_hash_p);
+ hlist_bl_lock(ce->e_index_hash_p);
+ /* Ignore if it is touched by a find/get */
+ if (ce->e_used || ce->e_queued || atomic_read(&ce->e_refcnt) ||
+ !list_empty(&ce->e_lru_list)) {
+ hlist_bl_unlock(ce->e_index_hash_p);
+ hlist_bl_unlock(ce->e_block_hash_p);
+ spin_lock(&mb_cache_spinlock);
+ continue;
+ }
+ __mb_cache_entry_unhash_unlock(ce);
+ list_add_tail(&ce->e_lru_list, &free_list);
+ spin_lock(&mb_cache_spinlock);
}
spin_unlock(&mb_cache_spinlock);
+
list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
__mb_cache_entry_forget(entry, gfp_mask);
+ freed++;
}
return freed;
}
@@ -215,29 +327,40 @@ mb_cache_create(const char *name, int bucket_bits)
int n, bucket_count = 1 << bucket_bits;
struct mb_cache *cache = NULL;
+ if (!mb_cache_bg_lock) {
+ mb_cache_bg_lock = kmalloc(sizeof(struct blockgroup_lock),
+ GFP_KERNEL);
+ if (!mb_cache_bg_lock)
+ return NULL;
+ bgl_lock_init(mb_cache_bg_lock);
+ }
+
cache = kmalloc(sizeof(struct mb_cache), GFP_KERNEL);
if (!cache)
return NULL;
cache->c_name = name;
atomic_set(&cache->c_entry_count, 0);
cache->c_bucket_bits = bucket_bits;
- cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
- GFP_KERNEL);
+ cache->c_block_hash = kmalloc(bucket_count *
+ sizeof(struct hlist_bl_head), GFP_KERNEL);
if (!cache->c_block_hash)
goto fail;
for (n=0; n<bucket_count; n++)
- INIT_LIST_HEAD(&cache->c_block_hash[n]);
- cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head),
- GFP_KERNEL);
+ INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]);
+ cache->c_index_hash = kmalloc(bucket_count *
+ sizeof(struct hlist_bl_head), GFP_KERNEL);
if (!cache->c_index_hash)
goto fail;
for (n=0; n<bucket_count; n++)
- INIT_LIST_HEAD(&cache->c_index_hash[n]);
- cache->c_entry_cache = kmem_cache_create(name,
- sizeof(struct mb_cache_entry), 0,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
- if (!cache->c_entry_cache)
- goto fail2;
+ INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]);
+ if (!mb_cache_kmem_cache) {
+ mb_cache_kmem_cache = kmem_cache_create(name,
+ sizeof(struct mb_cache_entry), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+ if (!mb_cache_kmem_cache)
+ goto fail2;
+ }
+ cache->c_entry_cache = mb_cache_kmem_cache;
/*
* Set an upper limit on the number of cache entries so that the hash
@@ -273,21 +396,47 @@ void
mb_cache_shrink(struct block_device *bdev)
{
LIST_HEAD(free_list);
- struct list_head *l, *ltmp;
+ struct list_head *l;
+ struct mb_cache_entry *ce, *tmp;
+ l = &mb_cache_lru_list;
spin_lock(&mb_cache_spinlock);
- list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
- struct mb_cache_entry *ce =
- list_entry(l, struct mb_cache_entry, e_lru_list);
+ while (!list_is_last(l, &mb_cache_lru_list)) {
+ l = l->next;
+ ce = list_entry(l, struct mb_cache_entry, e_lru_list);
if (ce->e_bdev == bdev) {
- list_move_tail(&ce->e_lru_list, &free_list);
- __mb_cache_entry_unhash(ce);
+ list_del_init(&ce->e_lru_list);
+ if (ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt))
+ continue;
+ spin_unlock(&mb_cache_spinlock);
+ /*
+ * Prevent any find or get operation on the entry.
+ */
+ hlist_bl_lock(ce->e_block_hash_p);
+ hlist_bl_lock(ce->e_index_hash_p);
+ /* Ignore if it is touched by a find/get */
+ if (ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt) ||
+ !list_empty(&ce->e_lru_list)) {
+ hlist_bl_unlock(ce->e_index_hash_p);
+ hlist_bl_unlock(ce->e_block_hash_p);
+ l = &mb_cache_lru_list;
+ spin_lock(&mb_cache_spinlock);
+ continue;
+ }
+ __mb_cache_entry_unhash_unlock(ce);
+ mb_assert(!(ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt)));
+ list_add_tail(&ce->e_lru_list, &free_list);
+ l = &mb_cache_lru_list;
+ spin_lock(&mb_cache_spinlock);
}
}
spin_unlock(&mb_cache_spinlock);
- list_for_each_safe(l, ltmp, &free_list) {
- __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
- e_lru_list), GFP_KERNEL);
+
+ list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+ __mb_cache_entry_forget(ce, GFP_KERNEL);
}
}
@@ -303,23 +452,27 @@ void
mb_cache_destroy(struct mb_cache *cache)
{
LIST_HEAD(free_list);
- struct list_head *l, *ltmp;
+ struct mb_cache_entry *ce, *tmp;
spin_lock(&mb_cache_spinlock);
- list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
- struct mb_cache_entry *ce =
- list_entry(l, struct mb_cache_entry, e_lru_list);
- if (ce->e_cache == cache) {
+ list_for_each_entry_safe(ce, tmp, &mb_cache_lru_list, e_lru_list) {
+ if (ce->e_cache == cache)
list_move_tail(&ce->e_lru_list, &free_list);
- __mb_cache_entry_unhash(ce);
- }
}
list_del(&cache->c_cache_list);
spin_unlock(&mb_cache_spinlock);
- list_for_each_safe(l, ltmp, &free_list) {
- __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
- e_lru_list), GFP_KERNEL);
+ list_for_each_entry_safe(ce, tmp, &free_list, e_lru_list) {
+ list_del_init(&ce->e_lru_list);
+ /*
+ * Prevent any find or get operation on the entry.
+ */
+ hlist_bl_lock(ce->e_block_hash_p);
+ hlist_bl_lock(ce->e_index_hash_p);
+ mb_assert(!(ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt)));
+ __mb_cache_entry_unhash_unlock(ce);
+ __mb_cache_entry_forget(ce, GFP_KERNEL);
}
if (atomic_read(&cache->c_entry_count) > 0) {
@@ -328,8 +481,10 @@ mb_cache_destroy(struct mb_cache *cache)
atomic_read(&cache->c_entry_count));
}
- kmem_cache_destroy(cache->c_entry_cache);
-
+ if (list_empty(&mb_cache_list)) {
+ kmem_cache_destroy(mb_cache_kmem_cache);
+ mb_cache_kmem_cache = NULL;
+ }
kfree(cache->c_index_hash);
kfree(cache->c_block_hash);
kfree(cache);
@@ -346,28 +501,61 @@ mb_cache_destroy(struct mb_cache *cache)
struct mb_cache_entry *
mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
{
- struct mb_cache_entry *ce = NULL;
+ struct mb_cache_entry *ce;
if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+ struct list_head *l;
+
+ l = &mb_cache_lru_list;
spin_lock(&mb_cache_spinlock);
- if (!list_empty(&mb_cache_lru_list)) {
- ce = list_entry(mb_cache_lru_list.next,
- struct mb_cache_entry, e_lru_list);
- list_del_init(&ce->e_lru_list);
- __mb_cache_entry_unhash(ce);
+ while (!list_is_last(l, &mb_cache_lru_list)) {
+ l = l->next;
+ ce = list_entry(l, struct mb_cache_entry, e_lru_list);
+ if (ce->e_cache == cache) {
+ list_del_init(&ce->e_lru_list);
+ if (ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt))
+ continue;
+ spin_unlock(&mb_cache_spinlock);
+ /*
+ * Prevent any find or get operation on the
+ * entry.
+ */
+ hlist_bl_lock(ce->e_block_hash_p);
+ hlist_bl_lock(ce->e_index_hash_p);
+ /* Ignore if it is touched by a find/get */
+ if (ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt) ||
+ !list_empty(&ce->e_lru_list)) {
+ hlist_bl_unlock(ce->e_index_hash_p);
+ hlist_bl_unlock(ce->e_block_hash_p);
+ l = &mb_cache_lru_list;
+ spin_lock(&mb_cache_spinlock);
+ continue;
+ }
+ mb_assert(list_empty(&ce->e_lru_list));
+ mb_assert(!(ce->e_used || ce->e_queued ||
+ atomic_read(&ce->e_refcnt)));
+ __mb_cache_entry_unhash_unlock(ce);
+ goto found;
+ }
}
spin_unlock(&mb_cache_spinlock);
}
- if (!ce) {
- ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
- if (!ce)
- return NULL;
- atomic_inc(&cache->c_entry_count);
- INIT_LIST_HEAD(&ce->e_lru_list);
- INIT_LIST_HEAD(&ce->e_block_list);
- ce->e_cache = cache;
- ce->e_queued = 0;
- }
+
+ ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+ if (!ce)
+ return NULL;
+ atomic_inc(&cache->c_entry_count);
+ INIT_LIST_HEAD(&ce->e_lru_list);
+ INIT_HLIST_BL_NODE(&ce->e_block_list);
+ INIT_HLIST_BL_NODE(&ce->e_index.o_list);
+ ce->e_cache = cache;
+ ce->e_queued = 0;
+ atomic_set(&ce->e_refcnt, 0);
+found:
+ ce->e_block_hash_p = &cache->c_block_hash[0];
+ ce->e_index_hash_p = &cache->c_index_hash[0];
ce->e_used = 1 + MB_CACHE_WRITER;
return ce;
}
@@ -393,29 +581,38 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev,
{
struct mb_cache *cache = ce->e_cache;
unsigned int bucket;
- struct list_head *l;
- int error = -EBUSY;
+ struct hlist_bl_node *l;
+ struct hlist_bl_head *block_hash_p;
+ struct hlist_bl_head *index_hash_p;
+ struct mb_cache_entry *lce;
+ mb_assert(ce);
bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
cache->c_bucket_bits);
- spin_lock(&mb_cache_spinlock);
- list_for_each_prev(l, &cache->c_block_hash[bucket]) {
- struct mb_cache_entry *ce =
- list_entry(l, struct mb_cache_entry, e_block_list);
- if (ce->e_bdev == bdev && ce->e_block == block)
- goto out;
+ block_hash_p = &cache->c_block_hash[bucket];
+ hlist_bl_lock(block_hash_p);
+ hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) {
+ if (lce->e_bdev == bdev && lce->e_block == block) {
+ hlist_bl_unlock(block_hash_p);
+ return -EBUSY;
+ }
}
- __mb_cache_entry_unhash(ce);
+ mb_assert(!__mb_cache_entry_is_block_hashed(ce));
+ __mb_cache_entry_unhash_block(ce);
+ __mb_cache_entry_unhash_index(ce);
ce->e_bdev = bdev;
ce->e_block = block;
- list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
+ ce->e_block_hash_p = block_hash_p;
ce->e_index.o_key = key;
+ hlist_bl_add_head(&ce->e_block_list, block_hash_p);
+ hlist_bl_unlock(block_hash_p);
bucket = hash_long(key, cache->c_bucket_bits);
- list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]);
- error = 0;
-out:
- spin_unlock(&mb_cache_spinlock);
- return error;
+ index_hash_p = &cache->c_index_hash[bucket];
+ hlist_bl_lock(index_hash_p);
+ ce->e_index_hash_p = index_hash_p;
+ hlist_bl_add_head(&ce->e_index.o_list, index_hash_p);
+ hlist_bl_unlock(index_hash_p);
+ return 0;
}
@@ -429,24 +626,26 @@ out:
void
mb_cache_entry_release(struct mb_cache_entry *ce)
{
- spin_lock(&mb_cache_spinlock);
- __mb_cache_entry_release_unlock(ce);
+ __mb_cache_entry_release(ce);
}
/*
* mb_cache_entry_free()
*
- * This is equivalent to the sequence mb_cache_entry_takeout() --
- * mb_cache_entry_release().
*/
void
mb_cache_entry_free(struct mb_cache_entry *ce)
{
- spin_lock(&mb_cache_spinlock);
+ mb_assert(ce);
mb_assert(list_empty(&ce->e_lru_list));
- __mb_cache_entry_unhash(ce);
- __mb_cache_entry_release_unlock(ce);
+ hlist_bl_lock(ce->e_index_hash_p);
+ __mb_cache_entry_unhash_index(ce);
+ hlist_bl_unlock(ce->e_index_hash_p);
+ hlist_bl_lock(ce->e_block_hash_p);
+ __mb_cache_entry_unhash_block(ce);
+ hlist_bl_unlock(ce->e_block_hash_p);
+ __mb_cache_entry_release(ce);
}
@@ -463,84 +662,110 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev,
sector_t block)
{
unsigned int bucket;
- struct list_head *l;
+ struct hlist_bl_node *l;
struct mb_cache_entry *ce;
+ struct hlist_bl_head *block_hash_p;
bucket = hash_long((unsigned long)bdev + (block & 0xffffffff),
cache->c_bucket_bits);
- spin_lock(&mb_cache_spinlock);
- list_for_each(l, &cache->c_block_hash[bucket]) {
- ce = list_entry(l, struct mb_cache_entry, e_block_list);
+ block_hash_p = &cache->c_block_hash[bucket];
+ /* First serialize access to the block corresponding hash chain. */
+ hlist_bl_lock(block_hash_p);
+ hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) {
+ mb_assert(ce->e_block_hash_p == block_hash_p);
if (ce->e_bdev == bdev && ce->e_block == block) {
- DEFINE_WAIT(wait);
+ /*
+ * Prevent a free from removing the entry.
+ */
+ atomic_inc(&ce->e_refcnt);
+ hlist_bl_unlock(block_hash_p);
+ __spin_lock_mb_cache_entry(ce);
+ atomic_dec(&ce->e_refcnt);
+ if (ce->e_used > 0) {
+ DEFINE_WAIT(wait);
+ while (ce->e_used > 0) {
+ ce->e_queued++;
+ prepare_to_wait(&mb_cache_queue, &wait,
+ TASK_UNINTERRUPTIBLE);
+ __spin_unlock_mb_cache_entry(ce);
+ schedule();
+ __spin_lock_mb_cache_entry(ce);
+ ce->e_queued--;
+ }
+ finish_wait(&mb_cache_queue, &wait);
+ }
+ ce->e_used += 1 + MB_CACHE_WRITER;
+ __spin_unlock_mb_cache_entry(ce);
- if (!list_empty(&ce->e_lru_list))
+ if (!list_empty(&ce->e_lru_list)) {
+ spin_lock(&mb_cache_spinlock);
list_del_init(&ce->e_lru_list);
-
- while (ce->e_used > 0) {
- ce->e_queued++;
- prepare_to_wait(&mb_cache_queue, &wait,
- TASK_UNINTERRUPTIBLE);
spin_unlock(&mb_cache_spinlock);
- schedule();
- spin_lock(&mb_cache_spinlock);
- ce->e_queued--;
}
- finish_wait(&mb_cache_queue, &wait);
- ce->e_used += 1 + MB_CACHE_WRITER;
-
- if (!__mb_cache_entry_is_hashed(ce)) {
- __mb_cache_entry_release_unlock(ce);
+ if (!__mb_cache_entry_is_block_hashed(ce)) {
+ __mb_cache_entry_release(ce);
return NULL;
}
- goto cleanup;
+ return ce;
}
}
- ce = NULL;
-
-cleanup:
- spin_unlock(&mb_cache_spinlock);
- return ce;
+ hlist_bl_unlock(block_hash_p);
+ return NULL;
}
#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
static struct mb_cache_entry *
-__mb_cache_entry_find(struct list_head *l, struct list_head *head,
+__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head,
struct block_device *bdev, unsigned int key)
{
- while (l != head) {
+
+ /* The index hash chain is alredy acquire by caller. */
+ while (l != NULL) {
struct mb_cache_entry *ce =
- list_entry(l, struct mb_cache_entry, e_index.o_list);
+ hlist_bl_entry(l, struct mb_cache_entry,
+ e_index.o_list);
+ mb_assert(ce->e_index_hash_p == head);
if (ce->e_bdev == bdev && ce->e_index.o_key == key) {
- DEFINE_WAIT(wait);
-
- if (!list_empty(&ce->e_lru_list))
- list_del_init(&ce->e_lru_list);
-
+ /*
+ * Prevent a free from removing the entry.
+ */
+ atomic_inc(&ce->e_refcnt);
+ hlist_bl_unlock(head);
+ __spin_lock_mb_cache_entry(ce);
+ atomic_dec(&ce->e_refcnt);
+ ce->e_used++;
/* Incrementing before holding the lock gives readers
priority over writers. */
- ce->e_used++;
- while (ce->e_used >= MB_CACHE_WRITER) {
- ce->e_queued++;
- prepare_to_wait(&mb_cache_queue, &wait,
- TASK_UNINTERRUPTIBLE);
- spin_unlock(&mb_cache_spinlock);
- schedule();
- spin_lock(&mb_cache_spinlock);
- ce->e_queued--;
+ if (ce->e_used >= MB_CACHE_WRITER) {
+ DEFINE_WAIT(wait);
+
+ while (ce->e_used >= MB_CACHE_WRITER) {
+ ce->e_queued++;
+ prepare_to_wait(&mb_cache_queue, &wait,
+ TASK_UNINTERRUPTIBLE);
+ __spin_unlock_mb_cache_entry(ce);
+ schedule();
+ __spin_lock_mb_cache_entry(ce);
+ ce->e_queued--;
+ }
+ finish_wait(&mb_cache_queue, &wait);
}
- finish_wait(&mb_cache_queue, &wait);
-
- if (!__mb_cache_entry_is_hashed(ce)) {
- __mb_cache_entry_release_unlock(ce);
+ __spin_unlock_mb_cache_entry(ce);
+ if (!list_empty(&ce->e_lru_list)) {
spin_lock(&mb_cache_spinlock);
+ list_del_init(&ce->e_lru_list);
+ spin_unlock(&mb_cache_spinlock);
+ }
+ if (!__mb_cache_entry_is_block_hashed(ce)) {
+ __mb_cache_entry_release(ce);
return ERR_PTR(-EAGAIN);
}
return ce;
}
l = l->next;
}
+ hlist_bl_unlock(head);
return NULL;
}
@@ -562,13 +787,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev,
unsigned int key)
{
unsigned int bucket = hash_long(key, cache->c_bucket_bits);
- struct list_head *l;
- struct mb_cache_entry *ce;
-
- spin_lock(&mb_cache_spinlock);
- l = cache->c_index_hash[bucket].next;
- ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
- spin_unlock(&mb_cache_spinlock);
+ struct hlist_bl_node *l;
+ struct mb_cache_entry *ce = NULL;
+ struct hlist_bl_head *index_hash_p;
+
+ index_hash_p = &cache->c_index_hash[bucket];
+ hlist_bl_lock(index_hash_p);
+ if (!hlist_bl_empty(index_hash_p)) {
+ l = hlist_bl_first(index_hash_p);
+ ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+ } else
+ hlist_bl_unlock(index_hash_p);
return ce;
}
@@ -597,13 +826,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev,
{
struct mb_cache *cache = prev->e_cache;
unsigned int bucket = hash_long(key, cache->c_bucket_bits);
- struct list_head *l;
+ struct hlist_bl_node *l;
struct mb_cache_entry *ce;
+ struct hlist_bl_head *index_hash_p;
- spin_lock(&mb_cache_spinlock);
+ index_hash_p = &cache->c_index_hash[bucket];
+ mb_assert(prev->e_index_hash_p == index_hash_p);
+ hlist_bl_lock(index_hash_p);
+ mb_assert(!hlist_bl_empty(index_hash_p));
l = prev->e_index.o_list.next;
- ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key);
- __mb_cache_entry_release_unlock(prev);
+ ce = __mb_cache_entry_find(l, index_hash_p, bdev, key);
+ __mb_cache_entry_release(prev);
return ce;
}
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 6624684dd5d..f2a0cfcef11 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -18,7 +18,7 @@ config MINIX_FS
config MINIX_FS_NATIVE_ENDIAN
def_bool MINIX_FS
- depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
+ depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
def_bool MINIX_FS
diff --git a/fs/minix/file.c b/fs/minix/file.c
index adc6f549423..a967de085ac 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -14,10 +14,10 @@
*/
const struct file_operations minix_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109162a..f007a335557 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);
static void minix_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
minix_truncate(inode);
@@ -86,7 +86,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
minix_inode_cachep = kmem_cache_create("minix_inode_cache",
sizeof(struct minix_inode_info),
@@ -123,6 +123,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
struct minix_sb_info * sbi = minix_sb(sb);
struct minix_super_block * ms;
+ sync_filesystem(sb);
ms = sbi->s_ms;
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
return 0;
diff --git a/fs/mount.h b/fs/mount.h
index 64a858143ff..d55297f2fa0 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
struct user_namespace *user_ns;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
- int event;
+ u64 event;
};
struct mnt_pcp {
@@ -19,16 +19,17 @@ struct mnt_pcp {
};
struct mountpoint {
- struct list_head m_hash;
+ struct hlist_node m_hash;
struct dentry *m_dentry;
int m_count;
};
struct mount {
- struct list_head mnt_hash;
+ struct hlist_node mnt_hash;
struct mount *mnt_parent;
struct dentry *mnt_mountpoint;
struct vfsmount mnt;
+ struct rcu_head mnt_rcu;
#ifdef CONFIG_SMP
struct mnt_pcp __percpu *mnt_pcp;
#else
@@ -55,7 +56,7 @@ struct mount {
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
int mnt_pinned;
- int mnt_ghosts;
+ struct path mnt_ex_mountpoint;
};
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
@@ -73,21 +74,39 @@ static inline int mnt_has_parent(struct mount *mnt)
static inline int is_mounted(struct vfsmount *mnt)
{
/* neither detached nor internal? */
- return !IS_ERR_OR_NULL(real_mount(mnt));
+ return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}
-extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
+extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
+
+extern bool legitimize_mnt(struct vfsmount *, unsigned);
static inline void get_mnt_ns(struct mnt_namespace *ns)
{
atomic_inc(&ns->count);
}
+extern seqlock_t mount_lock;
+
+static inline void lock_mount_hash(void)
+{
+ write_seqlock(&mount_lock);
+}
+
+static inline void unlock_mount_hash(void)
+{
+ write_sequnlock(&mount_lock);
+}
+
struct proc_mounts {
struct seq_file m;
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
+ void *cached_mount;
+ u64 cached_event;
+ loff_t cached_index;
};
#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
diff --git a/fs/mpage.c b/fs/mpage.c
index 0face1c4d4c..5f9ed622274 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -43,31 +43,14 @@
*/
static void mpage_end_io(struct bio *bio, int err)
{
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bv;
+ int i;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ struct page *page = bv->bv_page;
+ page_endio(page, bio_data_dir(bio), err);
+ }
- do {
- struct page *page = bvec->bv_page;
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
- if (bio_data_dir(bio) == READ) {
- if (uptodate) {
- SetPageUptodate(page);
- } else {
- ClearPageUptodate(page);
- SetPageError(page);
- }
- unlock_page(page);
- } else { /* bio_data_dir(bio) == WRITE */
- if (!uptodate) {
- SetPageError(page);
- if (page->mapping)
- set_bit(AS_EIO, &page->mapping->flags);
- }
- end_page_writeback(page);
- }
- } while (bvec >= bio->bi_io_vec);
bio_put(bio);
}
@@ -94,7 +77,7 @@ mpage_alloc(struct block_device *bdev,
if (bio) {
bio->bi_bdev = bdev;
- bio->bi_sector = first_sector;
+ bio->bi_iter.bi_sector = first_sector;
}
return bio;
}
@@ -286,6 +269,11 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
alloc_new:
if (bio == NULL) {
+ if (first_hole == blocks_per_page) {
+ if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
+ page))
+ goto out;
+ }
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
GFP_KERNEL);
@@ -440,6 +428,35 @@ struct mpage_data {
unsigned use_writepage;
};
+/*
+ * We have our BIO, so we can now mark the buffers clean. Make
+ * sure to only clean buffers which we know we'll be writing.
+ */
+static void clean_buffers(struct page *page, unsigned first_unmapped)
+{
+ unsigned buffer_counter = 0;
+ struct buffer_head *bh, *head;
+ if (!page_has_buffers(page))
+ return;
+ head = page_buffers(page);
+ bh = head;
+
+ do {
+ if (buffer_counter++ == first_unmapped)
+ break;
+ clear_buffer_dirty(bh);
+ bh = bh->b_this_page;
+ } while (bh != head);
+
+ /*
+ * we cannot drop the bh if the page is not uptodate or a concurrent
+ * readpage would fail to serialize with the bh and it would read from
+ * disk before we reach the platter.
+ */
+ if (buffer_heads_over_limit && PageUptodate(page))
+ try_to_free_buffers(page);
+}
+
static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
void *data)
{
@@ -575,6 +592,13 @@ page_is_mapped:
alloc_new:
if (bio == NULL) {
+ if (first_unmapped == blocks_per_page) {
+ if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9),
+ page, wbc)) {
+ clean_buffers(page, first_unmapped);
+ goto out;
+ }
+ }
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
if (bio == NULL)
@@ -592,30 +616,7 @@ alloc_new:
goto alloc_new;
}
- /*
- * OK, we have our BIO, so we can now mark the buffers clean. Make
- * sure to only clean buffers which we know we'll be writing.
- */
- if (page_has_buffers(page)) {
- struct buffer_head *head = page_buffers(page);
- struct buffer_head *bh = head;
- unsigned buffer_counter = 0;
-
- do {
- if (buffer_counter++ == first_unmapped)
- break;
- clear_buffer_dirty(bh);
- bh = bh->b_this_page;
- } while (bh != head);
-
- /*
- * we cannot drop the bh if the page is not uptodate
- * or a concurrent readpage would fail to serialize with the bh
- * and it would read from disk before we reach the platter.
- */
- if (buffer_heads_over_limit && PageUptodate(page))
- try_to_free_buffers(page);
- }
+ clean_buffers(page, first_unmapped);
BUG_ON(PageWriteback(page));
set_page_writeback(page);
diff --git a/fs/namei.c b/fs/namei.c
index 645268f23eb..9eb787e5c16 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -196,6 +196,7 @@ recopy:
goto error;
result->uptr = filename;
+ result->aname = NULL;
audit_getname(result);
return result;
@@ -209,7 +210,35 @@ getname(const char __user * filename)
{
return getname_flags(filename, 0, NULL);
}
-EXPORT_SYMBOL(getname);
+
+/*
+ * The "getname_kernel()" interface doesn't do pathnames longer
+ * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
+ */
+struct filename *
+getname_kernel(const char * filename)
+{
+ struct filename *result;
+ char *kname;
+ int len;
+
+ len = strlen(filename);
+ if (len >= EMBEDDED_NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ result = __getname();
+ if (unlikely(!result))
+ return ERR_PTR(-ENOMEM);
+
+ kname = (char *)result + sizeof(*result);
+ result->name = kname;
+ result->uptr = NULL;
+ result->aname = NULL;
+ result->separate = false;
+
+ strlcpy(kname, filename, EMBEDDED_NAME_MAX);
+ return result;
+}
#ifdef CONFIG_AUDITSYSCALL
void putname(struct filename *name)
@@ -235,27 +264,9 @@ static int check_acl(struct inode *inode, int mask)
return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
}
- acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
-
- /*
- * A filesystem can force a ACL callback by just never filling the
- * ACL cache. But normally you'd fill the cache either at inode
- * instantiation time, or on the first ->get_acl call.
- *
- * If the filesystem doesn't have a get_acl() function at all, we'll
- * just create the negative cache entry.
- */
- if (acl == ACL_NOT_CACHED) {
- if (inode->i_op->get_acl) {
- acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- } else {
- set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
- return -EAGAIN;
- }
- }
-
+ acl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
if (acl) {
int error = posix_acl_permission(inode, acl, mask);
posix_acl_release(acl);
@@ -321,10 +332,11 @@ int generic_permission(struct inode *inode, int mask)
if (S_ISDIR(inode->i_mode)) {
/* DACs are overridable for directories */
- if (inode_capable(inode, CAP_DAC_OVERRIDE))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
return 0;
if (!(mask & MAY_WRITE))
- if (inode_capable(inode, CAP_DAC_READ_SEARCH))
+ if (capable_wrt_inode_uidgid(inode,
+ CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
@@ -334,7 +346,7 @@ int generic_permission(struct inode *inode, int mask)
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
- if (inode_capable(inode, CAP_DAC_OVERRIDE))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
return 0;
/*
@@ -342,11 +354,12 @@ int generic_permission(struct inode *inode, int mask)
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
- if (inode_capable(inode, CAP_DAC_READ_SEARCH))
+ if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
return 0;
return -EACCES;
}
+EXPORT_SYMBOL(generic_permission);
/*
* We _really_ want to just do "generic_permission()" without
@@ -444,6 +457,7 @@ int inode_permission(struct inode *inode, int mask)
return retval;
return __inode_permission(inode, mask);
}
+EXPORT_SYMBOL(inode_permission);
/**
* path_get - get a reference to a path
@@ -482,18 +496,6 @@ EXPORT_SYMBOL(path_put);
* to restart the path walk from the beginning in ref-walk mode.
*/
-static inline void lock_rcu_walk(void)
-{
- br_read_lock(&vfsmount_lock);
- rcu_read_lock();
-}
-
-static inline void unlock_rcu_walk(void)
-{
- rcu_read_unlock();
- br_read_unlock(&vfsmount_lock);
-}
-
/**
* unlazy_walk - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
@@ -512,26 +514,22 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
BUG_ON(!(nd->flags & LOOKUP_RCU));
/*
- * Get a reference to the parent first: we're
- * going to make "path_put(nd->path)" valid in
- * non-RCU context for "terminate_walk()".
- *
- * If this doesn't work, return immediately with
- * RCU walking still active (and then we will do
- * the RCU walk cleanup in terminate_walk()).
+ * After legitimizing the bastards, terminate_walk()
+ * will do the right thing for non-RCU mode, and all our
+ * subsequent exit cases should rcu_read_unlock()
+ * before returning. Do vfsmount first; if dentry
+ * can't be legitimized, just set nd->path.dentry to NULL
+ * and rely on dput(NULL) being a no-op.
*/
- if (!lockref_get_not_dead(&parent->d_lockref))
+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq))
return -ECHILD;
-
- /*
- * After the mntget(), we terminate_walk() will do
- * the right thing for non-RCU mode, and all our
- * subsequent exit cases should unlock_rcu_walk()
- * before returning.
- */
- mntget(nd->path.mnt);
nd->flags &= ~LOOKUP_RCU;
+ if (!lockref_get_not_dead(&parent->d_lockref)) {
+ nd->path.dentry = NULL;
+ goto out;
+ }
+
/*
* For a negative lookup, the lookup sequence point is the parents
* sequence point, and it only needs to revalidate the parent dentry.
@@ -566,17 +564,17 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
spin_unlock(&fs->lock);
}
- unlock_rcu_walk();
+ rcu_read_unlock();
return 0;
unlock_and_drop_dentry:
spin_unlock(&fs->lock);
drop_dentry:
- unlock_rcu_walk();
+ rcu_read_unlock();
dput(dentry);
goto drop_root_mnt;
out:
- unlock_rcu_walk();
+ rcu_read_unlock();
drop_root_mnt:
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
@@ -608,17 +606,22 @@ static int complete_walk(struct nameidata *nd)
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
+ if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) {
+ rcu_read_unlock();
+ return -ECHILD;
+ }
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) {
- unlock_rcu_walk();
+ rcu_read_unlock();
+ mntput(nd->path.mnt);
return -ECHILD;
}
if (read_seqcount_retry(&dentry->d_seq, nd->seq)) {
- unlock_rcu_walk();
+ rcu_read_unlock();
dput(dentry);
+ mntput(nd->path.mnt);
return -ECHILD;
}
- mntget(nd->path.mnt);
- unlock_rcu_walk();
+ rcu_read_unlock();
}
if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -909,21 +912,22 @@ int follow_up(struct path *path)
struct mount *parent;
struct dentry *mountpoint;
- br_read_lock(&vfsmount_lock);
+ read_seqlock_excl(&mount_lock);
parent = mnt->mnt_parent;
if (parent == mnt) {
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
return 0;
}
mntget(&parent->mnt);
mountpoint = dget(mnt->mnt_mountpoint);
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
path->mnt = &parent->mnt;
return 1;
}
+EXPORT_SYMBOL(follow_up);
/*
* Perform an automount
@@ -1048,8 +1052,8 @@ static int follow_managed(struct path *path, unsigned flags)
/* Something is mounted on this dentry in another
* namespace and/or whatever was mounted there in this
- * namespace got unmounted before we managed to get the
- * vfsmount_lock */
+ * namespace got unmounted before lookup_mnt() could
+ * get it */
}
/* Handle an automount point */
@@ -1085,6 +1089,7 @@ int follow_down_one(struct path *path)
}
return 0;
}
+EXPORT_SYMBOL(follow_down_one);
static inline bool managed_dentry_might_block(struct dentry *dentry)
{
@@ -1109,9 +1114,9 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
return false;
if (!d_mountpoint(path->dentry))
- break;
+ return true;
- mounted = __lookup_mnt(path->mnt, path->dentry, 1);
+ mounted = __lookup_mnt(path->mnt, path->dentry);
if (!mounted)
break;
path->mnt = &mounted->mnt;
@@ -1125,20 +1130,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
*/
*inode = path->dentry->d_inode;
}
- return true;
-}
-
-static void follow_mount_rcu(struct nameidata *nd)
-{
- while (d_mountpoint(nd->path.dentry)) {
- struct mount *mounted;
- mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
- if (!mounted)
- break;
- nd->path.mnt = &mounted->mnt;
- nd->path.dentry = mounted->mnt.mnt_root;
- nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
- }
+ return read_seqretry(&mount_lock, nd->m_seq);
}
static int follow_dotdot_rcu(struct nameidata *nd)
@@ -1166,7 +1158,17 @@ static int follow_dotdot_rcu(struct nameidata *nd)
break;
nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
}
- follow_mount_rcu(nd);
+ while (d_mountpoint(nd->path.dentry)) {
+ struct mount *mounted;
+ mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
+ if (!mounted)
+ break;
+ nd->path.mnt = &mounted->mnt;
+ nd->path.dentry = mounted->mnt.mnt_root;
+ nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
+ if (!read_seqretry(&mount_lock, nd->m_seq))
+ goto failed;
+ }
nd->inode = nd->path.dentry->d_inode;
return 0;
@@ -1174,7 +1176,7 @@ failed:
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
- unlock_rcu_walk();
+ rcu_read_unlock();
return -ECHILD;
}
@@ -1226,6 +1228,7 @@ int follow_down(struct path *path)
}
return 0;
}
+EXPORT_SYMBOL(follow_down);
/*
* Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
@@ -1308,8 +1311,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
}
/*
- * Call i_op->lookup on the dentry. The dentry must be negative but may be
- * hashed if it was pouplated with DCACHE_NEED_LOOKUP.
+ * Call i_op->lookup on the dentry. The dentry must be negative and
+ * unhashed.
*
* dir->d_inode->i_mutex must be held
*/
@@ -1501,7 +1504,7 @@ static void terminate_walk(struct nameidata *nd)
nd->flags &= ~LOOKUP_RCU;
if (!(nd->flags & LOOKUP_ROOT))
nd->root.mnt = NULL;
- unlock_rcu_walk();
+ rcu_read_unlock();
}
}
@@ -1511,18 +1514,9 @@ static void terminate_walk(struct nameidata *nd)
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
-static inline int should_follow_link(struct inode *inode, int follow)
+static inline int should_follow_link(struct dentry *dentry, int follow)
{
- if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
- if (likely(inode->i_op->follow_link))
- return follow;
-
- /* This gets set once for the inode lifetime */
- spin_lock(&inode->i_lock);
- inode->i_opflags |= IOP_NOFOLLOW;
- spin_unlock(&inode->i_lock);
- }
- return 0;
+ return unlikely(d_is_symlink(dentry)) ? follow : 0;
}
static inline int walk_component(struct nameidata *nd, struct path *path,
@@ -1549,10 +1543,10 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
inode = path->dentry->d_inode;
}
err = -ENOENT;
- if (!inode)
+ if (!inode || d_is_negative(path->dentry))
goto out_path_put;
- if (should_follow_link(inode, follow)) {
+ if (should_follow_link(path->dentry, follow)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) {
err = -ECHILD;
@@ -1611,26 +1605,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
}
/*
- * We really don't want to look at inode->i_op->lookup
- * when we don't have to. So we keep a cache bit in
- * the inode ->i_opflags field that says "yes, we can
- * do lookup on this inode".
- */
-static inline int can_lookup(struct inode *inode)
-{
- if (likely(inode->i_opflags & IOP_LOOKUP))
- return 1;
- if (likely(!inode->i_op->lookup))
- return 0;
-
- /* We do this once for the lifetime of the inode */
- spin_lock(&inode->i_lock);
- inode->i_opflags |= IOP_LOOKUP;
- spin_unlock(&inode->i_lock);
- return 1;
-}
-
-/*
* We can do the critical dentry name comparison and hashing
* operations one word at a time, but we are limited to:
*
@@ -1638,11 +1612,6 @@ static inline int can_lookup(struct inode *inode)
* do a "get_unaligned()" if this helps and is sufficiently
* fast.
*
- * - Little-endian machines (so that we can generate the mask
- * of low bytes efficiently). Again, we *could* do a byte
- * swapping load on big-endian architectures if that is not
- * expensive enough to make the optimization worthless.
- *
* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
* do not trap on the (extremely unlikely) case of a page
* crossing operation.
@@ -1686,7 +1655,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len)
if (!len)
goto done;
}
- mask = ~(~0ul << len*8);
+ mask = bytemask_from_count(len);
hash += mask & a;
done:
return fold_hash(hash);
@@ -1833,7 +1802,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
if (err)
return err;
}
- if (!can_lookup(nd->inode)) {
+ if (!d_can_lookup(nd->path.dentry)) {
err = -ENOTDIR;
break;
}
@@ -1851,9 +1820,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->flags = flags | LOOKUP_JUMPED;
nd->depth = 0;
if (flags & LOOKUP_ROOT) {
- struct inode *inode = nd->root.dentry->d_inode;
+ struct dentry *root = nd->root.dentry;
+ struct inode *inode = root->d_inode;
if (*name) {
- if (!can_lookup(inode))
+ if (!d_can_lookup(root))
return -ENOTDIR;
retval = inode_permission(inode, MAY_EXEC);
if (retval)
@@ -1862,8 +1832,9 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->path = nd->root;
nd->inode = inode;
if (flags & LOOKUP_RCU) {
- lock_rcu_walk();
+ rcu_read_lock();
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+ nd->m_seq = read_seqbegin(&mount_lock);
} else {
path_get(&nd->path);
}
@@ -1872,9 +1843,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->root.mnt = NULL;
+ nd->m_seq = read_seqbegin(&mount_lock);
if (*name=='/') {
if (flags & LOOKUP_RCU) {
- lock_rcu_walk();
+ rcu_read_lock();
set_root_rcu(nd);
} else {
set_root(nd);
@@ -1886,7 +1858,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
struct fs_struct *fs = current->fs;
unsigned seq;
- lock_rcu_walk();
+ rcu_read_lock();
do {
seq = read_seqcount_begin(&fs->seq);
@@ -1907,7 +1879,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
dentry = f.file->f_path.dentry;
if (*name) {
- if (!can_lookup(dentry->d_inode)) {
+ if (!d_can_lookup(dentry)) {
fdput(f);
return -ENOTDIR;
}
@@ -1915,10 +1887,10 @@ static int path_init(int dfd, const char *name, unsigned int flags,
nd->path = f.file->f_path;
if (flags & LOOKUP_RCU) {
- if (f.need_put)
+ if (f.flags & FDPUT_FPUT)
*fp = f.file;
nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
- lock_rcu_walk();
+ rcu_read_lock();
} else {
path_get(&nd->path);
fdput(f);
@@ -1989,7 +1961,7 @@ static int path_lookupat(int dfd, const char *name,
err = complete_walk(nd);
if (!err && nd->flags & LOOKUP_DIRECTORY) {
- if (!can_lookup(nd->inode)) {
+ if (!d_can_lookup(nd->path.dentry)) {
path_put(&nd->path);
err = -ENOTDIR;
}
@@ -2059,6 +2031,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path)
*path = nd.path;
return res;
}
+EXPORT_SYMBOL(kern_path);
/**
* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
@@ -2083,6 +2056,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
*path = nd.path;
return err;
}
+EXPORT_SYMBOL(vfs_path_lookup);
/*
* Restricted form of lookup. Doesn't follow links, single-component only,
@@ -2145,6 +2119,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
return __lookup_hash(&this, base, 0);
}
+EXPORT_SYMBOL(lookup_one_len);
int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
struct path *path, int *empty)
@@ -2169,6 +2144,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
{
return user_path_at_empty(dfd, name, flags, path, NULL);
}
+EXPORT_SYMBOL(user_path_at);
/*
* NB: most callers don't do anything directly with the reference to the
@@ -2274,15 +2250,16 @@ mountpoint_last(struct nameidata *nd, struct path *path)
mutex_unlock(&dir->d_inode->i_mutex);
done:
- if (!dentry->d_inode) {
+ if (!dentry->d_inode || d_is_negative(dentry)) {
error = -ENOENT;
dput(dentry);
goto out;
}
path->dentry = dentry;
- path->mnt = mntget(nd->path.mnt);
- if (should_follow_link(dentry->d_inode, nd->flags & LOOKUP_FOLLOW))
+ path->mnt = nd->path.mnt;
+ if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW))
return 1;
+ mntget(path->mnt);
follow_mount(path);
error = 0;
out:
@@ -2294,10 +2271,11 @@ out:
* path_mountpoint - look up a path to be umounted
* @dfd: directory file descriptor to start walk from
* @name: full pathname to walk
+ * @path: pointer to container for result
* @flags: lookup flags
*
* Look up the given name, but don't attempt to revalidate the last component.
- * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+ * Returns 0 and "path" will be valid on success; Returns error otherwise.
*/
static int
path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags)
@@ -2403,7 +2381,7 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
return 0;
if (uid_eq(dir->i_uid, fsuid))
return 0;
- return !inode_capable(inode, CAP_FOWNER);
+ return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
}
/*
@@ -2425,12 +2403,14 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
-static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
+static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
{
+ struct inode *inode = victim->d_inode;
int error;
- if (!victim->d_inode)
+ if (d_is_negative(victim))
return -ENOENT;
+ BUG_ON(!inode);
BUG_ON(victim->d_parent->d_inode != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
@@ -2440,15 +2420,16 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
return error;
if (IS_APPEND(dir))
return -EPERM;
- if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||
- IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+
+ if (check_sticky(dir, inode) || IS_APPEND(inode) ||
+ IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
return -EPERM;
if (isdir) {
- if (!S_ISDIR(victim->d_inode->i_mode))
+ if (!d_is_dir(victim))
return -ENOTDIR;
if (IS_ROOT(victim))
return -EBUSY;
- } else if (S_ISDIR(victim->d_inode->i_mode))
+ } else if (d_is_dir(victim))
return -EISDIR;
if (IS_DEADDIR(dir))
return -ENOENT;
@@ -2467,6 +2448,7 @@ static int may_delete(struct inode *dir,struct dentry *victim,int isdir)
*/
static inline int may_create(struct inode *dir, struct dentry *child)
{
+ audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
@@ -2506,6 +2488,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
return NULL;
}
+EXPORT_SYMBOL(lock_rename);
void unlock_rename(struct dentry *p1, struct dentry *p2)
{
@@ -2515,6 +2498,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
}
}
+EXPORT_SYMBOL(unlock_rename);
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool want_excl)
@@ -2535,6 +2519,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_create);
static int may_open(struct path *path, int acc_mode, int flag)
{
@@ -2598,7 +2583,7 @@ static int handle_truncate(struct file *filp)
/*
* Refuse to truncate files with mandatory locks held on them.
*/
- error = locks_verify_locked(inode);
+ error = locks_verify_locked(filp);
if (!error)
error = security_path_truncate(path);
if (!error) {
@@ -2982,7 +2967,7 @@ retry_lookup:
/*
* create/update audit record if it already exists.
*/
- if (path->dentry->d_inode)
+ if (d_is_positive(path->dentry))
audit_inode(name, path->dentry, 0);
/*
@@ -3011,12 +2996,12 @@ retry_lookup:
finish_lookup:
/* we _can_ be in RCU mode here */
error = -ENOENT;
- if (!inode) {
+ if (!inode || d_is_negative(path->dentry)) {
path_to_nameidata(path, nd);
goto out;
}
- if (should_follow_link(inode, !symlink_ok)) {
+ if (should_follow_link(path->dentry, !symlink_ok)) {
if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) {
error = -ECHILD;
@@ -3045,10 +3030,10 @@ finish_open:
}
audit_inode(name, nd->path.dentry, 0);
error = -EISDIR;
- if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
+ if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry))
goto out;
error = -ENOTDIR;
- if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode))
+ if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
goto out;
if (!S_ISREG(nd->inode->i_mode))
will_truncate = false;
@@ -3274,7 +3259,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
nd.root.mnt = mnt;
nd.root.dentry = dentry;
- if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
+ if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);
file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
@@ -3324,8 +3309,9 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
goto unlock;
error = -EEXIST;
- if (dentry->d_inode)
+ if (d_is_positive(dentry))
goto fail;
+
/*
* Special case - lookup gave negative, but... we had foo/bar/
* From the vfs_mknod() POV we just have a negative dentry -
@@ -3403,6 +3389,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mknod);
static int may_mknod(umode_t mode)
{
@@ -3492,6 +3479,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
fsnotify_mkdir(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_mkdir);
SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
@@ -3546,6 +3534,7 @@ void dentry_unhash(struct dentry *dentry)
__d_drop(dentry);
spin_unlock(&dentry->d_lock);
}
+EXPORT_SYMBOL(dentry_unhash);
int vfs_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -3583,6 +3572,7 @@ out:
d_delete(dentry);
return error;
}
+EXPORT_SYMBOL(vfs_rmdir);
static long do_rmdir(int dfd, const char __user *pathname)
{
@@ -3646,8 +3636,27 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
return do_rmdir(AT_FDCWD, pathname);
}
-int vfs_unlink(struct inode *dir, struct dentry *dentry)
+/**
+ * vfs_unlink - unlink a filesystem object
+ * @dir: parent directory
+ * @dentry: victim
+ * @delegated_inode: returns victim inode, if the inode is delegated.
+ *
+ * The caller must hold dir->i_mutex.
+ *
+ * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
+ * return a reference to the inode in delegated_inode. The caller
+ * should then break the delegation on that inode and retry. Because
+ * breaking a delegation may take a long time, the caller should drop
+ * dir->i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode. This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
{
+ struct inode *target = dentry->d_inode;
int error = may_delete(dir, dentry, 0);
if (error)
@@ -3656,27 +3665,32 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
if (!dir->i_op->unlink)
return -EPERM;
- mutex_lock(&dentry->d_inode->i_mutex);
+ mutex_lock(&target->i_mutex);
if (d_mountpoint(dentry))
error = -EBUSY;
else {
error = security_inode_unlink(dir, dentry);
if (!error) {
+ error = try_break_deleg(target, delegated_inode);
+ if (error)
+ goto out;
error = dir->i_op->unlink(dir, dentry);
if (!error)
dont_mount(dentry);
}
}
- mutex_unlock(&dentry->d_inode->i_mutex);
+out:
+ mutex_unlock(&target->i_mutex);
/* We don't d_delete() NFS sillyrenamed files--they still exist. */
if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
- fsnotify_link_count(dentry->d_inode);
+ fsnotify_link_count(target);
d_delete(dentry);
}
return error;
}
+EXPORT_SYMBOL(vfs_unlink);
/*
* Make sure that the actual truncation of the file will occur outside its
@@ -3691,6 +3705,7 @@ static long do_unlinkat(int dfd, const char __user *pathname)
struct dentry *dentry;
struct nameidata nd;
struct inode *inode = NULL;
+ struct inode *delegated_inode = NULL;
unsigned int lookup_flags = 0;
retry:
name = user_path_parent(dfd, pathname, &nd, lookup_flags);
@@ -3705,7 +3720,7 @@ retry:
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit1;
-
+retry_deleg:
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
dentry = lookup_hash(&nd);
error = PTR_ERR(dentry);
@@ -3714,19 +3729,25 @@ retry:
if (nd.last.name[nd.last.len])
goto slashes;
inode = dentry->d_inode;
- if (!inode)
+ if (d_is_negative(dentry))
goto slashes;
ihold(inode);
error = security_path_unlink(&nd.path, dentry);
if (error)
goto exit2;
- error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+ error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode);
exit2:
dput(dentry);
}
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
if (inode)
iput(inode); /* truncate the inode here */
+ inode = NULL;
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
mnt_drop_write(nd.path.mnt);
exit1:
path_put(&nd.path);
@@ -3739,8 +3760,12 @@ exit1:
return error;
slashes:
- error = !dentry->d_inode ? -ENOENT :
- S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
+ if (d_is_negative(dentry))
+ error = -ENOENT;
+ else if (d_is_dir(dentry))
+ error = -EISDIR;
+ else
+ error = -ENOTDIR;
goto exit2;
}
@@ -3779,6 +3804,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
fsnotify_create(dir, dentry);
return error;
}
+EXPORT_SYMBOL(vfs_symlink);
SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
int, newdfd, const char __user *, newname)
@@ -3816,7 +3842,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
return sys_symlinkat(oldname, AT_FDCWD, newname);
}
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
+/**
+ * vfs_link - create a new link
+ * @old_dentry: object to be linked
+ * @dir: new parent
+ * @new_dentry: where to create the new link
+ * @delegated_inode: returns inode needing a delegation break
+ *
+ * The caller must hold dir->i_mutex
+ *
+ * If vfs_link discovers a delegation on the to-be-linked file in need
+ * of breaking, it will return -EWOULDBLOCK and return a reference to the
+ * inode in delegated_inode. The caller should then break the delegation
+ * and retry. Because breaking a delegation may take a long time, the
+ * caller should drop the i_mutex before doing so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode. This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ */
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
{
struct inode *inode = old_dentry->d_inode;
unsigned max_links = dir->i_sb->s_max_links;
@@ -3852,8 +3897,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
error = -ENOENT;
else if (max_links && inode->i_nlink >= max_links)
error = -EMLINK;
- else
- error = dir->i_op->link(old_dentry, dir, new_dentry);
+ else {
+ error = try_break_deleg(inode, delegated_inode);
+ if (!error)
+ error = dir->i_op->link(old_dentry, dir, new_dentry);
+ }
if (!error && (inode->i_state & I_LINKABLE)) {
spin_lock(&inode->i_lock);
@@ -3865,6 +3913,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
fsnotify_link(dir, inode, new_dentry);
return error;
}
+EXPORT_SYMBOL(vfs_link);
/*
* Hardlinks are often used in delicate situations. We avoid
@@ -3880,6 +3929,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
{
struct dentry *new_dentry;
struct path old_path, new_path;
+ struct inode *delegated_inode = NULL;
int how = 0;
int error;
@@ -3918,10 +3968,18 @@ retry:
error = security_path_link(old_path.dentry, &new_path, new_dentry);
if (error)
goto out_dput;
- error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
+ error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
out_dput:
done_path_create(&new_path, new_dentry);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error) {
+ path_put(&old_path);
+ goto retry;
+ }
+ }
if (retry_estale(error, how)) {
+ path_put(&old_path);
how |= LOOKUP_REVAL;
goto retry;
}
@@ -3936,7 +3994,28 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
}
-/*
+/**
+ * vfs_rename - rename a filesystem object
+ * @old_dir: parent of source
+ * @old_dentry: source
+ * @new_dir: parent of destination
+ * @new_dentry: destination
+ * @delegated_inode: returns an inode needing a delegation break
+ * @flags: rename flags
+ *
+ * The caller must hold multiple mutexes--see lock_rename()).
+ *
+ * If vfs_rename discovers a delegation in need of breaking at either
+ * the source or destination, it will return -EWOULDBLOCK and return a
+ * reference to the inode in delegated_inode. The caller should then
+ * break the delegation and retry. Because breaking a delegation may
+ * take a long time, the caller should drop all locks before doing
+ * so.
+ *
+ * Alternatively, a caller may pass NULL for delegated_inode. This may
+ * be appropriate for callers that expect the underlying filesystem not
+ * to be NFS exported.
+ *
* The worst of all namespace operations - renaming directory. "Perverted"
* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
* Problems:
@@ -3945,7 +4024,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* That's where 4.4 screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
* story.
- * c) we have to lock _three_ objects - parents and victim (if it exists).
+ * c) we have to lock _four_ objects - parents and victim (if it exists),
+ * and source (if it is not a directory).
* And that - after we got ->i_mutex on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
@@ -3963,143 +4043,158 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
* ->i_mutex on parents, which works but leads to some truly excessive
* locking].
*/
-static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ struct inode **delegated_inode, unsigned int flags)
{
- int error = 0;
+ int error;
+ bool is_dir = d_is_dir(old_dentry);
+ const unsigned char *old_name;
+ struct inode *source = old_dentry->d_inode;
struct inode *target = new_dentry->d_inode;
+ bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
- /*
- * If we are going to change the parent - check write permissions,
- * we'll need to flip '..'.
- */
- if (new_dir != old_dir) {
- error = inode_permission(old_dentry->d_inode, MAY_WRITE);
- if (error)
- return error;
- }
+ if (source == target)
+ return 0;
- error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
+ error = may_delete(old_dir, old_dentry, is_dir);
if (error)
return error;
- dget(new_dentry);
- if (target)
- mutex_lock(&target->i_mutex);
+ if (!target) {
+ error = may_create(new_dir, new_dentry);
+ } else {
+ new_is_dir = d_is_dir(new_dentry);
- error = -EBUSY;
- if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
- goto out;
+ if (!(flags & RENAME_EXCHANGE))
+ error = may_delete(new_dir, new_dentry, is_dir);
+ else
+ error = may_delete(new_dir, new_dentry, new_is_dir);
+ }
+ if (error)
+ return error;
- error = -EMLINK;
- if (max_links && !target && new_dir != old_dir &&
- new_dir->i_nlink >= max_links)
- goto out;
+ if (!old_dir->i_op->rename)
+ return -EPERM;
- if (target)
- shrink_dcache_parent(new_dentry);
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
- if (error)
- goto out;
+ if (flags && !old_dir->i_op->rename2)
+ return -EINVAL;
- if (target) {
- target->i_flags |= S_DEAD;
- dont_mount(new_dentry);
+ /*
+ * If we are going to change the parent - check write permissions,
+ * we'll need to flip '..'.
+ */
+ if (new_dir != old_dir) {
+ if (is_dir) {
+ error = inode_permission(source, MAY_WRITE);
+ if (error)
+ return error;
+ }
+ if ((flags & RENAME_EXCHANGE) && new_is_dir) {
+ error = inode_permission(target, MAY_WRITE);
+ if (error)
+ return error;
+ }
}
-out:
- if (target)
- mutex_unlock(&target->i_mutex);
- dput(new_dentry);
- if (!error)
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry,new_dentry);
- return error;
-}
-static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- struct inode *target = new_dentry->d_inode;
- int error;
-
- error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
+ error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
if (error)
return error;
+ old_name = fsnotify_oldname_init(old_dentry->d_name.name);
dget(new_dentry);
- if (target)
+ if (!is_dir || (flags & RENAME_EXCHANGE))
+ lock_two_nondirectories(source, target);
+ else if (target)
mutex_lock(&target->i_mutex);
error = -EBUSY;
- if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
+ if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry))
goto out;
- error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
+ if (max_links && new_dir != old_dir) {
+ error = -EMLINK;
+ if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
+ goto out;
+ if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
+ old_dir->i_nlink >= max_links)
+ goto out;
+ }
+ if (is_dir && !(flags & RENAME_EXCHANGE) && target)
+ shrink_dcache_parent(new_dentry);
+ if (!is_dir) {
+ error = try_break_deleg(source, delegated_inode);
+ if (error)
+ goto out;
+ }
+ if (target && !new_is_dir) {
+ error = try_break_deleg(target, delegated_inode);
+ if (error)
+ goto out;
+ }
+ if (!flags) {
+ error = old_dir->i_op->rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ } else {
+ error = old_dir->i_op->rename2(old_dir, old_dentry,
+ new_dir, new_dentry, flags);
+ }
if (error)
goto out;
- if (target)
+ if (!(flags & RENAME_EXCHANGE) && target) {
+ if (is_dir)
+ target->i_flags |= S_DEAD;
dont_mount(new_dentry);
- if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
- d_move(old_dentry, new_dentry);
+ }
+ if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
+ if (!(flags & RENAME_EXCHANGE))
+ d_move(old_dentry, new_dentry);
+ else
+ d_exchange(old_dentry, new_dentry);
+ }
out:
- if (target)
+ if (!is_dir || (flags & RENAME_EXCHANGE))
+ unlock_two_nondirectories(source, target);
+ else if (target)
mutex_unlock(&target->i_mutex);
dput(new_dentry);
- return error;
-}
-
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry)
-{
- int error;
- int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
- const unsigned char *old_name;
-
- if (old_dentry->d_inode == new_dentry->d_inode)
- return 0;
-
- error = may_delete(old_dir, old_dentry, is_dir);
- if (error)
- return error;
-
- if (!new_dentry->d_inode)
- error = may_create(new_dir, new_dentry);
- else
- error = may_delete(new_dir, new_dentry, is_dir);
- if (error)
- return error;
-
- if (!old_dir->i_op->rename)
- return -EPERM;
-
- old_name = fsnotify_oldname_init(old_dentry->d_name.name);
-
- if (is_dir)
- error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
- else
- error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
- if (!error)
+ if (!error) {
fsnotify_move(old_dir, new_dir, old_name, is_dir,
- new_dentry->d_inode, old_dentry);
+ !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
+ if (flags & RENAME_EXCHANGE) {
+ fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
+ new_is_dir, NULL, new_dentry);
+ }
+ }
fsnotify_oldname_free(old_name);
return error;
}
+EXPORT_SYMBOL(vfs_rename);
-SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
- int, newdfd, const char __user *, newname)
+SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname, unsigned int, flags)
{
struct dentry *old_dir, *new_dir;
struct dentry *old_dentry, *new_dentry;
struct dentry *trap;
struct nameidata oldnd, newnd;
+ struct inode *delegated_inode = NULL;
struct filename *from;
struct filename *to;
unsigned int lookup_flags = 0;
bool should_retry = false;
int error;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE))
+ return -EINVAL;
+
retry:
from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags);
if (IS_ERR(from)) {
@@ -4123,6 +4218,8 @@ retry:
goto exit2;
new_dir = newnd.path.dentry;
+ if (flags & RENAME_NOREPLACE)
+ error = -EEXIST;
if (newnd.last_type != LAST_NORM)
goto exit2;
@@ -4132,8 +4229,10 @@ retry:
oldnd.flags &= ~LOOKUP_PARENT;
newnd.flags &= ~LOOKUP_PARENT;
- newnd.flags |= LOOKUP_RENAME_TARGET;
+ if (!(flags & RENAME_EXCHANGE))
+ newnd.flags |= LOOKUP_RENAME_TARGET;
+retry_deleg:
trap = lock_rename(new_dir, old_dir);
old_dentry = lookup_hash(&oldnd);
@@ -4142,41 +4241,62 @@ retry:
goto exit3;
/* source must exist */
error = -ENOENT;
- if (!old_dentry->d_inode)
+ if (d_is_negative(old_dentry))
+ goto exit4;
+ new_dentry = lookup_hash(&newnd);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry))
goto exit4;
+ error = -EEXIST;
+ if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
+ goto exit5;
+ if (flags & RENAME_EXCHANGE) {
+ error = -ENOENT;
+ if (d_is_negative(new_dentry))
+ goto exit5;
+
+ if (!d_is_dir(new_dentry)) {
+ error = -ENOTDIR;
+ if (newnd.last.name[newnd.last.len])
+ goto exit5;
+ }
+ }
/* unless the source is a directory trailing slashes give -ENOTDIR */
- if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
+ if (!d_is_dir(old_dentry)) {
error = -ENOTDIR;
if (oldnd.last.name[oldnd.last.len])
- goto exit4;
- if (newnd.last.name[newnd.last.len])
- goto exit4;
+ goto exit5;
+ if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len])
+ goto exit5;
}
/* source should not be ancestor of target */
error = -EINVAL;
if (old_dentry == trap)
- goto exit4;
- new_dentry = lookup_hash(&newnd);
- error = PTR_ERR(new_dentry);
- if (IS_ERR(new_dentry))
- goto exit4;
+ goto exit5;
/* target should not be an ancestor of source */
- error = -ENOTEMPTY;
+ if (!(flags & RENAME_EXCHANGE))
+ error = -ENOTEMPTY;
if (new_dentry == trap)
goto exit5;
error = security_path_rename(&oldnd.path, old_dentry,
- &newnd.path, new_dentry);
+ &newnd.path, new_dentry, flags);
if (error)
goto exit5;
error = vfs_rename(old_dir->d_inode, old_dentry,
- new_dir->d_inode, new_dentry);
+ new_dir->d_inode, new_dentry,
+ &delegated_inode, flags);
exit5:
dput(new_dentry);
exit4:
dput(old_dentry);
exit3:
unlock_rename(new_dir, old_dir);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
mnt_drop_write(oldnd.path.mnt);
exit2:
if (retry_estale(error, lookup_flags))
@@ -4195,16 +4315,20 @@ exit:
return error;
}
-SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
+SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
+ int, newdfd, const char __user *, newname)
{
- return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
+ return sys_renameat2(olddfd, oldname, newdfd, newname, 0);
}
-int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link)
+SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
- int len;
+ return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
+}
- len = PTR_ERR(link);
+int readlink_copy(char __user *buffer, int buflen, const char *link)
+{
+ int len = PTR_ERR(link);
if (IS_ERR(link))
goto out;
@@ -4216,6 +4340,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c
out:
return len;
}
+EXPORT_SYMBOL(readlink_copy);
/*
* A helper for ->readlink(). This should be used *ONLY* for symlinks that
@@ -4233,11 +4358,12 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
if (IS_ERR(cookie))
return PTR_ERR(cookie);
- res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+ res = readlink_copy(buffer, buflen, nd_get_link(&nd));
if (dentry->d_inode->i_op->put_link)
dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
return res;
}
+EXPORT_SYMBOL(generic_readlink);
/* get the link contents into pagecache */
static char *page_getlink(struct dentry * dentry, struct page **ppage)
@@ -4257,14 +4383,14 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage)
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
struct page *page = NULL;
- char *s = page_getlink(dentry, &page);
- int res = vfs_readlink(dentry,buffer,buflen,s);
+ int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
if (page) {
kunmap(page);
page_cache_release(page);
}
return res;
}
+EXPORT_SYMBOL(page_readlink);
void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
{
@@ -4272,6 +4398,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
nd_set_link(nd, page_getlink(dentry, &page));
return page;
}
+EXPORT_SYMBOL(page_follow_link_light);
void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
{
@@ -4282,6 +4409,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
page_cache_release(page);
}
}
+EXPORT_SYMBOL(page_put_link);
/*
* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4319,45 +4447,18 @@ retry:
fail:
return err;
}
+EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
}
+EXPORT_SYMBOL(page_symlink);
const struct inode_operations page_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = page_follow_link_light,
.put_link = page_put_link,
};
-
-EXPORT_SYMBOL(user_path_at);
-EXPORT_SYMBOL(follow_down_one);
-EXPORT_SYMBOL(follow_down);
-EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(get_write_access); /* nfsd */
-EXPORT_SYMBOL(lock_rename);
-EXPORT_SYMBOL(lookup_one_len);
-EXPORT_SYMBOL(page_follow_link_light);
-EXPORT_SYMBOL(page_put_link);
-EXPORT_SYMBOL(page_readlink);
-EXPORT_SYMBOL(__page_symlink);
-EXPORT_SYMBOL(page_symlink);
EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(kern_path);
-EXPORT_SYMBOL(vfs_path_lookup);
-EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(unlock_rename);
-EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_link);
-EXPORT_SYMBOL(vfs_mkdir);
-EXPORT_SYMBOL(vfs_mknod);
-EXPORT_SYMBOL(generic_permission);
-EXPORT_SYMBOL(vfs_readlink);
-EXPORT_SYMBOL(vfs_rename);
-EXPORT_SYMBOL(vfs_rmdir);
-EXPORT_SYMBOL(vfs_symlink);
-EXPORT_SYMBOL(vfs_unlink);
-EXPORT_SYMBOL(dentry_unhash);
-EXPORT_SYMBOL(generic_readlink);
diff --git a/fs/namespace.c b/fs/namespace.c
index 5918fc31a63..182bc41cd88 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,23 +23,46 @@
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
+#include <linux/bootmem.h>
#include "pnode.h"
#include "internal.h"
-#define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
-#define HASH_SIZE (1UL << HASH_SHIFT)
+static unsigned int m_hash_mask __read_mostly;
+static unsigned int m_hash_shift __read_mostly;
+static unsigned int mp_hash_mask __read_mostly;
+static unsigned int mp_hash_shift __read_mostly;
-static int event;
+static __initdata unsigned long mhash_entries;
+static int __init set_mhash_entries(char *str)
+{
+ if (!str)
+ return 0;
+ mhash_entries = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("mhash_entries=", set_mhash_entries);
+
+static __initdata unsigned long mphash_entries;
+static int __init set_mphash_entries(char *str)
+{
+ if (!str)
+ return 0;
+ mphash_entries = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("mphash_entries=", set_mphash_entries);
+
+static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
static DEFINE_SPINLOCK(mnt_id_lock);
static int mnt_id_start = 0;
static int mnt_group_start = 1;
-static struct list_head *mount_hashtable __read_mostly;
-static struct list_head *mountpoint_hashtable __read_mostly;
+static struct hlist_head *mount_hashtable __read_mostly;
+static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
-static struct rw_semaphore namespace_sem;
+static DECLARE_RWSEM(namespace_sem);
/* /sys/fs */
struct kobject *fs_kobj;
@@ -53,17 +76,22 @@ EXPORT_SYMBOL_GPL(fs_kobj);
* It should be taken for write in all cases where the vfsmount
* tree or hash is modified or when a vfsmount structure is modified.
*/
-DEFINE_BRLOCK(vfsmount_lock);
+__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
+static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
- tmp = tmp + (tmp >> HASH_SHIFT);
- return tmp & (HASH_SIZE - 1);
+ tmp = tmp + (tmp >> m_hash_shift);
+ return &mount_hashtable[tmp & m_hash_mask];
}
-#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
+static inline struct hlist_head *mp_hash(struct dentry *dentry)
+{
+ unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
+ tmp = tmp + (tmp >> mp_hash_shift);
+ return &mountpoint_hashtable[tmp & mp_hash_mask];
+}
/*
* allocation is serialized by namespace_sem, but we need the spinlock to
@@ -189,7 +217,7 @@ static struct mount *alloc_vfsmnt(const char *name)
mnt->mnt_writers = 0;
#endif
- INIT_LIST_HEAD(&mnt->mnt_hash);
+ INIT_HLIST_NODE(&mnt->mnt_hash);
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
@@ -386,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
*/
int __mnt_want_write_file(struct file *file)
{
- struct inode *inode = file_inode(file);
-
- if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
+ if (!(file->f_mode & FMODE_WRITER))
return __mnt_want_write(file->f_path.mnt);
else
return mnt_clone_write(file->f_path.mnt);
@@ -458,7 +484,7 @@ static int mnt_make_readonly(struct mount *mnt)
{
int ret = 0;
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
/*
* After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -492,15 +518,15 @@ static int mnt_make_readonly(struct mount *mnt)
*/
smp_wmb();
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
return ret;
}
static void __mnt_unmake_readonly(struct mount *mnt)
{
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
mnt->mnt.mnt_flags &= ~MNT_READONLY;
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
int sb_prepare_remount_readonly(struct super_block *sb)
@@ -512,7 +538,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (atomic_long_read(&sb->s_remove_count))
return -EBUSY;
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -534,7 +560,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
return err;
}
@@ -542,37 +568,71 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
kfree(mnt->mnt_devname);
- mnt_free_id(mnt);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
#endif
kmem_cache_free(mnt_cache, mnt);
}
+static void delayed_free_vfsmnt(struct rcu_head *head)
+{
+ free_vfsmnt(container_of(head, struct mount, mnt_rcu));
+}
+
+/* call under rcu_read_lock */
+bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
+{
+ struct mount *mnt;
+ if (read_seqretry(&mount_lock, seq))
+ return false;
+ if (bastard == NULL)
+ return true;
+ mnt = real_mount(bastard);
+ mnt_add_count(mnt, 1);
+ if (likely(!read_seqretry(&mount_lock, seq)))
+ return true;
+ if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
+ mnt_add_count(mnt, -1);
+ return false;
+ }
+ rcu_read_unlock();
+ mntput(bastard);
+ rcu_read_lock();
+ return false;
+}
+
/*
- * find the first or last mount at @dentry on vfsmount @mnt depending on
- * @dir. If @dir is set return the first mount else return the last mount.
- * vfsmount_lock must be held for read or write.
+ * find the first mount at @dentry on vfsmount @mnt.
+ * call under rcu_read_lock()
*/
-struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
- int dir)
+struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
- struct list_head *head = mount_hashtable + hash(mnt, dentry);
- struct list_head *tmp = head;
- struct mount *p, *found = NULL;
+ struct hlist_head *head = m_hash(mnt, dentry);
+ struct mount *p;
- for (;;) {
- tmp = dir ? tmp->next : tmp->prev;
- p = NULL;
- if (tmp == head)
- break;
- p = list_entry(tmp, struct mount, mnt_hash);
- if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) {
- found = p;
+ hlist_for_each_entry_rcu(p, head, mnt_hash)
+ if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
+ return p;
+ return NULL;
+}
+
+/*
+ * find the last mount at @dentry on vfsmount @mnt.
+ * mount_lock must be held.
+ */
+struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
+{
+ struct mount *p, *res;
+ res = p = __lookup_mnt(mnt, dentry);
+ if (!p)
+ goto out;
+ hlist_for_each_entry_continue(p, mnt_hash) {
+ if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
break;
- }
+ res = p;
}
- return found;
+out:
+ return res;
}
/*
@@ -594,26 +654,26 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
struct vfsmount *lookup_mnt(struct path *path)
{
struct mount *child_mnt;
+ struct vfsmount *m;
+ unsigned seq;
- br_read_lock(&vfsmount_lock);
- child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
- if (child_mnt) {
- mnt_add_count(child_mnt, 1);
- br_read_unlock(&vfsmount_lock);
- return &child_mnt->mnt;
- } else {
- br_read_unlock(&vfsmount_lock);
- return NULL;
- }
+ rcu_read_lock();
+ do {
+ seq = read_seqbegin(&mount_lock);
+ child_mnt = __lookup_mnt(path->mnt, path->dentry);
+ m = child_mnt ? &child_mnt->mnt : NULL;
+ } while (!legitimize_mnt(m, seq));
+ rcu_read_unlock();
+ return m;
}
static struct mountpoint *new_mountpoint(struct dentry *dentry)
{
- struct list_head *chain = mountpoint_hashtable + hash(NULL, dentry);
+ struct hlist_head *chain = mp_hash(dentry);
struct mountpoint *mp;
int ret;
- list_for_each_entry(mp, chain, m_hash) {
+ hlist_for_each_entry(mp, chain, m_hash) {
if (mp->m_dentry == dentry) {
/* might be worth a WARN_ON() */
if (d_unlinked(dentry))
@@ -635,7 +695,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
mp->m_dentry = dentry;
mp->m_count = 1;
- list_add(&mp->m_hash, chain);
+ hlist_add_head(&mp->m_hash, chain);
return mp;
}
@@ -646,7 +706,7 @@ static void put_mountpoint(struct mountpoint *mp)
spin_lock(&dentry->d_lock);
dentry->d_flags &= ~DCACHE_MOUNTED;
spin_unlock(&dentry->d_lock);
- list_del(&mp->m_hash);
+ hlist_del(&mp->m_hash);
kfree(mp);
}
}
@@ -688,7 +748,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
- list_del_init(&mnt->mnt_hash);
+ hlist_del_init_rcu(&mnt->mnt_hash);
put_mountpoint(mnt->mnt_mp);
mnt->mnt_mp = NULL;
}
@@ -715,15 +775,14 @@ static void attach_mnt(struct mount *mnt,
struct mountpoint *mp)
{
mnt_set_mountpoint(parent, mp, mnt);
- list_add_tail(&mnt->mnt_hash, mount_hashtable +
- hash(&parent->mnt, mp->m_dentry));
+ hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}
/*
* vfsmount lock must be held for write
*/
-static void commit_tree(struct mount *mnt)
+static void commit_tree(struct mount *mnt, struct mount *shadows)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m;
@@ -738,8 +797,11 @@ static void commit_tree(struct mount *mnt)
list_splice(&head, n->list.prev);
- list_add_tail(&mnt->mnt_hash, mount_hashtable +
- hash(&parent->mnt, mnt->mnt_mountpoint));
+ if (shadows)
+ hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
+ else
+ hlist_add_head_rcu(&mnt->mnt_hash,
+ m_hash(&parent->mnt, mnt->mnt_mountpoint));
list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
touch_mnt_namespace(n);
}
@@ -788,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
root = mount_fs(type, flags, name, data);
if (IS_ERR(root)) {
+ mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_CAST(root);
}
@@ -796,9 +859,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
mnt->mnt.mnt_sb = root->d_sb;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
return &mnt->mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -825,7 +888,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
goto out_free;
}
- mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD;
+ mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
/* Don't allow unprivileged users to change mount flags */
if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY))
mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
@@ -839,9 +902,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
mnt->mnt.mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
mnt->mnt_parent = mnt;
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
if ((flag & CL_SLAVE) ||
((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
@@ -868,68 +931,61 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
return mnt;
out_free:
+ mnt_free_id(mnt);
free_vfsmnt(mnt);
return ERR_PTR(err);
}
-static inline void mntfree(struct mount *mnt)
-{
- struct vfsmount *m = &mnt->mnt;
- struct super_block *sb = m->mnt_sb;
-
- /*
- * This probably indicates that somebody messed
- * up a mnt_want/drop_write() pair. If this
- * happens, the filesystem was probably unable
- * to make r/w->r/o transitions.
- */
- /*
- * The locking used to deal with mnt_count decrement provides barriers,
- * so mnt_get_writers() below is safe.
- */
- WARN_ON(mnt_get_writers(mnt));
- fsnotify_vfsmount_delete(m);
- dput(m->mnt_root);
- free_vfsmnt(mnt);
- deactivate_super(sb);
-}
-
static void mntput_no_expire(struct mount *mnt)
{
put_again:
-#ifdef CONFIG_SMP
- br_read_lock(&vfsmount_lock);
- if (likely(mnt->mnt_ns)) {
- /* shouldn't be the last one */
- mnt_add_count(mnt, -1);
- br_read_unlock(&vfsmount_lock);
+ rcu_read_lock();
+ mnt_add_count(mnt, -1);
+ if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+ rcu_read_unlock();
return;
}
- br_read_unlock(&vfsmount_lock);
-
- br_write_lock(&vfsmount_lock);
- mnt_add_count(mnt, -1);
+ lock_mount_hash();
if (mnt_get_count(mnt)) {
- br_write_unlock(&vfsmount_lock);
+ rcu_read_unlock();
+ unlock_mount_hash();
return;
}
-#else
- mnt_add_count(mnt, -1);
- if (likely(mnt_get_count(mnt)))
- return;
- br_write_lock(&vfsmount_lock);
-#endif
if (unlikely(mnt->mnt_pinned)) {
mnt_add_count(mnt, mnt->mnt_pinned + 1);
mnt->mnt_pinned = 0;
- br_write_unlock(&vfsmount_lock);
+ rcu_read_unlock();
+ unlock_mount_hash();
acct_auto_close_mnt(&mnt->mnt);
goto put_again;
}
+ if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
+ rcu_read_unlock();
+ unlock_mount_hash();
+ return;
+ }
+ mnt->mnt.mnt_flags |= MNT_DOOMED;
+ rcu_read_unlock();
list_del(&mnt->mnt_instance);
- br_write_unlock(&vfsmount_lock);
- mntfree(mnt);
+ unlock_mount_hash();
+
+ /*
+ * This probably indicates that somebody messed
+ * up a mnt_want/drop_write() pair. If this
+ * happens, the filesystem was probably unable
+ * to make r/w->r/o transitions.
+ */
+ /*
+ * The locking used to deal with mnt_count decrement provides barriers,
+ * so mnt_get_writers() below is safe.
+ */
+ WARN_ON(mnt_get_writers(mnt));
+ fsnotify_vfsmount_delete(&mnt->mnt);
+ dput(mnt->mnt.mnt_root);
+ deactivate_super(mnt->mnt.mnt_sb);
+ mnt_free_id(mnt);
+ call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}
void mntput(struct vfsmount *mnt)
@@ -954,21 +1010,21 @@ EXPORT_SYMBOL(mntget);
void mnt_pin(struct vfsmount *mnt)
{
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
real_mount(mnt)->mnt_pinned++;
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
EXPORT_SYMBOL(mnt_pin);
void mnt_unpin(struct vfsmount *m)
{
struct mount *mnt = real_mount(m);
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
if (mnt->mnt_pinned) {
mnt_add_count(mnt, 1);
mnt->mnt_pinned--;
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
EXPORT_SYMBOL(mnt_unpin);
@@ -1038,14 +1094,29 @@ static void *m_start(struct seq_file *m, loff_t *pos)
struct proc_mounts *p = proc_mounts(m);
down_read(&namespace_sem);
- return seq_list_start(&p->ns->list, *pos);
+ if (p->cached_event == p->ns->event) {
+ void *v = p->cached_mount;
+ if (*pos == p->cached_index)
+ return v;
+ if (*pos == p->cached_index + 1) {
+ v = seq_list_next(v, &p->ns->list, &p->cached_index);
+ return p->cached_mount = v;
+ }
+ }
+
+ p->cached_event = p->ns->event;
+ p->cached_mount = seq_list_start(&p->ns->list, *pos);
+ p->cached_index = *pos;
+ return p->cached_mount;
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct proc_mounts *p = proc_mounts(m);
- return seq_list_next(v, &p->ns->list, pos);
+ p->cached_mount = seq_list_next(v, &p->ns->list, pos);
+ p->cached_index = *pos;
+ return p->cached_mount;
}
static void m_stop(struct seq_file *m, void *v)
@@ -1085,12 +1156,12 @@ int may_umount_tree(struct vfsmount *m)
BUG_ON(!m);
/* write lock needed for mnt_get_count */
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += mnt_get_count(p);
minimum_refs += 2;
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
if (actual_refs > minimum_refs)
return 0;
@@ -1117,48 +1188,40 @@ int may_umount(struct vfsmount *mnt)
{
int ret = 1;
down_read(&namespace_sem);
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
if (propagate_mount_busy(real_mount(mnt), 2))
ret = 0;
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
up_read(&namespace_sem);
return ret;
}
EXPORT_SYMBOL(may_umount);
-static LIST_HEAD(unmounted); /* protected by namespace_sem */
+static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static void namespace_unlock(void)
{
struct mount *mnt;
- LIST_HEAD(head);
+ struct hlist_head head = unmounted;
- if (likely(list_empty(&unmounted))) {
+ if (likely(hlist_empty(&head))) {
up_write(&namespace_sem);
return;
}
- list_splice_init(&unmounted, &head);
+ head.first->pprev = &head.first;
+ INIT_HLIST_HEAD(&unmounted);
+
up_write(&namespace_sem);
- while (!list_empty(&head)) {
- mnt = list_first_entry(&head, struct mount, mnt_hash);
- list_del_init(&mnt->mnt_hash);
- if (mnt_has_parent(mnt)) {
- struct dentry *dentry;
- struct mount *m;
-
- br_write_lock(&vfsmount_lock);
- dentry = mnt->mnt_mountpoint;
- m = mnt->mnt_parent;
- mnt->mnt_mountpoint = mnt->mnt.mnt_root;
- mnt->mnt_parent = mnt;
- m->mnt_ghosts--;
- br_write_unlock(&vfsmount_lock);
- dput(dentry);
- mntput(&m->mnt);
- }
+ synchronize_rcu();
+
+ while (!hlist_empty(&head)) {
+ mnt = hlist_entry(head.first, struct mount, mnt_hash);
+ hlist_del_init(&mnt->mnt_hash);
+ if (mnt->mnt_ex_mountpoint.mnt)
+ path_put(&mnt->mnt_ex_mountpoint);
mntput(&mnt->mnt);
}
}
@@ -1169,34 +1232,51 @@ static inline void namespace_lock(void)
}
/*
- * vfsmount lock must be held for write
+ * mount_lock must be held
* namespace_sem must be held for write
+ * how = 0 => just this tree, don't propagate
+ * how = 1 => propagate; we know that nobody else has reference to any victims
+ * how = 2 => lazy umount
*/
-void umount_tree(struct mount *mnt, int propagate)
+void umount_tree(struct mount *mnt, int how)
{
- LIST_HEAD(tmp_list);
+ HLIST_HEAD(tmp_list);
struct mount *p;
+ struct mount *last = NULL;
- for (p = mnt; p; p = next_mnt(p, mnt))
- list_move(&p->mnt_hash, &tmp_list);
+ for (p = mnt; p; p = next_mnt(p, mnt)) {
+ hlist_del_init_rcu(&p->mnt_hash);
+ hlist_add_head(&p->mnt_hash, &tmp_list);
+ }
- if (propagate)
+ if (how)
propagate_umount(&tmp_list);
- list_for_each_entry(p, &tmp_list, mnt_hash) {
+ hlist_for_each_entry(p, &tmp_list, mnt_hash) {
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
+ if (how < 2)
+ p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
list_del_init(&p->mnt_child);
if (mnt_has_parent(p)) {
- p->mnt_parent->mnt_ghosts++;
put_mountpoint(p->mnt_mp);
+ /* move the reference to mountpoint into ->mnt_ex_mountpoint */
+ p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
+ p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
+ p->mnt_mountpoint = p->mnt.mnt_root;
+ p->mnt_parent = p;
p->mnt_mp = NULL;
}
change_mnt_propagation(p, MS_PRIVATE);
+ last = p;
+ }
+ if (last) {
+ last->mnt_hash.next = unmounted.first;
+ unmounted.first = tmp_list.first;
+ unmounted.first->pprev = &unmounted.first;
}
- list_splice(&tmp_list, &unmounted);
}
static void shrink_submounts(struct mount *mnt);
@@ -1225,12 +1305,12 @@ static int do_umount(struct mount *mnt, int flags)
* probably don't strictly need the lock here if we examined
* all race cases, but it's a slowpath.
*/
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
if (mnt_get_count(mnt) != 2) {
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
return -EBUSY;
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
if (!xchg(&mnt->mnt_expiry_mark, 1))
return -EAGAIN;
@@ -1272,19 +1352,23 @@ static int do_umount(struct mount *mnt, int flags)
}
namespace_lock();
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
event++;
- if (!(flags & MNT_DETACH))
- shrink_submounts(mnt);
-
- retval = -EBUSY;
- if (flags & MNT_DETACH || !propagate_mount_busy(mnt, 2)) {
+ if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 1);
+ umount_tree(mnt, 2);
retval = 0;
+ } else {
+ shrink_submounts(mnt);
+ retval = -EBUSY;
+ if (!propagate_mount_busy(mnt, 2)) {
+ if (!list_empty(&mnt->mnt_list))
+ umount_tree(mnt, 1);
+ retval = 0;
+ }
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
namespace_unlock();
return retval;
}
@@ -1427,18 +1511,18 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
q = clone_mnt(p, p->mnt.mnt_root, flag);
if (IS_ERR(q))
goto out;
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list);
attach_mnt(q, parent, p->mnt_mp);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
}
return res;
out:
if (res) {
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
umount_tree(res, 0);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
return q;
}
@@ -1460,9 +1544,9 @@ struct vfsmount *collect_mounts(struct path *path)
void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
umount_tree(real_mount(mnt), 0);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
namespace_unlock();
}
@@ -1576,24 +1660,23 @@ static int attach_recursive_mnt(struct mount *source_mnt,
struct mountpoint *dest_mp,
struct path *parent_path)
{
- LIST_HEAD(tree_list);
+ HLIST_HEAD(tree_list);
struct mount *child, *p;
+ struct hlist_node *n;
int err;
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
goto out;
- }
- err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
- if (err)
- goto out_cleanup_ids;
-
- br_write_lock(&vfsmount_lock);
-
- if (IS_MNT_SHARED(dest_mnt)) {
+ err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
+ lock_mount_hash();
+ if (err)
+ goto out_cleanup_ids;
for (p = source_mnt; p; p = next_mnt(p, source_mnt))
set_mnt_shared(p);
+ } else {
+ lock_mount_hash();
}
if (parent_path) {
detach_mnt(source_mnt, parent_path);
@@ -1601,20 +1684,27 @@ static int attach_recursive_mnt(struct mount *source_mnt,
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
- commit_tree(source_mnt);
+ commit_tree(source_mnt, NULL);
}
- list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
- list_del_init(&child->mnt_hash);
- commit_tree(child);
+ hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
+ struct mount *q;
+ hlist_del_init(&child->mnt_hash);
+ q = __lookup_mnt_last(&child->mnt_parent->mnt,
+ child->mnt_mountpoint);
+ commit_tree(child, q);
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
return 0;
out_cleanup_ids:
- if (IS_MNT_SHARED(dest_mnt))
- cleanup_group_ids(source_mnt, NULL);
+ while (!hlist_empty(&tree_list)) {
+ child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+ umount_tree(child, 0);
+ }
+ unlock_mount_hash();
+ cleanup_group_ids(source_mnt, NULL);
out:
return err;
}
@@ -1710,10 +1800,10 @@ static int do_change_type(struct path *path, int flag)
goto out_unlock;
}
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
change_mnt_propagation(m, type);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
out_unlock:
namespace_unlock();
@@ -1785,9 +1875,9 @@ static int do_loopback(struct path *path, const char *old_name,
err = graft_tree(mnt, parent, mp);
if (err) {
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
umount_tree(mnt, 0);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
out2:
unlock_mount(mp);
@@ -1846,17 +1936,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
else
err = do_remount_sb(sb, flags, data, 0);
if (!err) {
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
mnt->mnt.mnt_flags = mnt_flags;
- br_write_unlock(&vfsmount_lock);
- }
- up_write(&sb->s_umount);
- if (!err) {
- br_write_lock(&vfsmount_lock);
touch_mnt_namespace(mnt->mnt_ns);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
}
+ up_write(&sb->s_umount);
return err;
}
@@ -1972,7 +2058,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
struct mount *parent;
int err;
- mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
+ mnt_flags &= ~MNT_INTERNAL_FLAGS;
mp = lock_mount(path);
if (IS_ERR(mp))
@@ -2077,9 +2163,7 @@ fail:
/* remove m from any expiration list it may be on */
if (!list_empty(&mnt->mnt_expire)) {
namespace_lock();
- br_write_lock(&vfsmount_lock);
list_del_init(&mnt->mnt_expire);
- br_write_unlock(&vfsmount_lock);
namespace_unlock();
}
mntput(m);
@@ -2095,11 +2179,9 @@ fail:
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
namespace_lock();
- br_write_lock(&vfsmount_lock);
list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
- br_write_unlock(&vfsmount_lock);
namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);
@@ -2118,7 +2200,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
return;
namespace_lock();
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
/* extract from the expiration list every vfsmount that matches the
* following criteria:
@@ -2137,7 +2219,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
touch_mnt_namespace(mnt->mnt_ns);
umount_tree(mnt, 1);
}
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
namespace_unlock();
}
@@ -2193,7 +2275,7 @@ resume:
* process a list of expirable mountpoints with the intent of discarding any
* submounts of a specific parent mountpoint
*
- * vfsmount_lock must be held for write
+ * mount_lock must be held for write
*/
static void shrink_submounts(struct mount *mnt)
{
@@ -2414,20 +2496,25 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns)
return new_ns;
}
-/*
- * Allocate a new namespace structure and populate it with contents
- * copied from the namespace of the passed in task structure.
- */
-static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
- struct user_namespace *user_ns, struct fs_struct *fs)
+struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
+ struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
struct mount *p, *q;
- struct mount *old = mnt_ns->root;
+ struct mount *old;
struct mount *new;
int copy_flags;
+ BUG_ON(!ns);
+
+ if (likely(!(flags & CLONE_NEWNS))) {
+ get_mnt_ns(ns);
+ return ns;
+ }
+
+ old = ns->root;
+
new_ns = alloc_mnt_ns(user_ns);
if (IS_ERR(new_ns))
return new_ns;
@@ -2435,7 +2522,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
namespace_lock();
/* First pass: copy the tree topology */
copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
- if (user_ns != mnt_ns->user_ns)
+ if (user_ns != ns->user_ns)
copy_flags |= CL_SHARED_TO_SLAVE | CL_UNPRIVILEGED;
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
@@ -2444,9 +2531,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
return ERR_CAST(new);
}
new_ns->root = new;
- br_write_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new->mnt_list);
- br_write_unlock(&vfsmount_lock);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2457,13 +2542,13 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
q = new;
while (p) {
q->mnt_ns = new_ns;
- if (fs) {
- if (&p->mnt == fs->root.mnt) {
- fs->root.mnt = mntget(&q->mnt);
+ if (new_fs) {
+ if (&p->mnt == new_fs->root.mnt) {
+ new_fs->root.mnt = mntget(&q->mnt);
rootmnt = &p->mnt;
}
- if (&p->mnt == fs->pwd.mnt) {
- fs->pwd.mnt = mntget(&q->mnt);
+ if (&p->mnt == new_fs->pwd.mnt) {
+ new_fs->pwd.mnt = mntget(&q->mnt);
pwdmnt = &p->mnt;
}
}
@@ -2484,23 +2569,6 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
return new_ns;
}
-struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
- struct user_namespace *user_ns, struct fs_struct *new_fs)
-{
- struct mnt_namespace *new_ns;
-
- BUG_ON(!ns);
- get_mnt_ns(ns);
-
- if (!(flags & CLONE_NEWNS))
- return ns;
-
- new_ns = dup_mnt_ns(ns, user_ns, new_fs);
-
- put_mnt_ns(ns);
- return new_ns;
-}
-
/**
* create_mnt_ns - creates a private namespace and adds a root filesystem
* @mnt: pointer to the new root filesystem mountpoint
@@ -2593,7 +2661,7 @@ out_type:
/*
* Return true if path is reachable from root
*
- * namespace_sem or vfsmount_lock is held
+ * namespace_sem or mount_lock is held
*/
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
const struct path *root)
@@ -2608,9 +2676,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
int path_is_under(struct path *path1, struct path *path2)
{
int res;
- br_read_lock(&vfsmount_lock);
+ read_seqlock_excl(&mount_lock);
res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
- br_read_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
return res;
}
EXPORT_SYMBOL(path_is_under);
@@ -2701,7 +2769,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
if (!is_path_reachable(old_mnt, old.dentry, &new))
goto out4;
root_mp->m_count++; /* pin it so it won't go away */
- br_write_lock(&vfsmount_lock);
+ lock_mount_hash();
detach_mnt(new_mnt, &parent_path);
detach_mnt(root_mnt, &root_parent);
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
@@ -2713,7 +2781,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* mount new_root on / */
attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
touch_mnt_namespace(current->nsproxy->mnt_ns);
- br_write_unlock(&vfsmount_lock);
+ unlock_mount_hash();
chroot_fs_refs(&root, &new);
put_mountpoint(root_mp);
error = 0;
@@ -2767,25 +2835,29 @@ void __init mnt_init(void)
unsigned u;
int err;
- init_rwsem(&namespace_sem);
-
mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
- mount_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
- mountpoint_hashtable = (struct list_head *)__get_free_page(GFP_ATOMIC);
+ mount_hashtable = alloc_large_system_hash("Mount-cache",
+ sizeof(struct hlist_head),
+ mhash_entries, 19,
+ 0,
+ &m_hash_shift, &m_hash_mask, 0, 0);
+ mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
+ sizeof(struct hlist_head),
+ mphash_entries, 19,
+ 0,
+ &mp_hash_shift, &mp_hash_mask, 0, 0);
if (!mount_hashtable || !mountpoint_hashtable)
panic("Failed to allocate mount hash table\n");
- printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
-
- for (u = 0; u < HASH_SIZE; u++)
- INIT_LIST_HEAD(&mount_hashtable[u]);
- for (u = 0; u < HASH_SIZE; u++)
- INIT_LIST_HEAD(&mountpoint_hashtable[u]);
+ for (u = 0; u <= m_hash_mask; u++)
+ INIT_HLIST_HEAD(&mount_hashtable[u]);
+ for (u = 0; u <= mp_hash_mask; u++)
+ INIT_HLIST_HEAD(&mountpoint_hashtable[u]);
- br_lock_init(&vfsmount_lock);
+ kernfs_init();
err = sysfs_init();
if (err)
@@ -2825,9 +2897,8 @@ void kern_unmount(struct vfsmount *mnt)
{
/* release long term mount so mount point can be released */
if (!IS_ERR_OR_NULL(mnt)) {
- br_write_lock(&vfsmount_lock);
real_mount(mnt)->mnt_ns = NULL;
- br_write_unlock(&vfsmount_lock);
+ synchronize_rcu(); /* yecchhh... */
mntput(mnt);
}
}
@@ -2871,7 +2942,7 @@ bool fs_fully_visible(struct file_system_type *type)
if (unlikely(!ns))
return false;
- namespace_lock();
+ down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
if (mnt->mnt.mnt_sb->s_type != type)
@@ -2884,7 +2955,7 @@ bool fs_fully_visible(struct file_system_type *type)
struct inode *inode = child->mnt_mountpoint->d_inode;
if (!S_ISDIR(inode->i_mode))
goto next;
- if (inode->i_nlink != 2)
+ if (inode->i_nlink > 2)
goto next;
}
visible = true;
@@ -2892,7 +2963,7 @@ bool fs_fully_visible(struct file_system_type *type)
next: ;
}
found:
- namespace_unlock();
+ up_read(&namespace_sem);
return visible;
}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index c320ac52353..08b8ea8c353 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -339,7 +339,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
if (val)
goto finished;
- DDPRINTK("ncp_lookup_validate: %pd2 not valid, age=%ld, server lookup\n",
+ ncp_dbg(2, "%pd2 not valid, age=%ld, server lookup\n",
dentry, NCP_GET_AGE(dentry));
len = sizeof(__name);
@@ -358,7 +358,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
res = ncp_obtain_info(server, dir, __name, &(finfo.i));
}
finfo.volume = finfo.i.volNumber;
- DDPRINTK("ncp_lookup_validate: looked for %pd/%s, res=%d\n",
+ ncp_dbg(2, "looked for %pd/%s, res=%d\n",
dentry->d_parent, __name, res);
/*
* If we didn't find it, or if it has a different dirEntNum to
@@ -372,14 +372,14 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
ncp_new_dentry(dentry);
val=1;
} else
- DDPRINTK("ncp_lookup_validate: found, but dirEntNum changed\n");
+ ncp_dbg(2, "found, but dirEntNum changed\n");
ncp_update_inode2(inode, &finfo);
mutex_unlock(&inode->i_mutex);
}
finished:
- DDPRINTK("ncp_lookup_validate: result=%d\n", val);
+ ncp_dbg(2, "result=%d\n", val);
dput(parent);
return val;
}
@@ -453,8 +453,7 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
ctl.page = NULL;
ctl.cache = NULL;
- DDPRINTK("ncp_readdir: reading %pD2, pos=%d\n", file,
- (int) ctx->pos);
+ ncp_dbg(2, "reading %pD2, pos=%d\n", file, (int)ctx->pos);
result = -EIO;
/* Do not generate '.' and '..' when server is dead. */
@@ -697,8 +696,7 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
struct ncp_entry_info entry;
int i;
- DPRINTK("ncp_read_volume_list: pos=%ld\n",
- (unsigned long) ctx->pos);
+ ncp_dbg(1, "pos=%ld\n", (unsigned long)ctx->pos);
for (i = 0; i < NCP_NUMBER_OF_VOLUMES; i++) {
int inval_dentry;
@@ -708,12 +706,11 @@ ncp_read_volume_list(struct file *file, struct dir_context *ctx,
if (!strlen(info.volume_name))
continue;
- DPRINTK("ncp_read_volume_list: found vol: %s\n",
- info.volume_name);
+ ncp_dbg(1, "found vol: %s\n", info.volume_name);
if (ncp_lookup_volume(server, info.volume_name,
&entry.i)) {
- DPRINTK("ncpfs: could not lookup vol %s\n",
+ ncp_dbg(1, "could not lookup vol %s\n",
info.volume_name);
continue;
}
@@ -738,14 +735,13 @@ ncp_do_readdir(struct file *file, struct dir_context *ctx,
int more;
size_t bufsize;
- DPRINTK("ncp_do_readdir: %pD2, fpos=%ld\n", file,
- (unsigned long) ctx->pos);
- PPRINTK("ncp_do_readdir: init %pD, volnum=%d, dirent=%u\n",
- file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
+ ncp_dbg(1, "%pD2, fpos=%ld\n", file, (unsigned long)ctx->pos);
+ ncp_vdbg("init %pD, volnum=%d, dirent=%u\n",
+ file, NCP_FINFO(dir)->volNumber, NCP_FINFO(dir)->dirEntNum);
err = ncp_initialize_search(server, dir, &seq);
if (err) {
- DPRINTK("ncp_do_readdir: init failed, err=%d\n", err);
+ ncp_dbg(1, "init failed, err=%d\n", err);
return;
}
/* We MUST NOT use server->buffer_size handshaked with server if we are
@@ -808,8 +804,7 @@ int ncp_conn_logged_in(struct super_block *sb)
goto out;
result = -ENOENT;
if (ncp_get_volume_root(server, __name, &volNumber, &dirEntNum, &DosDirNum)) {
- PPRINTK("ncp_conn_logged_in: %s not found\n",
- server->m.mounted_vol);
+ ncp_vdbg("%s not found\n", server->m.mounted_vol);
goto out;
}
dent = sb->s_root;
@@ -822,10 +817,10 @@ int ncp_conn_logged_in(struct super_block *sb)
NCP_FINFO(ino)->DosDirNum = DosDirNum;
result = 0;
} else {
- DPRINTK("ncpfs: sb->s_root->d_inode == NULL!\n");
+ ncp_dbg(1, "sb->s_root->d_inode == NULL!\n");
}
} else {
- DPRINTK("ncpfs: sb->s_root == NULL!\n");
+ ncp_dbg(1, "sb->s_root == NULL!\n");
}
} else
result = 0;
@@ -846,7 +841,7 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
if (!ncp_conn_valid(server))
goto finished;
- PPRINTK("ncp_lookup: server lookup for %pd2\n", dentry);
+ ncp_vdbg("server lookup for %pd2\n", dentry);
len = sizeof(__name);
if (ncp_is_server_root(dir)) {
@@ -854,15 +849,15 @@ static struct dentry *ncp_lookup(struct inode *dir, struct dentry *dentry, unsig
dentry->d_name.len, 1);
if (!res)
res = ncp_lookup_volume(server, __name, &(finfo.i));
- if (!res)
- ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
+ if (!res)
+ ncp_update_known_namespace(server, finfo.i.volNumber, NULL);
} else {
res = ncp_io2vol(server, __name, &len, dentry->d_name.name,
dentry->d_name.len, !ncp_preserve_case(dir));
if (!res)
res = ncp_obtain_info(server, dir, __name, &(finfo.i));
}
- PPRINTK("ncp_lookup: looked for %pd2, res=%d\n", dentry, res);
+ ncp_vdbg("looked for %pd2, res=%d\n", dentry, res);
/*
* If we didn't find an entry, make a negative dentry.
*/
@@ -886,7 +881,7 @@ add_entry:
}
finished:
- PPRINTK("ncp_lookup: result=%d\n", error);
+ ncp_vdbg("result=%d\n", error);
return ERR_PTR(error);
}
@@ -909,7 +904,7 @@ out:
return error;
out_close:
- PPRINTK("ncp_instantiate: %pd2 failed, closing file\n", dentry);
+ ncp_vdbg("%pd2 failed, closing file\n", dentry);
ncp_close_file(NCP_SERVER(dir), finfo->file_handle);
goto out;
}
@@ -923,7 +918,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
int opmode;
__u8 __name[NCP_MAXPATHLEN + 1];
- PPRINTK("ncp_create_new: creating %pd2, mode=%hx\n", dentry, mode);
+ ncp_vdbg("creating %pd2, mode=%hx\n", dentry, mode);
ncp_age_dentry(server, dentry);
len = sizeof(__name);
@@ -952,7 +947,7 @@ int ncp_create_new(struct inode *dir, struct dentry *dentry, umode_t mode,
error = -ENAMETOOLONG;
else if (result < 0)
error = result;
- DPRINTK("ncp_create: %pd2 failed\n", dentry);
+ ncp_dbg(1, "%pd2 failed\n", dentry);
goto out;
}
opmode = O_WRONLY;
@@ -985,7 +980,7 @@ static int ncp_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int error, len;
__u8 __name[NCP_MAXPATHLEN + 1];
- DPRINTK("ncp_mkdir: making %pd2\n", dentry);
+ ncp_dbg(1, "making %pd2\n", dentry);
ncp_age_dentry(server, dentry);
len = sizeof(__name);
@@ -1022,7 +1017,7 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry)
int error, result, len;
__u8 __name[NCP_MAXPATHLEN + 1];
- DPRINTK("ncp_rmdir: removing %pd2\n", dentry);
+ ncp_dbg(1, "removing %pd2\n", dentry);
len = sizeof(__name);
error = ncp_io2vol(server, __name, &len, dentry->d_name.name,
@@ -1067,13 +1062,13 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
int error;
server = NCP_SERVER(dir);
- DPRINTK("ncp_unlink: unlinking %pd2\n", dentry);
+ ncp_dbg(1, "unlinking %pd2\n", dentry);
/*
* Check whether to close the file ...
*/
if (inode) {
- PPRINTK("ncp_unlink: closing file\n");
+ ncp_vdbg("closing file\n");
ncp_make_closed(inode);
}
@@ -1087,7 +1082,7 @@ static int ncp_unlink(struct inode *dir, struct dentry *dentry)
#endif
switch (error) {
case 0x00:
- DPRINTK("ncp: removed %pd2\n", dentry);
+ ncp_dbg(1, "removed %pd2\n", dentry);
break;
case 0x85:
case 0x8A:
@@ -1120,7 +1115,7 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
int old_len, new_len;
__u8 __old_name[NCP_MAXPATHLEN + 1], __new_name[NCP_MAXPATHLEN + 1];
- DPRINTK("ncp_rename: %pd2 to %pd2\n", old_dentry, new_dentry);
+ ncp_dbg(1, "%pd2 to %pd2\n", old_dentry, new_dentry);
ncp_age_dentry(server, old_dentry);
ncp_age_dentry(server, new_dentry);
@@ -1150,8 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry,
#endif
switch (error) {
case 0x00:
- DPRINTK("ncp renamed %pd -> %pd.\n",
- old_dentry, new_dentry);
+ ncp_dbg(1, "renamed %pd -> %pd\n",
+ old_dentry, new_dentry);
break;
case 0x9E:
error = -ENAMETOOLONG;
@@ -1173,7 +1168,7 @@ static int ncp_mknod(struct inode * dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
- DPRINTK(KERN_DEBUG "ncp_mknod: mode = 0%ho\n", mode);
+ ncp_dbg(1, "mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
}
return -EPERM; /* Strange, but true */
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 8f5074e1ecb..77640a8bfb8 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -6,6 +6,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <asm/uaccess.h>
#include <linux/time.h>
@@ -34,11 +36,11 @@ int ncp_make_open(struct inode *inode, int right)
error = -EINVAL;
if (!inode) {
- printk(KERN_ERR "ncp_make_open: got NULL inode\n");
+ pr_err("%s: got NULL inode\n", __func__);
goto out;
}
- DPRINTK("ncp_make_open: opened=%d, volume # %u, dir entry # %u\n",
+ ncp_dbg(1, "opened=%d, volume # %u, dir entry # %u\n",
atomic_read(&NCP_FINFO(inode)->opened),
NCP_FINFO(inode)->volNumber,
NCP_FINFO(inode)->dirEntNum);
@@ -71,7 +73,7 @@ int ncp_make_open(struct inode *inode, int right)
break;
}
if (result) {
- PPRINTK("ncp_make_open: failed, result=%d\n", result);
+ ncp_vdbg("failed, result=%d\n", result);
goto out_unlock;
}
/*
@@ -83,7 +85,7 @@ int ncp_make_open(struct inode *inode, int right)
}
access = NCP_FINFO(inode)->access;
- PPRINTK("ncp_make_open: file open, access=%x\n", access);
+ ncp_vdbg("file open, access=%x\n", access);
if (access == right || access == O_RDWR) {
atomic_inc(&NCP_FINFO(inode)->opened);
error = 0;
@@ -107,7 +109,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
void* freepage;
size_t freelen;
- DPRINTK("ncp_file_read: enter %pd2\n", dentry);
+ ncp_dbg(1, "enter %pd2\n", dentry);
pos = *ppos;
@@ -124,7 +126,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
error = ncp_make_open(inode, O_RDONLY);
if (error) {
- DPRINTK(KERN_ERR "ncp_file_read: open failed, error=%d\n", error);
+ ncp_dbg(1, "open failed, error=%d\n", error);
return error;
}
@@ -165,7 +167,7 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
file_accessed(file);
- DPRINTK("ncp_file_read: exit %pd2\n", dentry);
+ ncp_dbg(1, "exit %pd2\n", dentry);
outrel:
ncp_inode_close(inode);
return already_read ? already_read : error;
@@ -182,7 +184,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
int errno;
void* bouncebuffer;
- DPRINTK("ncp_file_write: enter %pd2\n", dentry);
+ ncp_dbg(1, "enter %pd2\n", dentry);
if ((ssize_t) count < 0)
return -EINVAL;
pos = *ppos;
@@ -211,7 +213,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
return 0;
errno = ncp_make_open(inode, O_WRONLY);
if (errno) {
- DPRINTK(KERN_ERR "ncp_file_write: open failed, error=%d\n", errno);
+ ncp_dbg(1, "open failed, error=%d\n", errno);
return errno;
}
bufsize = NCP_SERVER(inode)->buffer_size;
@@ -261,7 +263,7 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
i_size_write(inode, pos);
mutex_unlock(&inode->i_mutex);
}
- DPRINTK("ncp_file_write: exit %pd2\n", dentry);
+ ncp_dbg(1, "exit %pd2\n", dentry);
outrel:
ncp_inode_close(inode);
return already_written ? already_written : errno;
@@ -269,7 +271,7 @@ outrel:
static int ncp_release(struct inode *inode, struct file *file) {
if (ncp_make_closed(inode)) {
- DPRINTK("ncp_release: failed to close\n");
+ ncp_dbg(1, "failed to close\n");
}
return 0;
}
diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c
index 0af3349de85..344889cd120 100644
--- a/fs/ncpfs/getopt.c
+++ b/fs/ncpfs/getopt.c
@@ -2,6 +2,8 @@
* getopt.c
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/string.h>
@@ -46,29 +48,28 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts
if (opts->has_arg & OPT_NOPARAM) {
return opts->val;
}
- printk(KERN_INFO "%s: the %s option requires an argument\n",
- caller, token);
+ pr_info("%s: the %s option requires an argument\n",
+ caller, token);
return -EINVAL;
}
if (opts->has_arg & OPT_INT) {
- char* v;
+ int rc = kstrtoul(val, 0, value);
- *value = simple_strtoul(val, &v, 0);
- if (!*v) {
- return opts->val;
+ if (rc) {
+ pr_info("%s: invalid numeric value in %s=%s\n",
+ caller, token, val);
+ return rc;
}
- printk(KERN_INFO "%s: invalid numeric value in %s=%s\n",
- caller, token, val);
- return -EDOM;
+ return opts->val;
}
if (opts->has_arg & OPT_STRING) {
return opts->val;
}
- printk(KERN_INFO "%s: unexpected argument %s to the %s option\n",
+ pr_info("%s: unexpected argument %s to the %s option\n",
caller, val, token);
return -EINVAL;
}
}
- printk(KERN_INFO "%s: Unrecognized mount option %s\n", caller, token);
+ pr_info("%s: Unrecognized mount option %s\n", caller, token);
return -EOPNOTSUPP;
}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 4659da67e7f..e31e589369a 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -9,6 +9,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <asm/uaccess.h>
@@ -99,6 +101,7 @@ static void destroy_inodecache(void)
static int ncp_remount(struct super_block *sb, int *flags, char* data)
{
+ sync_filesystem(sb);
*flags |= MS_NODIRATIME;
return 0;
}
@@ -132,7 +135,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
NCP_FINFO(inode)->access = nwinfo->access;
memcpy(NCP_FINFO(inode)->file_handle, nwinfo->file_handle,
sizeof(nwinfo->file_handle));
- DPRINTK("ncp_update_inode: updated %s, volnum=%d, dirent=%u\n",
+ ncp_dbg(1, "updated %s, volnum=%d, dirent=%u\n",
nwinfo->i.entryName, NCP_FINFO(inode)->volNumber,
NCP_FINFO(inode)->dirEntNum);
}
@@ -140,8 +143,7 @@ void ncp_update_inode(struct inode *inode, struct ncp_entry_info *nwinfo)
static void ncp_update_dates(struct inode *inode, struct nw_info_struct *nwi)
{
/* NFS namespace mode overrides others if it's set. */
- DPRINTK(KERN_DEBUG "ncp_update_dates_and_mode: (%s) nfs.mode=0%o\n",
- nwi->entryName, nwi->nfs.mode);
+ ncp_dbg(1, "(%s) nfs.mode=0%o\n", nwi->entryName, nwi->nfs.mode);
if (nwi->nfs.mode) {
/* XXX Security? */
inode->i_mode = nwi->nfs.mode;
@@ -229,7 +231,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
ncp_update_attrs(inode, nwinfo);
- DDPRINTK("ncp_read_inode: inode->i_mode = %u\n", inode->i_mode);
+ ncp_dbg(2, "inode->i_mode = %u\n", inode->i_mode);
set_nlink(inode, 1);
inode->i_uid = server->m.uid;
@@ -257,7 +259,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
struct inode *inode;
if (info == NULL) {
- printk(KERN_ERR "ncp_iget: info is NULL\n");
+ pr_err("%s: info is NULL\n", __func__);
return NULL;
}
@@ -289,23 +291,23 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
}
insert_inode_hash(inode);
} else
- printk(KERN_ERR "ncp_iget: iget failed!\n");
+ pr_err("%s: iget failed!\n", __func__);
return inode;
}
static void
ncp_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (S_ISDIR(inode->i_mode)) {
- DDPRINTK("ncp_evict_inode: put directory %ld\n", inode->i_ino);
+ ncp_dbg(2, "put directory %ld\n", inode->i_ino);
}
if (ncp_make_closed(inode) != 0) {
/* We can't do anything but complain. */
- printk(KERN_ERR "ncp_evict_inode: could not close\n");
+ pr_err("%s: could not close\n", __func__);
}
}
@@ -468,9 +470,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
{
struct ncp_mount_data_kernel data;
struct ncp_server *server;
- struct file *ncp_filp;
struct inode *root_inode;
- struct inode *sock_inode;
struct socket *sock;
int error;
int default_bufsize;
@@ -539,18 +539,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) ||
!gid_valid(data.gid))
goto out;
- error = -EBADF;
- ncp_filp = fget(data.ncp_fd);
- if (!ncp_filp)
- goto out;
- error = -ENOTSOCK;
- sock_inode = file_inode(ncp_filp);
- if (!S_ISSOCK(sock_inode->i_mode))
- goto out_fput;
- sock = SOCKET_I(sock_inode);
+ sock = sockfd_lookup(data.ncp_fd, &error);
if (!sock)
- goto out_fput;
-
+ goto out;
+
if (sock->type == SOCK_STREAM)
default_bufsize = 0xF000;
else
@@ -572,27 +564,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (error)
goto out_fput;
- server->ncp_filp = ncp_filp;
server->ncp_sock = sock;
if (data.info_fd != -1) {
- struct socket *info_sock;
-
- error = -EBADF;
- server->info_filp = fget(data.info_fd);
- if (!server->info_filp)
- goto out_bdi;
- error = -ENOTSOCK;
- sock_inode = file_inode(server->info_filp);
- if (!S_ISSOCK(sock_inode->i_mode))
- goto out_fput2;
- info_sock = SOCKET_I(sock_inode);
+ struct socket *info_sock = sockfd_lookup(data.info_fd, &error);
if (!info_sock)
- goto out_fput2;
+ goto out_bdi;
+ server->info_sock = info_sock;
error = -EBADFD;
if (info_sock->type != SOCK_STREAM)
goto out_fput2;
- server->info_sock = info_sock;
}
/* server->lock = 0; */
@@ -620,7 +601,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
now because of PATH_MAX changes.. */
if (server->m.time_out < 1) {
server->m.time_out = 10;
- printk(KERN_INFO "You need to recompile your ncpfs utils..\n");
+ pr_info("You need to recompile your ncpfs utils..\n");
}
server->m.time_out = server->m.time_out * HZ / 100;
server->m.file_mode = (server->m.file_mode & S_IRWXUGO) | S_IFREG;
@@ -681,7 +662,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
ncp_unlock_server(server);
if (error < 0)
goto out_rxbuf;
- DPRINTK("ncp_fill_super: NCP_SBP(sb) = %x\n", (int) NCP_SBP(sb));
+ ncp_dbg(1, "NCP_SBP(sb) = %p\n", NCP_SBP(sb));
error = -EMSGSIZE; /* -EREMOTESIDEINCOMPATIBLE */
#ifdef CONFIG_NCPFS_PACKET_SIGNING
@@ -709,7 +690,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
if (ncp_negotiate_buffersize(server, default_bufsize,
&(server->buffer_size)) != 0)
goto out_disconnect;
- DPRINTK("ncpfs: bufsize = %d\n", server->buffer_size);
+ ncp_dbg(1, "bufsize = %d\n", server->buffer_size);
memset(&finfo, 0, sizeof(finfo));
finfo.i.attributes = aDIR;
@@ -738,7 +719,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
root_inode = ncp_iget(sb, &finfo);
if (!root_inode)
goto out_disconnect;
- DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
+ ncp_dbg(1, "root vol=%d\n", NCP_FINFO(root_inode)->volNumber);
sb->s_root = d_make_root(root_inode);
if (!sb->s_root)
goto out_disconnect;
@@ -764,17 +745,12 @@ out_nls:
mutex_destroy(&server->root_setup_lock);
mutex_destroy(&server->mutex);
out_fput2:
- if (server->info_filp)
- fput(server->info_filp);
+ if (server->info_sock)
+ sockfd_put(server->info_sock);
out_bdi:
bdi_destroy(&server->bdi);
out_fput:
- /* 23/12/1998 Marcin Dalecki <dalecki@cs.net.pl>:
- *
- * The previously used put_filp(ncp_filp); was bogus, since
- * it doesn't perform proper unlocking.
- */
- fput(ncp_filp);
+ sockfd_put(sock);
out:
put_pid(data.wdog_pid);
sb->s_fs_info = NULL;
@@ -782,6 +758,17 @@ out:
return error;
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct ncp_server *server = container_of(p, struct ncp_server, rcu);
+#ifdef CONFIG_NCPFS_NLS
+ /* unload the NLS charsets */
+ unload_nls(server->nls_vol);
+ unload_nls(server->nls_io);
+#endif /* CONFIG_NCPFS_NLS */
+ kfree(server);
+}
+
static void ncp_put_super(struct super_block *sb)
{
struct ncp_server *server = NCP_SBP(sb);
@@ -792,18 +779,13 @@ static void ncp_put_super(struct super_block *sb)
ncp_stop_tasks(server);
-#ifdef CONFIG_NCPFS_NLS
- /* unload the NLS charsets */
- unload_nls(server->nls_vol);
- unload_nls(server->nls_io);
-#endif /* CONFIG_NCPFS_NLS */
mutex_destroy(&server->rcv.creq_mutex);
mutex_destroy(&server->root_setup_lock);
mutex_destroy(&server->mutex);
- if (server->info_filp)
- fput(server->info_filp);
- fput(server->ncp_filp);
+ if (server->info_sock)
+ sockfd_put(server->info_sock);
+ sockfd_put(server->ncp_sock);
kill_pid(server->m.wdog_pid, SIGTERM, 1);
put_pid(server->m.wdog_pid);
@@ -813,8 +795,7 @@ static void ncp_put_super(struct super_block *sb)
vfree(server->rxbuf);
vfree(server->txbuf);
vfree(server->packet);
- sb->s_fs_info = NULL;
- kfree(server);
+ call_rcu(&server->rcu, delayed_free);
}
static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -979,8 +960,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr)
if ((attr->ia_valid & ATTR_SIZE) != 0) {
int written;
- DPRINTK("ncpfs: trying to change size to %ld\n",
- attr->ia_size);
+ ncp_dbg(1, "trying to change size to %llu\n", attr->ia_size);
if ((result = ncp_make_open(inode, O_WRONLY)) < 0) {
result = -EACCES;
@@ -1066,7 +1046,7 @@ MODULE_ALIAS_FS("ncpfs");
static int __init init_ncp_fs(void)
{
int err;
- DPRINTK("ncpfs: init_ncp_fs called\n");
+ ncp_dbg(1, "called\n");
err = init_inodecache();
if (err)
@@ -1083,7 +1063,7 @@ out1:
static void __exit exit_ncp_fs(void)
{
- DPRINTK("ncpfs: exit_ncp_fs called\n");
+ ncp_dbg(1, "called\n");
unregister_filesystem(&ncp_fs_type);
destroy_inodecache();
}
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60426ccb3b6..d5659d96ee7 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -41,7 +41,7 @@ ncp_get_fs_info(struct ncp_server * server, struct inode *inode,
return -EFAULT;
if (info.version != NCP_GET_FS_INFO_VERSION) {
- DPRINTK("info.version invalid: %d\n", info.version);
+ ncp_dbg(1, "info.version invalid: %d\n", info.version);
return -EINVAL;
}
/* TODO: info.addr = server->m.serv_addr; */
@@ -66,7 +66,7 @@ ncp_get_fs_info_v2(struct ncp_server * server, struct inode *inode,
return -EFAULT;
if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
- DPRINTK("info.version invalid: %d\n", info2.version);
+ ncp_dbg(1, "info.version invalid: %d\n", info2.version);
return -EINVAL;
}
info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -132,7 +132,7 @@ ncp_get_compat_fs_info_v2(struct ncp_server * server, struct inode *inode,
return -EFAULT;
if (info2.version != NCP_GET_FS_INFO_VERSION_V2) {
- DPRINTK("info.version invalid: %d\n", info2.version);
+ ncp_dbg(1, "info.version invalid: %d\n", info2.version);
return -EINVAL;
}
info2.mounted_uid = from_kuid_munged(current_user_ns(), server->m.mounted_uid);
@@ -308,8 +308,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
else
result = server->reply_size;
ncp_unlock_server(server);
- DPRINTK("ncp_ioctl: copy %d bytes\n",
- result);
+ ncp_dbg(1, "copy %d bytes\n", result);
if (result >= 0)
if (copy_to_user(request.data, bouncebuffer, result))
result = -EFAULT;
@@ -385,9 +384,9 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
sr.namespace = server->name_space[sr.volNumber];
result = 0;
} else
- DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+ ncp_dbg(1, "s_root->d_inode==NULL\n");
} else
- DPRINTK("ncpfs: s_root==NULL\n");
+ ncp_dbg(1, "s_root==NULL\n");
} else {
sr.volNumber = -1;
sr.namespace = 0;
@@ -440,11 +439,11 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg
NCP_FINFO(s_inode)->DosDirNum = dosde;
server->root_setuped = 1;
} else {
- DPRINTK("ncpfs: s_root->d_inode==NULL\n");
+ ncp_dbg(1, "s_root->d_inode==NULL\n");
result = -EIO;
}
} else {
- DPRINTK("ncpfs: s_root==NULL\n");
+ ncp_dbg(1, "s_root==NULL\n");
result = -EIO;
}
}
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 3c5dd55d284..b359d12eb35 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -107,7 +107,7 @@ int ncp_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
- DPRINTK("ncp_mmap: called\n");
+ ncp_dbg(1, "called\n");
if (!ncp_conn_valid(NCP_SERVER(inode)))
return -EIO;
diff --git a/fs/ncpfs/ncp_fs.h b/fs/ncpfs/ncp_fs.h
index 31831afe1c3..b9f69e1b1f4 100644
--- a/fs/ncpfs/ncp_fs.h
+++ b/fs/ncpfs/ncp_fs.h
@@ -2,30 +2,32 @@
#include "ncp_fs_i.h"
#include "ncp_fs_sb.h"
-/* define because it is easy to change PRINTK to {*}PRINTK */
-#define PRINTK(format, args...) printk(KERN_DEBUG format , ## args)
-
#undef NCPFS_PARANOIA
#ifdef NCPFS_PARANOIA
-#define PPRINTK(format, args...) PRINTK(format , ## args)
+#define ncp_vdbg(fmt, ...) \
+ pr_debug(fmt, ##__VA_ARGS__)
#else
-#define PPRINTK(format, args...)
+#define ncp_vdbg(fmt, ...) \
+do { \
+ if (0) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
#endif
#ifndef DEBUG_NCP
#define DEBUG_NCP 0
#endif
-#if DEBUG_NCP > 0
-#define DPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DPRINTK(format, args...)
-#endif
-#if DEBUG_NCP > 1
-#define DDPRINTK(format, args...) PRINTK(format , ## args)
-#else
-#define DDPRINTK(format, args...)
+
+#if DEBUG_NCP > 0 && !defined(DEBUG)
+#define DEBUG
#endif
+#define ncp_dbg(level, fmt, ...) \
+do { \
+ if (level <= DEBUG_NCP) \
+ pr_debug(fmt, ##__VA_ARGS__); \
+} while (0)
+
#define NCP_MAX_RPC_TIMEOUT (6*HZ)
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h
index c51b2c54353..55e26fd8088 100644
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -38,16 +38,14 @@ struct ncp_mount_data_kernel {
};
struct ncp_server {
-
+ struct rcu_head rcu;
struct ncp_mount_data_kernel m; /* Nearly all of the mount data is of
interest for us later, so we store
it completely. */
__u8 name_space[NCP_NUMBER_OF_VOLUMES + 2];
- struct file *ncp_filp; /* File pointer to ncp socket */
struct socket *ncp_sock;/* ncp socket */
- struct file *info_filp;
struct socket *info_sock;
u8 sequence;
@@ -111,7 +109,7 @@ struct ncp_server {
spinlock_t requests_lock; /* Lock accesses to tx.requests, tx.creq and rcv.creq when STREAM mode */
- void (*data_ready)(struct sock* sk, int len);
+ void (*data_ready)(struct sock* sk);
void (*error_report)(struct sock* sk);
void (*write_space)(struct sock* sk); /* STREAM mode only */
struct {
@@ -153,7 +151,7 @@ extern void ncp_tcp_tx_proc(struct work_struct *work);
extern void ncpdgram_rcv_proc(struct work_struct *work);
extern void ncpdgram_timeout_proc(struct work_struct *work);
extern void ncpdgram_timeout_call(unsigned long server);
-extern void ncp_tcp_data_ready(struct sock* sk, int len);
+extern void ncp_tcp_data_ready(struct sock* sk);
extern void ncp_tcp_write_space(struct sock* sk);
extern void ncp_tcp_error_report(struct sock* sk);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c
index 981a95617fc..482387532f5 100644
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -9,14 +9,14 @@
*
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "ncp_fs.h"
static inline void assert_server_locked(struct ncp_server *server)
{
if (server->lock == 0) {
- DPRINTK("ncpfs: server not locked!\n");
+ ncp_dbg(1, "server not locked!\n");
}
}
@@ -75,7 +75,7 @@ static void ncp_add_pstring(struct ncp_server *server, const char *s)
int len = strlen(s);
assert_server_locked(server);
if (len > 255) {
- DPRINTK("ncpfs: string too long: %s\n", s);
+ ncp_dbg(1, "string too long: %s\n", s);
len = 255;
}
ncp_add_byte(server, len);
@@ -225,7 +225,7 @@ int ncp_get_volume_info_with_number(struct ncp_server* server,
result = -EIO;
len = ncp_reply_byte(server, 29);
if (len > NCP_VOLNAME_LEN) {
- DPRINTK("ncpfs: volume name too long: %d\n", len);
+ ncp_dbg(1, "volume name too long: %d\n", len);
goto out;
}
memcpy(&(target->volume_name), ncp_reply_data(server, 30), len);
@@ -259,7 +259,7 @@ int ncp_get_directory_info(struct ncp_server* server, __u8 n,
result = -EIO;
len = ncp_reply_byte(server, 21);
if (len > NCP_VOLNAME_LEN) {
- DPRINTK("ncpfs: volume name too long: %d\n", len);
+ ncp_dbg(1, "volume name too long: %d\n", len);
goto out;
}
memcpy(&(target->volume_name), ncp_reply_data(server, 22), len);
@@ -295,9 +295,9 @@ ncp_make_closed(struct inode *inode)
err = ncp_close_file(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle);
if (!err)
- PPRINTK("ncp_make_closed: volnum=%d, dirent=%u, error=%d\n",
- NCP_FINFO(inode)->volNumber,
- NCP_FINFO(inode)->dirEntNum, err);
+ ncp_vdbg("volnum=%d, dirent=%u, error=%d\n",
+ NCP_FINFO(inode)->volNumber,
+ NCP_FINFO(inode)->dirEntNum, err);
}
mutex_unlock(&NCP_FINFO(inode)->open_mutex);
return err;
@@ -394,8 +394,7 @@ int ncp_obtain_nfs_info(struct ncp_server *server,
if ((result = ncp_request(server, 87)) == 0) {
ncp_extract_nfs_info(ncp_reply_data(server, 0), &target->nfs);
- DPRINTK(KERN_DEBUG
- "ncp_obtain_nfs_info: (%s) mode=0%o, rdev=0x%x\n",
+ ncp_dbg(1, "(%s) mode=0%o, rdev=0x%x\n",
target->entryName, target->nfs.mode,
target->nfs.rdev);
} else {
@@ -425,7 +424,7 @@ int ncp_obtain_info(struct ncp_server *server, struct inode *dir, const char *pa
int result;
if (target == NULL) {
- printk(KERN_ERR "ncp_obtain_info: invalid call\n");
+ pr_err("%s: invalid call\n", __func__);
return -EINVAL;
}
ncp_init_request(server);
@@ -498,7 +497,7 @@ ncp_get_known_namespace(struct ncp_server *server, __u8 volume)
namespace = ncp_reply_data(server, 2);
while (no_namespaces > 0) {
- DPRINTK("get_namespaces: found %d on %d\n", *namespace, volume);
+ ncp_dbg(1, "found %d on %d\n", *namespace, volume);
#ifdef CONFIG_NCPFS_NFS_NS
if ((*namespace == NW_NS_NFS) && !(server->m.flags&NCP_MOUNT_NO_NFS))
@@ -531,8 +530,7 @@ ncp_update_known_namespace(struct ncp_server *server, __u8 volume, int *ret_ns)
if (ret_ns)
*ret_ns = ns;
- DPRINTK("lookup_vol: namespace[%d] = %d\n",
- volume, server->name_space[volume]);
+ ncp_dbg(1, "namespace[%d] = %d\n", volume, server->name_space[volume]);
if (server->name_space[volume] == ns)
return 0;
@@ -596,7 +594,7 @@ ncp_get_volume_root(struct ncp_server *server,
{
int result;
- DPRINTK("ncp_get_volume_root: looking up vol %s\n", volname);
+ ncp_dbg(1, "looking up vol %s\n", volname);
ncp_init_request(server);
ncp_add_byte(server, 22); /* Subfunction: Generate dir handle */
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 3a1587222c8..471bc3d1139 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -8,6 +8,7 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/time.h>
#include <linux/errno.h>
@@ -96,11 +97,11 @@ static void ncp_req_put(struct ncp_request_reply *req)
kfree(req);
}
-void ncp_tcp_data_ready(struct sock *sk, int len)
+void ncp_tcp_data_ready(struct sock *sk)
{
struct ncp_server *server = sk->sk_user_data;
- server->data_ready(sk, len);
+ server->data_ready(sk);
schedule_work(&server->rcv.tq);
}
@@ -231,7 +232,7 @@ static void __ncptcp_try_send(struct ncp_server *server)
return;
if (result < 0) {
- printk(KERN_ERR "ncpfs: tcp: Send failed: %d\n", result);
+ pr_err("tcp: Send failed: %d\n", result);
__ncp_abort_request(server, rq, result);
return;
}
@@ -332,7 +333,7 @@ static int ncp_add_request(struct ncp_server *server, struct ncp_request_reply *
mutex_lock(&server->rcv.creq_mutex);
if (!ncp_conn_valid(server)) {
mutex_unlock(&server->rcv.creq_mutex);
- printk(KERN_ERR "ncpfs: tcp: Server died\n");
+ pr_err("tcp: Server died\n");
return -EIO;
}
ncp_req_get(req);
@@ -405,15 +406,15 @@ void ncpdgram_rcv_proc(struct work_struct *work)
}
result = _recv(sock, buf, sizeof(buf), MSG_DONTWAIT);
if (result < 0) {
- DPRINTK("recv failed with %d\n", result);
+ ncp_dbg(1, "recv failed with %d\n", result);
continue;
}
if (result < 10) {
- DPRINTK("too short (%u) watchdog packet\n", result);
+ ncp_dbg(1, "too short (%u) watchdog packet\n", result);
continue;
}
if (buf[9] != '?') {
- DPRINTK("bad signature (%02X) in watchdog packet\n", buf[9]);
+ ncp_dbg(1, "bad signature (%02X) in watchdog packet\n", buf[9]);
continue;
}
buf[9] = 'Y';
@@ -448,7 +449,7 @@ void ncpdgram_rcv_proc(struct work_struct *work)
result -= 8;
hdrl = sock->sk->sk_family == AF_INET ? 8 : 6;
if (sign_verify_reply(server, server->rxbuf + hdrl, result - hdrl, cpu_to_le32(result), server->rxbuf + result)) {
- printk(KERN_INFO "ncpfs: Signature violation\n");
+ pr_info("Signature violation\n");
result = -EIO;
}
}
@@ -524,7 +525,7 @@ static int do_tcp_rcv(struct ncp_server *server, void *buffer, size_t len)
return result;
}
if (result > len) {
- printk(KERN_ERR "ncpfs: tcp: bug in recvmsg (%u > %Zu)\n", result, len);
+ pr_err("tcp: bug in recvmsg (%u > %Zu)\n", result, len);
return -EIO;
}
return result;
@@ -552,9 +553,9 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
__ncptcp_abort(server);
}
if (result < 0) {
- printk(KERN_ERR "ncpfs: tcp: error in recvmsg: %d\n", result);
+ pr_err("tcp: error in recvmsg: %d\n", result);
} else {
- DPRINTK(KERN_ERR "ncpfs: tcp: EOF\n");
+ ncp_dbg(1, "tcp: EOF\n");
}
return -EIO;
}
@@ -566,20 +567,20 @@ static int __ncptcp_rcv_proc(struct ncp_server *server)
switch (server->rcv.state) {
case 0:
if (server->rcv.buf.magic != htonl(NCP_TCP_RCVD_MAGIC)) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
+ pr_err("tcp: Unexpected reply type %08X\n", ntohl(server->rcv.buf.magic));
__ncptcp_abort(server);
return -EIO;
}
datalen = ntohl(server->rcv.buf.len) & 0x0FFFFFFF;
if (datalen < 10) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+ pr_err("tcp: Unexpected reply len %d\n", datalen);
__ncptcp_abort(server);
return -EIO;
}
#ifdef CONFIG_NCPFS_PACKET_SIGNING
if (server->sign_active) {
if (datalen < 18) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d\n", datalen);
+ pr_err("tcp: Unexpected reply len %d\n", datalen);
__ncptcp_abort(server);
return -EIO;
}
@@ -604,7 +605,7 @@ cont:;
server->rcv.len = datalen - 10;
break;
}
- DPRINTK("ncpfs: tcp: Unexpected NCP type %02X\n", type);
+ ncp_dbg(1, "tcp: Unexpected NCP type %02X\n", type);
skipdata2:;
server->rcv.state = 2;
skipdata:;
@@ -614,11 +615,11 @@ skipdata:;
}
req = server->rcv.creq;
if (!req) {
- DPRINTK(KERN_ERR "ncpfs: Reply without appropriate request\n");
+ ncp_dbg(1, "Reply without appropriate request\n");
goto skipdata2;
}
if (datalen > req->datalen + 8) {
- printk(KERN_ERR "ncpfs: tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
+ pr_err("tcp: Unexpected reply len %d (expected at most %Zd)\n", datalen, req->datalen + 8);
server->rcv.state = 3;
goto skipdata;
}
@@ -638,12 +639,12 @@ skipdata:;
req = server->rcv.creq;
if (req->tx_type != NCP_ALLOC_SLOT_REQUEST) {
if (((struct ncp_reply_header*)server->rxbuf)->sequence != server->sequence) {
- printk(KERN_ERR "ncpfs: tcp: Bad sequence number\n");
+ pr_err("tcp: Bad sequence number\n");
__ncp_abort_request(server, req, -EIO);
return -EIO;
}
if ((((struct ncp_reply_header*)server->rxbuf)->conn_low | (((struct ncp_reply_header*)server->rxbuf)->conn_high << 8)) != server->connection) {
- printk(KERN_ERR "ncpfs: tcp: Connection number mismatch\n");
+ pr_err("tcp: Connection number mismatch\n");
__ncp_abort_request(server, req, -EIO);
return -EIO;
}
@@ -651,7 +652,7 @@ skipdata:;
#ifdef CONFIG_NCPFS_PACKET_SIGNING
if (server->sign_active && req->tx_type != NCP_DEALLOC_SLOT_REQUEST) {
if (sign_verify_reply(server, server->rxbuf + 6, req->datalen - 6, cpu_to_be32(req->datalen + 16), &server->rcv.buf.type)) {
- printk(KERN_ERR "ncpfs: tcp: Signature violation\n");
+ pr_err("tcp: Signature violation\n");
__ncp_abort_request(server, req, -EIO);
return -EIO;
}
@@ -742,7 +743,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
int result;
if (server->lock == 0) {
- printk(KERN_ERR "ncpfs: Server not locked!\n");
+ pr_err("Server not locked!\n");
return -EIO;
}
if (!ncp_conn_valid(server)) {
@@ -781,7 +782,7 @@ static int ncp_do_request(struct ncp_server *server, int size,
spin_unlock_irqrestore(&current->sighand->siglock, flags);
}
- DDPRINTK("do_ncp_rpc_call returned %d\n", result);
+ ncp_dbg(2, "do_ncp_rpc_call returned %d\n", result);
return result;
}
@@ -811,7 +812,7 @@ int ncp_request2(struct ncp_server *server, int function,
result = ncp_do_request(server, server->current_size, reply, size);
if (result < 0) {
- DPRINTK("ncp_request_error: %d\n", result);
+ ncp_dbg(1, "ncp_request_error: %d\n", result);
goto out;
}
server->completion = reply->completion_code;
@@ -822,7 +823,7 @@ int ncp_request2(struct ncp_server *server, int function,
result = reply->completion_code;
if (result != 0)
- PPRINTK("ncp_request: completion code=%x\n", result);
+ ncp_vdbg("completion code=%x\n", result);
out:
return result;
}
@@ -865,14 +866,14 @@ void ncp_lock_server(struct ncp_server *server)
{
mutex_lock(&server->mutex);
if (server->lock)
- printk(KERN_WARNING "ncp_lock_server: was locked!\n");
+ pr_warn("%s: was locked!\n", __func__);
server->lock = 1;
}
void ncp_unlock_server(struct ncp_server *server)
{
if (!server->lock) {
- printk(KERN_WARNING "ncp_unlock_server: was not locked!\n");
+ pr_warn("%s: was not locked!\n", __func__);
return;
}
server->lock = 0;
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 52439ddc8de..1a63bfdb4a6 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -112,7 +112,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) {
__le32 attr;
unsigned int hdr;
- DPRINTK("ncp_symlink(dir=%p,dentry=%p,symname=%s)\n",dir,dentry,symname);
+ ncp_dbg(1, "dir=%p, dentry=%p, symname=%s\n", dir, dentry, symname);
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber))
kludge = 0;
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index b5e80b0af31..3dece03f2fc 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -116,17 +116,17 @@ config NFS_V4_2
config PNFS_FILE_LAYOUT
tristate
depends on NFS_V4_1
- default m
+ default NFS_V4
config PNFS_BLOCK
tristate
depends on NFS_V4_1 && BLK_DEV_DM
- default m
+ default NFS_V4
config PNFS_OBJLAYOUT
tristate
depends on NFS_V4_1 && SCSI_OSD_ULD
- default m
+ default NFS_V4
config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
string "NFSv4.1 Implementation ID Domain"
@@ -140,6 +140,17 @@ config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
If the NFS client is unchanged from the upstream kernel, this
option should be set to the default "kernel.org".
+config NFS_V4_1_MIGRATION
+ bool "NFSv4.1 client support for migration"
+ depends on NFS_V4_1
+ default n
+ help
+ This option makes the NFS client advertise to NFSv4.1 servers that
+ it can support NFSv4 migration.
+
+ The NFSv4.1 pieces of the Linux NFSv4 migration implementation are
+ still experimental. If you are not an NFSv4 developer, say N here.
+
config NFS_V4_SECURITY_LABEL
bool
depends on NFS_V4_2 && SECURITY
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 03192a66c14..4782e0840dc 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -29,8 +29,6 @@ nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
-obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
-nfs_layout_nfsv41_files-y := nfs4filelayout.o nfs4filelayoutdev.o
-
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index e242bbf7297..9b431f44fad 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -134,8 +134,8 @@ bl_submit_bio(int rw, struct bio *bio)
if (bio) {
get_parallel(bio->bi_private);
dprintk("%s submitting %s bio %u@%llu\n", __func__,
- rw == READ ? "read" : "write",
- bio->bi_size, (unsigned long long)bio->bi_sector);
+ rw == READ ? "read" : "write", bio->bi_iter.bi_size,
+ (unsigned long long)bio->bi_iter.bi_sector);
submit_bio(rw, bio);
}
return NULL;
@@ -156,7 +156,8 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
}
if (bio) {
- bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_iter.bi_sector = isect - be->be_f_offset +
+ be->be_v_offset;
bio->bi_bdev = be->be_mdev;
bio->bi_end_io = end_io;
bio->bi_private = par;
@@ -201,19 +202,15 @@ static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
static void bl_end_io_read(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+ struct bio_vec *bvec;
+ int i;
- do {
- struct page *page = bvec->bv_page;
+ if (!err)
+ bio_for_each_segment_all(bvec, bio, i)
+ SetPageUptodate(bvec->bv_page);
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
- if (uptodate)
- SetPageUptodate(page);
- } while (bvec >= bio->bi_io_vec);
- if (!uptodate) {
- struct nfs_read_data *rdata = par->data;
+ if (err) {
+ struct nfs_pgio_data *rdata = par->data;
struct nfs_pgio_header *header = rdata->header;
if (!header->pnfs_error)
@@ -227,17 +224,17 @@ static void bl_end_io_read(struct bio *bio, int err)
static void bl_read_cleanup(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_read_data *rdata;
+ struct nfs_pgio_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_read_data, task);
+ rdata = container_of(task, struct nfs_pgio_data, task);
pnfs_ld_read_done(rdata);
}
static void
bl_end_par_io_read(void *data, int unused)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
rdata->task.tk_status = rdata->header->pnfs_error;
INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
@@ -245,7 +242,7 @@ bl_end_par_io_read(void *data, int unused)
}
static enum pnfs_try_status
-bl_read_pagelist(struct nfs_read_data *rdata)
+bl_read_pagelist(struct nfs_pgio_data *rdata)
{
struct nfs_pgio_header *header = rdata->header;
int i, hole;
@@ -383,21 +380,17 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
static void bl_end_io_write_zero(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
- const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-
- do {
- struct page *page = bvec->bv_page;
+ struct bio_vec *bvec;
+ int i;
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
+ bio_for_each_segment_all(bvec, bio, i) {
/* This is the zeroing page we added */
- end_page_writeback(page);
- page_cache_release(page);
- } while (bvec >= bio->bi_io_vec);
+ end_page_writeback(bvec->bv_page);
+ page_cache_release(bvec->bv_page);
+ }
- if (unlikely(!uptodate)) {
- struct nfs_write_data *data = par->data;
+ if (unlikely(err)) {
+ struct nfs_pgio_data *data = par->data;
struct nfs_pgio_header *header = data->header;
if (!header->pnfs_error)
@@ -412,7 +405,7 @@ static void bl_end_io_write(struct bio *bio, int err)
{
struct parallel_io *par = bio->bi_private;
const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
- struct nfs_write_data *data = par->data;
+ struct nfs_pgio_data *data = par->data;
struct nfs_pgio_header *header = data->header;
if (!uptodate) {
@@ -430,10 +423,10 @@ static void bl_end_io_write(struct bio *bio, int err)
static void bl_write_cleanup(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_write_data *wdata;
+ struct nfs_pgio_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_write_data, task);
+ wdata = container_of(task, struct nfs_pgio_data, task);
if (likely(!wdata->header->pnfs_error)) {
/* Marks for LAYOUTCOMMIT */
mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
@@ -445,7 +438,7 @@ static void bl_write_cleanup(struct work_struct *work)
/* Called when last of bios associated with a bl_write_pagelist call finishes */
static void bl_end_par_io_write(void *data, int num_se)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
if (unlikely(wdata->header->pnfs_error)) {
bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
@@ -519,7 +512,7 @@ bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
(offset / SECTOR_SIZE);
- bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
+ bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
bio->bi_bdev = be->be_mdev;
bio->bi_end_io = bl_read_single_end_io;
@@ -680,7 +673,7 @@ check_page:
}
static enum pnfs_try_status
-bl_write_pagelist(struct nfs_write_data *wdata, int sync)
+bl_write_pagelist(struct nfs_pgio_data *wdata, int sync)
{
struct nfs_pgio_header *header = wdata->header;
int i, ret, npg_zero, pg_index, last = 0;
@@ -1196,13 +1189,17 @@ bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
pnfs_generic_pg_init_read(pgio, req);
}
-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (pgio->pg_dreq != NULL &&
!is_aligned_req(req, SECTOR_SIZE))
- return false;
+ return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
@@ -1220,7 +1217,7 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
end = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
if (end != NFS_I(inode)->npages) {
rcu_read_lock();
- end = radix_tree_next_hole(&mapping->page_tree, idx + 1, ULONG_MAX);
+ end = page_cache_next_hole(mapping, idx + 1, ULONG_MAX);
rcu_read_unlock();
}
@@ -1248,13 +1245,17 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
}
}
-static bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t
bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
if (pgio->pg_dreq != NULL &&
!is_aligned_req(req, PAGE_CACHE_SIZE))
- return false;
+ return 0;
return pnfs_generic_pg_test(pgio, prev, req);
}
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 8485978993e..9838fb02047 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -36,6 +36,7 @@
#include <linux/nfs_fs.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "../nfs4_fs.h"
#include "../pnfs.h"
#include "../netns.h"
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 9c3e117c3ed..4d016144256 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -44,7 +44,7 @@
static inline sector_t normalize(sector_t s, int base)
{
sector_t tmp = s; /* Since do_div modifies its argument */
- return s - do_div(tmp, base);
+ return s - sector_div(tmp, base);
}
static inline sector_t normalize_up(sector_t s, int base)
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 67cd7321316..073b4cf67ed 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -164,8 +164,7 @@ nfs41_callback_up(struct svc_serv *serv)
svc_xprt_put(serv->sv_bc_xprt);
serv->sv_bc_xprt = NULL;
}
- dprintk("--> %s return %ld\n", __func__,
- IS_ERR(rqstp) ? PTR_ERR(rqstp) : 0);
+ dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
return rqstp;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ae2e87b9545..41db5258e7a 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -112,7 +112,8 @@ out:
* TODO: keep track of all layouts (and delegations) in a hash table
* hashed by filehandle.
*/
-static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, struct nfs_fh *fh)
+static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
+ struct nfs_fh *fh, nfs4_stateid *stateid)
{
struct nfs_server *server;
struct inode *ino;
@@ -120,17 +121,19 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+ continue;
if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
continue;
ino = igrab(lo->plh_inode);
if (!ino)
- continue;
+ break;
spin_lock(&ino->i_lock);
/* Is this layout in the process of being freed? */
if (NFS_I(ino)->layout != lo) {
spin_unlock(&ino->i_lock);
iput(ino);
- continue;
+ break;
}
pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -141,13 +144,14 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
return NULL;
}
-static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, struct nfs_fh *fh)
+static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
+ struct nfs_fh *fh, nfs4_stateid *stateid)
{
struct pnfs_layout_hdr *lo;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh);
+ lo = get_layout_by_fh_locked(clp, fh, stateid);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
@@ -162,9 +166,9 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh);
+ lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
if (!lo)
- return NFS4ERR_NOMATCHING_LAYOUT;
+ goto out;
ino = lo->plh_inode;
spin_lock(&ino->i_lock);
@@ -179,6 +183,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
pnfs_free_lseg_list(&free_me_list);
pnfs_put_layout_hdr(lo);
iput(ino);
+out:
return rv;
}
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2dceee4db07..1d09289c8f0 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -590,6 +590,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
if (test_bit(NFS_CS_DISCRTRY, &clp->cl_flags))
args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+ if (test_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags))
+ args.flags |= RPC_CLNT_CREATE_NO_RETRANS_TIMEOUT;
if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
@@ -784,8 +786,10 @@ static int nfs_init_server(struct nfs_server *server,
goto error;
server->port = data->nfs_server.port;
+ server->auth_info = data->auth_info;
- error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]);
+ error = nfs_init_server_rpcclient(server, &timeparms,
+ data->selected_flavor);
if (error < 0)
goto error;
@@ -926,6 +930,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour
target->acdirmax = source->acdirmax;
target->caps = source->caps;
target->options = source->options;
+ target->auth_info = source->auth_info;
}
EXPORT_SYMBOL_GPL(nfs_server_copy_userdata);
@@ -943,7 +948,7 @@ void nfs_server_insert_lists(struct nfs_server *server)
}
EXPORT_SYMBOL_GPL(nfs_server_insert_lists);
-static void nfs_server_remove_lists(struct nfs_server *server)
+void nfs_server_remove_lists(struct nfs_server *server)
{
struct nfs_client *clp = server->nfs_client;
struct nfs_net *nn;
@@ -960,6 +965,7 @@ static void nfs_server_remove_lists(struct nfs_server *server)
synchronize_rcu();
}
+EXPORT_SYMBOL_GPL(nfs_server_remove_lists);
/*
* Allocate and initialise a server record
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index ef792f29f83..5d8ccecf5f5 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -659,16 +659,19 @@ int nfs_async_inode_return_delegation(struct inode *inode,
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation == NULL)
+ goto out_enoent;
- if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid)) {
- rcu_read_unlock();
- return -ENOENT;
- }
+ if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ goto out_enoent;
nfs_mark_return_delegation(server, delegation);
rcu_read_unlock();
nfs_delegation_run_state_manager(clp);
return 0;
+out_enoent:
+ rcu_read_unlock();
+ return -ENOENT;
}
static struct inode *
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 8b3dd7174fa..4a3d4ef7612 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -69,21 +69,28 @@ const struct address_space_operations nfs_dir_aops = {
static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir, struct rpc_cred *cred)
{
+ struct nfs_inode *nfsi = NFS_I(dir);
struct nfs_open_dir_context *ctx;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
if (ctx != NULL) {
ctx->duped = 0;
- ctx->attr_gencount = NFS_I(dir)->attr_gencount;
+ ctx->attr_gencount = nfsi->attr_gencount;
ctx->dir_cookie = 0;
ctx->dup_cookie = 0;
ctx->cred = get_rpccred(cred);
+ spin_lock(&dir->i_lock);
+ list_add(&ctx->list, &nfsi->open_files);
+ spin_unlock(&dir->i_lock);
return ctx;
}
return ERR_PTR(-ENOMEM);
}
-static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
+static void put_nfs_open_dir_context(struct inode *dir, struct nfs_open_dir_context *ctx)
{
+ spin_lock(&dir->i_lock);
+ list_del(&ctx->list);
+ spin_unlock(&dir->i_lock);
put_rpccred(ctx->cred);
kfree(ctx);
}
@@ -126,7 +133,7 @@ out:
static int
nfs_closedir(struct inode *inode, struct file *filp)
{
- put_nfs_open_dir_context(filp->private_data);
+ put_nfs_open_dir_context(filp->f_path.dentry->d_inode, filp->private_data);
return 0;
}
@@ -274,6 +281,15 @@ out_eof:
return -EBADCOOKIE;
}
+static bool
+nfs_readdir_inode_mapping_valid(struct nfs_inode *nfsi)
+{
+ if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+ return false;
+ smp_rmb();
+ return !test_bit(NFS_INO_INVALIDATING, &nfsi->flags);
+}
+
static
int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
@@ -287,8 +303,8 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
struct nfs_open_dir_context *ctx = desc->file->private_data;
new_pos = desc->current_index + i;
- if (ctx->attr_gencount != nfsi->attr_gencount
- || (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))) {
+ if (ctx->attr_gencount != nfsi->attr_gencount ||
+ !nfs_readdir_inode_mapping_valid(nfsi)) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
} else if (new_pos < desc->ctx->pos) {
@@ -297,10 +313,9 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
if (printk_ratelimit()) {
pr_notice("NFS: directory %pD2 contains a readdir loop."
"Please contact your server vendor. "
- "The file: %s has duplicate cookie %llu\n",
- desc->file,
- array->array[i].string.name,
- *desc->dir_cookie);
+ "The file: %.*s has duplicate cookie %llu\n",
+ desc->file, array->array[i].string.len,
+ array->array[i].string.name, *desc->dir_cookie);
}
status = -ELOOP;
goto out;
@@ -428,6 +443,22 @@ void nfs_advise_use_readdirplus(struct inode *dir)
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
}
+/*
+ * This function is mainly for use by nfs_getattr().
+ *
+ * If this is an 'ls -l', we want to force use of readdirplus.
+ * Do this by checking if there is an active file descriptor
+ * and calling nfs_advise_use_readdirplus, then forcing a
+ * cache flush.
+ */
+void nfs_force_use_readdirplus(struct inode *dir)
+{
+ if (!list_empty(&NFS_I(dir)->open_files)) {
+ nfs_advise_use_readdirplus(dir);
+ nfs_zap_mapping(dir, dir->i_mapping);
+ }
+}
+
static
void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
{
@@ -806,6 +837,17 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
goto out;
}
+static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
+{
+ struct nfs_inode *nfsi = NFS_I(dir);
+
+ if (nfs_attribute_cache_expired(dir))
+ return true;
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return true;
+ return false;
+}
+
/* The file offset position represents the dirent entry number. A
last cookie cache takes care of the common case of reading the
whole directory.
@@ -838,7 +880,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
nfs_block_sillyrename(dentry);
- if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
+ if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
goto out;
@@ -1125,7 +1167,13 @@ out_zap_parent:
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
nfs_zap_caches(inode);
- if (dentry->d_flags & DCACHE_DISCONNECTED)
+ /*
+ * We can't d_drop the root of a disconnected tree:
+ * its d_hash is on the s_anon list and d_drop() would hide
+ * it from shrink_dcache_for_unmount(), leading to busy
+ * inodes on unmount and further oopses.
+ */
+ if (IS_ROOT(dentry))
goto out_valid;
}
/* If we have submounts, don't unhash ! */
@@ -1361,7 +1409,7 @@ static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, i
static int do_open(struct inode *inode, struct file *filp)
{
- nfs_fscache_set_inode_cookie(inode, filp);
+ nfs_fscache_open_file(inode, filp);
return 0;
}
@@ -1398,7 +1446,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
/* Expect a negative dentry */
BUG_ON(dentry->d_inode);
- dfprintk(VFS, "NFS: atomic_open(%s/%ld), %pd\n",
+ dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
err = nfs_check_flags(open_flags);
@@ -1588,7 +1636,7 @@ int nfs_create(struct inode *dir, struct dentry *dentry,
int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT;
int error;
- dfprintk(VFS, "NFS: create(%s/%ld), %pd\n",
+ dfprintk(VFS, "NFS: create(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
attr.ia_mode = mode;
@@ -1615,7 +1663,7 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
struct iattr attr;
int status;
- dfprintk(VFS, "NFS: mknod(%s/%ld), %pd\n",
+ dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
if (!new_valid_dev(rdev))
@@ -1644,7 +1692,7 @@ int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct iattr attr;
int error;
- dfprintk(VFS, "NFS: mkdir(%s/%ld), %pd\n",
+ dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
attr.ia_valid = ATTR_MODE;
@@ -1672,7 +1720,7 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry)
{
int error;
- dfprintk(VFS, "NFS: rmdir(%s/%ld), %pd\n",
+ dfprintk(VFS, "NFS: rmdir(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
trace_nfs_rmdir_enter(dir, dentry);
@@ -1741,7 +1789,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry)
int error;
int need_rehash = 0;
- dfprintk(VFS, "NFS: unlink(%s/%ld, %pd)\n", dir->i_sb->s_id,
+ dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id,
dir->i_ino, dentry);
trace_nfs_unlink_enter(dir, dentry);
@@ -1792,7 +1840,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
unsigned int pathlen = strlen(symname);
int error;
- dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s)\n", dir->i_sb->s_id,
+ dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s)\n", dir->i_sb->s_id,
dir->i_ino, dentry, symname);
if (pathlen > PAGE_SIZE)
@@ -1815,7 +1863,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
trace_nfs_symlink_exit(dir, dentry, error);
if (error != 0) {
- dfprintk(VFS, "NFS: symlink(%s/%ld, %pd, %s) error %d\n",
+ dfprintk(VFS, "NFS: symlink(%s/%lu, %pd, %s) error %d\n",
dir->i_sb->s_id, dir->i_ino,
dentry, symname, error);
d_drop(dentry);
@@ -1831,6 +1879,11 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
GFP_KERNEL)) {
SetPageUptodate(page);
unlock_page(page);
+ /*
+ * add_to_page_cache_lru() grabs an extra page refcount.
+ * Drop it here to avoid leaking this page later.
+ */
+ page_cache_release(page);
} else
__free_page(page);
@@ -1891,6 +1944,7 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = old_dentry->d_inode;
struct inode *new_inode = new_dentry->d_inode;
struct dentry *dentry = NULL, *rehash = NULL;
+ struct rpc_task *task;
int error = -EBUSY;
dfprintk(VFS, "NFS: rename(%pd2 -> %pd2, ct=%d)\n",
@@ -1938,8 +1992,16 @@ int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_inode != NULL)
NFS_PROTO(new_inode)->return_delegation(new_inode);
- error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
- new_dir, &new_dentry->d_name);
+ task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL);
+ if (IS_ERR(task)) {
+ error = PTR_ERR(task);
+ goto out;
+ }
+
+ error = rpc_wait_for_completion_task(task);
+ if (error == 0)
+ error = task->tk_status;
+ rpc_put_task(task);
nfs_mark_for_revalidate(old_inode);
out:
if (rehash)
@@ -1970,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
{
put_rpccred(entry->cred);
kfree(entry);
- smp_mb__before_atomic_dec();
+ smp_mb__before_atomic();
atomic_long_dec(&nfs_access_nr_entries);
- smp_mb__after_atomic_dec();
+ smp_mb__after_atomic();
}
static void nfs_access_free_list(struct list_head *head)
@@ -2020,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
else {
remove_lru_entry:
list_del_init(&nfsi->access_cache_inode_lru);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
}
spin_unlock(&inode->i_lock);
}
@@ -2170,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
nfs_access_add_rbtree(inode, cache);
/* Update accounting */
- smp_mb__before_atomic_inc();
+ smp_mb__before_atomic();
atomic_long_inc(&nfs_access_nr_entries);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
/* Add inode to global LRU list */
if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
@@ -2298,7 +2360,7 @@ out:
if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
res = -EACCES;
- dfprintk(VFS, "NFS: permission(%s/%ld), mask=0x%x, res=%d\n",
+ dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
inode->i_sb->s_id, inode->i_ino, mask, res);
return res;
out_notsup:
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d71d66c9e0a..f11b9eed0de 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -108,6 +108,97 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
+/*
+ * nfs_direct_select_verf - select the right verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
+ * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ *
+ * returns the correct verifier to use given the role of the server
+ */
+static struct nfs_writeverf *
+nfs_direct_select_verf(struct nfs_direct_req *dreq,
+ struct nfs_client *ds_clp,
+ int ds_idx)
+{
+ struct nfs_writeverf *verfp = &dreq->verf;
+
+#ifdef CONFIG_NFS_V4_1
+ if (ds_clp) {
+ /* pNFS is in use, use the DS verf */
+ if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
+ verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+ else
+ WARN_ON_ONCE(1);
+ }
+#endif
+ return verfp;
+}
+
+
+/*
+ * nfs_direct_set_hdr_verf - set the write/commit verifier
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verfs
+ *
+ * Set the server's (MDS or DS) "seen" verifier
+ */
+static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_writeverf *verfp;
+
+ verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+ hdr->data->ds_idx);
+ WARN_ON_ONCE(verfp->committed >= 0);
+ memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ WARN_ON_ONCE(verfp->committed < 0);
+}
+
+/*
+ * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
+ * @dreq - direct request possibly spanning multiple servers
+ * @hdr - pageio header to validate against previously seen verf
+ *
+ * set the server's "seen" verf if not initialized.
+ * returns result of comparison between @hdr->verf and the "seen"
+ * verf of the server used by @hdr (DS or MDS)
+ */
+static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_writeverf *verfp;
+
+ verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp,
+ hdr->data->ds_idx);
+ if (verfp->committed < 0) {
+ nfs_direct_set_hdr_verf(dreq, hdr);
+ return 0;
+ }
+ return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+}
+
+#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
+/*
+ * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
+ * @dreq - direct request possibly spanning multiple servers
+ * @data - commit data to validate against previously seen verf
+ *
+ * returns result of comparison between @data->verf and the verf of
+ * the server used by @data (DS or MDS)
+ */
+static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
+ struct nfs_commit_data *data)
+{
+ struct nfs_writeverf *verfp;
+
+ verfp = nfs_direct_select_verf(dreq, data->ds_clp,
+ data->ds_commit_index);
+ WARN_ON_ONCE(verfp->committed < 0);
+ return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+}
+#endif
+
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @rw: direction (read or write)
@@ -121,20 +212,20 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
* shunt off direct read and write requests before the VFS gets them,
* so this method is only ever called for swap.
*/
-ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
+ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
#ifndef CONFIG_NFS_SWAP
dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
- iocb->ki_filp, (long long) pos, nr_segs);
+ iocb->ki_filp, (long long) pos, iter->nr_segs);
return -EINVAL;
#else
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
if (rw == READ || rw == KERNEL_READ)
- return nfs_file_direct_read(iocb, iov, nr_segs, pos,
+ return nfs_file_direct_read(iocb, iter, pos,
rw == READ ? true : false);
- return nfs_file_direct_write(iocb, iov, nr_segs, pos,
+ return nfs_file_direct_write(iocb, iter, pos,
rw == WRITE ? true : false);
#endif /* CONFIG_NFS_SWAP */
}
@@ -168,6 +259,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_get(&dreq->kref);
init_completion(&dreq->completion);
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
+ dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
spin_lock_init(&dreq->lock);
@@ -222,14 +314,31 @@ out:
* Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
* the iocb is still valid here if this is a synchronous request.
*/
-static void nfs_direct_complete(struct nfs_direct_req *dreq)
+static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
{
+ struct inode *inode = dreq->inode;
+
+ if (dreq->iocb && write) {
+ loff_t pos = dreq->iocb->ki_pos + dreq->count;
+
+ spin_lock(&inode->i_lock);
+ if (i_size_read(inode) < pos)
+ i_size_write(inode, pos);
+ spin_unlock(&inode->i_lock);
+ }
+
+ if (write)
+ nfs_zap_mapping(inode, inode->i_mapping);
+
+ inode_dio_done(inode);
+
if (dreq->iocb) {
long res = (long) dreq->error;
if (!res)
res = (long) dreq->count;
aio_complete(dreq->iocb, res, 0);
}
+
complete_all(&dreq->completion);
nfs_direct_req_release(dreq);
@@ -237,9 +346,9 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
static void nfs_direct_readpage_release(struct nfs_page *req)
{
- dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
+ dprintk("NFS: direct read done (%s/%llu %d@%lld)\n",
req->wb_context->dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
req->wb_bytes,
(long long)req_offset(req));
nfs_release_request(req);
@@ -272,7 +381,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
}
out_put:
if (put_dreq(dreq))
- nfs_direct_complete(dreq);
+ nfs_direct_complete(dreq, false);
hdr->release(hdr);
}
@@ -305,66 +414,42 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
* handled automatically by nfs_direct_read_result(). Otherwise, if
* no requests have been sent, just return an error.
*/
-static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
- const struct iovec *iov,
- loff_t pos, bool uio)
-{
- struct nfs_direct_req *dreq = desc->pg_dreq;
- struct nfs_open_context *ctx = dreq->ctx;
- struct inode *inode = ctx->dentry->d_inode;
- unsigned long user_addr = (unsigned long)iov->iov_base;
- size_t count = iov->iov_len;
- size_t rsize = NFS_SERVER(inode)->rsize;
- unsigned int pgbase;
- int result;
- ssize_t started = 0;
- struct page **pagevec = NULL;
- unsigned int npages;
-
- do {
- size_t bytes;
- int i;
- pgbase = user_addr & ~PAGE_MASK;
- bytes = min(max_t(size_t, rsize, PAGE_SIZE), count);
+static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
+ struct iov_iter *iter,
+ loff_t pos)
+{
+ struct nfs_pageio_descriptor desc;
+ struct inode *inode = dreq->inode;
+ ssize_t result = -EINVAL;
+ size_t requested_bytes = 0;
+ size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
- result = -ENOMEM;
- npages = nfs_page_array_len(pgbase, bytes);
- if (!pagevec)
- pagevec = kmalloc(npages * sizeof(struct page *),
- GFP_KERNEL);
- if (!pagevec)
- break;
- if (uio) {
- down_read(&current->mm->mmap_sem);
- result = get_user_pages(current, current->mm, user_addr,
- npages, 1, 0, pagevec, NULL);
- up_read(&current->mm->mmap_sem);
- if (result < 0)
- break;
- } else {
- WARN_ON(npages != 1);
- result = get_kernel_page(user_addr, 1, pagevec);
- if (WARN_ON(result != 1))
- break;
- }
+ nfs_pageio_init_read(&desc, dreq->inode, false,
+ &nfs_direct_read_completion_ops);
+ get_dreq(dreq);
+ desc.pg_dreq = dreq;
+ atomic_inc(&inode->i_dio_count);
- if ((unsigned)result < npages) {
- bytes = result * PAGE_SIZE;
- if (bytes <= pgbase) {
- nfs_direct_release_pages(pagevec, result);
- break;
- }
- bytes -= pgbase;
- npages = result;
- }
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+ result = iov_iter_get_pages_alloc(iter, &pagevec,
+ rsize, &pgbase);
+ if (result < 0)
+ break;
+
+ bytes = result;
+ iov_iter_advance(iter, bytes);
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
for (i = 0; i < npages; i++) {
struct nfs_page *req;
unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
/* XXX do we need to do the eof zeroing found in async_filler? */
- req = nfs_create_request(dreq->ctx, dreq->inode,
- pagevec[i],
+ req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
pgbase, req_len);
if (IS_ERR(req)) {
result = PTR_ERR(req);
@@ -372,54 +457,21 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
}
req->wb_index = pos >> PAGE_SHIFT;
req->wb_offset = pos & ~PAGE_MASK;
- if (!nfs_pageio_add_request(desc, req)) {
- result = desc->pg_error;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
nfs_release_request(req);
break;
}
pgbase = 0;
bytes -= req_len;
- started += req_len;
- user_addr += req_len;
+ requested_bytes += req_len;
pos += req_len;
- count -= req_len;
dreq->bytes_left -= req_len;
}
- /* The nfs_page now hold references to these pages */
nfs_direct_release_pages(pagevec, npages);
- } while (count != 0 && result >= 0);
-
- kfree(pagevec);
-
- if (started)
- return started;
- return result < 0 ? (ssize_t) result : -EFAULT;
-}
-
-static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos, bool uio)
-{
- struct nfs_pageio_descriptor desc;
- ssize_t result = -EINVAL;
- size_t requested_bytes = 0;
- unsigned long seg;
-
- NFS_PROTO(dreq->inode)->read_pageio_init(&desc, dreq->inode,
- &nfs_direct_read_completion_ops);
- get_dreq(dreq);
- desc.pg_dreq = dreq;
-
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *vec = &iov[seg];
- result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
+ kvfree(pagevec);
if (result < 0)
break;
- requested_bytes += result;
- if ((size_t)result < vec->iov_len)
- break;
- pos += vec->iov_len;
}
nfs_pageio_complete(&desc);
@@ -429,29 +481,69 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
* generic layer handle the completion.
*/
if (requested_bytes == 0) {
+ inode_dio_done(inode);
nfs_direct_req_release(dreq);
return result < 0 ? result : -EIO;
}
if (put_dreq(dreq))
- nfs_direct_complete(dreq);
+ nfs_direct_complete(dreq, false);
return 0;
}
-static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+/**
+ * nfs_file_direct_read - file direct read operation for NFS files
+ * @iocb: target I/O control block
+ * @iter: vector of user buffers into which to read data
+ * @pos: byte offset in file where reading starts
+ *
+ * We use this function for direct reads instead of calling
+ * generic_file_aio_read() in order to avoid gfar's check to see if
+ * the request starts before the end of the file. For that check
+ * to work, we must generate a GETATTR before each direct read, and
+ * even then there is a window between the GETATTR and the subsequent
+ * READ where the file size could change. Our preference is simply
+ * to do all reads the application wants, and the server will take
+ * care of managing the end of file boundary.
+ *
+ * This function also eliminates unnecessarily updating the file's
+ * atime locally, as the NFS server sets the file's atime, and this
+ * client must read the updated atime from the server back into its
+ * cache.
+ */
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
- ssize_t result = -ENOMEM;
- struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
struct nfs_direct_req *dreq;
struct nfs_lock_context *l_ctx;
+ ssize_t result = -EINVAL;
+ size_t count = iov_iter_count(iter);
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
+
+ dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
+ file, count, (long long) pos);
+
+ result = 0;
+ if (!count)
+ goto out;
+ mutex_lock(&inode->i_mutex);
+ result = nfs_sync_mapping(mapping);
+ if (result)
+ goto out_unlock;
+
+ task_io_account_read(count);
+
+ result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (dreq == NULL)
- goto out;
+ goto out_unlock;
dreq->inode = inode;
- dreq->bytes_left = iov_length(iov, nr_segs);
+ dreq->bytes_left = count;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -462,22 +554,28 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
- NFS_I(inode)->read_io += iov_length(iov, nr_segs);
- result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
- if (!result)
+ NFS_I(inode)->read_io += count;
+ result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (!result) {
result = nfs_direct_wait(dreq);
+ if (result > 0)
+ iocb->ki_pos = pos + result;
+ }
+
+ nfs_direct_req_release(dreq);
+ return result;
+
out_release:
nfs_direct_req_release(dreq);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
out:
return result;
}
-static void nfs_inode_dio_write_done(struct inode *inode)
-{
- nfs_zap_mapping(inode, inode->i_mapping);
- inode_dio_done(inode);
-}
-
#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
@@ -496,7 +594,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
dreq->count = 0;
get_dreq(dreq);
- NFS_PROTO(dreq->inode)->write_pageio_init(&desc, dreq->inode, FLUSH_STABLE,
+ nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
@@ -535,7 +633,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
dprintk("NFS: %5u commit failed with error %d.\n",
data->task.tk_pid, status);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
- } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
+ } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) {
dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
}
@@ -593,8 +691,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
- nfs_inode_dio_write_done(dreq->inode);
- nfs_direct_complete(dreq);
+ nfs_direct_complete(dreq, true);
}
}
@@ -610,114 +707,10 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
- nfs_inode_dio_write_done(inode);
- nfs_direct_complete(dreq);
+ nfs_direct_complete(dreq, true);
}
#endif
-/*
- * NB: Return the value of the first error return code. Subsequent
- * errors after the first one are ignored.
- */
-/*
- * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
- * operation. If nfs_writedata_alloc() or get_user_pages() fails,
- * bail and stop sending more writes. Write length accounting is
- * handled automatically by nfs_direct_write_result(). Otherwise, if
- * no requests have been sent, just return an error.
- */
-static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
- const struct iovec *iov,
- loff_t pos, bool uio)
-{
- struct nfs_direct_req *dreq = desc->pg_dreq;
- struct nfs_open_context *ctx = dreq->ctx;
- struct inode *inode = ctx->dentry->d_inode;
- unsigned long user_addr = (unsigned long)iov->iov_base;
- size_t count = iov->iov_len;
- size_t wsize = NFS_SERVER(inode)->wsize;
- unsigned int pgbase;
- int result;
- ssize_t started = 0;
- struct page **pagevec = NULL;
- unsigned int npages;
-
- do {
- size_t bytes;
- int i;
-
- pgbase = user_addr & ~PAGE_MASK;
- bytes = min(max_t(size_t, wsize, PAGE_SIZE), count);
-
- result = -ENOMEM;
- npages = nfs_page_array_len(pgbase, bytes);
- if (!pagevec)
- pagevec = kmalloc(npages * sizeof(struct page *), GFP_KERNEL);
- if (!pagevec)
- break;
-
- if (uio) {
- down_read(&current->mm->mmap_sem);
- result = get_user_pages(current, current->mm, user_addr,
- npages, 0, 0, pagevec, NULL);
- up_read(&current->mm->mmap_sem);
- if (result < 0)
- break;
- } else {
- WARN_ON(npages != 1);
- result = get_kernel_page(user_addr, 0, pagevec);
- if (WARN_ON(result != 1))
- break;
- }
-
- if ((unsigned)result < npages) {
- bytes = result * PAGE_SIZE;
- if (bytes <= pgbase) {
- nfs_direct_release_pages(pagevec, result);
- break;
- }
- bytes -= pgbase;
- npages = result;
- }
-
- for (i = 0; i < npages; i++) {
- struct nfs_page *req;
- unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
-
- req = nfs_create_request(dreq->ctx, dreq->inode,
- pagevec[i],
- pgbase, req_len);
- if (IS_ERR(req)) {
- result = PTR_ERR(req);
- break;
- }
- nfs_lock_request(req);
- req->wb_index = pos >> PAGE_SHIFT;
- req->wb_offset = pos & ~PAGE_MASK;
- if (!nfs_pageio_add_request(desc, req)) {
- result = desc->pg_error;
- nfs_unlock_and_release_request(req);
- break;
- }
- pgbase = 0;
- bytes -= req_len;
- started += req_len;
- user_addr += req_len;
- pos += req_len;
- count -= req_len;
- dreq->bytes_left -= req_len;
- }
- /* The nfs_page now hold references to these pages */
- nfs_direct_release_pages(pagevec, npages);
- } while (count != 0 && result >= 0);
-
- kfree(pagevec);
-
- if (started)
- return started;
- return result < 0 ? (ssize_t) result : -EFAULT;
-}
-
static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
{
struct nfs_direct_req *dreq = hdr->dreq;
@@ -747,13 +740,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
bit = NFS_IOHDR_NEED_RESCHED;
else if (dreq->flags == 0) {
- memcpy(&dreq->verf, hdr->verf,
- sizeof(dreq->verf));
+ nfs_direct_set_hdr_verf(dreq, hdr);
bit = NFS_IOHDR_NEED_COMMIT;
dreq->flags = NFS_ODIRECT_DO_COMMIT;
} else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) {
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) {
+ dreq->flags =
+ NFS_ODIRECT_RESCHED_WRITES;
bit = NFS_IOHDR_NEED_RESCHED;
} else
bit = NFS_IOHDR_NEED_COMMIT;
@@ -763,6 +756,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
+
req = nfs_list_entry(hdr->pages.next);
nfs_list_remove_request(req);
switch (bit) {
@@ -797,33 +791,77 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.completion = nfs_direct_write_completion,
};
+
+/*
+ * NB: Return the value of the first error return code. Subsequent
+ * errors after the first one are ignored.
+ */
+/*
+ * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
+ * operation. If nfs_writedata_alloc() or get_user_pages() fails,
+ * bail and stop sending more writes. Write length accounting is
+ * handled automatically by nfs_direct_write_result(). Otherwise, if
+ * no requests have been sent, just return an error.
+ */
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos, bool uio)
+ struct iov_iter *iter,
+ loff_t pos)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
ssize_t result = 0;
size_t requested_bytes = 0;
- unsigned long seg;
+ size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
- NFS_PROTO(inode)->write_pageio_init(&desc, inode, FLUSH_COND_STABLE,
+ nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
atomic_inc(&inode->i_dio_count);
- NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *vec = &iov[seg];
- result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
+ NFS_I(inode)->write_io += iov_iter_count(iter);
+ while (iov_iter_count(iter)) {
+ struct page **pagevec;
+ size_t bytes;
+ size_t pgbase;
+ unsigned npages, i;
+
+ result = iov_iter_get_pages_alloc(iter, &pagevec,
+ wsize, &pgbase);
if (result < 0)
break;
- requested_bytes += result;
- if ((size_t)result < vec->iov_len)
+
+ bytes = result;
+ iov_iter_advance(iter, bytes);
+ npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
+ for (i = 0; i < npages; i++) {
+ struct nfs_page *req;
+ unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
+
+ req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
+ pgbase, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+ nfs_lock_request(req);
+ req->wb_index = pos >> PAGE_SHIFT;
+ req->wb_offset = pos & ~PAGE_MASK;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+ pgbase = 0;
+ bytes -= req_len;
+ requested_bytes += req_len;
+ pos += req_len;
+ dreq->bytes_left -= req_len;
+ }
+ nfs_direct_release_pages(pagevec, npages);
+ kvfree(pagevec);
+ if (result < 0)
break;
- pos += vec->iov_len;
}
nfs_pageio_complete(&desc);
@@ -842,98 +880,10 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
return 0;
}
-static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos,
- size_t count, bool uio)
-{
- ssize_t result = -ENOMEM;
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- struct nfs_direct_req *dreq;
- struct nfs_lock_context *l_ctx;
-
- dreq = nfs_direct_req_alloc();
- if (!dreq)
- goto out;
-
- dreq->inode = inode;
- dreq->bytes_left = count;
- dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
- l_ctx = nfs_get_lock_context(dreq->ctx);
- if (IS_ERR(l_ctx)) {
- result = PTR_ERR(l_ctx);
- goto out_release;
- }
- dreq->l_ctx = l_ctx;
- if (!is_sync_kiocb(iocb))
- dreq->iocb = iocb;
-
- result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
- if (!result)
- result = nfs_direct_wait(dreq);
-out_release:
- nfs_direct_req_release(dreq);
-out:
- return result;
-}
-
-/**
- * nfs_file_direct_read - file direct read operation for NFS files
- * @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
- * @pos: byte offset in file where reading starts
- *
- * We use this function for direct reads instead of calling
- * generic_file_aio_read() in order to avoid gfar's check to see if
- * the request starts before the end of the file. For that check
- * to work, we must generate a GETATTR before each direct read, and
- * even then there is a window between the GETATTR and the subsequent
- * READ where the file size could change. Our preference is simply
- * to do all reads the application wants, and the server will take
- * care of managing the end of file boundary.
- *
- * This function also eliminates unnecessarily updating the file's
- * atime locally, as the NFS server sets the file's atime, and this
- * client must read the updated atime from the server back into its
- * cache.
- */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
-{
- ssize_t retval = -EINVAL;
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- size_t count;
-
- count = iov_length(iov, nr_segs);
- nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
-
- dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
- file, count, (long long) pos);
-
- retval = 0;
- if (!count)
- goto out;
-
- retval = nfs_sync_mapping(mapping);
- if (retval)
- goto out;
-
- task_io_account_read(count);
-
- retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
- if (retval > 0)
- iocb->ki_pos = pos + retval;
-
-out:
- return retval;
-}
-
/**
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
- * @iov: vector of user buffers from which to write data
- * @nr_segs: size of iov vector
+ * @iter: vector of user buffers from which to write data
* @pos: byte offset in file where writing starts
*
* We use this function for direct writes instead of calling
@@ -951,49 +901,97 @@ out:
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
- ssize_t retval = -EINVAL;
+ ssize_t result = -EINVAL;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- size_t count;
+ struct inode *inode = mapping->host;
+ struct nfs_direct_req *dreq;
+ struct nfs_lock_context *l_ctx;
+ loff_t end;
+ size_t count = iov_iter_count(iter);
+ end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
- count = iov_length(iov, nr_segs);
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, count, (long long) pos);
- retval = generic_write_checks(file, &pos, &count, 0);
- if (retval)
+ result = generic_write_checks(file, &pos, &count, 0);
+ if (result)
goto out;
- retval = -EINVAL;
+ result = -EINVAL;
if ((ssize_t) count < 0)
goto out;
- retval = 0;
+ result = 0;
if (!count)
goto out;
- retval = nfs_sync_mapping(mapping);
- if (retval)
- goto out;
+ mutex_lock(&inode->i_mutex);
+
+ result = nfs_sync_mapping(mapping);
+ if (result)
+ goto out_unlock;
+
+ if (mapping->nrpages) {
+ result = invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
+ if (result)
+ goto out_unlock;
+ }
task_io_account_write(count);
- retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
- if (retval > 0) {
- struct inode *inode = mapping->host;
+ result = -ENOMEM;
+ dreq = nfs_direct_req_alloc();
+ if (!dreq)
+ goto out_unlock;
- iocb->ki_pos = pos + retval;
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < iocb->ki_pos)
- i_size_write(inode, iocb->ki_pos);
- spin_unlock(&inode->i_lock);
+ dreq->inode = inode;
+ dreq->bytes_left = count;
+ dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
+ l_ctx = nfs_get_lock_context(dreq->ctx);
+ if (IS_ERR(l_ctx)) {
+ result = PTR_ERR(l_ctx);
+ goto out_release;
+ }
+ dreq->l_ctx = l_ctx;
+ if (!is_sync_kiocb(iocb))
+ dreq->iocb = iocb;
+
+ result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_CACHE_SHIFT, end);
}
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (!result) {
+ result = nfs_direct_wait(dreq);
+ if (result > 0) {
+ struct inode *inode = mapping->host;
+
+ iocb->ki_pos = pos + result;
+ spin_lock(&inode->i_lock);
+ if (i_size_read(inode) < iocb->ki_pos)
+ i_size_write(inode, iocb->ki_pos);
+ spin_unlock(&inode->i_lock);
+ }
+ }
+ nfs_direct_req_release(dreq);
+ return result;
+
+out_release:
+ nfs_direct_req_release(dreq);
+out_unlock:
+ mutex_unlock(&inode->i_mutex);
out:
- return retval;
+ return result;
}
/**
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index fc0f95ec735..d25f10fb492 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -46,7 +46,9 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/svcauth.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
#include "dns_resolve.h"
#include "cache_lib.h"
#include "netns.h"
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e2fcacf07de..4042ff58fe3 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -165,22 +165,21 @@ nfs_file_flush(struct file *file, fl_owner_t id)
EXPORT_SYMBOL_GPL(nfs_file_flush);
ssize_t
-nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
{
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t result;
if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
+ return nfs_file_direct_read(iocb, to, iocb->ki_pos, true);
- dprintk("NFS: read(%pD2, %lu@%lu)\n",
+ dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
- (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+ iov_iter_count(to), (unsigned long) iocb->ki_pos);
result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
- result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ result = generic_file_read_iter(iocb, to);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
@@ -354,7 +353,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
struct page *page;
int once_thru = 0;
- dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%ld), %u@%lld)\n",
+ dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
start:
@@ -395,7 +394,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
struct nfs_open_context *ctx = nfs_file_open_context(file);
int status;
- dfprintk(PAGECACHE, "NFS: write_end(%pD2(%ld), %u@%lld)\n",
+ dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n",
file, mapping->host->i_ino, len, (long long) pos);
/*
@@ -585,7 +584,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret = VM_FAULT_NOPAGE;
struct address_space *mapping;
- dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%ld), offset %lld)\n",
+ dfprintk(PAGECACHE, "NFS: vm_page_mkwrite(%pD2(%lu), offset %lld)\n",
filp, filp->f_mapping->host->i_ino,
(long long)page_offset(page));
@@ -617,6 +616,7 @@ out:
static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = nfs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -634,24 +634,24 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
return 0;
}
-ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
if (file->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
+ return nfs_file_direct_write(iocb, from, pos, true);
- dprintk("NFS: write(%pD2, %lu@%Ld)\n",
- file, (unsigned long) count, (long long) pos);
+ dprintk("NFS: write(%pD2, %zu@%Ld)\n",
+ file, count, (long long) pos);
result = -EBUSY;
if (IS_SWAPFILE(inode))
@@ -669,7 +669,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (!count)
goto out;
- result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ result = generic_file_write_iter(iocb, from);
if (result > 0)
written = result;
@@ -690,36 +690,6 @@ out_swapfile:
}
EXPORT_SYMBOL_GPL(nfs_file_write);
-ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
- struct file *filp, loff_t *ppos,
- size_t count, unsigned int flags)
-{
- struct inode *inode = file_inode(filp);
- unsigned long written = 0;
- ssize_t ret;
-
- dprintk("NFS splice_write(%pD2, %lu@%llu)\n",
- filp, (unsigned long) count, (unsigned long long) *ppos);
-
- /*
- * The combination of splice and an O_APPEND destination is disallowed.
- */
-
- ret = generic_file_splice_write(pipe, filp, ppos, count, flags);
- if (ret > 0)
- written = ret;
-
- if (ret >= 0 && nfs_need_sync_write(filp, inode)) {
- int err = vfs_fsync(filp, 0);
- if (err < 0)
- ret = err;
- }
- if (ret > 0)
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
- return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_file_splice_write);
-
static int
do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
@@ -915,10 +885,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
is_local = 1;
/* We're simulating flock() locks using posix locks on the server */
- fl->fl_owner = (fl_owner_t)filp;
- fl->fl_start = 0;
- fl->fl_end = OFFSET_MAX;
-
if (fl->fl_type == F_UNLCK)
return do_unlk(filp, cmd, fl, is_local);
return do_setlk(filp, cmd, fl, is_local);
@@ -938,10 +904,10 @@ EXPORT_SYMBOL_GPL(nfs_setlease);
const struct file_operations nfs_file_operations = {
.llseek = nfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs_file_open,
.flush = nfs_file_flush,
@@ -950,7 +916,7 @@ const struct file_operations nfs_file_operations = {
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
- .splice_write = nfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
};
diff --git a/fs/nfs/filelayout/Makefile b/fs/nfs/filelayout/Makefile
new file mode 100644
index 00000000000..8516cdffb9e
--- /dev/null
+++ b/fs/nfs/filelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Files Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FILE_LAYOUT) += nfs_layout_nfsv41_files.o
+nfs_layout_nfsv41_files-y := filelayout.o filelayoutdev.o
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/filelayout/filelayout.c
index b86464ba25e..d2eba1c13b7 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -35,11 +35,11 @@
#include <linux/sunrpc/metrics.h>
-#include "nfs4session.h"
-#include "internal.h"
-#include "delegation.h"
-#include "nfs4filelayout.h"
-#include "nfs4trace.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "filelayout.h"
+#include "../nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -84,17 +84,17 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
BUG();
}
-static void filelayout_reset_write(struct nfs_write_data *data)
+static void filelayout_reset_write(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
struct rpc_task *task = &data->task;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
- "(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
data->task.tk_pid,
hdr->inode->i_sb->s_id,
- (long long)NFS_FILEID(hdr->inode),
+ (unsigned long long)NFS_FILEID(hdr->inode),
data->args.count,
(unsigned long long)data->args.offset);
@@ -105,17 +105,17 @@ static void filelayout_reset_write(struct nfs_write_data *data)
}
}
-static void filelayout_reset_read(struct nfs_read_data *data)
+static void filelayout_reset_read(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
struct rpc_task *task = &data->task;
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
dprintk("%s Reset task %5u for i/o through MDS "
- "(req %s/%lld, %u bytes @ offset %llu)\n", __func__,
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
data->task.tk_pid,
hdr->inode->i_sb->s_id,
- (long long)NFS_FILEID(hdr->inode),
+ (unsigned long long)NFS_FILEID(hdr->inode),
data->args.count,
(unsigned long long)data->args.offset);
@@ -243,7 +243,7 @@ wait_on_recovery:
/* NFS_PROTO call done callback routines */
static int filelayout_read_done_cb(struct rpc_task *task,
- struct nfs_read_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
int err;
@@ -270,7 +270,7 @@ static int filelayout_read_done_cb(struct rpc_task *task,
* rfc5661 is not clear about which credential should be used.
*/
static void
-filelayout_set_layoutcommit(struct nfs_write_data *wdata)
+filelayout_set_layoutcommit(struct nfs_pgio_data *wdata)
{
struct nfs_pgio_header *hdr = wdata->header;
@@ -279,7 +279,7 @@ filelayout_set_layoutcommit(struct nfs_write_data *wdata)
return;
pnfs_set_layoutcommit(wdata);
- dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+ dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -305,7 +305,7 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
*/
static void filelayout_read_prepare(struct rpc_task *task, void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -317,26 +317,29 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
rpc_exit(task, 0);
return;
}
- rdata->read_done_cb = filelayout_read_done_cb;
+ rdata->pgio_done_cb = filelayout_read_done_cb;
if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
&rdata->args.seq_args,
&rdata->res.seq_res,
task))
return;
- nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
- rdata->args.lock_context, FMODE_READ);
+ if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
+ rdata->args.lock_context, FMODE_READ) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
}
static void filelayout_read_call_done(struct rpc_task *task, void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) &&
- task->tk_status == 0)
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &rdata->res.seq_res);
return;
+ }
/* Note this may cause RPC to be resent */
rdata->header->mds_ops->rpc_call_done(task, data);
@@ -344,14 +347,14 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
static void filelayout_read_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics);
}
static void filelayout_read_release(void *data)
{
- struct nfs_read_data *rdata = data;
+ struct nfs_pgio_data *rdata = data;
struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
@@ -360,7 +363,7 @@ static void filelayout_read_release(void *data)
}
static int filelayout_write_done_cb(struct rpc_task *task,
- struct nfs_write_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
int err;
@@ -416,7 +419,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
static void filelayout_write_prepare(struct rpc_task *task, void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
rpc_exit(task, -EIO);
@@ -433,17 +436,20 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
&wdata->res.seq_res,
task))
return;
- nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
- wdata->args.lock_context, FMODE_WRITE);
+ if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
+ wdata->args.lock_context, FMODE_WRITE) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
}
static void filelayout_write_call_done(struct rpc_task *task, void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) &&
- task->tk_status == 0)
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &wdata->res.seq_res);
return;
+ }
/* Note this may cause RPC to be resent */
wdata->header->mds_ops->rpc_call_done(task, data);
@@ -451,14 +457,14 @@ static void filelayout_write_call_done(struct rpc_task *task, void *data)
static void filelayout_write_count_stats(struct rpc_task *task, void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics);
}
static void filelayout_write_release(void *data)
{
- struct nfs_write_data *wdata = data;
+ struct nfs_pgio_data *wdata = data;
struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout;
filelayout_fenceme(lo->plh_inode, lo);
@@ -523,7 +529,7 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
};
static enum pnfs_try_status
-filelayout_read_pagelist(struct nfs_read_data *data)
+filelayout_read_pagelist(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
@@ -554,6 +560,7 @@ filelayout_read_pagelist(struct nfs_read_data *data)
/* No multipath support. Use first DS */
atomic_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
+ data->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
data->args.fh = fh;
@@ -562,14 +569,14 @@ filelayout_read_pagelist(struct nfs_read_data *data)
data->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_read(ds_clnt, data,
- &filelayout_read_call_ops, RPC_TASK_SOFTCONN);
+ nfs_initiate_pgio(ds_clnt, data,
+ &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
/* Perform async writes. */
static enum pnfs_try_status
-filelayout_write_pagelist(struct nfs_write_data *data, int sync)
+filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
{
struct nfs_pgio_header *hdr = data->header;
struct pnfs_layout_segment *lseg = hdr->lseg;
@@ -594,20 +601,18 @@ filelayout_write_pagelist(struct nfs_write_data *data, int sync)
__func__, hdr->inode->i_ino, sync, (size_t) data->args.count,
offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
- data->write_done_cb = filelayout_write_done_cb;
+ data->pgio_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
data->ds_clp = ds->ds_clp;
+ data->ds_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
data->args.fh = fh;
- /*
- * Get the file offset on the dserver. Set the write offset to
- * this offset and save the original offset.
- */
+
data->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_write(ds_clnt, data,
+ nfs_initiate_pgio(ds_clnt, data,
&filelayout_write_call_ops, sync,
RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
@@ -631,7 +636,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
struct nfs4_deviceid_node *d;
struct nfs4_file_layout_dsaddr *dsaddr;
int status = -EINVAL;
- struct nfs_server *nfss = NFS_SERVER(lo->plh_inode);
dprintk("--> %s\n", __func__);
@@ -649,7 +653,7 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out;
}
- if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
+ if (!fl->stripe_unit) {
dprintk("%s Invalid stripe unit (%u)\n",
__func__, fl->stripe_unit);
goto out;
@@ -686,12 +690,6 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
goto out_put;
}
- if (fl->stripe_unit % nfss->rsize || fl->stripe_unit % nfss->wsize) {
- dprintk("%s Stripe unit (%u) not aligned with rsize %u "
- "wsize %u\n", __func__, fl->stripe_unit, nfss->rsize,
- nfss->wsize);
- }
-
status = 0;
out:
dprintk("--> %s returns %d\n", __func__, status);
@@ -844,11 +842,15 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
struct pnfs_commit_bucket *buckets;
- int size;
+ int size, i;
if (fl->commit_through_mds)
return 0;
- if (cinfo->ds->nbuckets != 0) {
+
+ size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ if (cinfo->ds->nbuckets >= size) {
/* This assumes there is only one IOMODE_RW lseg. What
* we really want to do is have a layout_hdr level
* dictionary of <multipath_list4, fh> keys, each
@@ -858,30 +860,36 @@ filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
return 0;
}
- size = (fl->stripe_type == STRIPE_SPARSE) ?
- fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
gfp_flags);
if (!buckets)
return -ENOMEM;
- else {
- int i;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&buckets[i].written);
+ INIT_LIST_HEAD(&buckets[i].committing);
+ /* mark direct verifier as unset */
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
- spin_lock(cinfo->lock);
- if (cinfo->ds->nbuckets != 0)
- kfree(buckets);
- else {
- cinfo->ds->buckets = buckets;
- cinfo->ds->nbuckets = size;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- }
- }
- spin_unlock(cinfo->lock);
- return 0;
+ spin_lock(cinfo->lock);
+ if (cinfo->ds->nbuckets >= size)
+ goto out;
+ for (i = 0; i < cinfo->ds->nbuckets; i++) {
+ list_splice(&cinfo->ds->buckets[i].written,
+ &buckets[i].written);
+ list_splice(&cinfo->ds->buckets[i].committing,
+ &buckets[i].committing);
+ buckets[i].direct_verf.committed =
+ cinfo->ds->buckets[i].direct_verf.committed;
+ buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
+ buckets[i].clseg = cinfo->ds->buckets[i].clseg;
}
+ swap(cinfo->ds->buckets, buckets);
+ cinfo->ds->nbuckets = size;
+out:
+ spin_unlock(cinfo->lock);
+ kfree(buckets);
+ return 0;
}
static struct pnfs_layout_segment *
@@ -909,47 +917,51 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
- * return true : coalesce page
- * return false : don't coalesce page
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
-static bool
+static size_t
filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
+ unsigned int size;
u64 p_stripe, r_stripe;
- u32 stripe_unit;
+ u32 stripe_offset;
+ u64 segment_offset = pgio->pg_lseg->pls_range.offset;
+ u32 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
- if (!pnfs_generic_pg_test(pgio, prev, req) ||
- !nfs_generic_pg_test(pgio, prev, req))
- return false;
+ /* calls nfs_generic_pg_test */
+ size = pnfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
- p_stripe = (u64)req_offset(prev);
- r_stripe = (u64)req_offset(req);
- stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
+ /* see if req and prev are in the same stripe */
+ if (prev) {
+ p_stripe = (u64)req_offset(prev) - segment_offset;
+ r_stripe = (u64)req_offset(req) - segment_offset;
+ do_div(p_stripe, stripe_unit);
+ do_div(r_stripe, stripe_unit);
- do_div(p_stripe, stripe_unit);
- do_div(r_stripe, stripe_unit);
+ if (p_stripe != r_stripe)
+ return 0;
+ }
- return (p_stripe == r_stripe);
+ /* calculate remaining bytes in the current stripe */
+ div_u64_rem((u64)req_offset(req) - segment_offset,
+ stripe_unit,
+ &stripe_offset);
+ WARN_ON_ONCE(stripe_offset > stripe_unit);
+ if (stripe_offset >= stripe_unit)
+ return 0;
+ return min(stripe_unit - (unsigned int)stripe_offset, size);
}
static void
filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (req->wb_offset != req->wb_pgbase) {
- /*
- * Handling unaligned pages is difficult, because have to
- * somehow split a req in two in certain cases in the
- * pg.test code. Avoid this by just not using pnfs
- * in this case.
- */
- nfs_pageio_reset_read_mds(pgio);
- return;
- }
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
@@ -967,11 +979,8 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_commit_info cinfo;
int status;
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (req->wb_offset != req->wb_pgbase)
- goto out_mds;
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
0,
NFS4_MAX_UINT64,
@@ -1061,6 +1070,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
*/
j = nfs4_fl_calc_j_index(lseg, req_offset(req));
i = select_bucket_index(fl, j);
+ spin_lock(cinfo->lock);
buckets = cinfo->ds->buckets;
list = &buckets[i].written;
if (list_empty(list)) {
@@ -1074,6 +1084,7 @@ filelayout_choose_commit_list(struct nfs_page *req,
}
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
+ spin_unlock(cinfo->lock);
return list;
}
@@ -1170,6 +1181,7 @@ transfer_commit_list(struct list_head *src, struct list_head *dst,
return ret;
}
+/* Note called with cinfo->lock held. */
static int
filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo,
@@ -1214,19 +1226,22 @@ static void filelayout_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo)
{
struct pnfs_commit_bucket *b;
+ struct pnfs_layout_segment *freeme;
int i;
- /* NOTE cinfo->lock is NOT held, relying on fact that this is
- * only called on single thread per dreq.
- * Can't take the lock because need to do pnfs_put_lseg
- */
+restart:
+ spin_lock(cinfo->lock);
for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
- pnfs_put_lseg(b->wlseg);
+ freeme = b->wlseg;
b->wlseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
+ goto restart;
}
}
cinfo->ds->nwritten = 0;
+ spin_unlock(cinfo->lock);
}
static unsigned int
@@ -1237,6 +1252,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
struct nfs_commit_data *data;
int i, j;
unsigned int nreq = 0;
+ struct pnfs_layout_segment *freeme;
fl_cinfo = cinfo->ds;
bucket = fl_cinfo->buckets;
@@ -1247,8 +1263,10 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
if (!data)
break;
data->ds_commit_index = i;
+ spin_lock(cinfo->lock);
data->lseg = bucket->clseg;
bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
list_add(&data->pages, list);
nreq++;
}
@@ -1258,8 +1276,11 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
if (list_empty(&bucket->committing))
continue;
nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
- pnfs_put_lseg(bucket->clseg);
+ spin_lock(cinfo->lock);
+ freeme = bucket->clseg;
bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
}
/* Caller will clean up entries put on list */
return nreq;
@@ -1324,7 +1345,7 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct nfs4_filelayout *flo;
flo = kzalloc(sizeof(*flo), gfp_flags);
- return &flo->generic_hdr;
+ return flo != NULL ? &flo->generic_hdr : NULL;
}
static void
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/filelayout/filelayout.h
index cebd20e7e92..ffbddf2219e 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -30,7 +30,7 @@
#ifndef FS_NFS_NFS4FILELAYOUT_H
#define FS_NFS_NFS4FILELAYOUT_H
-#include "pnfs.h"
+#include "../pnfs.h"
/*
* Default data server connection timeout and retrans vaules.
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index c7c295e556e..44bf0140a4c 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -33,9 +33,9 @@
#include <linux/module.h>
#include <linux/sunrpc/addr.h>
-#include "internal.h"
-#include "nfs4session.h"
-#include "nfs4filelayout.h"
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "filelayout.h"
#define NFSDBG_FACILITY NFSDBG_PNFS_LD
@@ -95,7 +95,7 @@ same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
b6 = (struct sockaddr_in6 *)addr2;
/* LINKLOCAL addresses must have matching scope_id */
- if (ipv6_addr_scope(&a6->sin6_addr) ==
+ if (ipv6_addr_src_scope(&a6->sin6_addr) ==
IPV6_ADDR_SCOPE_LINKLOCAL &&
a6->sin6_scope_id != b6->sin6_scope_id)
return false;
@@ -789,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
}
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 24d1d1c5fca..3ef01f0ba0b 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -39,7 +39,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
/* create a cache index for looking up filehandles */
clp->fscache = fscache_acquire_cookie(nfs_fscache_netfs.primary_index,
&nfs_fscache_server_index_def,
- clp);
+ clp, true);
dfprintk(FSCACHE, "NFS: get client cookie (0x%p/0x%p)\n",
clp, clp->fscache);
}
@@ -139,7 +139,7 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
/* create a cache index for looking up filehandles */
nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
&nfs_fscache_super_index_def,
- nfss);
+ nfss, true);
dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
nfss, nfss->fscache);
return;
@@ -178,163 +178,79 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
/*
* Initialise the per-inode cache cookie pointer for an NFS inode.
*/
-void nfs_fscache_init_inode_cookie(struct inode *inode)
+void nfs_fscache_init_inode(struct inode *inode)
{
- NFS_I(inode)->fscache = NULL;
- if (S_ISREG(inode->i_mode))
- set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-}
-
-/*
- * Get the per-inode cache cookie for an NFS inode.
- */
-static void nfs_fscache_enable_inode_cookie(struct inode *inode)
-{
- struct super_block *sb = inode->i_sb;
struct nfs_inode *nfsi = NFS_I(inode);
- if (nfsi->fscache || !NFS_FSCACHE(inode))
+ nfsi->fscache = NULL;
+ if (!S_ISREG(inode->i_mode))
return;
-
- if ((NFS_SB(sb)->options & NFS_OPTION_FSCACHE)) {
- nfsi->fscache = fscache_acquire_cookie(
- NFS_SB(sb)->fscache,
- &nfs_fscache_inode_object_def,
- nfsi);
-
- dfprintk(FSCACHE, "NFS: get FH cookie (0x%p/0x%p/0x%p)\n",
- sb, nfsi, nfsi->fscache);
- }
+ nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
+ &nfs_fscache_inode_object_def,
+ nfsi, false);
}
/*
* Release a per-inode cookie.
*/
-void nfs_fscache_release_inode_cookie(struct inode *inode)
+void nfs_fscache_clear_inode(struct inode *inode)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
- dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n",
- nfsi, nfsi->fscache);
+ dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
- fscache_relinquish_cookie(nfsi->fscache, 0);
+ fscache_relinquish_cookie(cookie, false);
nfsi->fscache = NULL;
}
-/*
- * Retire a per-inode cookie, destroying the data attached to it.
- */
-void nfs_fscache_zap_inode_cookie(struct inode *inode)
+static bool nfs_fscache_can_enable(void *data)
{
- struct nfs_inode *nfsi = NFS_I(inode);
+ struct inode *inode = data;
- dfprintk(FSCACHE, "NFS: zapping cookie (0x%p/0x%p)\n",
- nfsi, nfsi->fscache);
-
- fscache_relinquish_cookie(nfsi->fscache, 1);
- nfsi->fscache = NULL;
+ return !inode_is_open_for_write(inode);
}
/*
- * Turn off the cache with regard to a per-inode cookie if opened for writing,
- * invalidating all the pages in the page cache relating to the associated
- * inode to clear the per-page caching.
- */
-static void nfs_fscache_disable_inode_cookie(struct inode *inode)
-{
- clear_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
-
- if (NFS_I(inode)->fscache) {
- dfprintk(FSCACHE,
- "NFS: nfsi 0x%p turning cache off\n", NFS_I(inode));
-
- /* Need to uncache any pages attached to this inode that
- * fscache knows about before turning off the cache.
- */
- fscache_uncache_all_inode_pages(NFS_I(inode)->fscache, inode);
- nfs_fscache_zap_inode_cookie(inode);
- }
-}
-
-/*
- * wait_on_bit() sleep function for uninterruptible waiting
- */
-static int nfs_fscache_wait_bit(void *flags)
-{
- schedule();
- return 0;
-}
-
-/*
- * Lock against someone else trying to also acquire or relinquish a cookie
- */
-static inline void nfs_fscache_inode_lock(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
-
- while (test_and_set_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags))
- wait_on_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK,
- nfs_fscache_wait_bit, TASK_UNINTERRUPTIBLE);
-}
-
-/*
- * Unlock cookie management lock
- */
-static inline void nfs_fscache_inode_unlock(struct inode *inode)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
-
- smp_mb__before_clear_bit();
- clear_bit(NFS_INO_FSCACHE_LOCK, &nfsi->flags);
- smp_mb__after_clear_bit();
- wake_up_bit(&nfsi->flags, NFS_INO_FSCACHE_LOCK);
-}
-
-/*
- * Decide if we should enable or disable local caching for this inode.
- * - For now, with NFS, only regular files that are open read-only will be able
- * to use the cache.
- * - May be invoked multiple times in parallel by parallel nfs_open() functions.
- */
-void nfs_fscache_set_inode_cookie(struct inode *inode, struct file *filp)
-{
- if (NFS_FSCACHE(inode)) {
- nfs_fscache_inode_lock(inode);
- if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
- nfs_fscache_disable_inode_cookie(inode);
- else
- nfs_fscache_enable_inode_cookie(inode);
- nfs_fscache_inode_unlock(inode);
- }
-}
-EXPORT_SYMBOL_GPL(nfs_fscache_set_inode_cookie);
-
-/*
- * Replace a per-inode cookie due to revalidation detecting a file having
- * changed on the server.
+ * Enable or disable caching for a file that is being opened as appropriate.
+ * The cookie is allocated when the inode is initialised, but is not enabled at
+ * that time. Enablement is deferred to file-open time to avoid stat() and
+ * access() thrashing the cache.
+ *
+ * For now, with NFS, only regular files that are open read-only will be able
+ * to use the cache.
+ *
+ * We enable the cache for an inode if we open it read-only and it isn't
+ * currently open for writing. We disable the cache if the inode is open
+ * write-only.
+ *
+ * The caller uses the file struct to pin i_writecount on the inode before
+ * calling us when a file is opened for writing, so we can make use of that.
+ *
+ * Note that this may be invoked multiple times in parallel by parallel
+ * nfs_open() functions.
*/
-void nfs_fscache_reset_inode_cookie(struct inode *inode)
+void nfs_fscache_open_file(struct inode *inode, struct file *filp)
{
struct nfs_inode *nfsi = NFS_I(inode);
- struct nfs_server *nfss = NFS_SERVER(inode);
- NFS_IFDEBUG(struct fscache_cookie *old = nfsi->fscache);
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
- nfs_fscache_inode_lock(inode);
- if (nfsi->fscache) {
- /* retire the current fscache cache and get a new one */
- fscache_relinquish_cookie(nfsi->fscache, 1);
-
- nfsi->fscache = fscache_acquire_cookie(
- nfss->nfs_client->fscache,
- &nfs_fscache_inode_object_def,
- nfsi);
+ if (!fscache_cookie_valid(cookie))
+ return;
- dfprintk(FSCACHE,
- "NFS: revalidation new cookie (0x%p/0x%p/0x%p/0x%p)\n",
- nfss, nfsi, old, nfsi->fscache);
+ if (inode_is_open_for_write(inode)) {
+ dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
+ clear_bit(NFS_INO_FSCACHE, &nfsi->flags);
+ fscache_disable_cookie(cookie, true);
+ fscache_uncache_all_inode_pages(cookie, inode);
+ } else {
+ dfprintk(FSCACHE, "NFS: nfsi 0x%p enabling cache\n", nfsi);
+ fscache_enable_cookie(cookie, nfs_fscache_can_enable, inode);
+ if (fscache_cookie_enabled(cookie))
+ set_bit(NFS_INO_FSCACHE, &NFS_I(inode)->flags);
}
- nfs_fscache_inode_unlock(inode);
}
+EXPORT_SYMBOL_GPL(nfs_fscache_open_file);
/*
* Release the caching state associated with a page, if the page isn't busy
@@ -344,12 +260,11 @@ void nfs_fscache_reset_inode_cookie(struct inode *inode)
int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
if (PageFsCache(page)) {
- struct nfs_inode *nfsi = NFS_I(page->mapping->host);
- struct fscache_cookie *cookie = nfsi->fscache;
+ struct fscache_cookie *cookie = nfs_i_fscache(page->mapping->host);
BUG_ON(!cookie);
dfprintk(FSCACHE, "NFS: fscache releasepage (0x%p/0x%p/0x%p)\n",
- cookie, page, nfsi);
+ cookie, page, NFS_I(page->mapping->host));
if (!fscache_maybe_release_page(cookie, page, gfp))
return 0;
@@ -367,13 +282,12 @@ int nfs_fscache_release_page(struct page *page, gfp_t gfp)
*/
void __nfs_fscache_invalidate_page(struct page *page, struct inode *inode)
{
- struct nfs_inode *nfsi = NFS_I(inode);
- struct fscache_cookie *cookie = nfsi->fscache;
+ struct fscache_cookie *cookie = nfs_i_fscache(inode);
BUG_ON(!cookie);
dfprintk(FSCACHE, "NFS: fscache invalidatepage (0x%p/0x%p/0x%p)\n",
- cookie, page, nfsi);
+ cookie, page, NFS_I(inode));
fscache_wait_on_page_write(cookie, page);
@@ -417,9 +331,9 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx,
dfprintk(FSCACHE,
"NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n",
- NFS_I(inode)->fscache, page, page->index, page->flags, inode);
+ nfs_i_fscache(inode), page, page->index, page->flags, inode);
- ret = fscache_read_or_alloc_page(NFS_I(inode)->fscache,
+ ret = fscache_read_or_alloc_page(nfs_i_fscache(inode),
page,
nfs_readpage_from_fscache_complete,
ctx,
@@ -459,9 +373,9 @@ int __nfs_readpages_from_fscache(struct nfs_open_context *ctx,
int ret;
dfprintk(FSCACHE, "NFS: nfs_getpages_from_fscache (0x%p/%u/0x%p)\n",
- NFS_I(inode)->fscache, npages, inode);
+ nfs_i_fscache(inode), npages, inode);
- ret = fscache_read_or_alloc_pages(NFS_I(inode)->fscache,
+ ret = fscache_read_or_alloc_pages(nfs_i_fscache(inode),
mapping, pages, nr_pages,
nfs_readpage_from_fscache_complete,
ctx,
@@ -506,15 +420,15 @@ void __nfs_readpage_to_fscache(struct inode *inode, struct page *page, int sync)
dfprintk(FSCACHE,
"NFS: readpage_to_fscache(fsc:%p/p:%p(i:%lx f:%lx)/%d)\n",
- NFS_I(inode)->fscache, page, page->index, page->flags, sync);
+ nfs_i_fscache(inode), page, page->index, page->flags, sync);
- ret = fscache_write_page(NFS_I(inode)->fscache, page, GFP_KERNEL);
+ ret = fscache_write_page(nfs_i_fscache(inode), page, GFP_KERNEL);
dfprintk(FSCACHE,
"NFS: readpage_to_fscache: p:%p(i:%lu f:%lx) ret %d\n",
page, page->index, page->flags, ret);
if (ret != 0) {
- fscache_uncache_page(NFS_I(inode)->fscache, page);
+ fscache_uncache_page(nfs_i_fscache(inode), page);
nfs_add_fscache_stats(inode,
NFSIOS_FSCACHE_PAGES_WRITTEN_FAIL, 1);
nfs_add_fscache_stats(inode, NFSIOS_FSCACHE_PAGES_UNCACHED, 1);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 4ecb76652eb..d7fe3e799f2 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -76,11 +76,9 @@ extern void nfs_fscache_release_client_cookie(struct nfs_client *);
extern void nfs_fscache_get_super_cookie(struct super_block *, const char *, int);
extern void nfs_fscache_release_super_cookie(struct super_block *);
-extern void nfs_fscache_init_inode_cookie(struct inode *);
-extern void nfs_fscache_release_inode_cookie(struct inode *);
-extern void nfs_fscache_zap_inode_cookie(struct inode *);
-extern void nfs_fscache_set_inode_cookie(struct inode *, struct file *);
-extern void nfs_fscache_reset_inode_cookie(struct inode *);
+extern void nfs_fscache_init_inode(struct inode *);
+extern void nfs_fscache_clear_inode(struct inode *);
+extern void nfs_fscache_open_file(struct inode *, struct file *);
extern void __nfs_fscache_invalidate_page(struct page *, struct inode *);
extern int nfs_fscache_release_page(struct page *, gfp_t);
@@ -187,12 +185,10 @@ static inline void nfs_fscache_release_client_cookie(struct nfs_client *clp) {}
static inline void nfs_fscache_release_super_cookie(struct super_block *sb) {}
-static inline void nfs_fscache_init_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_release_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_zap_inode_cookie(struct inode *inode) {}
-static inline void nfs_fscache_set_inode_cookie(struct inode *inode,
- struct file *filp) {}
-static inline void nfs_fscache_reset_inode_cookie(struct inode *inode) {}
+static inline void nfs_fscache_init_inode(struct inode *inode) {}
+static inline void nfs_fscache_clear_inode(struct inode *inode) {}
+static inline void nfs_fscache_open_file(struct inode *inode,
+ struct file *filp) {}
static inline int nfs_fscache_release_page(struct page *page, gfp_t gfp)
{
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 66984a9aafa..b94f80420a5 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -120,7 +120,8 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
security_d_instantiate(ret, inode);
spin_lock(&ret->d_lock);
- if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
+ if (IS_ROOT(ret) && !ret->d_fsdata &&
+ !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
ret->d_fsdata = name;
name = NULL;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879171c..9927913c97c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -122,13 +122,13 @@ void nfs_clear_inode(struct inode *inode)
WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files));
nfs_zap_acl_cache(inode);
nfs_access_zap_cache(inode);
- nfs_fscache_release_inode_cookie(inode);
+ nfs_fscache_clear_inode(inode);
}
EXPORT_SYMBOL_GPL(nfs_clear_inode);
void nfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
nfs_clear_inode(inode);
}
@@ -147,6 +147,17 @@ int nfs_sync_mapping(struct address_space *mapping)
return ret;
}
+static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ if (inode->i_mapping->nrpages == 0)
+ flags &= ~NFS_INO_INVALID_DATA;
+ nfsi->cache_validity |= flags;
+ if (flags & NFS_INO_INVALID_DATA)
+ nfs_fscache_invalidate(inode);
+}
+
/*
* Invalidate the local caches
*/
@@ -162,19 +173,17 @@ static void nfs_zap_caches_locked(struct inode *inode)
memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf));
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
- nfs_fscache_invalidate(inode);
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_LABEL
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_DATA
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
- | NFS_INO_REVAL_PAGECACHE;
+ | NFS_INO_REVAL_PAGECACHE);
} else
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_LABEL
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
- | NFS_INO_REVAL_PAGECACHE;
+ | NFS_INO_REVAL_PAGECACHE);
+ nfs_zap_label_cache_locked(nfsi);
}
void nfs_zap_caches(struct inode *inode)
@@ -188,8 +197,7 @@ void nfs_zap_mapping(struct inode *inode, struct address_space *mapping)
{
if (mapping->nrpages != 0) {
spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_DATA;
- nfs_fscache_invalidate(inode);
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
spin_unlock(&inode->i_lock);
}
}
@@ -210,7 +218,7 @@ EXPORT_SYMBOL_GPL(nfs_zap_acl_cache);
void nfs_invalidate_atime(struct inode *inode)
{
spin_lock(&inode->i_lock);
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
@@ -266,6 +274,13 @@ nfs_init_locked(struct inode *inode, void *opaque)
}
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+static void nfs_clear_label_invalid(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL;
+ spin_unlock(&inode->i_lock);
+}
+
void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
struct nfs4_label *label)
{
@@ -274,12 +289,6 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
if (label == NULL)
return;
- if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL) == 0)
- return;
-
- if (NFS_SERVER(inode)->nfs_client->cl_minorversion < 2)
- return;
-
if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) {
error = security_inode_notifysecctx(inode, label->label,
label->len);
@@ -289,6 +298,7 @@ void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
__func__,
(char *)label->label,
label->len, error);
+ nfs_clear_label_invalid(inode);
}
}
@@ -318,7 +328,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
}
EXPORT_SYMBOL_GPL(nfs4_label_alloc);
#else
-void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
struct nfs4_label *label)
{
}
@@ -368,7 +378,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
inode->i_mode = fattr->mode;
if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
&& nfs_server_capable(inode, NFS_CAP_MODE))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
*/
@@ -414,36 +424,36 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
inode->i_atime = fattr->atime;
else if (nfs_server_capable(inode, NFS_CAP_ATIME))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_MTIME)
inode->i_mtime = fattr->mtime;
else if (nfs_server_capable(inode, NFS_CAP_MTIME))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_CTIME)
inode->i_ctime = fattr->ctime;
else if (nfs_server_capable(inode, NFS_CAP_CTIME))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
inode->i_version = fattr->change_attr;
else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_SIZE)
inode->i_size = nfs_size_to_loff_t(fattr->size);
else
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR
- | NFS_INO_REVAL_PAGECACHE;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR
+ | NFS_INO_REVAL_PAGECACHE);
if (fattr->valid & NFS_ATTR_FATTR_NLINK)
set_nlink(inode, fattr->nlink);
else if (nfs_server_capable(inode, NFS_CAP_NLINK))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_OWNER)
inode->i_uid = fattr->uid;
else if (nfs_server_capable(inode, NFS_CAP_OWNER))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_GROUP)
inode->i_gid = fattr->gid;
else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR);
if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -459,14 +469,14 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
nfsi->attrtimeo_timestamp = now;
nfsi->access_cache = RB_ROOT;
- nfs_fscache_init_inode_cookie(inode);
+ nfs_fscache_init_inode(inode);
unlock_new_inode(inode);
} else
nfs_refresh_inode(inode, fattr);
- dprintk("NFS: nfs_fhget(%s/%Ld fh_crc=0x%08x ct=%d)\n",
+ dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n",
inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
+ (unsigned long long)NFS_FILEID(inode),
nfs_display_fhandle_hash(fh),
atomic_read(&inode->i_count));
@@ -549,6 +559,9 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
spin_lock(&inode->i_lock);
i_size_write(inode, offset);
+ /* Optimisation */
+ if (offset == 0)
+ NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
truncate_pagecache(inode, offset);
@@ -577,7 +590,8 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
inode->i_uid = attr->ia_uid;
if ((attr->ia_valid & ATTR_GID) != 0)
inode->i_gid = attr->ia_gid;
- NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL);
spin_unlock(&inode->i_lock);
}
if ((attr->ia_valid & ATTR_SIZE) != 0) {
@@ -587,6 +601,25 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
}
EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
+static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+{
+ struct dentry *parent;
+
+ parent = dget_parent(dentry);
+ nfs_force_use_readdirplus(parent->d_inode);
+ dput(parent);
+}
+
+static bool nfs_need_revalidate_inode(struct inode *inode)
+{
+ if (NFS_I(inode)->cache_validity &
+ (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
+ return true;
+ if (nfs_attribute_cache_expired(inode))
+ return true;
+ return false;
+}
+
int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
@@ -615,10 +648,13 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)))
need_atime = 0;
- if (need_atime)
- err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- else
- err = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (need_atime || nfs_need_revalidate_inode(inode)) {
+ struct nfs_server *server = NFS_SERVER(inode);
+
+ if (server->caps & NFS_CAP_READDIRPLUS)
+ nfs_request_parent_use_readdirplus(dentry);
+ err = __nfs_revalidate_inode(server, inode);
+ }
if (!err) {
generic_fillattr(inode, stat);
stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -854,7 +890,7 @@ int nfs_open(struct inode *inode, struct file *filp)
return PTR_ERR(ctx);
nfs_file_set_open_context(filp, ctx);
put_nfs_open_context(ctx);
- nfs_fscache_set_inode_cookie(inode, filp);
+ nfs_fscache_open_file(inode, filp);
return 0;
}
@@ -876,8 +912,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
struct nfs_fattr *fattr = NULL;
struct nfs_inode *nfsi = NFS_I(inode);
- dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
- inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+ dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n",
+ inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode));
trace_nfs_revalidate_inode_enter(inode);
@@ -901,9 +937,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label);
if (status != 0) {
- dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n",
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n",
inode->i_sb->s_id,
- (long long)NFS_FILEID(inode), status);
+ (unsigned long long)NFS_FILEID(inode), status);
if (status == -ESTALE) {
nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
@@ -914,18 +950,20 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
status = nfs_refresh_inode(inode, fattr);
if (status) {
- dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n",
+ dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n",
inode->i_sb->s_id,
- (long long)NFS_FILEID(inode), status);
+ (unsigned long long)NFS_FILEID(inode), status);
goto err_out;
}
if (nfsi->cache_validity & NFS_INO_INVALID_ACL)
nfs_zap_acl_cache(inode);
- dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
+ nfs_setsecurity(inode, fattr, label);
+
+ dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n",
inode->i_sb->s_id,
- (long long)NFS_FILEID(inode));
+ (unsigned long long)NFS_FILEID(inode));
err_out:
nfs4_label_free(label);
@@ -958,9 +996,7 @@ int nfs_attribute_cache_expired(struct inode *inode)
*/
int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
{
- if (!(NFS_I(inode)->cache_validity &
- (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
- && !nfs_attribute_cache_expired(inode))
+ if (!nfs_need_revalidate_inode(inode))
return NFS_STALE(inode) ? -ESTALE : 0;
return __nfs_revalidate_inode(server, inode);
}
@@ -981,16 +1017,17 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
if (ret < 0)
return ret;
}
- spin_lock(&inode->i_lock);
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode)) {
+ spin_lock(&inode->i_lock);
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
- spin_unlock(&inode->i_lock);
+ spin_unlock(&inode->i_lock);
+ }
nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE);
nfs_fscache_wait_on_invalidate(inode);
- dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
- inode->i_sb->s_id, (long long)NFS_FILEID(inode));
+ dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n",
+ inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(inode));
return 0;
}
@@ -1011,6 +1048,7 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long *bitlock = &nfsi->flags;
int ret = 0;
/* swapfiles are not supposed to be shared. */
@@ -1022,12 +1060,46 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
if (ret < 0)
goto out;
}
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
- trace_nfs_invalidate_mapping_enter(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- trace_nfs_invalidate_mapping_exit(inode, ret);
+
+ /*
+ * We must clear NFS_INO_INVALID_DATA first to ensure that
+ * invalidations that come in while we're shooting down the mappings
+ * are respected. But, that leaves a race window where one revalidator
+ * can clear the flag, and then another checks it before the mapping
+ * gets invalidated. Fix that by serializing access to this part of
+ * the function.
+ *
+ * At the same time, we need to allow other tasks to see whether we
+ * might be in the middle of invalidating the pages, so we only set
+ * the bit lock here if it looks like we're going to be doing that.
+ */
+ for (;;) {
+ ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ goto out;
+ spin_lock(&inode->i_lock);
+ if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ break;
+ spin_unlock(&inode->i_lock);
+ goto out;
}
+ set_bit(NFS_INO_INVALIDATING, bitlock);
+ smp_wmb();
+ nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
+ spin_unlock(&inode->i_lock);
+ trace_nfs_invalidate_mapping_enter(inode);
+ ret = nfs_invalidate_mapping(inode, mapping);
+ trace_nfs_invalidate_mapping_exit(inode, ret);
+
+ clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_INO_INVALIDATING);
out:
return ret;
}
@@ -1042,7 +1114,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
&& inode->i_version == fattr->pre_change_attr) {
inode->i_version = fattr->change_attr;
if (S_ISDIR(inode->i_mode))
- nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
ret |= NFS_INO_INVALID_ATTR;
}
/* If we have atomic WCC data, we may update some attributes */
@@ -1058,7 +1130,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
&& timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
if (S_ISDIR(inode->i_mode))
- nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA);
ret |= NFS_INO_INVALID_ATTR;
}
if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
@@ -1069,9 +1141,6 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr
ret |= NFS_INO_INVALID_ATTR;
}
- if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
- nfs_fscache_invalidate(inode);
-
return ret;
}
@@ -1130,7 +1199,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
invalid |= NFS_INO_INVALID_ATIME;
if (invalid != 0)
- nfsi->cache_validity |= invalid;
+ nfs_set_cache_invalid(inode, invalid);
nfsi->read_cache_jiffies = fattr->time_start;
return 0;
@@ -1209,6 +1278,7 @@ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh)
* not on the result */
return nfs_fhandle_hash(fh);
}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash);
/*
* _nfs_display_fhandle - display an NFS file handle on the console
@@ -1253,6 +1323,7 @@ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption)
}
}
}
+EXPORT_SYMBOL_GPL(_nfs_display_fhandle);
#endif
/**
@@ -1284,12 +1355,28 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
+/*
+ * Don't trust the change_attribute, mtime, ctime or size if
+ * a pnfs LAYOUTCOMMIT is outstanding
+ */
+static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
+ struct nfs_fattr *fattr)
+{
+ if (pnfs_layoutcommit_outstanding(inode))
+ fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
+ NFS_ATTR_FATTR_MTIME |
+ NFS_ATTR_FATTR_CTIME |
+ NFS_ATTR_FATTR_SIZE);
+}
+
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int ret;
trace_nfs_refresh_inode_enter(inode);
+ nfs_inode_attrs_handle_layoutcommit(inode, fattr);
+
if (nfs_inode_attrs_need_update(inode, fattr))
ret = nfs_update_inode(inode, fattr);
else
@@ -1325,13 +1412,11 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
- struct nfs_inode *nfsi = NFS_I(inode);
+ unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- if (S_ISDIR(inode->i_mode)) {
- nfsi->cache_validity |= NFS_INO_INVALID_DATA;
- nfs_fscache_invalidate(inode);
- }
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ nfs_set_cache_invalid(inode, invalid);
if ((fattr->valid & NFS_ATTR_FATTR) == 0)
return 0;
return nfs_refresh_inode_locked(inode, fattr);
@@ -1436,7 +1521,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long now = jiffies;
unsigned long save_cache_validity;
- dfprintk(VFS, "NFS: %s(%s/%ld fh_crc=0x%08x ct=%d info=0x%x)\n",
+ dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
nfs_display_fhandle_hash(NFS_FH(inode)),
atomic_read(&inode->i_count), fattr->valid);
@@ -1457,7 +1542,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/*
* Big trouble! The inode has become a different object.
*/
- printk(KERN_DEBUG "NFS: %s: inode %ld mode changed, %07o to %07o\n",
+ printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n",
__func__, inode->i_ino, inode->i_mode, fattr->mode);
goto out_err;
}
@@ -1498,18 +1583,20 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_version = fattr->change_attr;
}
} else if (server->caps & NFS_CAP_CHANGE_ATTR)
- invalid |= save_cache_validity;
+ nfsi->cache_validity |= save_cache_validity;
if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
} else if (server->caps & NFS_CAP_MTIME)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
} else if (server->caps & NFS_CAP_CTIME)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
/* Check if our cached file size is stale */
@@ -1519,10 +1606,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if ((nfsi->npages == 0 && !test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) ||
- new_isize > cur_isize) {
+ if ((nfsi->npages == 0) || new_isize > cur_isize) {
i_size_write(inode, new_isize);
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ invalid &= ~NFS_INO_REVAL_PAGECACHE;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
@@ -1532,7 +1619,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
(long long)new_isize);
}
} else
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_PAGECACHE
| NFS_INO_REVAL_FORCED);
@@ -1540,7 +1628,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (fattr->valid & NFS_ATTR_FATTR_ATIME)
memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
else if (server->caps & NFS_CAP_ATIME)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_MODE) {
@@ -1551,7 +1640,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
}
} else if (server->caps & NFS_CAP_MODE)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
@@ -1562,7 +1652,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_uid = fattr->uid;
}
} else if (server->caps & NFS_CAP_OWNER)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
@@ -1573,7 +1664,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_gid = fattr->gid;
}
} else if (server->caps & NFS_CAP_OWNER_GROUP)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ACCESS
| NFS_INO_INVALID_ACL
| NFS_INO_REVAL_FORCED);
@@ -1586,7 +1678,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
set_nlink(inode, fattr->nlink);
}
} else if (server->caps & NFS_CAP_NLINK)
- invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_ATTR
| NFS_INO_REVAL_FORCED);
if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1599,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_blocks = fattr->du.nfs2.blocks;
/* Update attrtimeo value if we're out of the unstable period */
- if (invalid & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) {
+ if (invalid & NFS_INO_INVALID_ATTR) {
nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE);
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
@@ -1612,17 +1705,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
}
}
invalid &= ~NFS_INO_INVALID_ATTR;
- invalid &= ~NFS_INO_INVALID_LABEL;
/* Don't invalidate the data if we were to blame */
if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
|| S_ISLNK(inode->i_mode)))
invalid &= ~NFS_INO_INVALID_DATA;
if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) ||
(save_cache_validity & NFS_INO_REVAL_FORCED))
- nfsi->cache_validity |= invalid;
-
- if (invalid & NFS_INO_INVALID_DATA)
- nfs_fscache_invalidate(inode);
+ nfs_set_cache_invalid(inode, invalid);
return 0;
out_err:
@@ -1643,10 +1732,6 @@ struct inode *nfs_alloc_inode(struct super_block *sb)
return NULL;
nfsi->flags = 0UL;
nfsi->cache_validity = 0UL;
-#ifdef CONFIG_NFS_V3_ACL
- nfsi->acl_access = ERR_PTR(-EAGAIN);
- nfsi->acl_default = ERR_PTR(-EAGAIN);
-#endif
#if IS_ENABLED(CONFIG_NFS_V4)
nfsi->nfs4_acl = NULL;
#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 38da8c2b81a..f415cbf9f6c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -88,8 +88,8 @@ struct nfs_parsed_mount_data {
unsigned int namlen;
unsigned int options;
unsigned int bsize;
- unsigned int auth_flavor_len;
- rpc_authflavor_t auth_flavors[1];
+ struct nfs_auth_info auth_info;
+ rpc_authflavor_t selected_flavor;
char *client_address;
unsigned int version;
unsigned int minorversion;
@@ -154,6 +154,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
rpc_authflavor_t);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
+void nfs_server_remove_lists(struct nfs_server *);
void nfs_init_timeout_values(struct rpc_timeout *, int, unsigned int, unsigned int);
int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t,
rpc_authflavor_t);
@@ -174,6 +175,9 @@ extern struct nfs_server *nfs4_create_server(
struct nfs_subversion *);
extern struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *,
struct nfs_fh *);
+extern int nfs4_update_server(struct nfs_server *server, const char *hostname,
+ struct sockaddr *sap, size_t salen,
+ struct net *net);
extern void nfs_free_server(struct nfs_server *server);
extern struct nfs_server *nfs_clone_server(struct nfs_server *,
struct nfs_fh *,
@@ -227,13 +231,21 @@ extern void nfs_destroy_writepagecache(void);
extern int __init nfs_init_directcache(void);
extern void nfs_destroy_directcache(void);
-extern bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount);
extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr));
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
int nfs_iocounter_wait(struct nfs_io_counter *c);
+extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
+struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *);
+void nfs_rw_header_free(struct nfs_pgio_header *);
+void nfs_pgio_data_release(struct nfs_pgio_data *);
+int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
+int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *,
+ const struct rpc_call_ops *, int, int);
+void nfs_free_request(struct nfs_page *req);
+
static inline void nfs_iocounter_init(struct nfs_io_counter *c)
{
c->flags = 0;
@@ -266,6 +278,30 @@ extern const u32 nfs41_maxgetdevinfo_overhead;
extern struct rpc_procinfo nfs4_procedures[];
#endif
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline void nfs4_label_free(struct nfs4_label *label)
+{
+ if (label) {
+ kfree(label->label);
+ kfree(label);
+ }
+ return;
+}
+
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+ if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL))
+ nfsi->cache_validity |= NFS_INO_INVALID_LABEL;
+}
+#else
+static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
+static inline void nfs4_label_free(void *label) {}
+static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi)
+{
+}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
@@ -273,6 +309,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
const char *ip_addr);
/* dir.c */
+extern void nfs_force_use_readdirplus(struct inode *dir);
extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
@@ -291,16 +328,14 @@ int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *)
int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
loff_t nfs_file_llseek(struct file *, loff_t, int);
int nfs_file_flush(struct file *, fl_owner_t);
-ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ssize_t nfs_file_read(struct kiocb *, struct iov_iter *);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
int nfs_file_mmap(struct file *, struct vm_area_struct *);
-ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ssize_t nfs_file_write(struct kiocb *, struct iov_iter *);
int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
-ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
- size_t, unsigned int);
int nfs_check_flags(int);
int nfs_setlease(struct file *, long, struct file_lock **);
@@ -323,6 +358,7 @@ extern struct file_system_type nfs_xdev_fs_type;
extern struct file_system_type nfs4_xdev_fs_type;
extern struct file_system_type nfs4_referral_fs_type;
#endif
+bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t);
struct dentry *nfs_try_mount(int, const char *, struct nfs_mount_info *,
struct nfs_subversion *);
void nfs_initialise_sb(struct super_block *);
@@ -365,19 +401,11 @@ extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool
struct nfs_pgio_completion_ops;
/* read.c */
-extern struct nfs_read_header *nfs_readhdr_alloc(void);
-extern void nfs_readhdr_free(struct nfs_pgio_header *hdr);
extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
- struct inode *inode,
+ struct inode *inode, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops);
-extern int nfs_initiate_read(struct rpc_clnt *clnt,
- struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops, int flags);
extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
-extern int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr);
extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio);
-extern void nfs_readdata_release(struct nfs_read_data *rdata);
/* super.c */
void nfs_clone_super(struct super_block *, struct nfs_mount_info *);
@@ -392,19 +420,10 @@ int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
/* write.c */
extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
- struct inode *inode, int ioflags,
+ struct inode *inode, int ioflags, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops);
-extern struct nfs_write_header *nfs_writehdr_alloc(void);
-extern void nfs_writehdr_free(struct nfs_pgio_header *hdr);
-extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr);
extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
-extern void nfs_writedata_release(struct nfs_write_data *wdata);
extern void nfs_commit_free(struct nfs_commit_data *p);
-extern int nfs_initiate_write(struct rpc_clnt *clnt,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- int how, int flags);
extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
@@ -417,6 +436,7 @@ extern void nfs_init_commit(struct nfs_commit_data *data,
struct nfs_commit_info *cinfo);
int nfs_scan_commit_list(struct list_head *src, struct list_head *dst,
struct nfs_commit_info *cinfo, int max);
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *);
int nfs_scan_commit(struct inode *inode, struct list_head *dst,
struct nfs_commit_info *cinfo);
void nfs_mark_request_commit(struct nfs_page *req,
@@ -445,6 +465,13 @@ extern int nfs_migrate_page(struct address_space *,
#define nfs_migrate_page NULL
#endif
+/* unlink.c */
+extern struct rpc_task *
+nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *));
+extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
+
/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
@@ -455,7 +482,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_read_data *);
+extern void __nfs4_read_done_cb(struct nfs_pgio_data *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
const char *ip_addr);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 62db136339e..5f61b83f4a1 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -103,7 +103,7 @@ static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
/*
* typedef opaque nfsdata<>;
*/
-static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_readres *result)
+static int decode_nfsdata(struct xdr_stream *xdr, struct nfs_pgio_res *result)
{
u32 recvd, count;
__be32 *p;
@@ -613,7 +613,7 @@ static void nfs2_xdr_enc_readlinkargs(struct rpc_rqst *req,
* };
*/
static void encode_readargs(struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
u32 offset = args->offset;
u32 count = args->count;
@@ -629,7 +629,7 @@ static void encode_readargs(struct xdr_stream *xdr,
static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
encode_readargs(xdr, args);
prepare_reply_buffer(req, args->pages, args->pgbase,
@@ -649,7 +649,7 @@ static void nfs2_xdr_enc_readargs(struct rpc_rqst *req,
* };
*/
static void encode_writeargs(struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
u32 offset = args->offset;
u32 count = args->count;
@@ -669,7 +669,7 @@ static void encode_writeargs(struct xdr_stream *xdr,
static void nfs2_xdr_enc_writeargs(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
encode_writeargs(xdr, args);
xdr->buf->flags |= XDRBUF_WRITE;
@@ -857,7 +857,7 @@ out_default:
* };
*/
static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_readres *result)
+ struct nfs_pgio_res *result)
{
enum nfs_stat status;
int error;
@@ -878,7 +878,7 @@ out_default:
}
static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_writeres *result)
+ struct nfs_pgio_res *result)
{
/* All NFSv2 writes are "file sync" writes */
result->verf->committed = NFS_FILE_SYNC;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 4a1aafba6a2..8f854dde415 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -10,179 +10,7 @@
#define NFSDBG_FACILITY NFSDBG_PROC
-ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl;
- int pos=0, len=0;
-
-# define output(s) do { \
- if (pos + sizeof(s) <= size) { \
- memcpy(buffer + pos, s, sizeof(s)); \
- pos += sizeof(s); \
- } \
- len += sizeof(s); \
- } while(0)
-
- acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- output("system.posix_acl_access");
- posix_acl_release(acl);
- }
-
- if (S_ISDIR(inode->i_mode)) {
- acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl) {
- output("system.posix_acl_default");
- posix_acl_release(acl);
- }
- }
-
-# undef output
-
- if (!buffer || len <= size)
- return len;
- return -ERANGE;
-}
-
-ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size)
-{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl;
- int type, error = 0;
-
- if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
- type = ACL_TYPE_ACCESS;
- else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
- type = ACL_TYPE_DEFAULT;
- else
- return -EOPNOTSUPP;
-
- acl = nfs3_proc_getacl(inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
- error = -ENODATA;
- else
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
- } else
- error = -ENODATA;
-
- return error;
-}
-
-int nfs3_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
-{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl;
- int type, error;
-
- if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
- type = ACL_TYPE_ACCESS;
- else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
- type = ACL_TYPE_DEFAULT;
- else
- return -EOPNOTSUPP;
-
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- error = nfs3_proc_setacl(inode, type, acl);
- posix_acl_release(acl);
-
- return error;
-}
-
-int nfs3_removexattr(struct dentry *dentry, const char *name)
-{
- struct inode *inode = dentry->d_inode;
- int type;
-
- if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
- type = ACL_TYPE_ACCESS;
- else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
- type = ACL_TYPE_DEFAULT;
- else
- return -EOPNOTSUPP;
-
- return nfs3_proc_setacl(inode, type, NULL);
-}
-
-static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
-{
- if (!IS_ERR(nfsi->acl_access)) {
- posix_acl_release(nfsi->acl_access);
- nfsi->acl_access = ERR_PTR(-EAGAIN);
- }
- if (!IS_ERR(nfsi->acl_default)) {
- posix_acl_release(nfsi->acl_default);
- nfsi->acl_default = ERR_PTR(-EAGAIN);
- }
-}
-
-void nfs3_forget_cached_acls(struct inode *inode)
-{
- dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
- inode->i_ino);
- spin_lock(&inode->i_lock);
- __nfs3_forget_cached_acls(NFS_I(inode));
- spin_unlock(&inode->i_lock);
-}
-
-static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
- struct posix_acl *acl = ERR_PTR(-EINVAL);
-
- spin_lock(&inode->i_lock);
- switch(type) {
- case ACL_TYPE_ACCESS:
- acl = nfsi->acl_access;
- break;
-
- case ACL_TYPE_DEFAULT:
- acl = nfsi->acl_default;
- break;
-
- default:
- goto out;
- }
- if (IS_ERR(acl))
- acl = ERR_PTR(-EAGAIN);
- else
- acl = posix_acl_dup(acl);
-out:
- spin_unlock(&inode->i_lock);
- dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
- inode->i_ino, type, acl);
- return acl;
-}
-
-static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
- struct posix_acl *dfacl)
-{
- struct nfs_inode *nfsi = NFS_I(inode);
-
- dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
- inode->i_ino, acl, dfacl);
- spin_lock(&inode->i_lock);
- __nfs3_forget_cached_acls(NFS_I(inode));
- if (!IS_ERR(acl))
- nfsi->acl_access = posix_acl_dup(acl);
- if (!IS_ERR(dfacl))
- nfsi->acl_default = posix_acl_dup(dfacl);
- spin_unlock(&inode->i_lock);
-}
-
-struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
+struct posix_acl *nfs3_get_acl(struct inode *inode, int type)
{
struct nfs_server *server = NFS_SERVER(inode);
struct page *pages[NFSACL_MAXPAGES] = { };
@@ -198,7 +26,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
.rpc_argp = &args,
.rpc_resp = &res,
};
- struct posix_acl *acl;
int status, count;
if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -207,10 +34,6 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
status = nfs_revalidate_inode(server, inode);
if (status < 0)
return ERR_PTR(status);
- acl = nfs3_get_cached_acl(inode, type);
- if (acl != ERR_PTR(-EAGAIN))
- return acl;
- acl = NULL;
/*
* Only get the access acl when explicitly requested: We don't
@@ -257,40 +80,41 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
}
if (res.acl_access != NULL) {
- if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) {
+ if ((posix_acl_equiv_mode(res.acl_access, NULL) == 0) ||
+ res.acl_access->a_count == 0) {
posix_acl_release(res.acl_access);
res.acl_access = NULL;
}
}
- nfs3_cache_acls(inode,
- (res.mask & NFS_ACL) ? res.acl_access : ERR_PTR(-EINVAL),
- (res.mask & NFS_DFACL) ? res.acl_default : ERR_PTR(-EINVAL));
- switch(type) {
- case ACL_TYPE_ACCESS:
- acl = res.acl_access;
- res.acl_access = NULL;
- break;
+ if (res.mask & NFS_ACL)
+ set_cached_acl(inode, ACL_TYPE_ACCESS, res.acl_access);
+ else
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
- case ACL_TYPE_DEFAULT:
- acl = res.acl_default;
- res.acl_default = NULL;
+ if (res.mask & NFS_DFACL)
+ set_cached_acl(inode, ACL_TYPE_DEFAULT, res.acl_default);
+ else
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+ nfs_free_fattr(res.fattr);
+ if (type == ACL_TYPE_ACCESS) {
+ posix_acl_release(res.acl_default);
+ return res.acl_access;
+ } else {
+ posix_acl_release(res.acl_access);
+ return res.acl_default;
}
getout:
posix_acl_release(res.acl_access);
posix_acl_release(res.acl_default);
nfs_free_fattr(res.fattr);
-
- if (status != 0) {
- posix_acl_release(acl);
- acl = ERR_PTR(status);
- }
- return acl;
+ return ERR_PTR(status);
}
-static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
- struct posix_acl *dfacl)
+static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
{
struct nfs_server *server = NFS_SERVER(inode);
struct nfs_fattr *fattr;
@@ -353,7 +177,8 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
switch (status) {
case 0:
status = nfs_refresh_inode(inode, fattr);
- nfs3_cache_acls(inode, acl, dfacl);
+ set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+ set_cached_acl(inode, ACL_TYPE_DEFAULT, dfacl);
break;
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
@@ -373,40 +198,43 @@ out:
return status;
}
-int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
+int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+ struct posix_acl *dfacl)
+{
+ int ret;
+ ret = __nfs3_proc_setacls(inode, acl, dfacl);
+ return (ret == -EOPNOTSUPP) ? 0 : ret;
+
+}
+
+int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
struct posix_acl *alloc = NULL, *dfacl = NULL;
int status;
if (S_ISDIR(inode->i_mode)) {
switch(type) {
- case ACL_TYPE_ACCESS:
- alloc = dfacl = nfs3_proc_getacl(inode,
- ACL_TYPE_DEFAULT);
- if (IS_ERR(alloc))
- goto fail;
- break;
-
- case ACL_TYPE_DEFAULT:
- dfacl = acl;
- alloc = acl = nfs3_proc_getacl(inode,
- ACL_TYPE_ACCESS);
- if (IS_ERR(alloc))
- goto fail;
- break;
-
- default:
- return -EINVAL;
+ case ACL_TYPE_ACCESS:
+ alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (IS_ERR(alloc))
+ goto fail;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ dfacl = acl;
+ alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(alloc))
+ goto fail;
+ break;
}
- } else if (type != ACL_TYPE_ACCESS)
- return -EINVAL;
+ }
if (acl == NULL) {
alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
if (IS_ERR(alloc))
goto fail;
}
- status = nfs3_proc_setacls(inode, acl, dfacl);
+ status = __nfs3_proc_setacls(inode, acl, dfacl);
posix_acl_release(alloc);
return status;
@@ -414,27 +242,51 @@ fail:
return PTR_ERR(alloc);
}
-int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
- umode_t mode)
+const struct xattr_handler *nfs3_xattr_handlers[] = {
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+ NULL,
+};
+
+static int
+nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
+ size_t size, ssize_t *result)
{
- struct posix_acl *dfacl, *acl;
- int error = 0;
+ struct posix_acl *acl;
+ char *p = data + *result;
- dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(dfacl)) {
- error = PTR_ERR(dfacl);
- return (error == -EOPNOTSUPP) ? 0 : error;
- }
- if (!dfacl)
+ acl = get_acl(inode, type);
+ if (!acl)
return 0;
- acl = posix_acl_dup(dfacl);
- error = posix_acl_create(&acl, GFP_KERNEL, &mode);
- if (error < 0)
- goto out_release_dfacl;
- error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
- dfacl : NULL);
+
posix_acl_release(acl);
-out_release_dfacl:
- posix_acl_release(dfacl);
- return error;
+
+ *result += strlen(name);
+ *result += 1;
+ if (!size)
+ return 0;
+ if (*result > size)
+ return -ERANGE;
+
+ strcpy(p, name);
+ return 0;
+}
+
+ssize_t
+nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+ struct inode *inode = dentry->d_inode;
+ ssize_t result = 0;
+ int error;
+
+ error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
+ POSIX_ACL_XATTR_ACCESS, data, size, &result);
+ if (error)
+ return error;
+
+ error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
+ POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+ if (error)
+ return error;
+ return result;
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 01b6f6a49d1..f0afa291fd5 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -18,6 +18,7 @@
#include <linux/lockd/bind.h>
#include <linux/nfs_mount.h>
#include <linux/freezer.h>
+#include <linux/xattr.h>
#include "iostat.h"
#include "internal.h"
@@ -317,8 +318,8 @@ static int
nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
+ struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
- umode_t mode = sattr->ia_mode;
int status = -ENOMEM;
dprintk("NFS call create %pd\n", dentry);
@@ -340,7 +341,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
data->arg.create.verifier[1] = cpu_to_be32(current->pid);
}
- sattr->ia_mode &= ~current_umask();
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
for (;;) {
status = nfs3_do_create(dir, dentry, data);
@@ -366,7 +369,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
}
if (status != 0)
- goto out;
+ goto out_release_acls;
/* When we created the file with exclusive semantics, make
* sure we set the attributes afterwards. */
@@ -385,9 +388,14 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
nfs_post_op_update_inode(dentry->d_inode, data->res.fattr);
dprintk("NFS reply setattr (post-create): %d\n", status);
if (status != 0)
- goto out;
+ goto out_release_acls;
}
- status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+
+ status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
out:
nfs3_free_createdata(data);
dprintk("NFS reply create: %d\n", status);
@@ -471,41 +479,6 @@ nfs3_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
}
static int
-nfs3_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs_renameargs arg = {
- .old_dir = NFS_FH(old_dir),
- .old_name = old_name,
- .new_dir = NFS_FH(new_dir),
- .new_name = new_name,
- };
- struct nfs_renameres res;
- struct rpc_message msg = {
- .rpc_proc = &nfs3_procedures[NFS3PROC_RENAME],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status = -ENOMEM;
-
- dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
-
- res.old_fattr = nfs_alloc_fattr();
- res.new_fattr = nfs_alloc_fattr();
- if (res.old_fattr == NULL || res.new_fattr == NULL)
- goto out;
-
- status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
- nfs_post_op_update_inode(old_dir, res.old_fattr);
- nfs_post_op_update_inode(new_dir, res.new_fattr);
-out:
- nfs_free_fattr(res.old_fattr);
- nfs_free_fattr(res.new_fattr);
- dprintk("NFS reply rename: %d\n", status);
- return status;
-}
-
-static int
nfs3_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
struct nfs3_linkargs arg = {
@@ -572,18 +545,20 @@ out:
static int
nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
{
+ struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
- umode_t mode = sattr->ia_mode;
int status = -ENOMEM;
dprintk("NFS call mkdir %pd\n", dentry);
- sattr->ia_mode &= ~current_umask();
-
data = nfs3_alloc_createdata();
if (data == NULL)
goto out;
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR];
data->arg.mkdir.fh = NFS_FH(dir);
data->arg.mkdir.name = dentry->d_name.name;
@@ -592,9 +567,13 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
status = nfs3_do_create(dir, dentry, data);
if (status != 0)
- goto out;
+ goto out_release_acls;
- status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+ status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
out:
nfs3_free_createdata(data);
dprintk("NFS reply mkdir: %d\n", status);
@@ -691,19 +670,21 @@ static int
nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
dev_t rdev)
{
+ struct posix_acl *default_acl, *acl;
struct nfs3_createdata *data;
- umode_t mode = sattr->ia_mode;
int status = -ENOMEM;
dprintk("NFS call mknod %pd %u:%u\n", dentry,
MAJOR(rdev), MINOR(rdev));
- sattr->ia_mode &= ~current_umask();
-
data = nfs3_alloc_createdata();
if (data == NULL)
goto out;
+ status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl);
+ if (status)
+ goto out;
+
data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKNOD];
data->arg.mknod.fh = NFS_FH(dir);
data->arg.mknod.name = dentry->d_name.name;
@@ -731,8 +712,13 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = nfs3_do_create(dir, dentry, data);
if (status != 0)
- goto out;
- status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+ goto out_release_acls;
+
+ status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl);
+
+out_release_acls:
+ posix_acl_release(acl);
+ posix_acl_release(default_acl);
out:
nfs3_free_createdata(data);
dprintk("NFS reply mknod: %d\n", status);
@@ -809,7 +795,7 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -821,18 +807,18 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
-static void nfs3_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
}
-static int nfs3_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
{
rpc_call_start(task);
return 0;
}
-static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -843,17 +829,11 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_write_data *data)
return 0;
}
-static void nfs3_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
}
-static int nfs3_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
-{
- rpc_call_start(task);
- return 0;
-}
-
static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
{
rpc_call_start(task);
@@ -904,20 +884,28 @@ static const struct inode_operations nfs3_dir_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .getxattr = nfs3_getxattr,
- .setxattr = nfs3_setxattr,
- .removexattr = nfs3_removexattr,
+ .getxattr = generic_getxattr,
+ .setxattr = generic_setxattr,
+ .removexattr = generic_removexattr,
+ .get_acl = nfs3_get_acl,
+ .set_acl = nfs3_set_acl,
+#endif
};
static const struct inode_operations nfs3_file_inode_operations = {
.permission = nfs_permission,
.getattr = nfs_getattr,
.setattr = nfs_setattr,
+#ifdef CONFIG_NFS_V3_ACL
.listxattr = nfs3_listxattr,
- .getxattr = nfs3_getxattr,
- .setxattr = nfs3_setxattr,
- .removexattr = nfs3_removexattr,
+ .getxattr = generic_getxattr,
+ .setxattr = generic_setxattr,
+ .removexattr = generic_removexattr,
+ .get_acl = nfs3_get_acl,
+ .set_acl = nfs3_set_acl,
+#endif
};
const struct nfs_rpc_ops nfs_v3_clientops = {
@@ -939,7 +927,6 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.unlink_setup = nfs3_proc_unlink_setup,
.unlink_rpc_prepare = nfs3_proc_unlink_rpc_prepare,
.unlink_done = nfs3_proc_unlink_done,
- .rename = nfs3_proc_rename,
.rename_setup = nfs3_proc_rename_setup,
.rename_rpc_prepare = nfs3_proc_rename_rpc_prepare,
.rename_done = nfs3_proc_rename_done,
@@ -953,19 +940,16 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.fsinfo = nfs3_proc_fsinfo,
.pathconf = nfs3_proc_pathconf,
.decode_dirent = nfs3_decode_dirent,
+ .pgio_rpc_prepare = nfs3_proc_pgio_rpc_prepare,
.read_setup = nfs3_proc_read_setup,
- .read_pageio_init = nfs_pageio_init_read,
- .read_rpc_prepare = nfs3_proc_read_rpc_prepare,
.read_done = nfs3_read_done,
.write_setup = nfs3_proc_write_setup,
- .write_pageio_init = nfs_pageio_init_write,
- .write_rpc_prepare = nfs3_proc_write_rpc_prepare,
.write_done = nfs3_write_done,
.commit_setup = nfs3_proc_commit_setup,
.commit_rpc_prepare = nfs3_proc_commit_rpc_prepare,
.commit_done = nfs3_commit_done,
.lock = nfs3_proc_lock,
- .clear_acl_cache = nfs3_forget_cached_acls,
+ .clear_acl_cache = forget_all_cached_acls,
.close_context = nfs_close_context,
.have_delegation = nfs3_have_delegation,
.return_delegation = nfs3_return_delegation,
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index cc471c72523..d6a98949af1 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -12,6 +12,9 @@ static struct nfs_subversion nfs_v3 = {
.rpc_vers = &nfs_version3,
.rpc_ops = &nfs_v3_clientops,
.sops = &nfs_sops,
+#ifdef CONFIG_NFS_V3_ACL
+ .xattr = nfs3_xattr_handlers,
+#endif
};
static int __init init_nfs_v3(void)
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index fa6d72131c1..8f4cbe7f4aa 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -953,7 +953,7 @@ static void nfs3_xdr_enc_readlink3args(struct rpc_rqst *req,
* };
*/
static void encode_read3args(struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
__be32 *p;
@@ -966,7 +966,7 @@ static void encode_read3args(struct xdr_stream *xdr,
static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_readargs *args)
+ const struct nfs_pgio_args *args)
{
encode_read3args(xdr, args);
prepare_reply_buffer(req, args->pages, args->pgbase,
@@ -992,7 +992,7 @@ static void nfs3_xdr_enc_read3args(struct rpc_rqst *req,
* };
*/
static void encode_write3args(struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
__be32 *p;
@@ -1008,7 +1008,7 @@ static void encode_write3args(struct xdr_stream *xdr,
static void nfs3_xdr_enc_write3args(struct rpc_rqst *req,
struct xdr_stream *xdr,
- const struct nfs_writeargs *args)
+ const struct nfs_pgio_args *args)
{
encode_write3args(xdr, args);
xdr->buf->flags |= XDRBUF_WRITE;
@@ -1589,7 +1589,7 @@ out_default:
* };
*/
static int decode_read3resok(struct xdr_stream *xdr,
- struct nfs_readres *result)
+ struct nfs_pgio_res *result)
{
u32 eof, count, ocount, recvd;
__be32 *p;
@@ -1625,7 +1625,7 @@ out_overflow:
}
static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_readres *result)
+ struct nfs_pgio_res *result)
{
enum nfs_stat status;
int error;
@@ -1673,7 +1673,7 @@ out_status:
* };
*/
static int decode_write3resok(struct xdr_stream *xdr,
- struct nfs_writeres *result)
+ struct nfs_pgio_res *result)
{
__be32 *p;
@@ -1697,7 +1697,7 @@ out_eio:
}
static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_writeres *result)
+ struct nfs_pgio_res *result)
{
enum nfs_stat status;
int error;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 28842abafab..ba2affa5194 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -9,6 +9,14 @@
#ifndef __LINUX_FS_NFS_NFS4_FS_H
#define __LINUX_FS_NFS_NFS4_FS_H
+#if defined(CONFIG_NFS_V4_2)
+#define NFS4_MAX_MINOR_VERSION 2
+#elif defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif
+
#if IS_ENABLED(CONFIG_NFS_V4)
#define NFS4_MAX_LOOP_ON_RECOVER (10)
@@ -29,6 +37,8 @@ enum nfs4_client_state {
NFS4CLNT_SERVER_SCOPE_MISMATCH,
NFS4CLNT_PURGE_STATE,
NFS4CLNT_BIND_CONN_TO_SESSION,
+ NFS4CLNT_MOVED,
+ NFS4CLNT_LEASE_MOVED,
};
#define NFS4_RENEW_TIMEOUT 0x01
@@ -50,6 +60,7 @@ struct nfs4_minor_version_ops {
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
const struct nfs4_state_recovery_ops *nograce_recovery_ops;
const struct nfs4_state_maintenance_ops *state_renewal_ops;
+ const struct nfs4_mig_recovery_ops *mig_recovery_ops;
};
#define NFS_SEQID_CONFIRMED 1
@@ -203,6 +214,12 @@ struct nfs4_state_maintenance_ops {
int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
};
+struct nfs4_mig_recovery_ops {
+ int (*get_locations)(struct inode *, struct nfs4_fs_locations *,
+ struct page *, struct rpc_cred *);
+ int (*fsid_present)(struct inode *, struct rpc_cred *);
+};
+
extern const struct dentry_operations nfs4_dentry_operations;
/* dir.c */
@@ -213,10 +230,11 @@ int nfs_atomic_open(struct inode *, struct dentry *, struct file *,
extern struct file_system_type nfs4_fs_type;
/* nfs4namespace.c */
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *);
-struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *, struct inode *, struct qstr *);
+struct rpc_clnt *nfs4_negotiate_security(struct rpc_clnt *, struct inode *, struct qstr *);
struct vfsmount *nfs4_submount(struct nfs_server *, struct dentry *,
struct nfs_fh *, struct nfs_fattr *);
+int nfs4_replace_transport(struct nfs_server *server,
+ const struct nfs4_fs_locations *locations);
/* nfs4proc.c */
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
@@ -231,6 +249,9 @@ extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait);
extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle);
extern int nfs4_proc_fs_locations(struct rpc_clnt *, struct inode *, const struct qstr *,
struct nfs4_fs_locations *, struct page *);
+extern int nfs4_proc_get_locations(struct inode *, struct nfs4_fs_locations *,
+ struct page *page, struct rpc_cred *);
+extern int nfs4_proc_fsid_present(struct inode *, struct rpc_cred *);
extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr *,
struct nfs_fh *, struct nfs_fattr *);
extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
@@ -249,6 +270,7 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
extern int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
struct rpc_task *task);
+extern int nfs41_sequence_done(struct rpc_task *, struct nfs4_sequence_res *);
extern int nfs4_proc_create_session(struct nfs_client *, struct rpc_cred *);
extern int nfs4_proc_destroy_session(struct nfs4_session *, struct rpc_cred *);
extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
@@ -315,7 +337,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
*/
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_write_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_data *wdata)
{
if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
!test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
@@ -347,7 +369,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
static inline void
nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
- struct rpc_message *msg, struct nfs_write_data *wdata)
+ struct rpc_message *msg, struct nfs_pgio_data *wdata)
{
}
#endif /* CONFIG_NFS_V4_1 */
@@ -405,12 +427,15 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
extern void nfs_inode_find_state_and_recover(struct inode *inode,
const nfs4_stateid *stateid);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *, struct nfs4_state *);
extern void nfs4_schedule_lease_recovery(struct nfs_client *);
extern int nfs4_wait_clnt_recover(struct nfs_client *clp);
extern int nfs4_client_recover_expired_lease(struct nfs_client *clp);
extern void nfs4_schedule_state_manager(struct nfs_client *);
extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
+extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
+extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
extern void nfs41_handle_server_scope(struct nfs_client *,
struct nfs41_server_scope **);
@@ -476,6 +501,16 @@ static inline bool nfs4_stateid_match(const nfs4_stateid *dst, const nfs4_statei
return memcmp(dst, src, sizeof(*dst)) == 0;
}
+static inline bool nfs4_stateid_match_other(const nfs4_stateid *dst, const nfs4_stateid *src)
+{
+ return memcmp(dst->other, src->other, NFS4_STATEID_OTHER_SIZE) == 0;
+}
+
+static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stateid *s2)
+{
+ return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0;
+}
+
static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state)
{
return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index a860ab566d6..aa9ef487604 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -10,6 +10,7 @@
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
#include "internal.h"
#include "callback.h"
#include "delegation.h"
@@ -169,7 +170,7 @@ void nfs41_shutdown_client(struct nfs_client *clp)
void nfs40_shutdown_client(struct nfs_client *clp)
{
if (clp->cl_slot_tbl) {
- nfs4_release_slot_table(clp->cl_slot_tbl);
+ nfs4_shutdown_slot_table(clp->cl_slot_tbl);
kfree(clp->cl_slot_tbl);
}
}
@@ -197,6 +198,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
+ clp->cl_mig_gen = 1;
return clp;
error:
@@ -368,6 +370,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
if (clp->cl_minorversion != 0)
__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
+ __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
+
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
if (error == -EINVAL)
error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
@@ -407,13 +411,11 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
error = nfs4_discover_server_trunking(clp, &old);
if (error < 0)
goto error;
- nfs_put_client(clp);
- if (clp != old) {
- clp->cl_preserve_clid = true;
- clp = old;
- }
- return clp;
+ if (clp != old)
+ clp->cl_preserve_clid = true;
+ nfs_put_client(clp);
+ return old;
error:
nfs_mark_client_ready(clp, error);
@@ -491,9 +493,10 @@ int nfs40_walk_client_list(struct nfs_client *new,
prev = pos;
status = nfs_wait_client_init_complete(pos);
- spin_lock(&nn->nfs_client_lock);
if (status < 0)
- continue;
+ goto out;
+ status = -NFS4ERR_STALE_CLIENTID;
+ spin_lock(&nn->nfs_client_lock);
}
if (pos->cl_cons_state != NFS_CS_READY)
continue;
@@ -528,6 +531,13 @@ int nfs40_walk_client_list(struct nfs_client *new,
*result = pos;
dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
__func__, pos, atomic_read(&pos->cl_count));
+ goto out;
+ case -ERESTARTSYS:
+ case -ETIMEDOUT:
+ /* The callback path may have been inadvertently
+ * changed. Schedule recovery!
+ */
+ nfs4_schedule_path_down_recovery(pos);
default:
goto out;
}
@@ -631,7 +641,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
}
spin_lock(&nn->nfs_client_lock);
if (status < 0)
- continue;
+ break;
+ status = -NFS4ERR_STALE_CLIENTID;
}
if (pos->cl_cons_state != NFS_CS_READY)
continue;
@@ -924,7 +935,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
dprintk("Server FSID: %llx:%llx\n",
(unsigned long long) server->fsid.major,
(unsigned long long) server->fsid.minor);
- dprintk("Mount FH: %d\n", mntfh->size);
+ nfs_display_fhandle(mntfh, "Pseudo-fs root FH");
nfs4_session_set_rwsize(server);
@@ -947,9 +958,8 @@ out:
* Create a version 4 volume record
*/
static int nfs4_init_server(struct nfs_server *server,
- const struct nfs_parsed_mount_data *data)
+ struct nfs_parsed_mount_data *data)
{
- rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
struct rpc_timeout timeparms;
int error;
@@ -961,9 +971,15 @@ static int nfs4_init_server(struct nfs_server *server,
/* Initialise the client representation from the mount data */
server->flags = data->flags;
server->options = data->options;
+ server->auth_info = data->auth_info;
- if (data->auth_flavor_len >= 1)
- pseudoflavor = data->auth_flavors[0];
+ /* Use the first specified auth flavor. If this flavor isn't
+ * allowed by the server, use the SECINFO path to try the
+ * other specified flavors */
+ if (data->auth_info.flavor_len >= 1)
+ data->selected_flavor = data->auth_info.flavors[0];
+ else
+ data->selected_flavor = RPC_AUTH_UNIX;
/* Get a client record */
error = nfs4_set_client(server,
@@ -971,7 +987,7 @@ static int nfs4_init_server(struct nfs_server *server,
(const struct sockaddr *)&data->nfs_server.address,
data->nfs_server.addrlen,
data->client_address,
- pseudoflavor,
+ data->selected_flavor,
data->nfs_server.protocol,
&timeparms,
data->minorversion,
@@ -991,7 +1007,8 @@ static int nfs4_init_server(struct nfs_server *server,
server->port = data->nfs_server.port;
- error = nfs_init_server_rpcclient(server, &timeparms, pseudoflavor);
+ error = nfs_init_server_rpcclient(server, &timeparms,
+ data->selected_flavor);
error:
/* Done */
@@ -1018,7 +1035,7 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info,
if (!server)
return ERR_PTR(-ENOMEM);
- auth_probe = mount_info->parsed->auth_flavor_len < 1;
+ auth_probe = mount_info->parsed->auth_info.flavor_len < 1;
/* set up the general RPC client */
error = nfs4_init_server(server, mount_info->parsed);
@@ -1046,6 +1063,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
{
struct nfs_client *parent_client;
struct nfs_server *server, *parent_server;
+ bool auth_probe;
int error;
dprintk("--> nfs4_create_referral_server()\n");
@@ -1078,8 +1096,9 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (error < 0)
goto error;
- error = nfs4_server_common_setup(server, mntfh,
- !(parent_server->flags & NFS_MOUNT_SECFLAVOUR));
+ auth_probe = parent_server->auth_info.flavor_len < 1;
+
+ error = nfs4_server_common_setup(server, mntfh, auth_probe);
if (error < 0)
goto error;
@@ -1091,3 +1110,112 @@ error:
dprintk("<-- nfs4_create_referral_server() = error %d\n", error);
return ERR_PTR(error);
}
+
+/*
+ * Grab the destination's particulars, including lease expiry time.
+ *
+ * Returns zero if probe succeeded and retrieved FSID matches the FSID
+ * we have cached.
+ */
+static int nfs_probe_destination(struct nfs_server *server)
+{
+ struct inode *inode = server->super->s_root->d_inode;
+ struct nfs_fattr *fattr;
+ int error;
+
+ fattr = nfs_alloc_fattr();
+ if (fattr == NULL)
+ return -ENOMEM;
+
+ /* Sanity: the probe won't work if the destination server
+ * does not recognize the migrated FH. */
+ error = nfs_probe_fsinfo(server, NFS_FH(inode), fattr);
+
+ nfs_free_fattr(fattr);
+ return error;
+}
+
+/**
+ * nfs4_update_server - Move an nfs_server to a different nfs_client
+ *
+ * @server: represents FSID to be moved
+ * @hostname: new end-point's hostname
+ * @sap: new end-point's socket address
+ * @salen: size of "sap"
+ * @net: net namespace
+ *
+ * The nfs_server must be quiescent before this function is invoked.
+ * Either its session is drained (NFSv4.1+), or its transport is
+ * plugged and drained (NFSv4.0).
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs4_update_server(struct nfs_server *server, const char *hostname,
+ struct sockaddr *sap, size_t salen, struct net *net)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct rpc_clnt *clnt = server->client;
+ struct xprt_create xargs = {
+ .ident = clp->cl_proto,
+ .net = net,
+ .dstaddr = sap,
+ .addrlen = salen,
+ .servername = hostname,
+ };
+ char buf[INET6_ADDRSTRLEN + 1];
+ struct sockaddr_storage address;
+ struct sockaddr *localaddr = (struct sockaddr *)&address;
+ int error;
+
+ dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ hostname);
+
+ error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout);
+ if (error != 0) {
+ dprintk("<-- %s(): rpc_switch_client_transport returned %d\n",
+ __func__, error);
+ goto out;
+ }
+
+ error = rpc_localaddr(clnt, localaddr, sizeof(address));
+ if (error != 0) {
+ dprintk("<-- %s(): rpc_localaddr returned %d\n",
+ __func__, error);
+ goto out;
+ }
+
+ error = -EAFNOSUPPORT;
+ if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) {
+ dprintk("<-- %s(): rpc_ntop returned %d\n",
+ __func__, error);
+ goto out;
+ }
+
+ nfs_server_remove_lists(server);
+ error = nfs4_set_client(server, hostname, sap, salen, buf,
+ clp->cl_rpcclient->cl_auth->au_flavor,
+ clp->cl_proto, clnt->cl_timeout,
+ clp->cl_minorversion, net);
+ nfs_put_client(clp);
+ if (error != 0) {
+ nfs_server_insert_lists(server);
+ dprintk("<-- %s(): nfs4_set_client returned %d\n",
+ __func__, error);
+ goto out;
+ }
+
+ if (server->nfs_client->cl_hostname == NULL)
+ server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL);
+ nfs_server_insert_lists(server);
+
+ error = nfs_probe_destination(server);
+ if (error < 0)
+ goto out;
+
+ dprintk("<-- %s() succeeded\n", __func__);
+
+out:
+ return error;
+}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 9c8f09a2156..a816f0627a6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -73,7 +73,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
nfs_file_set_open_context(filp, ctx);
- nfs_fscache_set_inode_cookie(inode, filp);
+ nfs_fscache_open_file(inode, filp);
err = 0;
out_put_ctx:
@@ -100,8 +100,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
break;
mutex_lock(&inode->i_mutex);
ret = nfs_file_fsync_commit(file, start, end, datasync);
- if (!ret && !datasync)
- /* application has asked for meta-data sync */
+ if (!ret)
ret = pnfs_layoutcommit_inode(inode, true);
mutex_unlock(&inode->i_mutex);
/*
@@ -118,10 +117,10 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
const struct file_operations nfs4_file_operations = {
.llseek = nfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = nfs_file_read,
- .aio_write = nfs_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = nfs_file_read,
+ .write_iter = nfs_file_write,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
.flush = nfs_file_flush,
@@ -130,7 +129,7 @@ const struct file_operations nfs4_file_operations = {
.lock = nfs_lock,
.flock = nfs_flock,
.splice_read = nfs_file_splice_read,
- .splice_write = nfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.check_flags = nfs_check_flags,
.setlease = nfs_setlease,
};
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 049b9fb0d2c..3d83cb1fdc7 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -121,9 +121,8 @@ static int nfs4_validate_fspath(struct dentry *dentry,
}
static size_t nfs_parse_server_name(char *string, size_t len,
- struct sockaddr *sa, size_t salen, struct nfs_server *server)
+ struct sockaddr *sa, size_t salen, struct net *net)
{
- struct net *net = rpc_net_ns(server->client);
ssize_t ret;
ret = rpc_pton(net, string, len, sa, salen);
@@ -137,17 +136,25 @@ static size_t nfs_parse_server_name(char *string, size_t len,
/**
* nfs_find_best_sec - Find a security mechanism supported locally
+ * @server: NFS server struct
* @flavors: List of security tuples returned by SECINFO procedure
*
- * Return the pseudoflavor of the first security mechanism in
- * "flavors" that is locally supported. Return RPC_AUTH_UNIX if
- * no matching flavor is found in the array. The "flavors" array
+ * Return an rpc client that uses the first security mechanism in
+ * "flavors" that is locally supported. The "flavors" array
* is searched in the order returned from the server, per RFC 3530
- * recommendation.
+ * recommendation and each flavor is checked for membership in the
+ * sec= mount option list if it exists.
+ *
+ * Return -EPERM if no matching flavor is found in the array.
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ *
*/
-rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
+static struct rpc_clnt *nfs_find_best_sec(struct rpc_clnt *clnt,
+ struct nfs_server *server,
+ struct nfs4_secinfo_flavors *flavors)
{
- rpc_authflavor_t pseudoflavor;
+ rpc_authflavor_t pflavor;
struct nfs4_secinfo4 *secinfo;
unsigned int i;
@@ -158,55 +165,73 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
case RPC_AUTH_NULL:
case RPC_AUTH_UNIX:
case RPC_AUTH_GSS:
- pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+ pflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
&secinfo->flavor_info);
- if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
- return pseudoflavor;
- break;
+ /* does the pseudoflavor match a sec= mount opt? */
+ if (pflavor != RPC_AUTH_MAXFLAVOR &&
+ nfs_auth_info_match(&server->auth_info, pflavor)) {
+ struct rpc_clnt *new;
+ struct rpc_cred *cred;
+
+ /* Cloning creates an rpc_auth for the flavor */
+ new = rpc_clone_client_set_auth(clnt, pflavor);
+ if (IS_ERR(new))
+ continue;
+ /**
+ * Check that the user actually can use the
+ * flavor. This is mostly for RPC_AUTH_GSS
+ * where cr_init obtains a gss context
+ */
+ cred = rpcauth_lookupcred(new->cl_auth, 0);
+ if (IS_ERR(cred)) {
+ rpc_shutdown_client(new);
+ continue;
+ }
+ put_rpccred(cred);
+ return new;
+ }
}
}
-
- return RPC_AUTH_UNIX;
+ return ERR_PTR(-EPERM);
}
-static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
+/**
+ * nfs4_negotiate_security - in response to an NFS4ERR_WRONGSEC on lookup,
+ * return an rpc_clnt that uses the best available security flavor with
+ * respect to the secinfo flavor list and the sec= mount options.
+ *
+ * @clnt: RPC client to clone
+ * @inode: directory inode
+ * @name: lookup name
+ *
+ * Please call rpc_shutdown_client() when you are done with this rpc client.
+ */
+struct rpc_clnt *
+nfs4_negotiate_security(struct rpc_clnt *clnt, struct inode *inode,
+ struct qstr *name)
{
struct page *page;
struct nfs4_secinfo_flavors *flavors;
- rpc_authflavor_t flavor;
+ struct rpc_clnt *new;
int err;
page = alloc_page(GFP_KERNEL);
if (!page)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
+
flavors = page_address(page);
err = nfs4_proc_secinfo(inode, name, flavors);
if (err < 0) {
- flavor = err;
+ new = ERR_PTR(err);
goto out;
}
- flavor = nfs_find_best_sec(flavors);
+ new = nfs_find_best_sec(clnt, NFS_SERVER(inode), flavors);
out:
put_page(page);
- return flavor;
-}
-
-/*
- * Please call rpc_shutdown_client() when you are done with this client.
- */
-struct rpc_clnt *nfs4_create_sec_client(struct rpc_clnt *clnt, struct inode *inode,
- struct qstr *name)
-{
- rpc_authflavor_t flavor;
-
- flavor = nfs4_negotiate_security(inode, name);
- if ((int)flavor < 0)
- return ERR_PTR((int)flavor);
-
- return rpc_clone_client_set_auth(clnt, flavor);
+ return new;
}
static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
@@ -214,6 +239,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
const struct nfs4_fs_location *location)
{
const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+ struct net *net = rpc_net_ns(NFS_SB(mountdata->sb)->client);
struct vfsmount *mnt = ERR_PTR(-ENOENT);
char *mnt_path;
unsigned int maxbuflen;
@@ -239,8 +265,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
continue;
mountdata->addrlen = nfs_parse_server_name(buf->data, buf->len,
- mountdata->addr, addr_bufsize,
- NFS_SB(mountdata->sb));
+ mountdata->addr, addr_bufsize, net);
if (mountdata->addrlen == 0)
continue;
@@ -389,13 +414,110 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry,
if (client->cl_auth->au_flavor != flavor)
flavor = client->cl_auth->au_flavor;
- else if (!(server->flags & NFS_MOUNT_SECFLAVOUR)) {
- rpc_authflavor_t new = nfs4_negotiate_security(dir, name);
- if ((int)new >= 0)
- flavor = new;
- }
mnt = nfs_do_submount(dentry, fh, fattr, flavor);
out:
rpc_shutdown_client(client);
return mnt;
}
+
+/*
+ * Try one location from the fs_locations array.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+static int nfs4_try_replacing_one_location(struct nfs_server *server,
+ char *page, char *page2,
+ const struct nfs4_fs_location *location)
+{
+ const size_t addr_bufsize = sizeof(struct sockaddr_storage);
+ struct net *net = rpc_net_ns(server->client);
+ struct sockaddr *sap;
+ unsigned int s;
+ size_t salen;
+ int error;
+
+ sap = kmalloc(addr_bufsize, GFP_KERNEL);
+ if (sap == NULL)
+ return -ENOMEM;
+
+ error = -ENOENT;
+ for (s = 0; s < location->nservers; s++) {
+ const struct nfs4_string *buf = &location->servers[s];
+ char *hostname;
+
+ if (buf->len <= 0 || buf->len > PAGE_SIZE)
+ continue;
+
+ if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len) != NULL)
+ continue;
+
+ salen = nfs_parse_server_name(buf->data, buf->len,
+ sap, addr_bufsize, net);
+ if (salen == 0)
+ continue;
+ rpc_set_port(sap, NFS_PORT);
+
+ error = -ENOMEM;
+ hostname = kstrndup(buf->data, buf->len, GFP_KERNEL);
+ if (hostname == NULL)
+ break;
+
+ error = nfs4_update_server(server, hostname, sap, salen, net);
+ kfree(hostname);
+ if (error == 0)
+ break;
+ }
+
+ kfree(sap);
+ return error;
+}
+
+/**
+ * nfs4_replace_transport - set up transport to destination server
+ *
+ * @server: export being migrated
+ * @locations: fs_locations array
+ *
+ * Returns zero on success, or a negative errno value.
+ *
+ * The client tries all the entries in the "locations" array, in the
+ * order returned by the server, until one works or the end of the
+ * array is reached.
+ */
+int nfs4_replace_transport(struct nfs_server *server,
+ const struct nfs4_fs_locations *locations)
+{
+ char *page = NULL, *page2 = NULL;
+ int loc, error;
+
+ error = -ENOENT;
+ if (locations == NULL || locations->nlocations <= 0)
+ goto out;
+
+ error = -ENOMEM;
+ page = (char *) __get_free_page(GFP_USER);
+ if (!page)
+ goto out;
+ page2 = (char *) __get_free_page(GFP_USER);
+ if (!page2)
+ goto out;
+
+ for (loc = 0; loc < locations->nlocations; loc++) {
+ const struct nfs4_fs_location *location =
+ &locations->locations[loc];
+
+ if (location == NULL || location->nservers <= 0 ||
+ location->rootpath.ncomponents == 0)
+ continue;
+
+ error = nfs4_try_replacing_one_location(server, page,
+ page2, location);
+ if (error == 0)
+ break;
+ }
+
+out:
+ free_page((unsigned long)page);
+ free_page((unsigned long)page2);
+ return error;
+}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 75e46bbf7f4..4bf3d97cc5a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -105,9 +105,6 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
return NULL;
- if (NFS_SERVER(dir)->nfs_client->cl_minorversion < 2)
- return NULL;
-
err = security_dentry_init_security(dentry, sattr->ia_mode,
&dentry->d_name, (void **)&label->label, &label->len);
if (err == 0)
@@ -384,6 +381,14 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
case -NFS4ERR_STALE_CLIENTID:
nfs4_schedule_lease_recovery(clp);
goto wait_on_recovery;
+ case -NFS4ERR_MOVED:
+ ret = nfs4_schedule_migration_recovery(server);
+ if (ret < 0)
+ break;
+ goto wait_on_recovery;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ goto wait_on_recovery;
#if defined(CONFIG_NFS_V4_1)
case -NFS4ERR_BADSESSION:
case -NFS4ERR_BADSLOT:
@@ -431,6 +436,8 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
return nfs4_map_errors(ret);
wait_on_recovery:
ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
if (ret == 0)
exception->retry = 1;
return ret;
@@ -532,7 +539,7 @@ static int nfs40_sequence_done(struct rpc_task *task,
struct nfs4_slot *slot = res->sr_slot;
struct nfs4_slot_table *tbl;
- if (!RPC_WAS_SENT(task))
+ if (slot == NULL)
goto out;
tbl = slot->table;
@@ -552,15 +559,10 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
struct nfs4_slot_table *tbl;
+ struct nfs4_slot *slot = res->sr_slot;
bool send_new_highest_used_slotid = false;
- if (!res->sr_slot) {
- /* just wake up the next guy waiting since
- * we may have not consumed a slot after all */
- dprintk("%s: No slot\n", __func__);
- return;
- }
- tbl = res->sr_slot->table;
+ tbl = slot->table;
session = tbl->session;
spin_lock(&tbl->slot_tbl_lock);
@@ -570,11 +572,11 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
if (tbl->highest_used_slotid > tbl->target_highest_slotid)
send_new_highest_used_slotid = true;
- if (nfs41_wake_and_assign_slot(tbl, res->sr_slot)) {
+ if (nfs41_wake_and_assign_slot(tbl, slot)) {
send_new_highest_used_slotid = false;
goto out_unlock;
}
- nfs4_free_slot(tbl, res->sr_slot);
+ nfs4_free_slot(tbl, slot);
if (tbl->highest_used_slotid != NFS4_NO_SLOT)
send_new_highest_used_slotid = false;
@@ -585,19 +587,20 @@ out_unlock:
nfs41_server_notify_highest_slotid_update(session->clp);
}
-static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
+int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
- struct nfs4_slot *slot;
+ struct nfs4_slot *slot = res->sr_slot;
struct nfs_client *clp;
bool interrupted = false;
int ret = 1;
+ if (slot == NULL)
+ goto out_noaction;
/* don't increment the sequence number if the task wasn't sent */
if (!RPC_WAS_SENT(task))
goto out;
- slot = res->sr_slot;
session = slot->table->session;
if (slot->interrupted) {
@@ -672,6 +675,7 @@ out:
/* The session may be reset by one of the error handlers. */
dprintk("%s: Error %d free the slot \n", __func__, res->sr_status);
nfs41_sequence_free_slot(res);
+out_noaction:
return ret;
retry_nowait:
if (rpc_restart_call_prepare(task)) {
@@ -685,6 +689,7 @@ out_retry:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
return 0;
}
+EXPORT_SYMBOL_GPL(nfs41_sequence_done);
static int nfs4_sequence_done(struct rpc_task *task,
struct nfs4_sequence_res *res)
@@ -1063,6 +1068,7 @@ static void nfs4_opendata_free(struct kref *kref)
dput(p->dentry);
nfs_sb_deactive(sb);
nfs_fattr_free_names(&p->f_attr);
+ kfree(p->f_attr.mdsthreshold);
kfree(p);
}
@@ -1132,12 +1138,71 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
nfs4_state_set_mode_locked(state, state->state | fmode);
}
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
+{
+ struct nfs_client *clp = state->owner->so_server->nfs_client;
+ bool need_recover = false;
+
+ if (test_and_clear_bit(NFS_O_RDONLY_STATE, &state->flags) && state->n_rdonly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_WRONLY_STATE, &state->flags) && state->n_wronly)
+ need_recover = true;
+ if (test_and_clear_bit(NFS_O_RDWR_STATE, &state->flags) && state->n_rdwr)
+ need_recover = true;
+ if (need_recover)
+ nfs4_state_mark_reclaim_nograce(clp, state);
+}
+
+static bool nfs_need_update_open_stateid(struct nfs4_state *state,
+ nfs4_stateid *stateid)
+{
+ if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
+ return true;
+ if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+ nfs_test_and_clear_all_open_stateid(state);
+ return true;
+ }
+ if (nfs4_stateid_is_newer(stateid, &state->open_stateid))
+ return true;
+ return false;
+}
+
+static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
+ nfs4_stateid *stateid, fmode_t fmode)
{
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_WRITE:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ break;
+ case FMODE_READ:
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ break;
+ case 0:
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_OPEN_STATE, &state->flags);
+ }
+ if (stateid == NULL)
+ return;
+ if (!nfs_need_update_open_stateid(state, stateid))
+ return;
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
- set_bit(NFS_OPEN_STATE, &state->flags);
+}
+
+static void nfs_clear_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
+ write_seqlock(&state->seqlock);
+ nfs_clear_open_stateid_locked(state, stateid, fmode);
+ write_sequnlock(&state->seqlock);
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+}
+
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+{
switch (fmode) {
case FMODE_READ:
set_bit(NFS_O_RDONLY_STATE, &state->flags);
@@ -1148,13 +1213,11 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
case FMODE_READ|FMODE_WRITE:
set_bit(NFS_O_RDWR_STATE, &state->flags);
}
-}
-
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
-{
- write_seqlock(&state->seqlock);
- nfs_set_open_stateid_locked(state, stateid, fmode);
- write_sequnlock(&state->seqlock);
+ if (!nfs_need_update_open_stateid(state, stateid))
+ return;
+ if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
+ nfs4_stateid_copy(&state->stateid, stateid);
+ nfs4_stateid_copy(&state->open_stateid, stateid);
}
static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
@@ -1212,6 +1275,8 @@ no_delegation:
__update_open_stateid(state, open_stateid, NULL, fmode);
ret = 1;
}
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
+ nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
return ret;
}
@@ -1318,31 +1383,24 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
int ret;
if (!data->rpc_done) {
- ret = data->rpc_status;
- goto err;
+ if (data->rpc_status) {
+ ret = data->rpc_status;
+ goto err;
+ }
+ /* cached opens have already been processed */
+ goto update;
}
- ret = -ESTALE;
- if (!(data->f_attr.valid & NFS_ATTR_FATTR_TYPE) ||
- !(data->f_attr.valid & NFS_ATTR_FATTR_FILEID) ||
- !(data->f_attr.valid & NFS_ATTR_FATTR_CHANGE))
- goto err;
-
- ret = -ENOMEM;
- state = nfs4_get_open_state(inode, data->owner);
- if (state == NULL)
- goto err;
-
ret = nfs_refresh_inode(inode, &data->f_attr);
if (ret)
goto err;
- nfs_setsecurity(inode, &data->f_attr, data->f_label);
-
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
+update:
update_open_stateid(state, &data->o_res.stateid, NULL,
data->o_arg.fmode);
+ atomic_inc(&state->count);
return state;
err:
@@ -1452,12 +1510,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
struct nfs4_state *newstate;
int ret;
+ /* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
/* memory barrier prior to reading state->n_* */
clear_bit(NFS_DELEGATED_STATE, &state->flags);
clear_bit(NFS_OPEN_STATE, &state->flags);
smp_rmb();
if (state->n_rdwr != 0) {
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
if (ret != 0)
return ret;
@@ -1465,7 +1526,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
return -ESTALE;
}
if (state->n_wronly != 0) {
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
if (ret != 0)
return ret;
@@ -1473,7 +1533,6 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
return -ESTALE;
}
if (state->n_rdonly != 0) {
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
if (ret != 0)
return ret;
@@ -1575,6 +1634,12 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
/* Don't recall a delegation if it was lost */
nfs4_schedule_lease_recovery(server->nfs_client);
return -EAGAIN;
+ case -NFS4ERR_MOVED:
+ nfs4_schedule_migration_recovery(server);
+ return -EAGAIN;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(server->nfs_client);
+ return -EAGAIN;
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
@@ -1616,15 +1681,15 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_opendata *data = calldata;
- nfs40_setup_sequence(data->o_arg.server, &data->o_arg.seq_args,
- &data->o_res.seq_res, task);
+ nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args,
+ &data->c_res.seq_res, task);
}
static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
{
struct nfs4_opendata *data = calldata;
- nfs40_sequence_done(task, &data->o_res.seq_res);
+ nfs40_sequence_done(task, &data->c_res.seq_res);
data->rpc_status = task->tk_status;
if (data->rpc_status == 0) {
@@ -1682,7 +1747,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
};
int status;
- nfs4_init_sequence(&data->o_arg.seq_args, &data->o_res.seq_res, 1);
+ nfs4_init_sequence(&data->c_arg.seq_args, &data->c_res.seq_res, 1);
kref_get(&data->kref);
data->rpc_done = 0;
data->rpc_status = 0;
@@ -1962,7 +2027,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
return status;
}
if (!(o_res->f_attr->valid & NFS_ATTR_FATTR))
- _nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
+ nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label);
return 0;
}
@@ -2240,10 +2305,12 @@ static int _nfs4_do_open(struct inode *dir,
}
}
- if (ctx_th && server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
- opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
- if (!opendata->f_attr.mdsthreshold)
- goto err_free_label;
+ if (server->attr_bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD) {
+ if (!opendata->f_attr.mdsthreshold) {
+ opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc();
+ if (!opendata->f_attr.mdsthreshold)
+ goto err_free_label;
+ }
opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0];
}
if (dentry->d_inode != NULL)
@@ -2271,11 +2338,10 @@ static int _nfs4_do_open(struct inode *dir,
if (opendata->file_created)
*opened |= FILE_CREATED;
- if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server))
+ if (pnfs_use_threshold(ctx_th, opendata->f_attr.mdsthreshold, server)) {
*ctx_th = opendata->f_attr.mdsthreshold;
- else
- kfree(opendata->f_attr.mdsthreshold);
- opendata->f_attr.mdsthreshold = NULL;
+ opendata->f_attr.mdsthreshold = NULL;
+ }
nfs4_label_free(olabel);
@@ -2285,7 +2351,6 @@ static int _nfs4_do_open(struct inode *dir,
err_free_label:
nfs4_label_free(olabel);
err_opendata_put:
- kfree(opendata->f_attr.mdsthreshold);
nfs4_opendata_put(opendata);
err_put_state_owner:
nfs4_put_state_owner(sp);
@@ -2394,13 +2459,16 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
if (nfs4_copy_delegation_stateid(&arg.stateid, inode, fmode)) {
/* Use that stateid */
- } else if (truncate && state != NULL && nfs4_valid_open_stateid(state)) {
+ } else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
.l_owner = current->files,
.l_pid = current->tgid,
};
- nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
- &lockowner);
+ if (!nfs4_valid_open_stateid(state))
+ return -EBADF;
+ if (nfs4_select_rw_stateid(&arg.stateid, state, FMODE_WRITE,
+ &lockowner) == -EIO)
+ return -EBADF;
} else
nfs4_stateid_copy(&arg.stateid, &zero_stateid);
@@ -2472,26 +2540,6 @@ static void nfs4_free_closedata(void *data)
kfree(calldata);
}
-static void nfs4_close_clear_stateid_flags(struct nfs4_state *state,
- fmode_t fmode)
-{
- spin_lock(&state->owner->so_lock);
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
- switch (fmode & (FMODE_READ|FMODE_WRITE)) {
- case FMODE_WRITE:
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- break;
- case FMODE_READ:
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- break;
- case 0:
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- clear_bit(NFS_OPEN_STATE, &state->flags);
- }
- spin_unlock(&state->owner->so_lock);
-}
-
static void nfs4_close_done(struct rpc_task *task, void *data)
{
struct nfs4_closedata *calldata = data;
@@ -2510,11 +2558,10 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->roc)
pnfs_roc_set_barrier(state->inode,
calldata->roc_barrier);
- nfs_set_open_stateid(state, &calldata->res.stateid, 0);
+ nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
renew_lease(server, calldata->timestamp);
- nfs4_close_clear_stateid_flags(state,
- calldata->arg.fmode);
- break;
+ goto out_release;
+ case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_BAD_STATEID:
@@ -2522,9 +2569,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
if (calldata->arg.fmode == 0)
break;
default:
- if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+ if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
rpc_restart_call_prepare(task);
+ goto out_release;
+ }
}
+ nfs_clear_open_stateid(state, NULL, calldata->arg.fmode);
+out_release:
nfs_release_seqid(calldata->arg.seqid);
nfs_refresh_inode(calldata->inode, calldata->res.fattr);
dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
@@ -2697,6 +2748,10 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
nfs4_close_state(ctx->state, ctx->mode);
}
+#define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
+#define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
+
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
struct nfs4_server_caps_arg args = {
@@ -2712,13 +2767,27 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
if (status == 0) {
+ /* Sanity check the server answers */
+ switch (server->nfs_client->cl_minorversion) {
+ case 0:
+ res.attr_bitmask[1] &= FATTR4_WORD1_NFS40_MASK;
+ res.attr_bitmask[2] = 0;
+ break;
+ case 1:
+ res.attr_bitmask[2] &= FATTR4_WORD2_NFS41_MASK;
+ break;
+ case 2:
+ res.attr_bitmask[2] &= FATTR4_WORD2_NFS42_MASK;
+ }
memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
- NFS_CAP_CTIME|NFS_CAP_MTIME);
- if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
+ NFS_CAP_CTIME|NFS_CAP_MTIME|
+ NFS_CAP_SECURITY_LABEL);
+ if (res.attr_bitmask[0] & FATTR4_WORD0_ACL &&
+ res.acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
server->caps |= NFS_CAP_ACLS;
if (res.has_links != 0)
server->caps |= NFS_CAP_HARDLINKS;
@@ -2746,14 +2815,12 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
#endif
memcpy(server->attr_bitmask_nl, res.attr_bitmask,
sizeof(server->attr_bitmask));
+ server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- if (server->caps & NFS_CAP_SECURITY_LABEL) {
- server->attr_bitmask_nl[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- res.attr_bitmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
- }
memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ server->cache_consistency_bitmask[2] = 0;
server->acl_bitmask = res.acl_bitmask;
server->fh_expire_type = res.fh_expire_type;
}
@@ -2864,11 +2931,24 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
int status = -EPERM;
size_t i;
- for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
- status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
- if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
- continue;
- break;
+ if (server->auth_info.flavor_len > 0) {
+ /* try each flavor specified by user */
+ for (i = 0; i < server->auth_info.flavor_len; i++) {
+ status = nfs4_lookup_root_sec(server, fhandle, info,
+ server->auth_info.flavors[i]);
+ if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+ continue;
+ break;
+ }
+ } else {
+ /* no flavors specified by user, try default list */
+ for (i = 0; i < ARRAY_SIZE(flav_array); i++) {
+ status = nfs4_lookup_root_sec(server, fhandle, info,
+ flav_array[i]);
+ if (status == -NFS4ERR_WRONGSEC || status == -EACCES)
+ continue;
+ break;
+ }
}
/*
@@ -2910,9 +2990,6 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
status = nfs4_lookup_root(server, fhandle, info);
if (status != -NFS4ERR_WRONGSEC)
break;
- /* Did user force a 'sec=' mount option? */
- if (server->flags & NFS_MOUNT_SECFLAVOUR)
- break;
default:
status = nfs4_do_find_root_sec(server, fhandle, info);
}
@@ -2981,11 +3058,16 @@ static int nfs4_get_referral(struct rpc_clnt *client, struct inode *dir,
status = nfs4_proc_fs_locations(client, dir, name, locations, page);
if (status != 0)
goto out;
- /* Make sure server returned a different fsid for the referral */
+
+ /*
+ * If the fsid didn't change, this is a migration event, not a
+ * referral. Cause us to drop into the exception handler, which
+ * will kick off migration recovery.
+ */
if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
dprintk("%s: server did not return a different fsid for"
" a referral at %s\n", __func__, name->name);
- status = -EIO;
+ status = -NFS4ERR_MOVED;
goto out;
}
/* Fixup attributes for the nfs_lookup() call to nfs_fhget() */
@@ -3165,10 +3247,7 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir,
err = -EPERM;
if (client != *clnt)
goto out;
- /* No security negotiation if the user specified 'sec=' */
- if (NFS_SERVER(dir)->flags & NFS_MOUNT_SECFLAVOUR)
- goto out;
- client = nfs4_create_sec_client(client, dir, name);
+ client = nfs4_negotiate_security(client, dir, name);
if (IS_ERR(client))
return PTR_ERR(client);
@@ -3469,49 +3548,6 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
return 1;
}
-static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs_server *server = NFS_SERVER(old_dir);
- struct nfs_renameargs arg = {
- .old_dir = NFS_FH(old_dir),
- .new_dir = NFS_FH(new_dir),
- .old_name = old_name,
- .new_name = new_name,
- };
- struct nfs_renameres res = {
- .server = server,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENAME],
- .rpc_argp = &arg,
- .rpc_resp = &res,
- };
- int status = -ENOMEM;
-
- status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
- if (!status) {
- update_changeattr(old_dir, &res.old_cinfo);
- update_changeattr(new_dir, &res.new_cinfo);
- }
- return status;
-}
-
-static int nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs4_exception exception = { };
- int err;
- do {
- err = _nfs4_proc_rename(old_dir, old_name,
- new_dir, new_name);
- trace_nfs4_rename(old_dir, old_name, new_dir, new_name, err);
- err = nfs4_handle_exception(NFS_SERVER(old_dir), err,
- &exception);
- } while (exception.retry);
- return err;
-}
-
static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
struct nfs_server *server = NFS_SERVER(inode);
@@ -3976,8 +4012,9 @@ static bool nfs4_stateid_is_current(nfs4_stateid *stateid,
{
nfs4_stateid current_stateid;
- if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode))
- return false;
+ /* If the current stateid represents a lost lock, then exit */
+ if (nfs4_set_rw_stateid(&current_stateid, ctx, l_ctx, fmode) == -EIO)
+ return true;
return nfs4_stateid_match(stateid, &current_stateid);
}
@@ -3996,12 +4033,12 @@ static bool nfs4_error_stateid_expired(int err)
return false;
}
-void __nfs4_read_done_cb(struct nfs_read_data *data)
+void __nfs4_read_done_cb(struct nfs_pgio_data *data)
{
nfs_invalidate_atime(data->header->inode);
}
-static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct nfs_server *server = NFS_SERVER(data->header->inode);
@@ -4018,7 +4055,7 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
}
static bool nfs4_read_stateid_changed(struct rpc_task *task,
- struct nfs_readargs *args)
+ struct nfs_pgio_args *args)
{
if (!nfs4_error_stateid_expired(task->tk_status) ||
@@ -4031,7 +4068,7 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
dprintk("--> %s\n", __func__);
@@ -4040,19 +4077,19 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
return -EAGAIN;
if (nfs4_read_stateid_changed(task, &data->args))
return -EAGAIN;
- return data->read_done_cb ? data->read_done_cb(task, data) :
+ return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
nfs4_read_done_cb(task, data);
}
-static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
data->timestamp = jiffies;
- data->read_done_cb = nfs4_read_done_cb;
+ data->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
}
-static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
{
if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
&data->args.seq_args,
@@ -4060,14 +4097,14 @@ static int nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_dat
task))
return 0;
if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_READ) == -EIO)
+ data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO)
return -EIO;
if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
return -EIO;
return 0;
}
-static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -4084,7 +4121,7 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data
}
static bool nfs4_write_stateid_changed(struct rpc_task *task,
- struct nfs_writeargs *args)
+ struct nfs_pgio_args *args)
{
if (!nfs4_error_stateid_expired(task->tk_status) ||
@@ -4097,18 +4134,18 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
return true;
}
-static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
if (!nfs4_sequence_done(task, &data->res.seq_res))
return -EAGAIN;
if (nfs4_write_stateid_changed(task, &data->args))
return -EAGAIN;
- return data->write_done_cb ? data->write_done_cb(task, data) :
+ return data->pgio_done_cb ? data->pgio_done_cb(task, data) :
nfs4_write_done_cb(task, data);
}
static
-bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
+bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
{
const struct nfs_pgio_header *hdr = data->header;
@@ -4121,7 +4158,7 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_write_data *data)
return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
}
-static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
struct nfs_server *server = NFS_SERVER(data->header->inode);
@@ -4131,8 +4168,8 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
} else
data->args.bitmask = server->cache_consistency_bitmask;
- if (!data->write_done_cb)
- data->write_done_cb = nfs4_write_done_cb;
+ if (!data->pgio_done_cb)
+ data->pgio_done_cb = nfs4_write_done_cb;
data->res.server = server;
data->timestamp = jiffies;
@@ -4140,21 +4177,6 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
}
-static int nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
-{
- if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
- &data->args.seq_args,
- &data->res.seq_res,
- task))
- return 0;
- if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
- data->args.lock_context, FMODE_WRITE) == -EIO)
- return -EIO;
- if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
- return -EIO;
- return 0;
-}
-
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
{
nfs4_setup_sequence(NFS_SERVER(data->inode),
@@ -4220,7 +4242,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
unsigned long timestamp = data->timestamp;
trace_nfs4_renew_async(clp, task->tk_status);
- if (task->tk_status < 0) {
+ switch (task->tk_status) {
+ case 0:
+ break;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ break;
+ default:
/* Unless we're shutting down, schedule state recovery! */
if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
return;
@@ -4279,9 +4307,7 @@ static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
static inline int nfs4_server_supports_acls(struct nfs_server *server)
{
- return (server->caps & NFS_CAP_ACLS)
- && (server->acl_bitmask & ACL4_SUPPORT_ALLOW_ACL)
- && (server->acl_bitmask & ACL4_SUPPORT_DENY_ACL);
+ return server->caps & NFS_CAP_ACLS;
}
/* Assuming that XATTR_SIZE_MAX is a multiple of PAGE_SIZE, and that
@@ -4574,7 +4600,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
struct nfs4_label label = {0, 0, buflen, buf};
u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
- struct nfs4_getattr_arg args = {
+ struct nfs4_getattr_arg arg = {
.fh = NFS_FH(inode),
.bitmask = bitmask,
};
@@ -4585,14 +4611,14 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETATTR],
- .rpc_argp = &args,
+ .rpc_argp = &arg,
.rpc_resp = &res,
};
int ret;
nfs_fattr_init(&fattr);
- ret = rpc_call_sync(server->client, &msg, 0);
+ ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 0);
if (ret)
return ret;
if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
@@ -4629,7 +4655,7 @@ static int _nfs4_do_set_security_label(struct inode *inode,
struct iattr sattr = {0};
struct nfs_server *server = NFS_SERVER(inode);
const u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL };
- struct nfs_setattrargs args = {
+ struct nfs_setattrargs arg = {
.fh = NFS_FH(inode),
.iap = &sattr,
.server = server,
@@ -4643,14 +4669,14 @@ static int _nfs4_do_set_security_label(struct inode *inode,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &args,
+ .rpc_argp = &arg,
.rpc_resp = &res,
};
int status;
- nfs4_stateid_copy(&args.stateid, &zero_stateid);
+ nfs4_stateid_copy(&arg.stateid, &zero_stateid);
- status = rpc_call_sync(server->client, &msg, 0);
+ status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
if (status)
dprintk("%s failed: %d\n", __func__, status);
@@ -4734,17 +4760,24 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
if (state == NULL)
break;
if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto stateid_invalid;
+ goto recovery_failed;
goto wait_on_recovery;
case -NFS4ERR_EXPIRED:
if (state != NULL) {
if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto stateid_invalid;
+ goto recovery_failed;
}
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_STALE_CLIENTID:
nfs4_schedule_lease_recovery(clp);
goto wait_on_recovery;
+ case -NFS4ERR_MOVED:
+ if (nfs4_schedule_migration_recovery(server) < 0)
+ goto recovery_failed;
+ goto wait_on_recovery;
+ case -NFS4ERR_LEASE_MOVED:
+ nfs4_schedule_lease_moved_recovery(clp);
+ goto wait_on_recovery;
#if defined(CONFIG_NFS_V4_1)
case -NFS4ERR_BADSESSION:
case -NFS4ERR_BADSLOT:
@@ -4756,29 +4789,28 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
dprintk("%s ERROR %d, Reset session\n", __func__,
task->tk_status);
nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- task->tk_status = 0;
- return -EAGAIN;
+ goto wait_on_recovery;
#endif /* CONFIG_NFS_V4_1 */
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
rpc_delay(task, NFS4_POLL_RETRY_MAX);
- task->tk_status = 0;
- return -EAGAIN;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_OLD_STATEID:
- task->tk_status = 0;
- return -EAGAIN;
+ goto restart_call;
}
task->tk_status = nfs4_map_errors(task->tk_status);
return 0;
-stateid_invalid:
+recovery_failed:
task->tk_status = -EIO;
return 0;
wait_on_recovery:
rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ goto recovery_failed;
+restart_call:
task->tk_status = 0;
return -EAGAIN;
}
@@ -4835,6 +4867,20 @@ nfs4_init_uniform_client_string(const struct nfs_client *clp,
nodename);
}
+/*
+ * nfs4_callback_up_net() starts only "tcp" and "tcp6" callback
+ * services. Advertise one based on the address family of the
+ * clientaddr.
+ */
+static unsigned int
+nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
+{
+ if (strchr(clp->cl_ipaddr, ':') != NULL)
+ return scnprintf(buf, len, "tcp6");
+ else
+ return scnprintf(buf, len, "tcp");
+}
+
/**
* nfs4_proc_setclientid - Negotiate client ID
* @clp: state data structure
@@ -4876,12 +4922,10 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
setclientid.sc_name,
sizeof(setclientid.sc_name));
/* cb_client4 */
- rcu_read_lock();
- setclientid.sc_netid_len = scnprintf(setclientid.sc_netid,
- sizeof(setclientid.sc_netid), "%s",
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_NETID));
- rcu_read_unlock();
+ setclientid.sc_netid_len =
+ nfs4_init_callback_netid(clp,
+ setclientid.sc_netid,
+ sizeof(setclientid.sc_netid));
setclientid.sc_uaddr_len = scnprintf(setclientid.sc_uaddr,
sizeof(setclientid.sc_uaddr), "%s.%u.%u",
clp->cl_ipaddr, port >> 8, port & 255);
@@ -4942,11 +4986,17 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
switch (task->tk_status) {
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_EXPIRED:
case 0:
renew_lease(data->res.server, data->timestamp);
break;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ task->tk_status = 0;
+ break;
default:
if (nfs4_async_handle_error(task, data->res.server, NULL) ==
-EAGAIN) {
@@ -5105,6 +5155,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
status = 0;
}
request->fl_ops->fl_release_private(request);
+ request->fl_ops = NULL;
out:
return status;
}
@@ -5776,21 +5827,36 @@ struct nfs_release_lockowner_data {
struct nfs4_lock_state *lsp;
struct nfs_server *server;
struct nfs_release_lockowner_args args;
- struct nfs4_sequence_args seq_args;
- struct nfs4_sequence_res seq_res;
+ struct nfs_release_lockowner_res res;
+ unsigned long timestamp;
};
static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_release_lockowner_data *data = calldata;
nfs40_setup_sequence(data->server,
- &data->seq_args, &data->seq_res, task);
+ &data->args.seq_args, &data->res.seq_res, task);
+ data->timestamp = jiffies;
}
static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
{
struct nfs_release_lockowner_data *data = calldata;
- nfs40_sequence_done(task, &data->seq_res);
+ struct nfs_server *server = data->server;
+
+ nfs40_sequence_done(task, &data->res.seq_res);
+
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(server, data->timestamp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_LEASE_MOVED:
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN)
+ rpc_restart_call_prepare(task);
+ }
}
static void nfs4_release_lockowner_release(void *calldata)
@@ -5819,7 +5885,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
data = kmalloc(sizeof(*data), GFP_NOFS);
if (!data)
return -ENOMEM;
- nfs4_init_sequence(&data->seq_args, &data->seq_res, 0);
data->lsp = lsp;
data->server = server;
data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5827,6 +5892,8 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
data->args.lock_owner.s_dev = server->s_dev;
msg.rpc_argp = &data->args;
+ msg.rpc_resp = &data->res;
+ nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
return 0;
}
@@ -5989,6 +6056,283 @@ int nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir,
return err;
}
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery. The server can stop returning
+ * NFS4ERR_LEASE_MOVED to this client. A RENEW operation is
+ * appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_get_locations(struct inode *inode,
+ struct nfs4_fs_locations *locations,
+ struct page *page, struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ u32 bitmask[2] = {
+ [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+ };
+ struct nfs4_fs_locations_arg args = {
+ .clientid = server->nfs_client->cl_clientid,
+ .fh = NFS_FH(inode),
+ .page = page,
+ .bitmask = bitmask,
+ .migration = 1, /* skip LOOKUP */
+ .renew = 1, /* append RENEW */
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = locations,
+ .migration = 1,
+ .renew = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ nfs_fattr_init(&locations->fattr);
+ locations->server = server;
+ locations->nlocations = 0;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+ nfs4_set_sequence_privileged(&args.seq_args);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status)
+ return status;
+
+ renew_lease(server, now);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing migration recovery. The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client. The client ID
+ * performing this operation is identified in the SEQUENCE
+ * operation in this compound.
+ *
+ * When the client supports GETATTR(fs_locations_info), it can
+ * be plumbed in here.
+ */
+static int _nfs41_proc_get_locations(struct inode *inode,
+ struct nfs4_fs_locations *locations,
+ struct page *page, struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ u32 bitmask[2] = {
+ [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS,
+ };
+ struct nfs4_fs_locations_arg args = {
+ .fh = NFS_FH(inode),
+ .page = page,
+ .bitmask = bitmask,
+ .migration = 1, /* skip LOOKUP */
+ };
+ struct nfs4_fs_locations_res res = {
+ .fs_locations = locations,
+ .migration = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FS_LOCATIONS],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ nfs_fattr_init(&locations->fattr);
+ locations->server = server;
+ locations->nlocations = 0;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+ nfs4_set_sequence_privileged(&args.seq_args);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ if (status == NFS4_OK &&
+ res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+ status = -NFS4ERR_LEASE_MOVED;
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_get_locations - discover locations for a migrated FSID
+ * @inode: inode on FSID that is migrating
+ * @locations: result of query
+ * @page: buffer
+ * @cred: credential to use for this operation
+ *
+ * Returns NFS4_OK on success, a negative NFS4ERR status code if the
+ * operation failed, or a negative errno if a local error occurred.
+ *
+ * On success, "locations" is filled in, but if the server has
+ * no locations information, NFS_ATTR_FATTR_V4_LOCATIONS is not
+ * asserted.
+ *
+ * -NFS4ERR_LEASE_MOVED is returned if the server still has leases
+ * from this client that require migration recovery.
+ */
+int nfs4_proc_get_locations(struct inode *inode,
+ struct nfs4_fs_locations *locations,
+ struct page *page, struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_mig_recovery_ops *ops =
+ clp->cl_mvops->mig_recovery_ops;
+ struct nfs4_exception exception = { };
+ int status;
+
+ dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+ nfs_display_fhandle(NFS_FH(inode), __func__);
+
+ do {
+ status = ops->get_locations(inode, locations, page, cred);
+ if (status != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ return status;
+}
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery. The server can stop
+ * returning NFS4ERR_LEASE_MOVED to this client. A RENEW operation
+ * is appended to this compound to identify the client ID which is
+ * performing recovery.
+ */
+static int _nfs40_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct rpc_clnt *clnt = server->client;
+ struct nfs4_fsid_present_arg args = {
+ .fh = NFS_FH(inode),
+ .clientid = clp->cl_clientid,
+ .renew = 1, /* append RENEW */
+ };
+ struct nfs4_fsid_present_res res = {
+ .renew = 1,
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ unsigned long now = jiffies;
+ int status;
+
+ res.fh = nfs_alloc_fhandle();
+ if (res.fh == NULL)
+ return -ENOMEM;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+ nfs4_set_sequence_privileged(&args.seq_args);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ nfs_free_fhandle(res.fh);
+ if (status)
+ return status;
+
+ do_renew_lease(clp, now);
+ return 0;
+}
+
+#ifdef CONFIG_NFS_V4_1
+
+/*
+ * This operation also signals the server that this client is
+ * performing "lease moved" recovery. The server can stop asserting
+ * SEQ4_STATUS_LEASE_MOVED for this client. The client ID performing
+ * this operation is identified in the SEQUENCE operation in this
+ * compound.
+ */
+static int _nfs41_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct rpc_clnt *clnt = server->client;
+ struct nfs4_fsid_present_arg args = {
+ .fh = NFS_FH(inode),
+ };
+ struct nfs4_fsid_present_res res = {
+ };
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FSID_PRESENT],
+ .rpc_argp = &args,
+ .rpc_resp = &res,
+ .rpc_cred = cred,
+ };
+ int status;
+
+ res.fh = nfs_alloc_fhandle();
+ if (res.fh == NULL)
+ return -ENOMEM;
+
+ nfs4_init_sequence(&args.seq_args, &res.seq_res, 0);
+ nfs4_set_sequence_privileged(&args.seq_args);
+ status = nfs4_call_sync_sequence(clnt, server, &msg,
+ &args.seq_args, &res.seq_res);
+ nfs_free_fhandle(res.fh);
+ if (status == NFS4_OK &&
+ res.seq_res.sr_status_flags & SEQ4_STATUS_LEASE_MOVED)
+ status = -NFS4ERR_LEASE_MOVED;
+ return status;
+}
+
+#endif /* CONFIG_NFS_V4_1 */
+
+/**
+ * nfs4_proc_fsid_present - Is this FSID present or absent on server?
+ * @inode: inode on FSID to check
+ * @cred: credential to use for this operation
+ *
+ * Server indicates whether the FSID is present, moved, or not
+ * recognized. This operation is necessary to clear a LEASE_MOVED
+ * condition for this client ID.
+ *
+ * Returns NFS4_OK if the FSID is present on this server,
+ * -NFS4ERR_MOVED if the FSID is no longer present, a negative
+ * NFS4ERR code if some error occurred on the server, or a
+ * negative errno if a local failure occurred.
+ */
+int nfs4_proc_fsid_present(struct inode *inode, struct rpc_cred *cred)
+{
+ struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_client *clp = server->nfs_client;
+ const struct nfs4_mig_recovery_ops *ops =
+ clp->cl_mvops->mig_recovery_ops;
+ struct nfs4_exception exception = { };
+ int status;
+
+ dprintk("%s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+ nfs_display_fhandle(NFS_FH(inode), __func__);
+
+ do {
+ status = ops->fsid_present(inode, cred);
+ if (status != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, status, &exception);
+ } while (exception.retry);
+ return status;
+}
+
/**
* If 'use_integrity' is true and the state managment nfs_client
* cl_rpcclient is using krb5i/p, use the integrity protected cl_rpcclient
@@ -6275,8 +6619,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
struct nfs41_exchange_id_args args = {
.verifier = &verifier,
.client = clp,
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+ .flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+ EXCHGID4_FLAG_BIND_PRINC_STATEID |
+ EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
- EXCHGID4_FLAG_BIND_PRINC_STATEID,
+ EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
};
struct nfs41_exchange_id_res res = {
0
@@ -7055,9 +7405,9 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
struct nfs4_state *state = NULL;
- unsigned long timeo, giveup;
+ unsigned long timeo, now, giveup;
- dprintk("--> %s\n", __func__);
+ dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
if (!nfs41_sequence_done(task, &lgp->res.seq_res))
goto out;
@@ -7065,12 +7415,38 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
switch (task->tk_status) {
case 0:
goto out;
+ /*
+ * NFS4ERR_LAYOUTTRYLATER is a conflict with another client
+ * (or clients) writing to the same RAID stripe
+ */
case -NFS4ERR_LAYOUTTRYLATER:
+ /*
+ * NFS4ERR_RECALLCONFLICT is when conflict with self (must recall
+ * existing layout before getting a new one).
+ */
case -NFS4ERR_RECALLCONFLICT:
timeo = rpc_get_timeout(task->tk_client);
giveup = lgp->args.timestamp + timeo;
- if (time_after(giveup, jiffies))
- task->tk_status = -NFS4ERR_DELAY;
+ now = jiffies;
+ if (time_after(giveup, now)) {
+ unsigned long delay;
+
+ /* Delay for:
+ * - Not less then NFS4_POLL_RETRY_MIN.
+ * - One last time a jiffie before we give up
+ * - exponential backoff (time_now minus start_attempt)
+ */
+ delay = max_t(unsigned long, NFS4_POLL_RETRY_MIN,
+ min((giveup - now - 1),
+ now - lgp->args.timestamp));
+
+ dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
+ __func__, delay);
+ rpc_delay(task, delay);
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ goto out; /* Do not call nfs4_async_handle_error() */
+ }
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
@@ -7244,7 +7620,14 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
return;
server = NFS_SERVER(lrp->args.inode);
- if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+ switch (task->tk_status) {
+ default:
+ task->tk_status = 0;
+ case 0:
+ break;
+ case -NFS4ERR_DELAY:
+ if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+ break;
rpc_restart_call_prepare(task);
return;
}
@@ -7419,10 +7802,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
case -NFS4ERR_BADLAYOUT: /* no layout */
case -NFS4ERR_GRACE: /* loca_recalim always false */
task->tk_status = 0;
- break;
case 0:
- nfs_post_op_update_inode_force_wcc(data->args.inode,
- data->res.fattr);
break;
default:
if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
@@ -7437,6 +7817,8 @@ static void nfs4_layoutcommit_release(void *calldata)
struct nfs4_layoutcommit_data *data = calldata;
pnfs_cleanup_layoutcommit(data);
+ nfs_post_op_update_inode_force_wcc(data->args.inode,
+ data->res.fattr);
put_rpccred(data->cred);
kfree(data);
}
@@ -7559,7 +7941,7 @@ nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
switch (err) {
case 0:
case -NFS4ERR_WRONGSEC:
- case -NFS4ERR_NOTSUPP:
+ case -ENOTSUPP:
goto out;
default:
err = nfs4_handle_exception(server, err, &exception);
@@ -7593,7 +7975,7 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
* Fall back on "guess and check" method if
* the server doesn't support SECINFO_NO_NAME
*/
- if (err == -NFS4ERR_WRONGSEC || err == -NFS4ERR_NOTSUPP) {
+ if (err == -NFS4ERR_WRONGSEC || err == -ENOTSUPP) {
err = nfs4_find_root_sec(server, fhandle, info);
goto out_freepage;
}
@@ -7615,6 +7997,9 @@ nfs41_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
break;
}
+ if (!nfs_auth_info_match(&server->auth_info, flavor))
+ flavor = RPC_AUTH_MAXFLAVOR;
+
if (flavor != RPC_AUTH_MAXFLAVOR) {
err = nfs4_lookup_root_sec(server, fhandle,
info, flavor);
@@ -7886,6 +8271,18 @@ static const struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = {
};
#endif
+static const struct nfs4_mig_recovery_ops nfs40_mig_recovery_ops = {
+ .get_locations = _nfs40_proc_get_locations,
+ .fsid_present = _nfs40_proc_fsid_present,
+};
+
+#if defined(CONFIG_NFS_V4_1)
+static const struct nfs4_mig_recovery_ops nfs41_mig_recovery_ops = {
+ .get_locations = _nfs41_proc_get_locations,
+ .fsid_present = _nfs41_proc_fsid_present,
+};
+#endif /* CONFIG_NFS_V4_1 */
+
static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.minor_version = 0,
.init_caps = NFS_CAP_READDIRPLUS
@@ -7901,6 +8298,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
.state_renewal_ops = &nfs40_state_renewal_ops,
+ .mig_recovery_ops = &nfs40_mig_recovery_ops,
};
#if defined(CONFIG_NFS_V4_1)
@@ -7921,6 +8319,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
};
#endif
@@ -8004,7 +8403,6 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.unlink_setup = nfs4_proc_unlink_setup,
.unlink_rpc_prepare = nfs4_proc_unlink_rpc_prepare,
.unlink_done = nfs4_proc_unlink_done,
- .rename = nfs4_proc_rename,
.rename_setup = nfs4_proc_rename_setup,
.rename_rpc_prepare = nfs4_proc_rename_rpc_prepare,
.rename_done = nfs4_proc_rename_done,
@@ -8019,13 +8417,10 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.pathconf = nfs4_proc_pathconf,
.set_capabilities = nfs4_server_capabilities,
.decode_dirent = nfs4_decode_dirent,
+ .pgio_rpc_prepare = nfs4_proc_pgio_rpc_prepare,
.read_setup = nfs4_proc_read_setup,
- .read_pageio_init = pnfs_pageio_init_read,
- .read_rpc_prepare = nfs4_proc_read_rpc_prepare,
.read_done = nfs4_read_done,
.write_setup = nfs4_proc_write_setup,
- .write_pageio_init = pnfs_pageio_init_write,
- .write_rpc_prepare = nfs4_proc_write_rpc_prepare,
.write_done = nfs4_write_done,
.commit_setup = nfs4_proc_commit_setup,
.commit_rpc_prepare = nfs4_proc_commit_rpc_prepare,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index cf883c7ae05..e799dc3c3b1 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -231,14 +231,23 @@ out:
return ret;
}
+/*
+ * nfs4_release_slot_table - release all slot table entries
+ */
+static void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+{
+ nfs4_shrink_slot_table(tbl, 0);
+}
+
/**
- * nfs4_release_slot_table - release resources attached to a slot table
+ * nfs4_shutdown_slot_table - release resources attached to a slot table
* @tbl: slot table to shut down
*
*/
-void nfs4_release_slot_table(struct nfs4_slot_table *tbl)
+void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl)
{
- nfs4_shrink_slot_table(tbl, 0);
+ nfs4_release_slot_table(tbl);
+ rpc_destroy_wait_queue(&tbl->slot_tbl_waitq);
}
/**
@@ -422,7 +431,7 @@ void nfs41_update_target_slotid(struct nfs4_slot_table *tbl,
spin_unlock(&tbl->slot_tbl_lock);
}
-static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+static void nfs4_release_session_slot_tables(struct nfs4_session *session)
{
nfs4_release_slot_table(&session->fc_slot_table);
nfs4_release_slot_table(&session->bc_slot_table);
@@ -450,7 +459,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
if (status && tbl->slots == NULL)
/* Fore and back channel share a connection so get
* both slot tables or neither */
- nfs4_destroy_session_slot_tables(ses);
+ nfs4_release_session_slot_tables(ses);
return status;
}
@@ -470,6 +479,12 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp)
return session;
}
+static void nfs4_destroy_session_slot_tables(struct nfs4_session *session)
+{
+ nfs4_shutdown_slot_table(&session->fc_slot_table);
+ nfs4_shutdown_slot_table(&session->bc_slot_table);
+}
+
void nfs4_destroy_session(struct nfs4_session *session)
{
struct rpc_xprt *xprt;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index 23230610065..b34ada9bc6a 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -74,7 +74,7 @@ enum nfs4_session_state {
extern int nfs4_setup_slot_table(struct nfs4_slot_table *tbl,
unsigned int max_reqs, const char *queue);
-extern void nfs4_release_slot_table(struct nfs4_slot_table *tbl);
+extern void nfs4_shutdown_slot_table(struct nfs4_slot_table *tbl);
extern struct nfs4_slot *nfs4_alloc_slot(struct nfs4_slot_table *tbl);
extern void nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot);
extern void nfs4_slot_tbl_drain_complete(struct nfs4_slot_table *tbl);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cc14cbb78b7..848f6853c59 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -239,14 +239,12 @@ static void nfs4_end_drain_session(struct nfs_client *clp)
}
}
-#if defined(CONFIG_NFS_V4_1)
-
static int nfs4_drain_slot_tbl(struct nfs4_slot_table *tbl)
{
set_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state);
spin_lock(&tbl->slot_tbl_lock);
if (tbl->highest_used_slotid != NFS4_NO_SLOT) {
- INIT_COMPLETION(tbl->complete);
+ reinit_completion(&tbl->complete);
spin_unlock(&tbl->slot_tbl_lock);
return wait_for_completion_interruptible(&tbl->complete);
}
@@ -270,6 +268,8 @@ static int nfs4_begin_drain_session(struct nfs_client *clp)
return nfs4_drain_slot_tbl(&ses->fc_slot_table);
}
+#if defined(CONFIG_NFS_V4_1)
+
static int nfs41_setup_state_renewal(struct nfs_client *clp)
{
int status;
@@ -974,9 +974,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
nfs4_stateid_copy(dst, &lsp->ls_stateid);
ret = 0;
- smp_rmb();
- if (!list_empty(&lsp->ls_seqid.list))
- ret = -EWOULDBLOCK;
}
spin_unlock(&state->state_lock);
nfs4_put_lock_state(lsp);
@@ -984,10 +981,9 @@ out:
return ret;
}
-static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
+static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
{
const nfs4_stateid *src;
- int ret;
int seq;
do {
@@ -996,12 +992,7 @@ static int nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
if (test_bit(NFS_OPEN_STATE, &state->flags))
src = &state->open_stateid;
nfs4_stateid_copy(dst, src);
- ret = 0;
- smp_rmb();
- if (!list_empty(&state->owner->so_seqid.list))
- ret = -EWOULDBLOCK;
} while (read_seqretry(&state->seqlock, seq));
- return ret;
}
/*
@@ -1015,15 +1006,19 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
if (ret == -EIO)
/* A lost lock - don't even consider delegations */
goto out;
- if (nfs4_copy_delegation_stateid(dst, state->inode, fmode))
+ /* returns true if delegation stateid found and copied */
+ if (nfs4_copy_delegation_stateid(dst, state->inode, fmode)) {
+ ret = 0;
goto out;
+ }
if (ret != -ENOENT)
/* nfs4_copy_delegation_stateid() didn't over-write
* dst, so it still has the lock stateid which we now
* choose to use.
*/
goto out;
- ret = nfs4_copy_open_stateid(dst, state);
+ nfs4_copy_open_stateid(dst, state);
+ ret = 0;
out:
if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
dst->seqid = 0;
@@ -1071,7 +1066,7 @@ void nfs_free_seqid(struct nfs_seqid *seqid)
/*
* Increment the seqid if the OPEN/OPEN_DOWNGRADE/CLOSE succeeded, or
* failed with a seqid incrementing error -
- * see comments nfs_fs.h:seqid_mutating_error()
+ * see comments nfs4.h:seqid_mutating_error()
*/
static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
{
@@ -1116,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
/*
* Increment the seqid if the LOCK/LOCKU succeeded, or
* failed with a seqid incrementing error -
- * see comments nfs_fs.h:seqid_mutating_error()
+ * see comments nfs4.h:seqid_mutating_error()
*/
void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
{
@@ -1145,9 +1140,9 @@ static int nfs4_run_state_manager(void *);
static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
{
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
rpc_wake_up(&clp->cl_rpcwaitq);
}
@@ -1197,20 +1192,74 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
}
EXPORT_SYMBOL_GPL(nfs4_schedule_lease_recovery);
+/**
+ * nfs4_schedule_migration_recovery - trigger migration recovery
+ *
+ * @server: FSID that is migrating
+ *
+ * Returns zero if recovery has started, otherwise a negative NFS4ERR
+ * value is returned.
+ */
+int nfs4_schedule_migration_recovery(const struct nfs_server *server)
+{
+ struct nfs_client *clp = server->nfs_client;
+
+ if (server->fh_expire_type != NFS4_FH_PERSISTENT) {
+ pr_err("NFS: volatile file handles not supported (server %s)\n",
+ clp->cl_hostname);
+ return -NFS4ERR_IO;
+ }
+
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -NFS4ERR_IO;
+
+ dprintk("%s: scheduling migration recovery for (%llx:%llx) on %s\n",
+ __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+
+ set_bit(NFS_MIG_IN_TRANSITION,
+ &((struct nfs_server *)server)->mig_status);
+ set_bit(NFS4CLNT_MOVED, &clp->cl_state);
+
+ nfs4_schedule_state_manager(clp);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_migration_recovery);
+
+/**
+ * nfs4_schedule_lease_moved_recovery - start lease-moved recovery
+ *
+ * @clp: server to check for moved leases
+ *
+ */
+void nfs4_schedule_lease_moved_recovery(struct nfs_client *clp)
+{
+ dprintk("%s: scheduling lease-moved recovery for client ID %llx on %s\n",
+ __func__, clp->cl_clientid, clp->cl_hostname);
+
+ set_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
+}
+EXPORT_SYMBOL_GPL(nfs4_schedule_lease_moved_recovery);
+
int nfs4_wait_clnt_recover(struct nfs_client *clp)
{
int res;
might_sleep();
+ atomic_inc(&clp->cl_count);
res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
nfs_wait_bit_killable, TASK_KILLABLE);
if (res)
- return res;
-
+ goto out;
if (clp->cl_cons_state < 0)
- return clp->cl_cons_state;
- return 0;
+ res = clp->cl_cons_state;
+out:
+ nfs_put_client(clp);
+ return res;
}
int nfs4_client_recover_expired_lease(struct nfs_client *clp)
@@ -1267,7 +1316,7 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
return 1;
}
-static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
{
set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1375,8 +1424,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
goto out;
default:
- printk(KERN_ERR "NFS: %s: unhandled error %d. "
- "Zeroing state\n", __func__, status);
+ printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+ __func__, status);
case -ENOMEM:
case -NFS4ERR_DENIED:
case -NFS4ERR_RECLAIM_BAD:
@@ -1407,7 +1456,7 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
* server that doesn't support a grace period.
*/
spin_lock(&sp->so_lock);
- write_seqcount_begin(&sp->so_reclaim_seqcount);
+ raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
restart:
list_for_each_entry(state, &sp->so_states, open_states) {
if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1422,7 +1471,7 @@ restart:
if (status >= 0) {
status = nfs4_reclaim_locks(state, ops);
if (status >= 0) {
- if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) {
+ if (!test_bit(NFS_DELEGATED_STATE, &state->flags)) {
spin_lock(&state->state_lock);
list_for_each_entry(lock, &state->lock_states, ls_locks) {
if (!test_bit(NFS_LOCK_INITIALIZED, &lock->ls_flags))
@@ -1439,15 +1488,12 @@ restart:
}
switch (status) {
default:
- printk(KERN_ERR "NFS: %s: unhandled error %d. "
- "Zeroing state\n", __func__, status);
+ printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+ __func__, status);
case -ENOENT:
case -ENOMEM:
case -ESTALE:
- /*
- * Open state on this file cannot be recovered
- * All we can do is revert to using the zero stateid.
- */
+ /* Open state on this file cannot be recovered */
nfs4_state_mark_recovery_failed(state, status);
break;
case -EAGAIN:
@@ -1473,13 +1519,13 @@ restart:
spin_lock(&sp->so_lock);
goto restart;
}
- write_seqcount_end(&sp->so_reclaim_seqcount);
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return 0;
out_err:
nfs4_put_open_state(state);
spin_lock(&sp->so_lock);
- write_seqcount_end(&sp->so_reclaim_seqcount);
+ raw_write_seqcount_end(&sp->so_reclaim_seqcount);
spin_unlock(&sp->so_lock);
return status;
}
@@ -1628,7 +1674,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
nfs4_state_end_reclaim_reboot(clp);
break;
case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_LEASE_MOVED:
set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
nfs4_state_clear_reclaim_reboot(clp);
nfs4_state_start_reclaim_reboot(clp);
@@ -1829,6 +1874,168 @@ static int nfs4_purge_lease(struct nfs_client *clp)
return 0;
}
+/*
+ * Try remote migration of one FSID from a source server to a
+ * destination server. The source server provides a list of
+ * potential destinations.
+ *
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_fs_locations *locations = NULL;
+ struct inode *inode;
+ struct page *page;
+ int status, result;
+
+ dprintk("--> %s: FSID %llx:%llx on \"%s\"\n", __func__,
+ (unsigned long long)server->fsid.major,
+ (unsigned long long)server->fsid.minor,
+ clp->cl_hostname);
+
+ result = 0;
+ page = alloc_page(GFP_KERNEL);
+ locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL);
+ if (page == NULL || locations == NULL) {
+ dprintk("<-- %s: no memory\n", __func__);
+ goto out;
+ }
+
+ inode = server->super->s_root->d_inode;
+ result = nfs4_proc_get_locations(inode, locations, page, cred);
+ if (result) {
+ dprintk("<-- %s: failed to retrieve fs_locations: %d\n",
+ __func__, result);
+ goto out;
+ }
+
+ result = -NFS4ERR_NXIO;
+ if (!(locations->fattr.valid & NFS_ATTR_FATTR_V4_LOCATIONS)) {
+ dprintk("<-- %s: No fs_locations data, migration skipped\n",
+ __func__);
+ goto out;
+ }
+
+ nfs4_begin_drain_session(clp);
+
+ status = nfs4_replace_transport(server, locations);
+ if (status != 0) {
+ dprintk("<-- %s: failed to replace transport: %d\n",
+ __func__, status);
+ goto out;
+ }
+
+ result = 0;
+ dprintk("<-- %s: migration succeeded\n", __func__);
+
+out:
+ if (page != NULL)
+ __free_page(page);
+ kfree(locations);
+ if (result) {
+ pr_err("NFS: migration recovery failed (server %s)\n",
+ clp->cl_hostname);
+ set_bit(NFS_MIG_FAILED, &server->mig_status);
+ }
+ return result;
+}
+
+/*
+ * Returns zero or a negative NFS4ERR status code.
+ */
+static int nfs4_handle_migration(struct nfs_client *clp)
+{
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ struct nfs_server *server;
+ struct rpc_cred *cred;
+
+ dprintk("%s: migration reported on \"%s\"\n", __func__,
+ clp->cl_hostname);
+
+ spin_lock(&clp->cl_lock);
+ cred = ops->get_state_renewal_cred_locked(clp);
+ spin_unlock(&clp->cl_lock);
+ if (cred == NULL)
+ return -NFS4ERR_NOENT;
+
+ clp->cl_mig_gen++;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ int status;
+
+ if (server->mig_gen == clp->cl_mig_gen)
+ continue;
+ server->mig_gen = clp->cl_mig_gen;
+
+ if (!test_and_clear_bit(NFS_MIG_IN_TRANSITION,
+ &server->mig_status))
+ continue;
+
+ rcu_read_unlock();
+ status = nfs4_try_migration(server, cred);
+ if (status < 0) {
+ put_rpccred(cred);
+ return status;
+ }
+ goto restart;
+ }
+ rcu_read_unlock();
+ put_rpccred(cred);
+ return 0;
+}
+
+/*
+ * Test each nfs_server on the clp's cl_superblocks list to see
+ * if it's moved to another server. Stop when the server no longer
+ * returns NFS4ERR_LEASE_MOVED.
+ */
+static int nfs4_handle_lease_moved(struct nfs_client *clp)
+{
+ const struct nfs4_state_maintenance_ops *ops =
+ clp->cl_mvops->state_renewal_ops;
+ struct nfs_server *server;
+ struct rpc_cred *cred;
+
+ dprintk("%s: lease moved reported on \"%s\"\n", __func__,
+ clp->cl_hostname);
+
+ spin_lock(&clp->cl_lock);
+ cred = ops->get_state_renewal_cred_locked(clp);
+ spin_unlock(&clp->cl_lock);
+ if (cred == NULL)
+ return -NFS4ERR_NOENT;
+
+ clp->cl_mig_gen++;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+ struct inode *inode;
+ int status;
+
+ if (server->mig_gen == clp->cl_mig_gen)
+ continue;
+ server->mig_gen = clp->cl_mig_gen;
+
+ rcu_read_unlock();
+
+ inode = server->super->s_root->d_inode;
+ status = nfs4_proc_fsid_present(inode, cred);
+ if (status != -NFS4ERR_MOVED)
+ goto restart; /* wasn't this one */
+ if (nfs4_try_migration(server, cred) == -NFS4ERR_LEASE_MOVED)
+ goto restart; /* there are more */
+ goto out;
+ }
+ rcu_read_unlock();
+
+out:
+ put_rpccred(cred);
+ return 0;
+}
+
/**
* nfs4_discover_server_trunking - Detect server IP address trunking
*
@@ -1868,8 +2075,10 @@ again:
switch (status) {
case 0:
break;
- case -NFS4ERR_DELAY:
case -ETIMEDOUT:
+ if (clnt->cl_softrtry)
+ break;
+ case -NFS4ERR_DELAY:
case -EAGAIN:
ssleep(1);
case -NFS4ERR_STALE_CLIENTID:
@@ -1881,10 +2090,15 @@ again:
nfs4_root_machine_cred(clp);
goto again;
}
- if (i > 2)
+ if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX)
break;
case -NFS4ERR_CLID_INUSE:
case -NFS4ERR_WRONGSEC:
+ /* No point in retrying if we already used RPC_AUTH_UNIX */
+ if (clnt->cl_auth->au_flavor == RPC_AUTH_UNIX) {
+ status = -EPERM;
+ break;
+ }
clnt = rpc_clone_client_set_auth(clnt, RPC_AUTH_UNIX);
if (IS_ERR(clnt)) {
status = PTR_ERR(clnt);
@@ -2017,9 +2231,10 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
nfs41_handle_server_reboot(clp);
if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED |
SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED |
- SEQ4_STATUS_ADMIN_STATE_REVOKED |
- SEQ4_STATUS_LEASE_MOVED))
+ SEQ4_STATUS_ADMIN_STATE_REVOKED))
nfs41_handle_state_revoked(clp);
+ if (flags & SEQ4_STATUS_LEASE_MOVED)
+ nfs4_schedule_lease_moved_recovery(clp);
if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
nfs41_handle_recallable_state_revoked(clp);
if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
@@ -2157,7 +2372,20 @@ static void nfs4_state_manager(struct nfs_client *clp)
status = nfs4_check_lease(clp);
if (status < 0)
goto out_error;
- continue;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
+ section = "migration";
+ status = nfs4_handle_migration(clp);
+ if (status < 0)
+ goto out_error;
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_LEASE_MOVED, &clp->cl_state)) {
+ section = "lease moved";
+ status = nfs4_handle_lease_moved(clp);
+ if (status < 0)
+ goto out_error;
}
/* First recover reboot state... */
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index e26acdd1a64..6f340f02f2b 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -77,17 +77,9 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
int ret = nfs_write_inode(inode, wbc);
- if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
- int status;
- bool sync = true;
-
- if (wbc->sync_mode == WB_SYNC_NONE)
- sync = false;
-
- status = pnfs_layoutcommit_inode(inode, sync);
- if (status < 0)
- return status;
- }
+ if (ret == 0)
+ ret = pnfs_layoutcommit_inode(inode,
+ wbc->sync_mode == WB_SYNC_ALL);
return ret;
}
@@ -98,7 +90,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
static void nfs4_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
pnfs_return_layout(inode);
pnfs_destroy_layout(NFS_I(inode));
@@ -261,9 +253,9 @@ struct dentry *nfs4_try_mount(int flags, const char *dev_name,
res = nfs_follow_remote_path(root_mnt, export_path);
- dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
- IS_ERR(res) ? PTR_ERR(res) : 0,
- IS_ERR(res) ? " [error]" : "");
+ dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n",
+ PTR_ERR_OR_ZERO(res),
+ IS_ERR(res) ? " [error]" : "");
return res;
}
@@ -319,9 +311,9 @@ static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
data->mnt_path = export_path;
res = nfs_follow_remote_path(root_mnt, export_path);
- dprintk("<-- nfs4_referral_mount() = %ld%s\n",
- IS_ERR(res) ? PTR_ERR(res) : 0,
- IS_ERR(res) ? " [error]" : "");
+ dprintk("<-- nfs4_referral_mount() = %d%s\n",
+ PTR_ERR_OR_ZERO(res),
+ IS_ERR(res) ? " [error]" : "");
return res;
}
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 2628d921b7e..b6ebe7e445f 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -16,7 +16,7 @@ static const int nfs_set_port_min = 0;
static const int nfs_set_port_max = 65535;
static struct ctl_table_header *nfs4_callback_sysctl_table;
-static ctl_table nfs4_cb_sysctls[] = {
+static struct ctl_table nfs4_cb_sysctls[] = {
{
.procname = "nfs_callback_tcpport",
.data = &nfs_callback_set_tcpport,
@@ -36,7 +36,7 @@ static ctl_table nfs4_cb_sysctls[] = {
{ }
};
-static ctl_table nfs4_cb_sysctl_dir[] = {
+static struct ctl_table nfs4_cb_sysctl_dir[] = {
{
.procname = "nfs",
.mode = 0555,
@@ -45,7 +45,7 @@ static ctl_table nfs4_cb_sysctl_dir[] = {
{ }
};
-static ctl_table nfs4_cb_sysctl_root[] = {
+static struct ctl_table nfs4_cb_sysctl_root[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 849cf146db3..0a744f3a86f 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,7 +932,7 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
DECLARE_EVENT_CLASS(nfs4_read_event,
TP_PROTO(
- const struct nfs_read_data *data,
+ const struct nfs_pgio_data *data,
int error
),
@@ -972,7 +972,7 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
#define DEFINE_NFS4_READ_EVENT(name) \
DEFINE_EVENT(nfs4_read_event, name, \
TP_PROTO( \
- const struct nfs_read_data *data, \
+ const struct nfs_pgio_data *data, \
int error \
), \
TP_ARGS(data, error))
@@ -983,7 +983,7 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
DECLARE_EVENT_CLASS(nfs4_write_event,
TP_PROTO(
- const struct nfs_write_data *data,
+ const struct nfs_pgio_data *data,
int error
),
@@ -1024,7 +1024,7 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
#define DEFINE_NFS4_WRITE_EVENT(name) \
DEFINE_EVENT(nfs4_write_event, name, \
TP_PROTO( \
- const struct nfs_write_data *data, \
+ const struct nfs_pgio_data *data, \
int error \
), \
TP_ARGS(data, error))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 79210d23f60..939ae606cfa 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -105,12 +105,8 @@ static int nfs4_stat_to_errno(int);
#ifdef CONFIG_NFS_V4_SECURITY_LABEL
/* PI(4 bytes) + LFS(4 bytes) + 1(for null terminator?) + MAXLABELLEN */
#define nfs4_label_maxsz (4 + 4 + 1 + XDR_QUADLEN(NFS4_MAXLABELLEN))
-#define encode_readdir_space 24
-#define encode_readdir_bitmask_sz 3
#else
#define nfs4_label_maxsz 0
-#define encode_readdir_space 20
-#define encode_readdir_bitmask_sz 2
#endif
/* We support only one layout type per file system */
#define decode_mdsthreshold_maxsz (1 + 1 + nfs4_fattr_bitmap_maxsz + 1 + 8)
@@ -207,8 +203,7 @@ static int nfs4_stat_to_errno(int);
2 + encode_verifier_maxsz + 5 + \
nfs4_label_maxsz)
#define decode_readdir_maxsz (op_decode_hdr_maxsz + \
- decode_verifier_maxsz + \
- nfs4_label_maxsz + nfs4_fattr_maxsz)
+ decode_verifier_maxsz)
#define encode_readlink_maxsz (op_encode_hdr_maxsz)
#define decode_readlink_maxsz (op_decode_hdr_maxsz + 1)
#define encode_write_maxsz (op_encode_hdr_maxsz + \
@@ -595,11 +590,13 @@ static int nfs4_stat_to_errno(int);
#define NFS4_enc_getattr_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
- encode_getattr_maxsz)
+ encode_getattr_maxsz + \
+ encode_renew_maxsz)
#define NFS4_dec_getattr_sz (compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
- decode_getattr_maxsz)
+ decode_getattr_maxsz + \
+ decode_renew_maxsz)
#define NFS4_enc_lookup_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -736,13 +733,15 @@ static int nfs4_stat_to_errno(int);
encode_sequence_maxsz + \
encode_putfh_maxsz + \
encode_lookup_maxsz + \
- encode_fs_locations_maxsz)
+ encode_fs_locations_maxsz + \
+ encode_renew_maxsz)
#define NFS4_dec_fs_locations_sz \
(compound_decode_hdr_maxsz + \
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_lookup_maxsz + \
- decode_fs_locations_maxsz)
+ decode_fs_locations_maxsz + \
+ decode_renew_maxsz)
#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \
encode_sequence_maxsz + \
encode_putfh_maxsz + \
@@ -751,6 +750,18 @@ static int nfs4_stat_to_errno(int);
decode_sequence_maxsz + \
decode_putfh_maxsz + \
decode_secinfo_maxsz)
+#define NFS4_enc_fsid_present_sz \
+ (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_getfh_maxsz + \
+ encode_renew_maxsz)
+#define NFS4_dec_fsid_present_sz \
+ (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_getfh_maxsz + \
+ decode_renew_maxsz)
#if defined(CONFIG_NFS_V4_1)
#define NFS4_enc_bind_conn_to_session_sz \
(compound_encode_hdr_maxsz + \
@@ -1545,7 +1556,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
}
-static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
{
__be32 *p;
@@ -1565,6 +1577,8 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
};
uint32_t dircount = readdir->count >> 1;
__be32 *p, verf[2];
+ uint32_t attrlen = 0;
+ unsigned int i;
if (readdir->plus) {
attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
@@ -1573,26 +1587,27 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
dircount >>= 1;
}
/* Use mounted_on_fileid only if the server supports it */
if (!(readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID))
attrs[0] |= FATTR4_WORD0_FILEID;
+ for (i = 0; i < ARRAY_SIZE(attrs); i++) {
+ attrs[i] &= readdir->bitmask[i];
+ if (attrs[i] != 0)
+ attrlen = i+1;
+ }
encode_op_hdr(xdr, OP_READDIR, decode_readdir_maxsz, hdr);
encode_uint64(xdr, readdir->cookie);
encode_nfs4_verifier(xdr, &readdir->verifier);
- p = reserve_space(xdr, encode_readdir_space);
+ p = reserve_space(xdr, 12 + (attrlen << 2));
*p++ = cpu_to_be32(dircount);
*p++ = cpu_to_be32(readdir->count);
- *p++ = cpu_to_be32(encode_readdir_bitmask_sz);
- *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
- *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
- if (encode_readdir_bitmask_sz > 2) {
- if (hdr->minorversion > 1)
- attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
- p++, *p++ = cpu_to_be32(attrs[2] & readdir->bitmask[2]);
- }
+ *p++ = cpu_to_be32(attrlen);
+ for (i = 0; i < attrlen; i++)
+ *p++ = cpu_to_be32(attrs[i]);
memcpy(verf, readdir->verifier.data, sizeof(verf));
dprintk("%s: cookie = %llu, verifier = %08x:%08x, bitmap = %08x:%08x:%08x\n",
@@ -1687,7 +1702,8 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4
encode_nfs4_verifier(xdr, &arg->confirm);
}
-static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_pgio_args *args,
+ struct compound_hdr *hdr)
{
__be32 *p;
@@ -2437,7 +2453,7 @@ static void nfs4_xdr_enc_readdir(struct rpc_rqst *req, struct xdr_stream *xdr,
* Encode a READ request
*/
static void nfs4_xdr_enc_read(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_readargs *args)
+ struct nfs_pgio_args *args)
{
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2499,7 +2515,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
* Encode a WRITE request
*/
static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
- struct nfs_writeargs *args)
+ struct nfs_pgio_args *args)
{
struct compound_hdr hdr = {
.minorversion = nfs4_xdr_minorversion(&args->seq_args),
@@ -2687,11 +2703,20 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
encode_compound_hdr(xdr, req, &hdr);
encode_sequence(xdr, &args->seq_args, &hdr);
- encode_putfh(xdr, args->dir_fh, &hdr);
- encode_lookup(xdr, args->name, &hdr);
- replen = hdr.replen; /* get the attribute into args->page */
- encode_fs_locations(xdr, args->bitmask, &hdr);
+ if (args->migration) {
+ encode_putfh(xdr, args->fh, &hdr);
+ replen = hdr.replen;
+ encode_fs_locations(xdr, args->bitmask, &hdr);
+ if (args->renew)
+ encode_renew(xdr, args->clientid, &hdr);
+ } else {
+ encode_putfh(xdr, args->dir_fh, &hdr);
+ encode_lookup(xdr, args->name, &hdr);
+ replen = hdr.replen;
+ encode_fs_locations(xdr, args->bitmask, &hdr);
+ }
+ /* Set up reply kvec to capture returned fs_locations array. */
xdr_inline_pages(&req->rq_rcv_buf, replen << 2, &args->page,
0, PAGE_SIZE);
encode_nops(&hdr);
@@ -2715,6 +2740,26 @@ static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode FSID_PRESENT request
+ */
+static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs4_fsid_present_arg *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->fh, &hdr);
+ encode_getfh(xdr, &hdr);
+ if (args->renew)
+ encode_renew(xdr, args->clientid, &hdr);
+ encode_nops(&hdr);
+}
+
#if defined(CONFIG_NFS_V4_1)
/*
* BIND_CONN_TO_SESSION request
@@ -3053,7 +3098,8 @@ out_overflow:
return -EIO;
}
-static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+static bool __decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected,
+ int *nfs_retval)
{
__be32 *p;
uint32_t opnum;
@@ -3063,19 +3109,32 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
if (unlikely(!p))
goto out_overflow;
opnum = be32_to_cpup(p++);
- if (opnum != expected) {
- dprintk("nfs: Server returned operation"
- " %d but we issued a request for %d\n",
- opnum, expected);
- return -EIO;
- }
+ if (unlikely(opnum != expected))
+ goto out_bad_operation;
nfserr = be32_to_cpup(p);
- if (nfserr != NFS_OK)
- return nfs4_stat_to_errno(nfserr);
- return 0;
+ if (nfserr == NFS_OK)
+ *nfs_retval = 0;
+ else
+ *nfs_retval = nfs4_stat_to_errno(nfserr);
+ return true;
+out_bad_operation:
+ dprintk("nfs: Server returned operation"
+ " %d but we issued a request for %d\n",
+ opnum, expected);
+ *nfs_retval = -EREMOTEIO;
+ return false;
out_overflow:
print_overflow_msg(__func__, xdr);
- return -EIO;
+ *nfs_retval = -EIO;
+ return false;
+}
+
+static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
+{
+ int retval;
+
+ __decode_op_hdr(xdr, expected, &retval);
+ return retval;
}
/* Dummy routine */
@@ -3391,7 +3450,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
{
__be32 *p;
- *res = ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL;
+ *res = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
return -EIO;
if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
@@ -4957,11 +5016,12 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
uint32_t savewords, bmlen, i;
int status;
- status = decode_op_hdr(xdr, OP_OPEN);
- if (status != -EIO)
- nfs_increment_open_seqid(status, res->seqid);
- if (!status)
- status = decode_stateid(xdr, &res->stateid);
+ if (!__decode_op_hdr(xdr, OP_OPEN, &status))
+ return status;
+ nfs_increment_open_seqid(status, res->seqid);
+ if (status)
+ return status;
+ status = decode_stateid(xdr, &res->stateid);
if (unlikely(status))
return status;
@@ -5027,7 +5087,8 @@ static int decode_putrootfh(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_PUTROOTFH);
}
-static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_readres *res)
+static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req,
+ struct nfs_pgio_res *res)
{
__be32 *p;
uint32_t count, eof, recvd;
@@ -5281,7 +5342,7 @@ static int decode_setclientid_confirm(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_SETCLIENTID_CONFIRM);
}
-static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
+static int decode_write(struct xdr_stream *xdr, struct nfs_pgio_res *res)
{
__be32 *p;
int status;
@@ -6578,7 +6639,7 @@ out:
* Decode Read response
*/
static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
- struct nfs_readres *res)
+ struct nfs_pgio_res *res)
{
struct compound_hdr hdr;
int status;
@@ -6603,7 +6664,7 @@ out:
* Decode WRITE response
*/
static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
- struct nfs_writeres *res)
+ struct nfs_pgio_res *res)
{
struct compound_hdr hdr;
int status;
@@ -6824,13 +6885,26 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
status = decode_putfh(xdr);
if (status)
goto out;
- status = decode_lookup(xdr);
- if (status)
- goto out;
- xdr_enter_page(xdr, PAGE_SIZE);
- status = decode_getfattr_generic(xdr, &res->fs_locations->fattr,
+ if (res->migration) {
+ xdr_enter_page(xdr, PAGE_SIZE);
+ status = decode_getfattr_generic(xdr,
+ &res->fs_locations->fattr,
NULL, res->fs_locations,
NULL, res->fs_locations->server);
+ if (status)
+ goto out;
+ if (res->renew)
+ status = decode_renew(xdr);
+ } else {
+ status = decode_lookup(xdr);
+ if (status)
+ goto out;
+ xdr_enter_page(xdr, PAGE_SIZE);
+ status = decode_getfattr_generic(xdr,
+ &res->fs_locations->fattr,
+ NULL, res->fs_locations,
+ NULL, res->fs_locations->server);
+ }
out:
return status;
}
@@ -6859,6 +6933,34 @@ out:
return status;
}
+/*
+ * Decode FSID_PRESENT response
+ */
+static int nfs4_xdr_dec_fsid_present(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs4_fsid_present_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_getfh(xdr, res->fh);
+ if (status)
+ goto out;
+ if (res->renew)
+ status = decode_renew(xdr);
+out:
+ return status;
+}
+
#if defined(CONFIG_NFS_V4_1)
/*
* Decode BIND_CONN_TO_SESSION response
@@ -7373,6 +7475,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
PROC(SECINFO, enc_secinfo, dec_secinfo),
+ PROC(FSID_PRESENT, enc_fsid_present, dec_fsid_present),
#if defined(CONFIG_NFS_V4_1)
PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
PROC(CREATE_SESSION, enc_create_session, dec_create_session),
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 89fe741e58b..59f838cdc00 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -36,6 +36,7 @@
__print_flags(v, "|", \
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \
+ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_COMMIT, "COMMIT" }, \
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5457745dd4f..611320753db 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -439,7 +439,7 @@ static void _read_done(struct ore_io_state *ios, void *private)
objlayout_read_done(&objios->oir, status, objios->sync);
}
-int objio_read_pagelist(struct nfs_read_data *rdata)
+int objio_read_pagelist(struct nfs_pgio_data *rdata)
{
struct nfs_pgio_header *hdr = rdata->header;
struct objio_state *objios;
@@ -487,7 +487,7 @@ static void _write_done(struct ore_io_state *ios, void *private)
static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
{
struct objio_state *objios = priv;
- struct nfs_write_data *wdata = objios->oir.rpcdata;
+ struct nfs_pgio_data *wdata = objios->oir.rpcdata;
struct address_space *mapping = wdata->header->inode->i_mapping;
pgoff_t index = offset / PAGE_SIZE;
struct page *page;
@@ -531,7 +531,7 @@ static const struct _ore_r4w_op _r4w_op = {
.put_page = &__r4w_put_page,
};
-int objio_write_pagelist(struct nfs_write_data *wdata, int how)
+int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
{
struct nfs_pgio_header *hdr = wdata->header;
struct objio_state *objios;
@@ -564,14 +564,22 @@ int objio_write_pagelist(struct nfs_write_data *wdata, int how)
return 0;
}
-static bool objio_pg_test(struct nfs_pageio_descriptor *pgio,
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req)
{
- if (!pnfs_generic_pg_test(pgio, prev, req))
- return false;
+ unsigned int size;
+
+ size = pnfs_generic_pg_test(pgio, prev, req);
+
+ if (!size || pgio->pg_count + req->wb_bytes >
+ (unsigned long)pgio->pg_layout_private)
+ return 0;
- return pgio->pg_count + req->wb_bytes <=
- (unsigned long)pgio->pg_layout_private;
+ return min(size, req->wb_bytes);
}
static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index e4f9cbfec67..765d3f54e98 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -53,10 +53,10 @@ objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct objlayout *objlay;
objlay = kzalloc(sizeof(struct objlayout), gfp_flags);
- if (objlay) {
- spin_lock_init(&objlay->lock);
- INIT_LIST_HEAD(&objlay->err_list);
- }
+ if (!objlay)
+ return NULL;
+ spin_lock_init(&objlay->lock);
+ INIT_LIST_HEAD(&objlay->err_list);
dprintk("%s: Return %p\n", __func__, objlay);
return &objlay->pnfs_layout;
}
@@ -229,11 +229,11 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
static void _rpc_read_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_read_data *rdata;
+ struct nfs_pgio_data *rdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- rdata = container_of(task, struct nfs_read_data, task);
+ rdata = container_of(task, struct nfs_pgio_data, task);
pnfs_ld_read_done(rdata);
}
@@ -241,7 +241,7 @@ static void _rpc_read_complete(struct work_struct *work)
void
objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_read_data *rdata = oir->rpcdata;
+ struct nfs_pgio_data *rdata = oir->rpcdata;
oir->status = rdata->task.tk_status = status;
if (status >= 0)
@@ -266,7 +266,7 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async reads.
*/
enum pnfs_try_status
-objlayout_read_pagelist(struct nfs_read_data *rdata)
+objlayout_read_pagelist(struct nfs_pgio_data *rdata)
{
struct nfs_pgio_header *hdr = rdata->header;
struct inode *inode = hdr->inode;
@@ -312,11 +312,11 @@ objlayout_read_pagelist(struct nfs_read_data *rdata)
static void _rpc_write_complete(struct work_struct *work)
{
struct rpc_task *task;
- struct nfs_write_data *wdata;
+ struct nfs_pgio_data *wdata;
dprintk("%s enter\n", __func__);
task = container_of(work, struct rpc_task, u.tk_work);
- wdata = container_of(task, struct nfs_write_data, task);
+ wdata = container_of(task, struct nfs_pgio_data, task);
pnfs_ld_write_done(wdata);
}
@@ -324,7 +324,7 @@ static void _rpc_write_complete(struct work_struct *work)
void
objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
{
- struct nfs_write_data *wdata = oir->rpcdata;
+ struct nfs_pgio_data *wdata = oir->rpcdata;
oir->status = wdata->task.tk_status = status;
if (status >= 0) {
@@ -351,7 +351,7 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
* Perform sync or async writes.
*/
enum pnfs_try_status
-objlayout_write_pagelist(struct nfs_write_data *wdata,
+objlayout_write_pagelist(struct nfs_pgio_data *wdata,
int how)
{
struct nfs_pgio_header *hdr = wdata->header;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 87aa1dec612..01e041029a6 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
*/
extern void objio_free_result(struct objlayout_io_res *oir);
-extern int objio_read_pagelist(struct nfs_read_data *rdata);
-extern int objio_write_pagelist(struct nfs_write_data *wdata, int how);
+extern int objio_read_pagelist(struct nfs_pgio_data *rdata);
+extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how);
/*
* callback API
@@ -168,10 +168,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
extern void objlayout_free_lseg(struct pnfs_layout_segment *);
extern enum pnfs_try_status objlayout_read_pagelist(
- struct nfs_read_data *);
+ struct nfs_pgio_data *);
extern enum pnfs_try_status objlayout_write_pagelist(
- struct nfs_write_data *,
+ struct nfs_pgio_data *,
int how);
extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2ffebf2081c..17fab89f635 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -24,9 +24,12 @@
#include "internal.h"
#include "pnfs.h"
+#define NFSDBG_FACILITY NFSDBG_PAGECACHE
+
static struct kmem_cache *nfs_page_cachep;
+static const struct rpc_call_ops nfs_pgio_common_ops;
-bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
+static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
{
p->npages = pagecount;
if (pagecount <= ARRAY_SIZE(p->page_array))
@@ -95,7 +98,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c)
{
if (atomic_dec_and_test(&c->io_count)) {
clear_bit(NFS_IO_INPROGRESS, &c->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
}
}
@@ -133,11 +136,168 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
return __nfs_iocounter_wait(c);
}
+static int nfs_wait_bit_uninterruptible(void *word)
+{
+ io_schedule();
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req - request in group that is to be locked
+ *
+ * this lock must be held if modifying the page group list
+ */
+void
+nfs_page_group_lock(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ nfs_wait_bit_uninterruptible,
+ TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req - request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ WARN_ON_ONCE(head != head->wb_head);
+
+ smp_mb__before_atomic();
+ clear_bit(PG_HEADLOCK, &head->wb_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+ struct nfs_page *head = req->wb_head;
+ struct nfs_page *tmp;
+
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+ WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+ tmp = req->wb_this_page;
+ while (tmp != req) {
+ if (!test_bit(bit, &tmp->wb_flags))
+ return false;
+ tmp = tmp->wb_this_page;
+ }
+
+ /* true! reset all bits */
+ tmp = req;
+ do {
+ clear_bit(bit, &tmp->wb_flags);
+ tmp = tmp->wb_this_page;
+ } while (tmp != req);
+
+ return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ * return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+ bool ret;
+
+ nfs_page_group_lock(req);
+ ret = nfs_page_group_sync_on_bit_locked(req, bit);
+ nfs_page_group_unlock(req);
+
+ return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ * or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+ WARN_ON_ONCE(prev == req);
+
+ if (!prev) {
+ /* a head request */
+ req->wb_head = req;
+ req->wb_this_page = req;
+ } else {
+ /* a subrequest */
+ WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+ req->wb_head = prev->wb_head;
+ req->wb_this_page = prev->wb_this_page;
+ prev->wb_this_page = req;
+
+ /* All subrequests take a ref on the head request until
+ * nfs_page_group_destroy is called */
+ kref_get(&req->wb_head->wb_kref);
+
+ /* grab extra ref if head request has extra ref from
+ * the write/commit path to handle handoff between write
+ * and commit lists */
+ if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) {
+ set_bit(PG_INODE_REF, &req->wb_flags);
+ kref_get(&req->wb_kref);
+ }
+ }
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+ struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ struct nfs_page *tmp, *next;
+
+ /* subrequests must release the ref on the head request */
+ if (req->wb_head != req)
+ nfs_release_request(req->wb_head);
+
+ if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+ return;
+
+ tmp = req;
+ do {
+ next = tmp->wb_this_page;
+ /* unlink and free */
+ tmp->wb_this_page = tmp;
+ tmp->wb_head = tmp;
+ nfs_free_request(tmp);
+ tmp = next;
+ } while (tmp != req);
+}
+
/**
* nfs_create_request - Create an NFS read/write request.
* @ctx: open context to use
- * @inode: inode to which the request is attached
* @page: page to write
+ * @last: last nfs request created for this page group or NULL if head
* @offset: starting offset within the page for the write
* @count: number of bytes to read/write
*
@@ -146,9 +306,9 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
* User should ensure it is safe to sleep in this function.
*/
struct nfs_page *
-nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
- struct page *page,
- unsigned int offset, unsigned int count)
+nfs_create_request(struct nfs_open_context *ctx, struct page *page,
+ struct nfs_page *last, unsigned int offset,
+ unsigned int count)
{
struct nfs_page *req;
struct nfs_lock_context *l_ctx;
@@ -180,6 +340,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
req->wb_bytes = count;
req->wb_context = get_nfs_open_context(ctx);
kref_init(&req->wb_kref);
+ nfs_page_group_init(req, last);
return req;
}
@@ -193,9 +354,9 @@ void nfs_unlock_request(struct nfs_page *req)
printk(KERN_ERR "NFS: Invalid unlock attempted\n");
BUG();
}
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(PG_BUSY, &req->wb_flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&req->wb_flags, PG_BUSY);
}
@@ -237,16 +398,22 @@ static void nfs_clear_request(struct nfs_page *req)
}
}
-
/**
* nfs_release_request - Release the count on an NFS read/write request
* @req: request to release
*
* Note: Should never be called with the spinlock held!
*/
-static void nfs_free_request(struct kref *kref)
+void nfs_free_request(struct nfs_page *req)
{
- struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
+ /* extra debug: make sure no sync bits are still set */
+ WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UNLOCKPAGE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_UPTODATE, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_WB_END, &req->wb_flags));
+ WARN_ON_ONCE(test_bit(PG_REMOVE, &req->wb_flags));
/* Release struct file and open context */
nfs_clear_request(req);
@@ -255,13 +422,7 @@ static void nfs_free_request(struct kref *kref)
void nfs_release_request(struct nfs_page *req)
{
- kref_put(&req->wb_kref, nfs_free_request);
-}
-
-static int nfs_wait_bit_uninterruptible(void *word)
-{
- io_schedule();
- return 0;
+ kref_put(&req->wb_kref, nfs_page_group_destroy);
}
/**
@@ -279,22 +440,249 @@ nfs_wait_on_request(struct nfs_page *req)
TASK_UNINTERRUPTIBLE);
}
-bool nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, struct nfs_page *prev, struct nfs_page *req)
+/*
+ * nfs_generic_pg_test - determine if requests can be coalesced
+ * @desc: pointer to descriptor
+ * @prev: previous request in desc, or NULL
+ * @req: this request
+ *
+ * Returns zero if @req can be coalesced into @desc, otherwise it returns
+ * the size of the request.
+ */
+size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *prev, struct nfs_page *req)
{
- /*
- * FIXME: ideally we should be able to coalesce all requests
- * that are not block boundary aligned, but currently this
- * is problematic for the case of bsize < PAGE_CACHE_SIZE,
- * since nfs_flush_multi and nfs_pagein_multi assume you
- * can have only one struct nfs_page.
- */
- if (desc->pg_bsize < PAGE_SIZE)
+ if (desc->pg_count > desc->pg_bsize) {
+ /* should never happen */
+ WARN_ON_ONCE(1);
return 0;
+ }
- return desc->pg_count + req->wb_bytes <= desc->pg_bsize;
+ return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
+static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr)
+{
+ return container_of(hdr, struct nfs_rw_header, header);
+}
+
+/**
+ * nfs_rw_header_alloc - Allocate a header for a read or write
+ * @ops: Read or write function vector
+ */
+struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
+{
+ struct nfs_rw_header *header = ops->rw_alloc_header();
+
+ if (header) {
+ struct nfs_pgio_header *hdr = &header->header;
+
+ INIT_LIST_HEAD(&hdr->pages);
+ spin_lock_init(&hdr->lock);
+ atomic_set(&hdr->refcnt, 0);
+ hdr->rw_ops = ops;
+ }
+ return header;
+}
+EXPORT_SYMBOL_GPL(nfs_rw_header_alloc);
+
+/*
+ * nfs_rw_header_free - Free a read or write header
+ * @hdr: The header to free
+ */
+void nfs_rw_header_free(struct nfs_pgio_header *hdr)
+{
+ hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
+}
+EXPORT_SYMBOL_GPL(nfs_rw_header_free);
+
+/**
+ * nfs_pgio_data_alloc - Allocate pageio data
+ * @hdr: The header making a request
+ * @pagecount: Number of pages to create
+ */
+static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
+ unsigned int pagecount)
+{
+ struct nfs_pgio_data *data, *prealloc;
+
+ prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
+ if (prealloc->header == NULL)
+ data = prealloc;
+ else
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ if (nfs_pgarray_set(&data->pages, pagecount)) {
+ data->header = hdr;
+ atomic_inc(&hdr->refcnt);
+ } else {
+ if (data != prealloc)
+ kfree(data);
+ data = NULL;
+ }
+out:
+ return data;
+}
+
+/**
+ * nfs_pgio_data_release - Properly free pageio data
+ * @data: The data to release
+ */
+void nfs_pgio_data_release(struct nfs_pgio_data *data)
+{
+ struct nfs_pgio_header *hdr = data->header;
+ struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr);
+
+ put_nfs_open_context(data->args.context);
+ if (data->pages.pagevec != data->pages.page_array)
+ kfree(data->pages.pagevec);
+ if (data == &pageio_header->rpc_data) {
+ data->header = NULL;
+ data = NULL;
+ }
+ if (atomic_dec_and_test(&hdr->refcnt))
+ hdr->completion_ops->completion(hdr);
+ /* Note: we only free the rpc_task after callbacks are done.
+ * See the comment in rpc_free_task() for why
+ */
+ kfree(data);
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_data_release);
+
+/**
+ * nfs_pgio_rpcsetup - Set up arguments for a pageio call
+ * @data: The pageio data
+ * @count: Number of bytes to read
+ * @offset: Initial offset
+ * @how: How to commit data (writes only)
+ * @cinfo: Commit information for the call (writes only)
+ */
+static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
+ unsigned int count, unsigned int offset,
+ int how, struct nfs_commit_info *cinfo)
+{
+ struct nfs_page *req = data->header->req;
+
+ /* Set up the RPC argument and reply structs
+ * NB: take care not to mess about with data->commit et al. */
+
+ data->args.fh = NFS_FH(data->header->inode);
+ data->args.offset = req_offset(req) + offset;
+ /* pnfs_set_layoutcommit needs this */
+ data->mds_offset = data->args.offset;
+ data->args.pgbase = req->wb_pgbase + offset;
+ data->args.pages = data->pages.pagevec;
+ data->args.count = count;
+ data->args.context = get_nfs_open_context(req->wb_context);
+ data->args.lock_context = req->wb_lock_context;
+ data->args.stable = NFS_UNSTABLE;
+ switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
+ case 0:
+ break;
+ case FLUSH_COND_STABLE:
+ if (nfs_reqs_to_commit(cinfo))
+ break;
+ default:
+ data->args.stable = NFS_FILE_SYNC;
+ }
+
+ data->res.fattr = &data->fattr;
+ data->res.count = count;
+ data->res.eof = 0;
+ data->res.verf = &data->verf;
+ nfs_fattr_init(&data->fattr);
+}
+
+/**
+ * nfs_pgio_prepare - Prepare pageio data to go over the wire
+ * @task: The current task
+ * @calldata: pageio data to prepare
+ */
+static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_data *data = calldata;
+ int err;
+ err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data);
+ if (err)
+ rpc_exit(task, err);
+}
+
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data,
+ const struct rpc_call_ops *call_ops, int how, int flags)
+{
+ struct rpc_task *task;
+ struct rpc_message msg = {
+ .rpc_argp = &data->args,
+ .rpc_resp = &data->res,
+ .rpc_cred = data->header->cred,
+ };
+ struct rpc_task_setup task_setup_data = {
+ .rpc_client = clnt,
+ .task = &data->task,
+ .rpc_message = &msg,
+ .callback_ops = call_ops,
+ .callback_data = data,
+ .workqueue = nfsiod_workqueue,
+ .flags = RPC_TASK_ASYNC | flags,
+ };
+ int ret = 0;
+
+ data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how);
+
+ dprintk("NFS: %5u initiated pgio call "
+ "(req %s/%llu, %u bytes @ offset %llu)\n",
+ data->task.tk_pid,
+ data->header->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(data->header->inode),
+ data->args.count,
+ (unsigned long long)data->args.offset);
+
+ task = rpc_run_task(&task_setup_data);
+ if (IS_ERR(task)) {
+ ret = PTR_ERR(task);
+ goto out;
+ }
+ if (how & FLUSH_SYNC) {
+ ret = rpc_wait_for_completion_task(task);
+ if (ret == 0)
+ ret = task->tk_status;
+ }
+ rpc_put_task(task);
+out:
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
+
+/**
+ * nfs_pgio_error - Clean up from a pageio error
+ * @desc: IO descriptor
+ * @hdr: pageio header
+ */
+static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ set_bit(NFS_IOHDR_REDO, &hdr->flags);
+ nfs_pgio_data_release(hdr->data);
+ hdr->data = NULL;
+ desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+ return -ENOMEM;
+}
+
+/**
+ * nfs_pgio_release - Release pageio data
+ * @calldata: The pageio data to release
+ */
+static void nfs_pgio_release(void *calldata)
+{
+ struct nfs_pgio_data *data = calldata;
+ if (data->header->rw_ops->rw_release)
+ data->header->rw_ops->rw_release(data);
+ nfs_pgio_data_release(data);
+}
+
/**
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
@@ -307,6 +695,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
struct inode *inode,
const struct nfs_pageio_ops *pg_ops,
const struct nfs_pgio_completion_ops *compl_ops,
+ const struct nfs_rw_ops *rw_ops,
size_t bsize,
int io_flags)
{
@@ -320,6 +709,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_inode = inode;
desc->pg_ops = pg_ops;
desc->pg_completion_ops = compl_ops;
+ desc->pg_rw_ops = rw_ops;
desc->pg_ioflags = io_flags;
desc->pg_error = 0;
desc->pg_lseg = NULL;
@@ -328,6 +718,94 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
}
EXPORT_SYMBOL_GPL(nfs_pageio_init);
+/**
+ * nfs_pgio_result - Basic pageio error handling
+ * @task: The task that ran
+ * @calldata: Pageio data to check
+ */
+static void nfs_pgio_result(struct rpc_task *task, void *calldata)
+{
+ struct nfs_pgio_data *data = calldata;
+ struct inode *inode = data->header->inode;
+
+ dprintk("NFS: %s: %5u, (status %d)\n", __func__,
+ task->tk_pid, task->tk_status);
+
+ if (data->header->rw_ops->rw_done(task, data, inode) != 0)
+ return;
+ if (task->tk_status < 0)
+ nfs_set_pgio_error(data->header, task->tk_status, data->args.offset);
+ else
+ data->header->rw_ops->rw_result(task, data);
+}
+
+/*
+ * Create an RPC task for the given read or write request and kick it.
+ * The page must have been locked by the caller.
+ *
+ * It may happen that the page we're passed is not marked dirty.
+ * This is the case if nfs_updatepage detects a conflicting request
+ * that has been written but not committed.
+ */
+int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr)
+{
+ struct nfs_page *req;
+ struct page **pages;
+ struct nfs_pgio_data *data;
+ struct list_head *head = &desc->pg_list;
+ struct nfs_commit_info cinfo;
+
+ data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base,
+ desc->pg_count));
+ if (!data)
+ return nfs_pgio_error(desc, hdr);
+
+ nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
+ pages = data->pages.pagevec;
+ while (!list_empty(head)) {
+ req = nfs_list_entry(head->next);
+ nfs_list_remove_request(req);
+ nfs_list_add_request(req, &hdr->pages);
+ *pages++ = req->wb_page;
+ }
+
+ if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
+ (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
+ desc->pg_ioflags &= ~FLUSH_COND_STABLE;
+
+ /* Set up the argument struct */
+ nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+ hdr->data = data;
+ desc->pg_rpc_callops = &nfs_pgio_common_ops;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nfs_generic_pgio);
+
+static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
+{
+ struct nfs_rw_header *rw_hdr;
+ struct nfs_pgio_header *hdr;
+ int ret;
+
+ rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops);
+ if (!rw_hdr) {
+ desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+ return -ENOMEM;
+ }
+ hdr = &rw_hdr->header;
+ nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
+ atomic_inc(&hdr->refcnt);
+ ret = nfs_generic_pgio(desc, hdr);
+ if (ret == 0)
+ ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
+ hdr->data, desc->pg_rpc_callops,
+ desc->pg_ioflags, 0);
+ if (atomic_dec_and_test(&hdr->refcnt))
+ hdr->completion_ops->completion(hdr);
+ return ret;
+}
+
static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
@@ -356,18 +834,23 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
- if (!nfs_match_open_context(req->wb_context, prev->wb_context))
- return false;
- if (req->wb_context->dentry->d_inode->i_flock != NULL &&
- !nfs_match_lock_context(req->wb_lock_context, prev->wb_lock_context))
- return false;
- if (req->wb_pgbase != 0)
- return false;
- if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
- return false;
- if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
- return false;
- return pgio->pg_ops->pg_test(pgio, prev, req);
+ size_t size;
+
+ if (prev) {
+ if (!nfs_match_open_context(req->wb_context, prev->wb_context))
+ return false;
+ if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+ !nfs_match_lock_context(req->wb_lock_context,
+ prev->wb_lock_context))
+ return false;
+ if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
+ return false;
+ }
+ size = pgio->pg_ops->pg_test(pgio, prev, req);
+ WARN_ON_ONCE(size > req->wb_bytes);
+ if (size && size < req->wb_bytes)
+ req->wb_bytes = size;
+ return size > 0;
}
/**
@@ -381,17 +864,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
+ struct nfs_page *prev = NULL;
if (desc->pg_count != 0) {
- struct nfs_page *prev;
-
prev = nfs_list_entry(desc->pg_list.prev);
- if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
desc->pg_base = req->wb_pgbase;
}
+ if (!nfs_can_coalesce_requests(prev, req, desc))
+ return 0;
nfs_list_remove_request(req);
nfs_list_add_request(req, &desc->pg_list);
desc->pg_count += req->wb_bytes;
@@ -421,22 +903,72 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
* @desc: destination io descriptor
* @req: request
*
+ * This may split a request into subrequests which are all part of the
+ * same page group.
+ *
* Returns true if the request 'req' was successfully coalesced into the
* existing list of pages 'desc'.
*/
static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
- while (!nfs_pageio_do_add_request(desc, req)) {
- desc->pg_moreio = 1;
- nfs_pageio_doio(desc);
- if (desc->pg_error < 0)
- return 0;
- desc->pg_moreio = 0;
- if (desc->pg_recoalesce)
- return 0;
- }
+ struct nfs_page *subreq;
+ unsigned int bytes_left = 0;
+ unsigned int offset, pgbase;
+
+ nfs_page_group_lock(req);
+
+ subreq = req;
+ bytes_left = subreq->wb_bytes;
+ offset = subreq->wb_offset;
+ pgbase = subreq->wb_pgbase;
+
+ do {
+ if (!nfs_pageio_do_add_request(desc, subreq)) {
+ /* make sure pg_test call(s) did nothing */
+ WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
+ WARN_ON_ONCE(subreq->wb_offset != offset);
+ WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
+
+ nfs_page_group_unlock(req);
+ desc->pg_moreio = 1;
+ nfs_pageio_doio(desc);
+ if (desc->pg_error < 0)
+ return 0;
+ if (desc->pg_recoalesce)
+ return 0;
+ /* retry add_request for this subreq */
+ nfs_page_group_lock(req);
+ continue;
+ }
+
+ /* check for buggy pg_test call(s) */
+ WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
+ WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
+ WARN_ON_ONCE(subreq->wb_bytes == 0);
+
+ bytes_left -= subreq->wb_bytes;
+ offset += subreq->wb_bytes;
+ pgbase += subreq->wb_bytes;
+
+ if (bytes_left) {
+ subreq = nfs_create_request(req->wb_context,
+ req->wb_page,
+ subreq, pgbase, bytes_left);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ nfs_lock_request(subreq);
+ subreq->wb_offset = offset;
+ subreq->wb_index = req->wb_index;
+ }
+ } while (bytes_left > 0);
+
+ nfs_page_group_unlock(req);
return 1;
+err_ptr:
+ desc->pg_error = PTR_ERR(subreq);
+ nfs_page_group_unlock(req);
+ return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -449,6 +981,7 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
desc->pg_count = 0;
desc->pg_base = 0;
desc->pg_recoalesce = 0;
+ desc->pg_moreio = 0;
while (!list_empty(&head)) {
struct nfs_page *req;
@@ -535,3 +1068,13 @@ void nfs_destroy_nfspagecache(void)
kmem_cache_destroy(nfs_page_cachep);
}
+static const struct rpc_call_ops nfs_pgio_common_ops = {
+ .rpc_call_prepare = nfs_pgio_prepare,
+ .rpc_call_done = nfs_pgio_result,
+ .rpc_release = nfs_pgio_release,
+};
+
+const struct nfs_pageio_ops nfs_pgio_rw_ops = {
+ .pg_test = nfs_generic_pg_test,
+ .pg_doio = nfs_generic_pg_pgios,
+};
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d75d938d36c..6fdcd233d6f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -662,7 +662,18 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
*/
static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
{
- return (s32)s1 - (s32)s2 > 0;
+ return (s32)(s1 - s2) > 0;
+}
+
+static void
+pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *new,
+ struct list_head *free_me_list)
+{
+ if (nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return;
+ /* Layout is new! Kill existing layout segments */
+ pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
}
/* update lo->plh_stateid with new if is more recent */
@@ -1315,6 +1326,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
struct nfs4_layoutget_res *res = &lgp->res;
struct pnfs_layout_segment *lseg;
struct inode *ino = lo->plh_inode;
+ LIST_HEAD(free_me);
int status = 0;
/* Inject layout blob into I/O device driver */
@@ -1341,6 +1353,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget_reply;
}
+ /* Check that the new stateid matches the old stateid */
+ pnfs_verify_layout_stateid(lo, &res->stateid, &free_me);
/* Done processing layoutget. Set the layout stateid */
pnfs_set_layout_stateid(lo, &res->stateid, false);
@@ -1355,6 +1369,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
}
spin_unlock(&ino->i_lock);
+ pnfs_free_lseg_list(&free_me);
return lseg;
out:
return ERR_PTR(status);
@@ -1373,11 +1388,6 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
WARN_ON_ONCE(pgio->pg_lseg != NULL);
- if (req->wb_offset != req->wb_pgbase) {
- nfs_pageio_reset_read_mds(pgio);
- return;
- }
-
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
else
@@ -1402,11 +1412,6 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
{
WARN_ON_ONCE(pgio->pg_lseg != NULL);
- if (req->wb_offset != req->wb_pgbase) {
- nfs_pageio_reset_write_mds(pgio);
- return;
- }
-
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
req->wb_context,
req_offset(req),
@@ -1419,56 +1424,49 @@ pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
-void
-pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- struct nfs_server *server = NFS_SERVER(inode);
- struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-
- if (ld == NULL)
- nfs_pageio_init_read(pgio, inode, compl_ops);
- else
- nfs_pageio_init(pgio, inode, ld->pg_read_ops, compl_ops, server->rsize, 0);
-}
-
-void
-pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode,
- int ioflags,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- struct nfs_server *server = NFS_SERVER(inode);
- struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
-
- if (ld == NULL)
- nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
- else
- nfs_pageio_init(pgio, inode, ld->pg_write_ops, compl_ops, server->wsize, ioflags);
-}
-
-bool
+/*
+ * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
+ * of bytes (maximum @req->wb_bytes) that can be coalesced.
+ */
+size_t
pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
struct nfs_page *req)
{
- if (pgio->pg_lseg == NULL)
- return nfs_generic_pg_test(pgio, prev, req);
+ unsigned int size;
+ u64 seg_end, req_start, seg_left;
+
+ size = nfs_generic_pg_test(pgio, prev, req);
+ if (!size)
+ return 0;
/*
- * Test if a nfs_page is fully contained in the pnfs_layout_range.
- * Note that this test makes several assumptions:
- * - that the previous nfs_page in the struct nfs_pageio_descriptor
- * is known to lie within the range.
- * - that the nfs_page being tested is known to be contiguous with the
- * previous nfs_page.
- * - Layout ranges are page aligned, so we only have to test the
- * start offset of the request.
+ * 'size' contains the number of bytes left in the current page (up
+ * to the original size asked for in @req->wb_bytes).
+ *
+ * Calculate how many bytes are left in the layout segment
+ * and if there are less bytes than 'size', return that instead.
*
* Please also note that 'end_offset' is actually the offset of the
* first byte that lies outside the pnfs_layout_range. FIXME?
*
*/
- return req_offset(req) < end_offset(pgio->pg_lseg->pls_range.offset,
- pgio->pg_lseg->pls_range.length);
+ if (pgio->pg_lseg) {
+ seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
+ pgio->pg_lseg->pls_range.length);
+ req_start = req_offset(req);
+ WARN_ON_ONCE(req_start > seg_end);
+ /* start of request is past the last byte of this segment */
+ if (req_start >= seg_end)
+ return 0;
+
+ /* adjust 'size' iff there are fewer bytes left in the
+ * segment than what nfs_generic_pg_test returned */
+ seg_left = seg_end - req_start;
+ if (seg_left < size)
+ size = (unsigned int)seg_left;
+ }
+
+ return size;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
@@ -1481,7 +1479,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,
LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, compl_ops);
+ nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops);
pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1504,7 +1502,7 @@ int pnfs_write_done_resend_to_mds(struct inode *inode,
}
EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
-static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
+static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1523,7 +1521,7 @@ static void pnfs_ld_handle_write_error(struct nfs_write_data *data)
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_write_done(struct nfs_write_data *data)
+void pnfs_ld_write_done(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1539,7 +1537,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
static void
pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_write_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1548,11 +1546,11 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
nfs_pageio_reset_write_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_writedata_release(data);
+ nfs_pgio_data_release(data);
}
static enum pnfs_try_status
-pnfs_try_to_write_data(struct nfs_write_data *wdata,
+pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg,
int how)
@@ -1574,41 +1572,36 @@ pnfs_try_to_write_data(struct nfs_write_data *wdata,
}
static void
-pnfs_do_multiple_writes(struct nfs_pageio_descriptor *desc, struct list_head *head, int how)
+pnfs_do_write(struct nfs_pageio_descriptor *desc,
+ struct nfs_pgio_header *hdr, int how)
{
- struct nfs_write_data *data;
+ struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- while (!list_empty(head)) {
- enum pnfs_try_status trypnfs;
-
- data = list_first_entry(head, struct nfs_write_data, list);
- list_del_init(&data->list);
-
- trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_write_through_mds(desc, data);
- }
+ trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how);
+ if (trypnfs == PNFS_NOT_ATTEMPTED)
+ pnfs_write_through_mds(desc, data);
pnfs_put_lseg(lseg);
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_writehdr_free(hdr);
+ nfs_rw_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_write_header *whdr;
+ struct nfs_rw_header *whdr;
struct nfs_pgio_header *hdr;
int ret;
- whdr = nfs_writehdr_alloc();
+ whdr = nfs_rw_header_alloc(desc->pg_rw_ops);
if (!whdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
pnfs_put_lseg(desc->pg_lseg);
@@ -1619,12 +1612,12 @@ pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
atomic_inc(&hdr->refcnt);
- ret = nfs_generic_flush(desc, hdr);
+ ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
- pnfs_do_multiple_writes(desc, &hdr->rpc_list, desc->pg_ioflags);
+ pnfs_do_write(desc, hdr, desc->pg_ioflags);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
@@ -1640,7 +1633,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,
LIST_HEAD(failed);
/* Resend all requests through the MDS */
- nfs_pageio_init_read(&pgio, inode, compl_ops);
+ nfs_pageio_init_read(&pgio, inode, true, compl_ops);
pgio.pg_dreq = dreq;
while (!list_empty(head)) {
struct nfs_page *req = nfs_list_entry(head->next);
@@ -1659,7 +1652,7 @@ int pnfs_read_done_resend_to_mds(struct inode *inode,
}
EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
-static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
+static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1678,7 +1671,7 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
/*
* Called by non rpc-based layout drivers
*/
-void pnfs_ld_read_done(struct nfs_read_data *data)
+void pnfs_ld_read_done(struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1694,7 +1687,7 @@ EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
static void
pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
- struct nfs_read_data *data)
+ struct nfs_pgio_data *data)
{
struct nfs_pgio_header *hdr = data->header;
@@ -1703,14 +1696,14 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
nfs_pageio_reset_read_mds(desc);
desc->pg_recoalesce = 1;
}
- nfs_readdata_release(data);
+ nfs_pgio_data_release(data);
}
/*
* Call the appropriate parallel I/O subsystem read function.
*/
static enum pnfs_try_status
-pnfs_try_to_read_data(struct nfs_read_data *rdata,
+pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
const struct rpc_call_ops *call_ops,
struct pnfs_layout_segment *lseg)
{
@@ -1732,41 +1725,35 @@ pnfs_try_to_read_data(struct nfs_read_data *rdata,
}
static void
-pnfs_do_multiple_reads(struct nfs_pageio_descriptor *desc, struct list_head *head)
+pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
{
- struct nfs_read_data *data;
+ struct nfs_pgio_data *data = hdr->data;
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
+ enum pnfs_try_status trypnfs;
desc->pg_lseg = NULL;
- while (!list_empty(head)) {
- enum pnfs_try_status trypnfs;
-
- data = list_first_entry(head, struct nfs_read_data, list);
- list_del_init(&data->list);
-
- trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
- pnfs_read_through_mds(desc, data);
- }
+ trypnfs = pnfs_try_to_read_data(data, call_ops, lseg);
+ if (trypnfs == PNFS_NOT_ATTEMPTED)
+ pnfs_read_through_mds(desc, data);
pnfs_put_lseg(lseg);
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
{
pnfs_put_lseg(hdr->lseg);
- nfs_readhdr_free(hdr);
+ nfs_rw_header_free(hdr);
}
EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
- struct nfs_read_header *rhdr;
+ struct nfs_rw_header *rhdr;
struct nfs_pgio_header *hdr;
int ret;
- rhdr = nfs_readhdr_alloc();
+ rhdr = nfs_rw_header_alloc(desc->pg_rw_ops);
if (!rhdr) {
desc->pg_completion_ops->error_cleanup(&desc->pg_list);
ret = -ENOMEM;
@@ -1778,18 +1765,27 @@ pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
atomic_inc(&hdr->refcnt);
- ret = nfs_generic_pagein(desc, hdr);
+ ret = nfs_generic_pgio(desc, hdr);
if (ret != 0) {
pnfs_put_lseg(desc->pg_lseg);
desc->pg_lseg = NULL;
} else
- pnfs_do_multiple_reads(desc, &hdr->rpc_list);
+ pnfs_do_read(desc, hdr);
if (atomic_dec_and_test(&hdr->refcnt))
hdr->completion_ops->completion(hdr);
return ret;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
+static void pnfs_clear_layoutcommitting(struct inode *inode)
+{
+ unsigned long *bitlock = &NFS_I(inode)->flags;
+
+ clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+}
+
/*
* There can be multiple RW segments.
*/
@@ -1807,7 +1803,6 @@ static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
{
struct pnfs_layout_segment *lseg, *tmp;
- unsigned long *bitlock = &NFS_I(inode)->flags;
/* Matched by references in pnfs_set_layoutcommit */
list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
@@ -1815,9 +1810,7 @@ static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *lis
pnfs_put_lseg(lseg);
}
- clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
- smp_mb__after_clear_bit();
- wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
+ pnfs_clear_layoutcommitting(inode);
}
void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
@@ -1827,7 +1820,7 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
void
-pnfs_set_layoutcommit(struct nfs_write_data *wdata)
+pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
{
struct nfs_pgio_header *hdr = wdata->header;
struct inode *inode = hdr->inode;
@@ -1881,43 +1874,37 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
struct nfs4_layoutcommit_data *data;
struct nfs_inode *nfsi = NFS_I(inode);
loff_t end_pos;
- int status = 0;
-
- dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+ int status;
- if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ if (!pnfs_layoutcommit_outstanding(inode))
return 0;
- /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
- data = kzalloc(sizeof(*data), GFP_NOFS);
- if (!data) {
- status = -ENOMEM;
- goto out;
- }
-
- if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
- goto out_free;
+ dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
+ status = -EAGAIN;
if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
- if (!sync) {
- status = -EAGAIN;
- goto out_free;
- }
- status = wait_on_bit_lock(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING,
- nfs_wait_bit_killable, TASK_KILLABLE);
+ if (!sync)
+ goto out;
+ status = wait_on_bit_lock(&nfsi->flags,
+ NFS_INO_LAYOUTCOMMITTING,
+ nfs_wait_bit_killable,
+ TASK_KILLABLE);
if (status)
- goto out_free;
+ goto out;
}
- INIT_LIST_HEAD(&data->lseg_list);
+ status = -ENOMEM;
+ /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
+ data = kzalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ goto clear_layoutcommitting;
+
+ status = 0;
spin_lock(&inode->i_lock);
- if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
- clear_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags);
- spin_unlock(&inode->i_lock);
- wake_up_bit(&nfsi->flags, NFS_INO_LAYOUTCOMMITTING);
- goto out_free;
- }
+ if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
+ goto out_unlock;
+ INIT_LIST_HEAD(&data->lseg_list);
pnfs_list_write_lseg(inode, &data->lseg_list);
end_pos = nfsi->layout->plh_lwb;
@@ -1940,8 +1927,11 @@ out:
mark_inode_dirty_sync(inode);
dprintk("<-- %s status %d\n", __func__, status);
return status;
-out_free:
+out_unlock:
+ spin_unlock(&inode->i_lock);
kfree(data);
+clear_layoutcommitting:
+ pnfs_clear_layoutcommitting(inode);
goto out;
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index a4f41810a7f..4fb309a2b4c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -113,8 +113,8 @@ struct pnfs_layoutdriver_type {
* Return PNFS_ATTEMPTED to indicate the layout code has attempted
* I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
*/
- enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
- enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
+ enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data);
+ enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how);
void (*free_deviceid_node) (struct nfs4_deviceid_node *);
@@ -180,11 +180,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
-void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *,
- const struct nfs_pgio_completion_ops *);
-void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *,
- int, const struct nfs_pgio_completion_ops *);
-
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
@@ -192,7 +187,8 @@ int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size);
int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
-bool pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req);
+size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req);
void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg);
struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
void pnfs_free_lseg_list(struct list_head *tmp_list);
@@ -217,13 +213,13 @@ bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
-void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
+void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
-void pnfs_ld_write_done(struct nfs_write_data *);
-void pnfs_ld_read_done(struct nfs_read_data *);
+void pnfs_ld_write_done(struct nfs_pgio_data *);
+void pnfs_ld_read_done(struct nfs_pgio_data *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
@@ -275,7 +271,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
if (lseg) {
atomic_inc(&lseg->pls_refcount);
- smp_mb__after_atomic_inc();
+ smp_mb__after_atomic();
}
return lseg;
}
@@ -359,6 +355,15 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
PNFS_LAYOUTRET_ON_SETATTR;
}
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
+ return test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags) != 0 ||
+ test_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags) != 0;
+}
+
static inline int pnfs_return_layout(struct inode *ino)
{
struct nfs_inode *nfsi = NFS_I(ino);
@@ -452,18 +457,6 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
{
}
-static inline void pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- nfs_pageio_init_read(pgio, inode, compl_ops);
-}
-
-static inline void pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags,
- const struct nfs_pgio_completion_ops *compl_ops)
-{
- nfs_pageio_init_write(pgio, inode, ioflags, compl_ops);
-}
-
static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
struct nfs_commit_info *cinfo)
@@ -515,6 +508,13 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
return false;
}
+static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
return NULL;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index fddbba2d9ef..c171ce1a8a3 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -357,30 +357,6 @@ nfs_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
}
static int
-nfs_proc_rename(struct inode *old_dir, struct qstr *old_name,
- struct inode *new_dir, struct qstr *new_name)
-{
- struct nfs_renameargs arg = {
- .old_dir = NFS_FH(old_dir),
- .old_name = old_name,
- .new_dir = NFS_FH(new_dir),
- .new_name = new_name,
- };
- struct rpc_message msg = {
- .rpc_proc = &nfs_procedures[NFSPROC_RENAME],
- .rpc_argp = &arg,
- };
- int status;
-
- dprintk("NFS call rename %s -> %s\n", old_name->name, new_name->name);
- status = rpc_call_sync(NFS_CLIENT(old_dir), &msg, 0);
- nfs_mark_for_revalidate(old_dir);
- nfs_mark_for_revalidate(new_dir);
- dprintk("NFS reply rename: %d\n", status);
- return status;
-}
-
-static int
nfs_proc_link(struct inode *inode, struct inode *dir, struct qstr *name)
{
struct nfs_linkargs arg = {
@@ -602,7 +578,7 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
return 0;
}
-static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -618,18 +594,18 @@ static int nfs_read_done(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
-static void nfs_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
+static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
}
-static int nfs_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data)
{
rpc_call_start(task);
return 0;
}
-static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data)
{
struct inode *inode = data->header->inode;
@@ -638,19 +614,13 @@ static int nfs_write_done(struct rpc_task *task, struct nfs_write_data *data)
return 0;
}
-static void nfs_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
+static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg)
{
/* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
data->args.stable = NFS_FILE_SYNC;
msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
}
-static int nfs_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
-{
- rpc_call_start(task);
- return 0;
-}
-
static void nfs_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
{
BUG();
@@ -745,7 +715,6 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.unlink_setup = nfs_proc_unlink_setup,
.unlink_rpc_prepare = nfs_proc_unlink_rpc_prepare,
.unlink_done = nfs_proc_unlink_done,
- .rename = nfs_proc_rename,
.rename_setup = nfs_proc_rename_setup,
.rename_rpc_prepare = nfs_proc_rename_rpc_prepare,
.rename_done = nfs_proc_rename_done,
@@ -759,13 +728,10 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.fsinfo = nfs_proc_fsinfo,
.pathconf = nfs_proc_pathconf,
.decode_dirent = nfs2_decode_dirent,
+ .pgio_rpc_prepare = nfs_proc_pgio_rpc_prepare,
.read_setup = nfs_proc_read_setup,
- .read_pageio_init = nfs_pageio_init_read,
- .read_rpc_prepare = nfs_proc_read_rpc_prepare,
.read_done = nfs_read_done,
.write_setup = nfs_proc_write_setup,
- .write_pageio_init = nfs_pageio_init_write,
- .write_rpc_prepare = nfs_proc_write_rpc_prepare,
.write_done = nfs_write_done,
.commit_setup = nfs_proc_commit_setup,
.commit_rpc_prepare = nfs_proc_commit_rpc_prepare,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 31db5c366b8..e818a475ca6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -24,85 +24,24 @@
#include "internal.h"
#include "iostat.h"
#include "fscache.h"
+#include "pnfs.h"
#define NFSDBG_FACILITY NFSDBG_PAGECACHE
-static const struct nfs_pageio_ops nfs_pageio_read_ops;
-static const struct rpc_call_ops nfs_read_common_ops;
static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops;
+static const struct nfs_rw_ops nfs_rw_read_ops;
static struct kmem_cache *nfs_rdata_cachep;
-struct nfs_read_header *nfs_readhdr_alloc(void)
+static struct nfs_rw_header *nfs_readhdr_alloc(void)
{
- struct nfs_read_header *rhdr;
-
- rhdr = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
- if (rhdr) {
- struct nfs_pgio_header *hdr = &rhdr->header;
-
- INIT_LIST_HEAD(&hdr->pages);
- INIT_LIST_HEAD(&hdr->rpc_list);
- spin_lock_init(&hdr->lock);
- atomic_set(&hdr->refcnt, 0);
- }
- return rhdr;
+ return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
}
-EXPORT_SYMBOL_GPL(nfs_readhdr_alloc);
-static struct nfs_read_data *nfs_readdata_alloc(struct nfs_pgio_header *hdr,
- unsigned int pagecount)
+static void nfs_readhdr_free(struct nfs_rw_header *rhdr)
{
- struct nfs_read_data *data, *prealloc;
-
- prealloc = &container_of(hdr, struct nfs_read_header, header)->rpc_data;
- if (prealloc->header == NULL)
- data = prealloc;
- else
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- if (nfs_pgarray_set(&data->pages, pagecount)) {
- data->header = hdr;
- atomic_inc(&hdr->refcnt);
- } else {
- if (data != prealloc)
- kfree(data);
- data = NULL;
- }
-out:
- return data;
-}
-
-void nfs_readhdr_free(struct nfs_pgio_header *hdr)
-{
- struct nfs_read_header *rhdr = container_of(hdr, struct nfs_read_header, header);
-
kmem_cache_free(nfs_rdata_cachep, rhdr);
}
-EXPORT_SYMBOL_GPL(nfs_readhdr_free);
-
-void nfs_readdata_release(struct nfs_read_data *rdata)
-{
- struct nfs_pgio_header *hdr = rdata->header;
- struct nfs_read_header *read_header = container_of(hdr, struct nfs_read_header, header);
-
- put_nfs_open_context(rdata->args.context);
- if (rdata->pages.pagevec != rdata->pages.page_array)
- kfree(rdata->pages.pagevec);
- if (rdata == &read_header->rpc_data) {
- rdata->header = NULL;
- rdata = NULL;
- }
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- /* Note: we only free the rpc_task after callbacks are done.
- * See the comment in rpc_free_task() for why
- */
- kfree(rdata);
-}
-EXPORT_SYMBOL_GPL(nfs_readdata_release);
static
int nfs_return_empty_page(struct page *page)
@@ -114,17 +53,24 @@ int nfs_return_empty_page(struct page *page)
}
void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio,
- struct inode *inode,
+ struct inode *inode, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops)
{
- nfs_pageio_init(pgio, inode, &nfs_pageio_read_ops, compl_ops,
- NFS_SERVER(inode)->rsize, 0);
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_read_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops,
+ server->rsize, 0);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
- pgio->pg_ops = &nfs_pageio_read_ops;
+ pgio->pg_ops = &nfs_pgio_rw_ops;
pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
@@ -139,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(ctx, inode, page, 0, len);
+ new = nfs_create_request(ctx, page, NULL, 0, len);
if (IS_ERR(new)) {
unlock_page(page);
return PTR_ERR(new);
@@ -147,7 +93,8 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
if (len < PAGE_CACHE_SIZE)
zero_user_segment(page, len, PAGE_CACHE_SIZE);
- NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops);
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
nfs_pageio_add_request(&pgio, new);
nfs_pageio_complete(&pgio);
NFS_I(inode)->read_io += pgio.pg_bytes_written;
@@ -158,20 +105,31 @@ static void nfs_readpage_release(struct nfs_page *req)
{
struct inode *d_inode = req->wb_context->dentry->d_inode;
- if (PageUptodate(req->wb_page))
- nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
+ dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes,
+ (long long)req_offset(req));
- unlock_page(req->wb_page);
+ if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+ if (PageUptodate(req->wb_page))
+ nfs_readpage_to_fscache(d_inode, req->wb_page, 0);
- dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
+ unlock_page(req->wb_page);
+ }
+
+ dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
req->wb_context->dentry->d_inode->i_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
req->wb_bytes,
(long long)req_offset(req));
nfs_release_request(req);
}
-/* Note io was page aligned */
+static void nfs_page_group_set_uptodate(struct nfs_page *req)
+{
+ if (nfs_page_group_sync_on_bit(req, PG_UPTODATE))
+ SetPageUptodate(req->wb_page);
+}
+
static void nfs_read_completion(struct nfs_pgio_header *hdr)
{
unsigned long bytes = 0;
@@ -181,21 +139,32 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr)
while (!list_empty(&hdr->pages)) {
struct nfs_page *req = nfs_list_entry(hdr->pages.next);
struct page *page = req->wb_page;
+ unsigned long start = req->wb_pgbase;
+ unsigned long end = req->wb_pgbase + req->wb_bytes;
if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
- if (bytes > hdr->good_bytes)
- zero_user(page, 0, PAGE_SIZE);
- else if (hdr->good_bytes - bytes < PAGE_SIZE)
- zero_user_segment(page,
- hdr->good_bytes & ~PAGE_MASK,
- PAGE_SIZE);
+ /* note: regions of the page not covered by a
+ * request are zeroed in nfs_readpage_async /
+ * readpage_async_filler */
+ if (bytes > hdr->good_bytes) {
+ /* nothing in this request was good, so zero
+ * the full extent of the request */
+ zero_user_segment(page, start, end);
+
+ } else if (hdr->good_bytes - bytes < req->wb_bytes) {
+ /* part of this request has good bytes, but
+ * not all. zero the bad bytes */
+ start += hdr->good_bytes - bytes;
+ WARN_ON(start < req->wb_pgbase);
+ zero_user_segment(page, start, end);
+ }
}
bytes += req->wb_bytes;
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (bytes <= hdr->good_bytes)
- SetPageUptodate(page);
+ nfs_page_group_set_uptodate(req);
} else
- SetPageUptodate(page);
+ nfs_page_group_set_uptodate(req);
nfs_list_remove_request(req);
nfs_readpage_release(req);
}
@@ -203,95 +172,14 @@ out:
hdr->release(hdr);
}
-int nfs_initiate_read(struct rpc_clnt *clnt,
- struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops, int flags)
+static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg,
+ struct rpc_task_setup *task_setup_data, int how)
{
struct inode *inode = data->header->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
- .rpc_cred = data->header->cred,
- };
- struct rpc_task_setup task_setup_data = {
- .task = &data->task,
- .rpc_client = clnt,
- .rpc_message = &msg,
- .callback_ops = call_ops,
- .callback_data = data,
- .workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | swap_flags | flags,
- };
-
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->read_setup(data, &msg);
-
- dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
- "offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- data->args.count,
- (unsigned long long)data->args.offset);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task))
- return PTR_ERR(task);
- rpc_put_task(task);
- return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_initiate_read);
-/*
- * Set up the NFS read request struct
- */
-static void nfs_read_rpcsetup(struct nfs_read_data *data,
- unsigned int count, unsigned int offset)
-{
- struct nfs_page *req = data->header->req;
-
- data->args.fh = NFS_FH(data->header->inode);
- data->args.offset = req_offset(req) + offset;
- data->args.pgbase = req->wb_pgbase + offset;
- data->args.pages = data->pages.pagevec;
- data->args.count = count;
- data->args.context = get_nfs_open_context(req->wb_context);
- data->args.lock_context = req->wb_lock_context;
-
- data->res.fattr = &data->fattr;
- data->res.count = count;
- data->res.eof = 0;
- nfs_fattr_init(&data->fattr);
-}
-
-static int nfs_do_read(struct nfs_read_data *data,
- const struct rpc_call_ops *call_ops)
-{
- struct inode *inode = data->header->inode;
-
- return nfs_initiate_read(NFS_CLIENT(inode), data, call_ops, 0);
-}
-
-static int
-nfs_do_multiple_reads(struct list_head *head,
- const struct rpc_call_ops *call_ops)
-{
- struct nfs_read_data *data;
- int ret = 0;
-
- while (!list_empty(head)) {
- int ret2;
-
- data = list_first_entry(head, struct nfs_read_data, list);
- list_del_init(&data->list);
-
- ret2 = nfs_do_read(data, call_ops);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
+ task_setup_data->flags |= swap_flags;
+ NFS_PROTO(inode)->read_setup(data, msg);
}
static void
@@ -311,143 +199,14 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
.completion = nfs_read_completion,
};
-static void nfs_pagein_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- set_bit(NFS_IOHDR_REDO, &hdr->flags);
- while (!list_empty(&hdr->rpc_list)) {
- struct nfs_read_data *data = list_first_entry(&hdr->rpc_list,
- struct nfs_read_data, list);
- list_del(&data->list);
- nfs_readdata_release(data);
- }
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-}
-
-/*
- * Generate multiple requests to fill a single page.
- *
- * We optimize to reduce the number of read operations on the wire. If we
- * detect that we're reading a page, or an area of a page, that is past the
- * end of file, we do not generate NFS read operations but just clear the
- * parts of the page that would have come back zero from the server anyway.
- *
- * We rely on the cached value of i_size to make this determination; another
- * client can fill pages on the server past our cached end-of-file, but we
- * won't see the new data until our attribute cache is updated. This is more
- * or less conventional NFS client behavior.
- */
-static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req = hdr->req;
- struct page *page = req->wb_page;
- struct nfs_read_data *data;
- size_t rsize = desc->pg_bsize, nbytes;
- unsigned int offset;
-
- offset = 0;
- nbytes = desc->pg_count;
- do {
- size_t len = min(nbytes,rsize);
-
- data = nfs_readdata_alloc(hdr, 1);
- if (!data) {
- nfs_pagein_error(desc, hdr);
- return -ENOMEM;
- }
- data->pages.pagevec[0] = page;
- nfs_read_rpcsetup(data, len, offset);
- list_add(&data->list, &hdr->rpc_list);
- nbytes -= len;
- offset += len;
- } while (nbytes != 0);
-
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- desc->pg_rpc_callops = &nfs_read_common_ops;
- return 0;
-}
-
-static int nfs_pagein_one(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req;
- struct page **pages;
- struct nfs_read_data *data;
- struct list_head *head = &desc->pg_list;
-
- data = nfs_readdata_alloc(hdr, nfs_page_array_len(desc->pg_base,
- desc->pg_count));
- if (!data) {
- nfs_pagein_error(desc, hdr);
- return -ENOMEM;
- }
-
- pages = data->pages.pagevec;
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
- }
-
- nfs_read_rpcsetup(data, desc->pg_count, 0);
- list_add(&data->list, &hdr->rpc_list);
- desc->pg_rpc_callops = &nfs_read_common_ops;
- return 0;
-}
-
-int nfs_generic_pagein(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- if (desc->pg_bsize < PAGE_CACHE_SIZE)
- return nfs_pagein_multi(desc, hdr);
- return nfs_pagein_one(desc, hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_generic_pagein);
-
-static int nfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
-{
- struct nfs_read_header *rhdr;
- struct nfs_pgio_header *hdr;
- int ret;
-
- rhdr = nfs_readhdr_alloc();
- if (!rhdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
- return -ENOMEM;
- }
- hdr = &rhdr->header;
- nfs_pgheader_init(desc, hdr, nfs_readhdr_free);
- atomic_inc(&hdr->refcnt);
- ret = nfs_generic_pagein(desc, hdr);
- if (ret == 0)
- ret = nfs_do_multiple_reads(&hdr->rpc_list,
- desc->pg_rpc_callops);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- return ret;
-}
-
-static const struct nfs_pageio_ops nfs_pageio_read_ops = {
- .pg_test = nfs_generic_pg_test,
- .pg_doio = nfs_generic_pg_readpages,
-};
-
/*
* This is the callback from RPC telling us whether a reply was
* received or some error occurred (timeout or socket shutdown).
*/
-int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
+static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
+ struct inode *inode)
{
- struct inode *inode = data->header->inode;
- int status;
-
- dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
- task->tk_status);
-
- status = NFS_PROTO(inode)->read_done(task, data);
+ int status = NFS_PROTO(inode)->read_done(task, data);
if (status != 0)
return status;
@@ -460,10 +219,10 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
return 0;
}
-static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data)
+static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data)
{
- struct nfs_readargs *argp = &data->args;
- struct nfs_readres *resp = &data->res;
+ struct nfs_pgio_args *argp = &data->args;
+ struct nfs_pgio_res *resp = &data->res;
/* This is a short read! */
nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD);
@@ -480,17 +239,11 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
rpc_restart_call_prepare(task);
}
-static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
+static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data)
{
- struct nfs_read_data *data = calldata;
struct nfs_pgio_header *hdr = data->header;
- /* Note the only returns of nfs_readpage_result are 0 and -EAGAIN */
- if (nfs_readpage_result(task, data) != 0)
- return;
- if (task->tk_status < 0)
- nfs_set_pgio_error(hdr, task->tk_status, data->args.offset);
- else if (data->res.eof) {
+ if (data->res.eof) {
loff_t bound;
bound = data->args.offset + data->res.count;
@@ -505,26 +258,6 @@ static void nfs_readpage_result_common(struct rpc_task *task, void *calldata)
nfs_readpage_retry(task, data);
}
-static void nfs_readpage_release_common(void *calldata)
-{
- nfs_readdata_release(calldata);
-}
-
-void nfs_read_prepare(struct rpc_task *task, void *calldata)
-{
- struct nfs_read_data *data = calldata;
- int err;
- err = NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
- if (err)
- rpc_exit(task, err);
-}
-
-static const struct rpc_call_ops nfs_read_common_ops = {
- .rpc_call_prepare = nfs_read_prepare,
- .rpc_call_done = nfs_readpage_result_common,
- .rpc_release = nfs_readpage_release_common,
-};
-
/*
* Read a page over NFS.
* We read the page synchronously in the following case:
@@ -592,7 +325,6 @@ static int
readpage_async_filler(void *data, struct page *page)
{
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
- struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *new;
unsigned int len;
int error;
@@ -601,7 +333,7 @@ readpage_async_filler(void *data, struct page *page)
if (len == 0)
return nfs_return_empty_page(page);
- new = nfs_create_request(desc->ctx, inode, page, 0, len);
+ new = nfs_create_request(desc->ctx, page, NULL, 0, len);
if (IS_ERR(new))
goto out_error;
@@ -630,9 +362,9 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
unsigned long npages;
int ret = -ESTALE;
- dprintk("NFS: nfs_readpages (%s/%Ld %d)\n",
+ dprintk("NFS: nfs_readpages (%s/%Lu %d)\n",
inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
+ (unsigned long long)NFS_FILEID(inode),
nr_pages);
nfs_inc_stats(inode, NFSIOS_VFSREADPAGES);
@@ -654,7 +386,8 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
if (ret == 0)
goto read_complete; /* all pages were read */
- NFS_PROTO(inode)->read_pageio_init(&pgio, inode, &nfs_async_read_completion_ops);
+ nfs_pageio_init_read(&pgio, inode, false,
+ &nfs_async_read_completion_ops);
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
@@ -671,7 +404,7 @@ out:
int __init nfs_init_readpagecache(void)
{
nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
- sizeof(struct nfs_read_header),
+ sizeof(struct nfs_rw_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_rdata_cachep == NULL)
@@ -684,3 +417,12 @@ void nfs_destroy_readpagecache(void)
{
kmem_cache_destroy(nfs_rdata_cachep);
}
+
+static const struct nfs_rw_ops nfs_rw_read_ops = {
+ .rw_mode = FMODE_READ,
+ .rw_alloc_header = nfs_readhdr_alloc,
+ .rw_free_header = nfs_readhdr_free,
+ .rw_done = nfs_readpage_done,
+ .rw_result = nfs_readpage_result,
+ .rw_initiate = nfs_initiate_read,
+};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index a03b9c6f948..084af1060d7 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -497,7 +497,8 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour)
static const struct {
rpc_authflavor_t flavour;
const char *str;
- } sec_flavours[] = {
+ } sec_flavours[NFS_AUTH_INFO_MAX_FLAVORS] = {
+ /* update NFS_AUTH_INFO_MAX_FLAVORS when this list changes! */
{ RPC_AUTH_NULL, "null" },
{ RPC_AUTH_UNIX, "sys" },
{ RPC_AUTH_GSS_KRB5, "krb5" },
@@ -923,8 +924,7 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(void)
data->mount_server.port = NFS_UNSPEC_PORT;
data->nfs_server.port = NFS_UNSPEC_PORT;
data->nfs_server.protocol = XPRT_TRANSPORT_TCP;
- data->auth_flavors[0] = RPC_AUTH_MAXFLAVOR;
- data->auth_flavor_len = 0;
+ data->selected_flavor = RPC_AUTH_MAXFLAVOR;
data->minorversion = 0;
data->need_mount = true;
data->net = current->nsproxy->net_ns;
@@ -1019,14 +1019,53 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
}
}
-static void nfs_set_auth_parsed_mount_data(struct nfs_parsed_mount_data *data,
- rpc_authflavor_t pseudoflavor)
+/*
+ * Add 'flavor' to 'auth_info' if not already present.
+ * Returns true if 'flavor' ends up in the list, false otherwise
+ */
+static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
+ rpc_authflavor_t flavor)
{
- data->auth_flavors[0] = pseudoflavor;
- data->auth_flavor_len = 1;
+ unsigned int i;
+ unsigned int max_flavor_len = (sizeof(auth_info->flavors) /
+ sizeof(auth_info->flavors[0]));
+
+ /* make sure this flavor isn't already in the list */
+ for (i = 0; i < auth_info->flavor_len; i++) {
+ if (flavor == auth_info->flavors[i])
+ return true;
+ }
+
+ if (auth_info->flavor_len + 1 >= max_flavor_len) {
+ dfprintk(MOUNT, "NFS: too many sec= flavors\n");
+ return false;
+ }
+
+ auth_info->flavors[auth_info->flavor_len++] = flavor;
+ return true;
}
/*
+ * Return true if 'match' is in auth_info or auth_info is empty.
+ * Return false otherwise.
+ */
+bool nfs_auth_info_match(const struct nfs_auth_info *auth_info,
+ rpc_authflavor_t match)
+{
+ int i;
+
+ if (!auth_info->flavor_len)
+ return true;
+
+ for (i = 0; i < auth_info->flavor_len; i++) {
+ if (auth_info->flavors[i] == match)
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nfs_auth_info_match);
+
+/*
* Parse the value of the 'sec=' option.
*/
static int nfs_parse_security_flavors(char *value,
@@ -1034,49 +1073,55 @@ static int nfs_parse_security_flavors(char *value,
{
substring_t args[MAX_OPT_ARGS];
rpc_authflavor_t pseudoflavor;
+ char *p;
dfprintk(MOUNT, "NFS: parsing sec=%s option\n", value);
- switch (match_token(value, nfs_secflavor_tokens, args)) {
- case Opt_sec_none:
- pseudoflavor = RPC_AUTH_NULL;
- break;
- case Opt_sec_sys:
- pseudoflavor = RPC_AUTH_UNIX;
- break;
- case Opt_sec_krb5:
- pseudoflavor = RPC_AUTH_GSS_KRB5;
- break;
- case Opt_sec_krb5i:
- pseudoflavor = RPC_AUTH_GSS_KRB5I;
- break;
- case Opt_sec_krb5p:
- pseudoflavor = RPC_AUTH_GSS_KRB5P;
- break;
- case Opt_sec_lkey:
- pseudoflavor = RPC_AUTH_GSS_LKEY;
- break;
- case Opt_sec_lkeyi:
- pseudoflavor = RPC_AUTH_GSS_LKEYI;
- break;
- case Opt_sec_lkeyp:
- pseudoflavor = RPC_AUTH_GSS_LKEYP;
- break;
- case Opt_sec_spkm:
- pseudoflavor = RPC_AUTH_GSS_SPKM;
- break;
- case Opt_sec_spkmi:
- pseudoflavor = RPC_AUTH_GSS_SPKMI;
- break;
- case Opt_sec_spkmp:
- pseudoflavor = RPC_AUTH_GSS_SPKMP;
- break;
- default:
- return 0;
+ while ((p = strsep(&value, ":")) != NULL) {
+ switch (match_token(p, nfs_secflavor_tokens, args)) {
+ case Opt_sec_none:
+ pseudoflavor = RPC_AUTH_NULL;
+ break;
+ case Opt_sec_sys:
+ pseudoflavor = RPC_AUTH_UNIX;
+ break;
+ case Opt_sec_krb5:
+ pseudoflavor = RPC_AUTH_GSS_KRB5;
+ break;
+ case Opt_sec_krb5i:
+ pseudoflavor = RPC_AUTH_GSS_KRB5I;
+ break;
+ case Opt_sec_krb5p:
+ pseudoflavor = RPC_AUTH_GSS_KRB5P;
+ break;
+ case Opt_sec_lkey:
+ pseudoflavor = RPC_AUTH_GSS_LKEY;
+ break;
+ case Opt_sec_lkeyi:
+ pseudoflavor = RPC_AUTH_GSS_LKEYI;
+ break;
+ case Opt_sec_lkeyp:
+ pseudoflavor = RPC_AUTH_GSS_LKEYP;
+ break;
+ case Opt_sec_spkm:
+ pseudoflavor = RPC_AUTH_GSS_SPKM;
+ break;
+ case Opt_sec_spkmi:
+ pseudoflavor = RPC_AUTH_GSS_SPKMI;
+ break;
+ case Opt_sec_spkmp:
+ pseudoflavor = RPC_AUTH_GSS_SPKMP;
+ break;
+ default:
+ dfprintk(MOUNT,
+ "NFS: sec= option '%s' not recognized\n", p);
+ return 0;
+ }
+
+ if (!nfs_auth_info_add(&mnt->auth_info, pseudoflavor))
+ return 0;
}
- mnt->flags |= NFS_MOUNT_SECFLAVOUR;
- nfs_set_auth_parsed_mount_data(mnt, pseudoflavor);
return 1;
}
@@ -1569,7 +1614,7 @@ static int nfs_parse_mount_options(char *raw,
goto out_minorversion_mismatch;
if (mnt->options & NFS_OPTION_MIGRATION &&
- mnt->version != 4 && mnt->minorversion != 0)
+ (mnt->version != 4 || mnt->minorversion != 0))
goto out_migration_misuse;
/*
@@ -1623,12 +1668,14 @@ out_security_failure:
}
/*
- * Ensure that the specified authtype in args->auth_flavors[0] is supported by
- * the server. Returns 0 if it's ok, and -EACCES if not.
+ * Ensure that a specified authtype in args->auth_info is supported by
+ * the server. Returns 0 and sets args->selected_flavor if it's ok, and
+ * -EACCES if not.
*/
-static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
+static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
rpc_authflavor_t *server_authlist, unsigned int count)
{
+ rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
unsigned int i;
/*
@@ -1640,17 +1687,20 @@ static int nfs_verify_authflavor(struct nfs_parsed_mount_data *args,
* can be used.
*/
for (i = 0; i < count; i++) {
- if (args->auth_flavors[0] == server_authlist[i] ||
- server_authlist[i] == RPC_AUTH_NULL)
+ flavor = server_authlist[i];
+
+ if (nfs_auth_info_match(&args->auth_info, flavor) ||
+ flavor == RPC_AUTH_NULL)
goto out;
}
- dfprintk(MOUNT, "NFS: auth flavor %u not supported by server\n",
- args->auth_flavors[0]);
+ dfprintk(MOUNT,
+ "NFS: specified auth flavors not supported by server\n");
return -EACCES;
out:
- dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+ args->selected_flavor = flavor;
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->selected_flavor);
return 0;
}
@@ -1738,9 +1788,10 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
* Was a sec= authflavor specified in the options? First, verify
* whether the server supports it, and then just try to use it if so.
*/
- if (args->auth_flavor_len > 0) {
- status = nfs_verify_authflavor(args, authlist, authlist_len);
- dfprintk(MOUNT, "NFS: using auth flavor %u\n", args->auth_flavors[0]);
+ if (args->auth_info.flavor_len > 0) {
+ status = nfs_verify_authflavors(args, authlist, authlist_len);
+ dfprintk(MOUNT, "NFS: using auth flavor %u\n",
+ args->selected_flavor);
if (status)
return ERR_PTR(status);
return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
@@ -1769,7 +1820,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
/* Fallthrough */
}
dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", flavor);
- nfs_set_auth_parsed_mount_data(args, flavor);
+ args->selected_flavor = flavor;
server = nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
if (!IS_ERR(server))
return server;
@@ -1785,7 +1836,7 @@ static struct nfs_server *nfs_try_mount_request(struct nfs_mount_info *mount_inf
/* Last chance! Try AUTH_UNIX */
dfprintk(MOUNT, "NFS: attempting to use auth flavor %u\n", RPC_AUTH_UNIX);
- nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+ args->selected_flavor = RPC_AUTH_UNIX;
return nfs_mod->rpc_ops->create_server(mount_info, nfs_mod);
}
@@ -1972,9 +2023,9 @@ static int nfs23_validate_mount_data(void *options,
args->bsize = data->bsize;
if (data->flags & NFS_MOUNT_SECFLAVOUR)
- nfs_set_auth_parsed_mount_data(args, data->pseudoflavor);
+ args->selected_flavor = data->pseudoflavor;
else
- nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+ args->selected_flavor = RPC_AUTH_UNIX;
if (!args->nfs_server.hostname)
goto out_nomem;
@@ -2108,9 +2159,6 @@ static int nfs_validate_text_mount_data(void *options,
nfs_set_port(sap, &args->nfs_server.port, port);
- if (args->auth_flavor_len > 1)
- goto out_bad_auth;
-
return nfs_parse_devname(dev_name,
&args->nfs_server.hostname,
max_namelen,
@@ -2130,21 +2178,31 @@ out_invalid_transport_udp:
out_no_address:
dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n");
return -EINVAL;
-
-out_bad_auth:
- dfprintk(MOUNT, "NFS: Too many RPC auth flavours specified\n");
- return -EINVAL;
}
+#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
+ | NFS_MOUNT_SECURE \
+ | NFS_MOUNT_TCP \
+ | NFS_MOUNT_VER3 \
+ | NFS_MOUNT_KERBEROS \
+ | NFS_MOUNT_NONLM \
+ | NFS_MOUNT_BROKEN_SUID \
+ | NFS_MOUNT_STRICTLOCK \
+ | NFS_MOUNT_UNSHARED \
+ | NFS_MOUNT_NORESVPORT \
+ | NFS_MOUNT_LEGACY_INTERFACE)
+
static int
nfs_compare_remount_data(struct nfs_server *nfss,
struct nfs_parsed_mount_data *data)
{
- if (data->flags != nfss->flags ||
+ if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK ||
data->rsize != nfss->rsize ||
data->wsize != nfss->wsize ||
+ data->version != nfss->nfs_client->rpc_ops->version ||
+ data->minorversion != nfss->nfs_client->cl_minorversion ||
data->retrans != nfss->client->cl_timeout->to_retries ||
- data->auth_flavors[0] != nfss->client->cl_auth->au_flavor ||
+ data->selected_flavor != nfss->client->cl_auth->au_flavor ||
data->acregmin != nfss->acregmin / HZ ||
data->acregmax != nfss->acregmax / HZ ||
data->acdirmin != nfss->acdirmin / HZ ||
@@ -2169,6 +2227,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
struct nfs4_mount_data *options4 = (struct nfs4_mount_data *)raw_data;
u32 nfsvers = nfss->nfs_client->rpc_ops->version;
+ sync_filesystem(sb);
+
/*
* Userspace mount programs that send binary options generally send
* them populated with default values. We have no way to know which
@@ -2189,7 +2249,8 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->rsize = nfss->rsize;
data->wsize = nfss->wsize;
data->retrans = nfss->client->cl_timeout->to_retries;
- nfs_set_auth_parsed_mount_data(data, nfss->client->cl_auth->au_flavor);
+ data->selected_flavor = nfss->client->cl_auth->au_flavor;
+ data->auth_info = nfss->auth_info;
data->acregmin = nfss->acregmin / HZ;
data->acregmax = nfss->acregmax / HZ;
data->acdirmin = nfss->acdirmin / HZ;
@@ -2197,12 +2258,15 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
data->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ;
data->nfs_server.port = nfss->port;
data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
+ data->version = nfsvers;
+ data->minorversion = nfss->nfs_client->cl_minorversion;
+ data->net = current->nsproxy->net_ns;
memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
data->nfs_server.addrlen);
/* overwrite those values with any that were specified */
- error = nfs_parse_mount_options((char *)options, data);
- if (error < 0)
+ error = -EINVAL;
+ if (!nfs_parse_mount_options((char *)options, data))
goto out;
/*
@@ -2296,18 +2360,6 @@ void nfs_clone_super(struct super_block *sb, struct nfs_mount_info *mount_info)
nfs_initialise_sb(sb);
}
-#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
- | NFS_MOUNT_SECURE \
- | NFS_MOUNT_TCP \
- | NFS_MOUNT_VER3 \
- | NFS_MOUNT_KERBEROS \
- | NFS_MOUNT_NONLM \
- | NFS_MOUNT_BROKEN_SUID \
- | NFS_MOUNT_STRICTLOCK \
- | NFS_MOUNT_UNSHARED \
- | NFS_MOUNT_NORESVPORT \
- | NFS_MOUNT_LEGACY_INTERFACE)
-
static int nfs_compare_mount_options(const struct super_block *s, const struct nfs_server *b, int flags)
{
const struct nfs_server *a = s->s_fs_info;
@@ -2332,7 +2384,7 @@ static int nfs_compare_mount_options(const struct super_block *s, const struct n
goto Ebusy;
if (a->acdirmax != b->acdirmax)
goto Ebusy;
- if (b->flags & NFS_MOUNT_SECFLAVOUR &&
+ if (b->auth_info.flavor_len > 0 &&
clnt_a->cl_auth->au_flavor != clnt_b->cl_auth->au_flavor)
goto Ebusy;
return 1;
@@ -2530,6 +2582,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
mntroot = ERR_PTR(error);
goto error_splat_bdi;
}
+ server->super = s;
}
if (!s->s_root) {
@@ -2713,9 +2766,9 @@ static int nfs4_validate_mount_data(void *options,
data->auth_flavours,
sizeof(pseudoflavor)))
return -EFAULT;
- nfs_set_auth_parsed_mount_data(args, pseudoflavor);
+ args->selected_flavor = pseudoflavor;
} else
- nfs_set_auth_parsed_mount_data(args, RPC_AUTH_UNIX);
+ args->selected_flavor = RPC_AUTH_UNIX;
c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN);
if (IS_ERR(c))
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index 6b3f2535a3e..bb6ed810fa6 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -13,7 +13,7 @@
static struct ctl_table_header *nfs_callback_sysctl_table;
-static ctl_table nfs_cb_sysctls[] = {
+static struct ctl_table nfs_cb_sysctls[] = {
{
.procname = "nfs_mountpoint_timeout",
.data = &nfs_mountpoint_expiry_timeout,
@@ -31,7 +31,7 @@ static ctl_table nfs_cb_sysctls[] = {
{ }
};
-static ctl_table nfs_cb_sysctl_dir[] = {
+static struct ctl_table nfs_cb_sysctl_dir[] = {
{
.procname = "nfs",
.mode = 0555,
@@ -40,7 +40,7 @@ static ctl_table nfs_cb_sysctl_dir[] = {
{ }
};
-static ctl_table nfs_cb_sysctl_root[] = {
+static struct ctl_table nfs_cb_sysctl_root[] = {
{
.procname = "fs",
.mode = 0555,
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 8285de9eaad..de54129336c 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -14,6 +14,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/namei.h>
+#include <linux/fsnotify.h>
#include "internal.h"
#include "nfs4_fs.h"
@@ -353,8 +354,8 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
return;
}
- if (task->tk_status != 0)
- nfs_cancel_async_unlink(old_dentry);
+ if (data->complete)
+ data->complete(task, data);
}
/**
@@ -399,9 +400,10 @@ static const struct rpc_call_ops nfs_rename_ops = {
*
* It's expected that valid references to the dentries and inodes are held
*/
-static struct rpc_task *
+struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
- struct dentry *old_dentry, struct dentry *new_dentry)
+ struct dentry *old_dentry, struct dentry *new_dentry,
+ void (*complete)(struct rpc_task *, struct nfs_renamedata *))
{
struct nfs_renamedata *data;
struct rpc_message msg = { };
@@ -438,6 +440,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
data->new_dentry = dget(new_dentry);
nfs_fattr_init(&data->old_fattr);
nfs_fattr_init(&data->new_fattr);
+ data->complete = complete;
/* set up nfs_renameargs */
data->args.old_dir = NFS_FH(old_dir);
@@ -456,6 +459,27 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
return rpc_run_task(&task_setup_data);
}
+/*
+ * Perform tasks needed when a sillyrename is done such as cancelling the
+ * queued async unlink if it failed.
+ */
+static void
+nfs_complete_sillyrename(struct rpc_task *task, struct nfs_renamedata *data)
+{
+ struct dentry *dentry = data->old_dentry;
+
+ if (task->tk_status != 0) {
+ nfs_cancel_async_unlink(dentry);
+ return;
+ }
+
+ /*
+ * vfs_unlink and the like do not issue this when a file is
+ * sillyrenamed, so do it here.
+ */
+ fsnotify_nameremove(dentry, 0);
+}
+
#define SILLYNAME_PREFIX ".nfs"
#define SILLYNAME_PREFIX_LEN ((unsigned)sizeof(SILLYNAME_PREFIX) - 1)
#define SILLYNAME_FILEID_LEN ((unsigned)sizeof(u64) << 1)
@@ -493,7 +517,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
unsigned long long fileid;
struct dentry *sdentry;
struct rpc_task *task;
- int error = -EIO;
+ int error = -EBUSY;
dfprintk(VFS, "NFS: silly-rename(%pd2, ct=%d)\n",
dentry, d_count(dentry));
@@ -502,7 +526,6 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
/*
* We don't allow a dentry to be silly-renamed twice.
*/
- error = -EBUSY;
if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
goto out;
@@ -549,7 +572,8 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry)
}
/* run the rename task, undo unlink if it fails */
- task = nfs_async_rename(dir, dir, dentry, sdentry);
+ task = nfs_async_rename(dir, dir, dentry, sdentry,
+ nfs_complete_sillyrename);
if (IS_ERR(task)) {
error = -EBUSY;
nfs_cancel_async_unlink(dentry);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c1d548211c3..5e2f1030454 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -42,10 +42,11 @@
* Local function declarations
*/
static void nfs_redirty_request(struct nfs_page *req);
-static const struct rpc_call_ops nfs_write_common_ops;
static const struct rpc_call_ops nfs_commit_ops;
static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
+static const struct nfs_rw_ops nfs_rw_write_ops;
+static void nfs_clear_request_commit(struct nfs_page *req);
static struct kmem_cache *nfs_wdata_cachep;
static mempool_t *nfs_wdata_mempool;
@@ -70,76 +71,19 @@ void nfs_commit_free(struct nfs_commit_data *p)
}
EXPORT_SYMBOL_GPL(nfs_commit_free);
-struct nfs_write_header *nfs_writehdr_alloc(void)
+static struct nfs_rw_header *nfs_writehdr_alloc(void)
{
- struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
-
- if (p) {
- struct nfs_pgio_header *hdr = &p->header;
+ struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
+ if (p)
memset(p, 0, sizeof(*p));
- INIT_LIST_HEAD(&hdr->pages);
- INIT_LIST_HEAD(&hdr->rpc_list);
- spin_lock_init(&hdr->lock);
- atomic_set(&hdr->refcnt, 0);
- hdr->verf = &p->verf;
- }
return p;
}
-EXPORT_SYMBOL_GPL(nfs_writehdr_alloc);
-static struct nfs_write_data *nfs_writedata_alloc(struct nfs_pgio_header *hdr,
- unsigned int pagecount)
+static void nfs_writehdr_free(struct nfs_rw_header *whdr)
{
- struct nfs_write_data *data, *prealloc;
-
- prealloc = &container_of(hdr, struct nfs_write_header, header)->rpc_data;
- if (prealloc->header == NULL)
- data = prealloc;
- else
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
-
- if (nfs_pgarray_set(&data->pages, pagecount)) {
- data->header = hdr;
- atomic_inc(&hdr->refcnt);
- } else {
- if (data != prealloc)
- kfree(data);
- data = NULL;
- }
-out:
- return data;
-}
-
-void nfs_writehdr_free(struct nfs_pgio_header *hdr)
-{
- struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
mempool_free(whdr, nfs_wdata_mempool);
}
-EXPORT_SYMBOL_GPL(nfs_writehdr_free);
-
-void nfs_writedata_release(struct nfs_write_data *wdata)
-{
- struct nfs_pgio_header *hdr = wdata->header;
- struct nfs_write_header *write_header = container_of(hdr, struct nfs_write_header, header);
-
- put_nfs_open_context(wdata->args.context);
- if (wdata->pages.pagevec != wdata->pages.page_array)
- kfree(wdata->pages.pagevec);
- if (wdata == &write_header->rpc_data) {
- wdata->header = NULL;
- wdata = NULL;
- }
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- /* Note: we only free the rpc_task after callbacks are done.
- * See the comment in rpc_free_task() for why
- */
- kfree(wdata);
-}
-EXPORT_SYMBOL_GPL(nfs_writedata_release);
static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
{
@@ -148,8 +92,15 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
}
+/*
+ * nfs_page_find_head_request_locked - find head request associated with @page
+ *
+ * must be called while holding the inode lock.
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
static struct nfs_page *
-nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
+nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
{
struct nfs_page *req = NULL;
@@ -161,25 +112,33 @@ nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
/* Linearly search the commit list for the correct req */
list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
if (freq->wb_page == page) {
- req = freq;
+ req = freq->wb_head;
break;
}
}
}
- if (req)
+ if (req) {
+ WARN_ON_ONCE(req->wb_head != req);
+
kref_get(&req->wb_kref);
+ }
return req;
}
-static struct nfs_page *nfs_page_find_request(struct page *page)
+/*
+ * nfs_page_find_head_request - find head request associated with @page
+ *
+ * returns matching head request with reference held, or NULL if not found.
+ */
+static struct nfs_page *nfs_page_find_head_request(struct page *page)
{
struct inode *inode = page_file_mapping(page)->host;
struct nfs_page *req = NULL;
spin_lock(&inode->i_lock);
- req = nfs_page_find_request_locked(NFS_I(inode), page);
+ req = nfs_page_find_head_request_locked(NFS_I(inode), page);
spin_unlock(&inode->i_lock);
return req;
}
@@ -211,18 +170,78 @@ static void nfs_set_pageerror(struct page *page)
nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page));
}
+/*
+ * nfs_page_group_search_locked
+ * @head - head request of page group
+ * @page_offset - offset into page
+ *
+ * Search page group with head @head to find a request that contains the
+ * page offset @page_offset.
+ *
+ * Returns a pointer to the first matching nfs request, or NULL if no
+ * match is found.
+ *
+ * Must be called with the page group lock held
+ */
+static struct nfs_page *
+nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset)
+{
+ struct nfs_page *req;
+
+ WARN_ON_ONCE(head != head->wb_head);
+ WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags));
+
+ req = head;
+ do {
+ if (page_offset >= req->wb_pgbase &&
+ page_offset < (req->wb_pgbase + req->wb_bytes))
+ return req;
+
+ req = req->wb_this_page;
+ } while (req != head);
+
+ return NULL;
+}
+
+/*
+ * nfs_page_group_covers_page
+ * @head - head request of page group
+ *
+ * Return true if the page group with head @head covers the whole page,
+ * returns false otherwise
+ */
+static bool nfs_page_group_covers_page(struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+ unsigned int pos = 0;
+ unsigned int len = nfs_page_length(req->wb_page);
+
+ nfs_page_group_lock(req);
+
+ do {
+ tmp = nfs_page_group_search_locked(req->wb_head, pos);
+ if (tmp) {
+ /* no way this should happen */
+ WARN_ON_ONCE(tmp->wb_pgbase != pos);
+ pos += tmp->wb_bytes - (pos - tmp->wb_pgbase);
+ }
+ } while (tmp && pos < len);
+
+ nfs_page_group_unlock(req);
+ WARN_ON_ONCE(pos > len);
+ return pos == len;
+}
+
/* We can set the PG_uptodate flag if we see that a write request
* covers the full page.
*/
-static void nfs_mark_uptodate(struct page *page, unsigned int base, unsigned int count)
+static void nfs_mark_uptodate(struct nfs_page *req)
{
- if (PageUptodate(page))
- return;
- if (base != 0)
+ if (PageUptodate(req->wb_page))
return;
- if (count != nfs_page_length(page))
+ if (!nfs_page_group_covers_page(req))
return;
- SetPageUptodate(page);
+ SetPageUptodate(req->wb_page);
}
static int wb_priority(struct writeback_control *wbc)
@@ -258,46 +277,259 @@ static void nfs_set_page_writeback(struct page *page)
}
}
-static void nfs_end_page_writeback(struct page *page)
+static void nfs_end_page_writeback(struct nfs_page *req)
{
- struct inode *inode = page_file_mapping(page)->host;
+ struct inode *inode = page_file_mapping(req->wb_page)->host;
struct nfs_server *nfss = NFS_SERVER(inode);
- end_page_writeback(page);
+ if (!nfs_page_group_sync_on_bit(req, PG_WB_END))
+ return;
+
+ end_page_writeback(req->wb_page);
if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
}
-static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock)
+
+/* nfs_page_group_clear_bits
+ * @req - an nfs request
+ * clears all page group related bits from @req
+ */
+static void
+nfs_page_group_clear_bits(struct nfs_page *req)
+{
+ clear_bit(PG_TEARDOWN, &req->wb_flags);
+ clear_bit(PG_UNLOCKPAGE, &req->wb_flags);
+ clear_bit(PG_UPTODATE, &req->wb_flags);
+ clear_bit(PG_WB_END, &req->wb_flags);
+ clear_bit(PG_REMOVE, &req->wb_flags);
+}
+
+
+/*
+ * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req
+ *
+ * this is a helper function for nfs_lock_and_join_requests
+ *
+ * @inode - inode associated with request page group, must be holding inode lock
+ * @head - head request of page group, must be holding head lock
+ * @req - request that couldn't lock and needs to wait on the req bit lock
+ * @nonblock - if true, don't actually wait
+ *
+ * NOTE: this must be called holding page_group bit lock and inode spin lock
+ * and BOTH will be released before returning.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+static int
+nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head,
+ struct nfs_page *req, bool nonblock)
+ __releases(&inode->i_lock)
+{
+ struct nfs_page *tmp;
+ int ret;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head ; tmp != req; tmp = tmp->wb_this_page)
+ nfs_unlock_request(tmp);
+
+ WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
+
+ /* grab a ref on the request that will be waited on */
+ kref_get(&req->wb_kref);
+
+ nfs_page_group_unlock(head);
+ spin_unlock(&inode->i_lock);
+
+ /* release ref from nfs_page_find_head_request_locked */
+ nfs_release_request(head);
+
+ if (!nonblock)
+ ret = nfs_wait_on_request(req);
+ else
+ ret = -EAGAIN;
+ nfs_release_request(req);
+
+ return ret;
+}
+
+/*
+ * nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
+ *
+ * @destroy_list - request list (using wb_this_page) terminated by @old_head
+ * @old_head - the old head of the list
+ *
+ * All subrequests must be locked and removed from all lists, so at this point
+ * they are only "active" in this function, and possibly in nfs_wait_on_request
+ * with a reference held by some other context.
+ */
+static void
+nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
+ struct nfs_page *old_head)
+{
+ while (destroy_list) {
+ struct nfs_page *subreq = destroy_list;
+
+ destroy_list = (subreq->wb_this_page == old_head) ?
+ NULL : subreq->wb_this_page;
+
+ WARN_ON_ONCE(old_head != subreq->wb_head);
+
+ /* make sure old group is not used */
+ subreq->wb_head = subreq;
+ subreq->wb_this_page = subreq;
+
+ nfs_clear_request_commit(subreq);
+
+ /* subreq is now totally disconnected from page group or any
+ * write / commit lists. last chance to wake any waiters */
+ nfs_unlock_request(subreq);
+
+ if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+ /* release ref on old head request */
+ nfs_release_request(old_head);
+
+ nfs_page_group_clear_bits(subreq);
+
+ /* release the PG_INODE_REF reference */
+ if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags))
+ nfs_release_request(subreq);
+ else
+ WARN_ON_ONCE(1);
+ } else {
+ WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags));
+ /* zombie requests have already released the last
+ * reference and were waiting on the rest of the
+ * group to complete. Since it's no longer part of a
+ * group, simply free the request */
+ nfs_page_group_clear_bits(subreq);
+ nfs_free_request(subreq);
+ }
+ }
+}
+
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req and return
+ * a locked reference, cancelling any pending
+ * operations for this page.
+ *
+ * @page - the page used to lookup the "page group" of nfs_page structures
+ * @nonblock - if true, don't block waiting for request locks
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page, bool nonblock)
{
struct inode *inode = page_file_mapping(page)->host;
- struct nfs_page *req;
+ struct nfs_page *head, *subreq;
+ struct nfs_page *destroy_list = NULL;
+ unsigned int total_bytes;
int ret;
+try_again:
+ total_bytes = 0;
+
+ WARN_ON_ONCE(destroy_list);
+
spin_lock(&inode->i_lock);
- for (;;) {
- req = nfs_page_find_request_locked(NFS_I(inode), page);
- if (req == NULL)
- break;
- if (nfs_lock_request(req))
- break;
- /* Note: If we hold the page lock, as is the case in nfs_writepage,
- * then the call to nfs_lock_request() will always
- * succeed provided that someone hasn't already marked the
- * request as dirty (in which case we don't care).
- */
+
+ /*
+ * A reference is taken only on the head request which acts as a
+ * reference to the whole page group - the group will not be destroyed
+ * until the head reference is released.
+ */
+ head = nfs_page_find_head_request_locked(NFS_I(inode), page);
+
+ if (!head) {
spin_unlock(&inode->i_lock);
- if (!nonblock)
- ret = nfs_wait_on_request(req);
- else
- ret = -EAGAIN;
- nfs_release_request(req);
- if (ret != 0)
+ return NULL;
+ }
+
+ /* lock each request in the page group */
+ nfs_page_group_lock(head);
+ subreq = head;
+ do {
+ /*
+ * Subrequests are always contiguous, non overlapping
+ * and in order. If not, it's a programming error.
+ */
+ WARN_ON_ONCE(subreq->wb_offset !=
+ (head->wb_offset + total_bytes));
+
+ /* keep track of how many bytes this group covers */
+ total_bytes += subreq->wb_bytes;
+
+ if (!nfs_lock_request(subreq)) {
+ /* releases page group bit lock and
+ * inode spin lock and all references */
+ ret = nfs_unroll_locks_and_wait(inode, head,
+ subreq, nonblock);
+
+ if (ret == 0)
+ goto try_again;
+
return ERR_PTR(ret);
- spin_lock(&inode->i_lock);
+ }
+
+ subreq = subreq->wb_this_page;
+ } while (subreq != head);
+
+ /* Now that all requests are locked, make sure they aren't on any list.
+ * Commit list removal accounting is done after locks are dropped */
+ subreq = head;
+ do {
+ nfs_list_remove_request(subreq);
+ subreq = subreq->wb_this_page;
+ } while (subreq != head);
+
+ /* unlink subrequests from head, destroy them later */
+ if (head->wb_this_page != head) {
+ /* destroy list will be terminated by head */
+ destroy_list = head->wb_this_page;
+ head->wb_this_page = head;
+
+ /* change head request to cover whole range that
+ * the former page group covered */
+ head->wb_bytes = total_bytes;
}
+
+ /*
+ * prepare head request to be added to new pgio descriptor
+ */
+ nfs_page_group_clear_bits(head);
+
+ /*
+ * some part of the group was still on the inode list - otherwise
+ * the group wouldn't be involved in async write.
+ * grab a reference for the head request, iff it needs one.
+ */
+ if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags))
+ kref_get(&head->wb_kref);
+
+ nfs_page_group_unlock(head);
+
+ /* drop lock to clear_request_commit the head req and clean up
+ * requests on destroy list */
spin_unlock(&inode->i_lock);
- return req;
+
+ nfs_destroy_unlinked_subrequests(destroy_list, head);
+
+ /* clean up commit list state */
+ nfs_clear_request_commit(head);
+
+ /* still holds ref on head from nfs_page_find_head_request_locked
+ * and still has lock on head from lock loop */
+ return head;
}
/*
@@ -310,7 +542,7 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req;
int ret = 0;
- req = nfs_find_and_lock_request(page, nonblock);
+ req = nfs_lock_and_join_requests(page, nonblock);
if (!req)
goto out;
ret = PTR_ERR(req);
@@ -354,10 +586,8 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
struct nfs_pageio_descriptor pgio;
int err;
- NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio,
- page->mapping->host,
- wb_priority(wbc),
- &nfs_async_write_completion_ops);
+ nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+ false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio);
nfs_pageio_complete(&pgio);
if (err < 0)
@@ -400,12 +630,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
- NFS_PROTO(inode)->write_pageio_init(&pgio, inode, wb_priority(wbc), &nfs_async_write_completion_ops);
+ nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
+ &nfs_async_write_completion_ops);
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(bitlock, NFS_INO_FLUSHING);
if (err < 0)
@@ -425,6 +656,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
{
struct nfs_inode *nfsi = NFS_I(inode);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
/* Lock the request! */
nfs_lock_request(req);
@@ -441,6 +674,9 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
set_page_private(req->wb_page, (unsigned long)req);
}
nfsi->npages++;
+ /* this a head request for a page group - mark it as having an
+ * extra reference so sub groups can follow suit */
+ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags));
kref_get(&req->wb_kref);
spin_unlock(&inode->i_lock);
}
@@ -452,16 +688,23 @@ static void nfs_inode_remove_request(struct nfs_page *req)
{
struct inode *inode = req->wb_context->dentry->d_inode;
struct nfs_inode *nfsi = NFS_I(inode);
+ struct nfs_page *head;
- spin_lock(&inode->i_lock);
- if (likely(!PageSwapCache(req->wb_page))) {
- set_page_private(req->wb_page, 0);
- ClearPagePrivate(req->wb_page);
- clear_bit(PG_MAPPED, &req->wb_flags);
+ if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
+ head = req->wb_head;
+
+ spin_lock(&inode->i_lock);
+ if (likely(!PageSwapCache(head->wb_page))) {
+ set_page_private(head->wb_page, 0);
+ ClearPagePrivate(head->wb_page);
+ clear_bit(PG_MAPPED, &head->wb_flags);
+ }
+ nfsi->npages--;
+ spin_unlock(&inode->i_lock);
}
- nfsi->npages--;
- spin_unlock(&inode->i_lock);
- nfs_release_request(req);
+
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+ nfs_release_request(req);
}
static void
@@ -583,7 +826,7 @@ nfs_clear_request_commit(struct nfs_page *req)
}
static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_data *data)
{
if (data->verf.committed == NFS_DATA_SYNC)
return data->header->lseg == NULL;
@@ -614,7 +857,7 @@ nfs_clear_request_commit(struct nfs_page *req)
}
static inline
-int nfs_write_need_commit(struct nfs_write_data *data)
+int nfs_write_need_commit(struct nfs_pgio_data *data)
{
return 0;
}
@@ -645,7 +888,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
goto next;
}
if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
- memcpy(&req->wb_verf, &hdr->verf->verifier, sizeof(req->wb_verf));
+ memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
nfs_mark_request_commit(req, hdr->lseg, &cinfo);
goto next;
}
@@ -653,7 +896,7 @@ remove_req:
nfs_inode_remove_request(req);
next:
nfs_unlock_request(req);
- nfs_end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req);
nfs_release_request(req);
}
out:
@@ -661,7 +904,7 @@ out:
}
#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
-static unsigned long
+unsigned long
nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
return cinfo->mds->ncommit;
@@ -718,7 +961,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
}
#else
-static unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
+unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
{
return 0;
}
@@ -754,10 +997,14 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
spin_lock(&inode->i_lock);
for (;;) {
- req = nfs_page_find_request_locked(NFS_I(inode), page);
+ req = nfs_page_find_head_request_locked(NFS_I(inode), page);
if (req == NULL)
goto out_unlock;
+ /* should be handled by nfs_flush_incompatible */
+ WARN_ON_ONCE(req->wb_head != req);
+ WARN_ON_ONCE(req->wb_this_page != req);
+
rqend = req->wb_offset + req->wb_bytes;
/*
* Tell the caller to flush out the request if
@@ -819,7 +1066,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
req = nfs_try_to_update_request(inode, page, offset, bytes);
if (req != NULL)
goto out;
- req = nfs_create_request(ctx, inode, page, offset, bytes);
+ req = nfs_create_request(ctx, page, NULL, offset, bytes);
if (IS_ERR(req))
goto out;
nfs_inode_add_request(inode, req);
@@ -837,7 +1084,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page,
return PTR_ERR(req);
/* Update file length */
nfs_grow_file(page, offset, count);
- nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes);
+ nfs_mark_uptodate(req);
nfs_mark_request_dirty(req);
nfs_unlock_and_release_request(req);
return 0;
@@ -858,11 +1105,13 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
* dropped page.
*/
do {
- req = nfs_page_find_request(page);
+ req = nfs_page_find_head_request(page);
if (req == NULL)
return 0;
l_ctx = req->wb_lock_context;
do_flush = req->wb_page != page || req->wb_context != ctx;
+ /* for now, flush if more than 1 request in page_group */
+ do_flush |= req->wb_this_page != req;
if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
do_flush |= l_ctx->lockowner.l_owner != current->files
|| l_ctx->lockowner.l_pid != current->tgid;
@@ -909,11 +1158,18 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
*/
static bool nfs_write_pageuptodate(struct page *page, struct inode *inode)
{
+ struct nfs_inode *nfsi = NFS_I(inode);
+
if (nfs_have_delegated_attributes(inode))
goto out;
- if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_DATA|NFS_INO_REVAL_PAGECACHE))
+ if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
+ return false;
+ smp_rmb();
+ if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags))
return false;
out:
+ if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
+ return false;
return PageUptodate(page) != 0;
}
@@ -922,19 +1178,20 @@ out:
* extend the write to cover the entire page in order to avoid fragmentation
* inefficiencies.
*
- * If the file is opened for synchronous writes or if we have a write delegation
- * from the server then we can just skip the rest of the checks.
+ * If the file is opened for synchronous writes then we can just skip the rest
+ * of the checks.
*/
static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
{
if (file->f_flags & O_DSYNC)
return 0;
+ if (!nfs_write_pageuptodate(page, inode))
+ return 0;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
return 1;
- if (nfs_write_pageuptodate(page, inode) && (inode->i_flock == NULL ||
- (inode->i_flock->fl_start == 0 &&
+ if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
inode->i_flock->fl_end == OFFSET_MAX &&
- inode->i_flock->fl_type != F_RDLCK)))
+ inode->i_flock->fl_type != F_RDLCK))
return 1;
return 0;
}
@@ -984,126 +1241,17 @@ static int flush_task_priority(int how)
return RPC_PRIORITY_NORMAL;
}
-int nfs_initiate_write(struct rpc_clnt *clnt,
- struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- int how, int flags)
+static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg,
+ struct rpc_task_setup *task_setup_data, int how)
{
struct inode *inode = data->header->inode;
int priority = flush_task_priority(how);
- struct rpc_task *task;
- struct rpc_message msg = {
- .rpc_argp = &data->args,
- .rpc_resp = &data->res,
- .rpc_cred = data->header->cred,
- };
- struct rpc_task_setup task_setup_data = {
- .rpc_client = clnt,
- .task = &data->task,
- .rpc_message = &msg,
- .callback_ops = call_ops,
- .callback_data = data,
- .workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
- .priority = priority,
- };
- int ret = 0;
-
- /* Set up the initial task struct. */
- NFS_PROTO(inode)->write_setup(data, &msg);
- dprintk("NFS: %5u initiated write call "
- "(req %s/%lld, %u bytes @ offset %llu)\n",
- data->task.tk_pid,
- inode->i_sb->s_id,
- (long long)NFS_FILEID(inode),
- data->args.count,
- (unsigned long long)data->args.offset);
+ task_setup_data->priority = priority;
+ NFS_PROTO(inode)->write_setup(data, msg);
nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
- &task_setup_data.rpc_client, &msg, data);
-
- task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
- rpc_put_task(task);
-out:
- return ret;
-}
-EXPORT_SYMBOL_GPL(nfs_initiate_write);
-
-/*
- * Set up the argument/result storage required for the RPC call.
- */
-static void nfs_write_rpcsetup(struct nfs_write_data *data,
- unsigned int count, unsigned int offset,
- int how, struct nfs_commit_info *cinfo)
-{
- struct nfs_page *req = data->header->req;
-
- /* Set up the RPC argument and reply structs
- * NB: take care not to mess about with data->commit et al. */
-
- data->args.fh = NFS_FH(data->header->inode);
- data->args.offset = req_offset(req) + offset;
- /* pnfs_set_layoutcommit needs this */
- data->mds_offset = data->args.offset;
- data->args.pgbase = req->wb_pgbase + offset;
- data->args.pages = data->pages.pagevec;
- data->args.count = count;
- data->args.context = get_nfs_open_context(req->wb_context);
- data->args.lock_context = req->wb_lock_context;
- data->args.stable = NFS_UNSTABLE;
- switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
- case 0:
- break;
- case FLUSH_COND_STABLE:
- if (nfs_reqs_to_commit(cinfo))
- break;
- default:
- data->args.stable = NFS_FILE_SYNC;
- }
-
- data->res.fattr = &data->fattr;
- data->res.count = count;
- data->res.verf = &data->verf;
- nfs_fattr_init(&data->fattr);
-}
-
-static int nfs_do_write(struct nfs_write_data *data,
- const struct rpc_call_ops *call_ops,
- int how)
-{
- struct inode *inode = data->header->inode;
-
- return nfs_initiate_write(NFS_CLIENT(inode), data, call_ops, how, 0);
-}
-
-static int nfs_do_multiple_writes(struct list_head *head,
- const struct rpc_call_ops *call_ops,
- int how)
-{
- struct nfs_write_data *data;
- int ret = 0;
-
- while (!list_empty(head)) {
- int ret2;
-
- data = list_first_entry(head, struct nfs_write_data, list);
- list_del_init(&data->list);
-
- ret2 = nfs_do_write(data, call_ops, how);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
+ &task_setup_data->rpc_client, msg, data);
}
/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1114,7 +1262,7 @@ static void nfs_redirty_request(struct nfs_page *req)
{
nfs_mark_request_dirty(req);
nfs_unlock_request(req);
- nfs_end_page_writeback(req->wb_page);
+ nfs_end_page_writeback(req);
nfs_release_request(req);
}
@@ -1134,173 +1282,30 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
.completion = nfs_write_completion,
};
-static void nfs_flush_error(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- set_bit(NFS_IOHDR_REDO, &hdr->flags);
- while (!list_empty(&hdr->rpc_list)) {
- struct nfs_write_data *data = list_first_entry(&hdr->rpc_list,
- struct nfs_write_data, list);
- list_del(&data->list);
- nfs_writedata_release(data);
- }
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
-}
-
-/*
- * Generate multiple small requests to write out a single
- * contiguous dirty area on one page.
- */
-static int nfs_flush_multi(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req = hdr->req;
- struct page *page = req->wb_page;
- struct nfs_write_data *data;
- size_t wsize = desc->pg_bsize, nbytes;
- unsigned int offset;
- int requests = 0;
- struct nfs_commit_info cinfo;
-
- nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
-
- if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
- (desc->pg_moreio || nfs_reqs_to_commit(&cinfo) ||
- desc->pg_count > wsize))
- desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
-
- offset = 0;
- nbytes = desc->pg_count;
- do {
- size_t len = min(nbytes, wsize);
-
- data = nfs_writedata_alloc(hdr, 1);
- if (!data) {
- nfs_flush_error(desc, hdr);
- return -ENOMEM;
- }
- data->pages.pagevec[0] = page;
- nfs_write_rpcsetup(data, len, offset, desc->pg_ioflags, &cinfo);
- list_add(&data->list, &hdr->rpc_list);
- requests++;
- nbytes -= len;
- offset += len;
- } while (nbytes != 0);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- desc->pg_rpc_callops = &nfs_write_common_ops;
- return 0;
-}
-
-/*
- * Create an RPC task for the given write request and kick it.
- * The page must have been locked by the caller.
- *
- * It may happen that the page we're passed is not marked dirty.
- * This is the case if nfs_updatepage detects a conflicting request
- * that has been written but not committed.
- */
-static int nfs_flush_one(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_page *req;
- struct page **pages;
- struct nfs_write_data *data;
- struct list_head *head = &desc->pg_list;
- struct nfs_commit_info cinfo;
-
- data = nfs_writedata_alloc(hdr, nfs_page_array_len(desc->pg_base,
- desc->pg_count));
- if (!data) {
- nfs_flush_error(desc, hdr);
- return -ENOMEM;
- }
-
- nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
- pages = data->pages.pagevec;
- while (!list_empty(head)) {
- req = nfs_list_entry(head->next);
- nfs_list_remove_request(req);
- nfs_list_add_request(req, &hdr->pages);
- *pages++ = req->wb_page;
- }
-
- if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
- (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
- desc->pg_ioflags &= ~FLUSH_COND_STABLE;
-
- /* Set up the argument struct */
- nfs_write_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
- list_add(&data->list, &hdr->rpc_list);
- desc->pg_rpc_callops = &nfs_write_common_ops;
- return 0;
-}
-
-int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
- struct nfs_pgio_header *hdr)
-{
- if (desc->pg_bsize < PAGE_CACHE_SIZE)
- return nfs_flush_multi(desc, hdr);
- return nfs_flush_one(desc, hdr);
-}
-EXPORT_SYMBOL_GPL(nfs_generic_flush);
-
-static int nfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
-{
- struct nfs_write_header *whdr;
- struct nfs_pgio_header *hdr;
- int ret;
-
- whdr = nfs_writehdr_alloc();
- if (!whdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
- return -ENOMEM;
- }
- hdr = &whdr->header;
- nfs_pgheader_init(desc, hdr, nfs_writehdr_free);
- atomic_inc(&hdr->refcnt);
- ret = nfs_generic_flush(desc, hdr);
- if (ret == 0)
- ret = nfs_do_multiple_writes(&hdr->rpc_list,
- desc->pg_rpc_callops,
- desc->pg_ioflags);
- if (atomic_dec_and_test(&hdr->refcnt))
- hdr->completion_ops->completion(hdr);
- return ret;
-}
-
-static const struct nfs_pageio_ops nfs_pageio_write_ops = {
- .pg_test = nfs_generic_pg_test,
- .pg_doio = nfs_generic_pg_writepages,
-};
-
void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
- struct inode *inode, int ioflags,
+ struct inode *inode, int ioflags, bool force_mds,
const struct nfs_pgio_completion_ops *compl_ops)
{
- nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops, compl_ops,
- NFS_SERVER(inode)->wsize, ioflags);
+ struct nfs_server *server = NFS_SERVER(inode);
+ const struct nfs_pageio_ops *pg_ops = &nfs_pgio_rw_ops;
+
+#ifdef CONFIG_NFS_V4_1
+ if (server->pnfs_curr_ld && !force_mds)
+ pg_ops = server->pnfs_curr_ld->pg_write_ops;
+#endif
+ nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops,
+ server->wsize, ioflags);
}
EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
{
- pgio->pg_ops = &nfs_pageio_write_ops;
+ pgio->pg_ops = &nfs_pgio_rw_ops;
pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
-void nfs_write_prepare(struct rpc_task *task, void *calldata)
-{
- struct nfs_write_data *data = calldata;
- int err;
- err = NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
- if (err)
- rpc_exit(task, err);
-}
-
void nfs_commit_prepare(struct rpc_task *task, void *calldata)
{
struct nfs_commit_data *data = calldata;
@@ -1308,23 +1313,8 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
}
-/*
- * Handle a write reply that flushes a whole page.
- *
- * FIXME: There is an inherent race with invalidate_inode_pages and
- * writebacks since the page->count is kept > 1 for as long
- * as the page has a write request pending.
- */
-static void nfs_writeback_done_common(struct rpc_task *task, void *calldata)
-{
- struct nfs_write_data *data = calldata;
-
- nfs_writeback_done(task, data);
-}
-
-static void nfs_writeback_release_common(void *calldata)
+static void nfs_writeback_release_common(struct nfs_pgio_data *data)
{
- struct nfs_write_data *data = calldata;
struct nfs_pgio_header *hdr = data->header;
int status = data->task.tk_status;
@@ -1333,34 +1323,46 @@ static void nfs_writeback_release_common(void *calldata)
if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
; /* Do nothing */
else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
- memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf));
- else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf)))
+ memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
+ else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
spin_unlock(&hdr->lock);
}
- nfs_writedata_release(data);
}
-static const struct rpc_call_ops nfs_write_common_ops = {
- .rpc_call_prepare = nfs_write_prepare,
- .rpc_call_done = nfs_writeback_done_common,
- .rpc_release = nfs_writeback_release_common,
-};
+/*
+ * Special version of should_remove_suid() that ignores capabilities.
+ */
+static int nfs_should_remove_suid(const struct inode *inode)
+{
+ umode_t mode = inode->i_mode;
+ int kill = 0;
+ /* suid always must be killed */
+ if (unlikely(mode & S_ISUID))
+ kill = ATTR_KILL_SUID;
+
+ /*
+ * sgid without any exec bits is just a mandatory locking mark; leave
+ * it alone. If some exec bits are set, it's a real sgid; kill it.
+ */
+ if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+ kill |= ATTR_KILL_SGID;
+
+ if (unlikely(kill && S_ISREG(mode)))
+ return kill;
+
+ return 0;
+}
/*
* This function is called when the WRITE call is complete.
*/
-void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
+static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
+ struct inode *inode)
{
- struct nfs_writeargs *argp = &data->args;
- struct nfs_writeres *resp = &data->res;
- struct inode *inode = data->header->inode;
int status;
- dprintk("NFS: %5u nfs_writeback_done (status %d)\n",
- task->tk_pid, task->tk_status);
-
/*
* ->write_done will attempt to use post-op attributes to detect
* conflicting writes by other clients. A strict interpretation
@@ -1370,11 +1372,11 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
*/
status = NFS_PROTO(inode)->write_done(task, data);
if (status != 0)
- return;
- nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
+ return status;
+ nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count);
#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
- if (resp->verf->committed < argp->stable && task->tk_status >= 0) {
+ if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) {
/* We tried a write call, but the server did not
* commit data to stable storage even though we
* requested it.
@@ -1390,18 +1392,31 @@ void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
dprintk("NFS: faulty NFS server %s:"
" (committed = %d) != (stable = %d)\n",
NFS_SERVER(inode)->nfs_client->cl_hostname,
- resp->verf->committed, argp->stable);
+ data->res.verf->committed, data->args.stable);
complain = jiffies + 300 * HZ;
}
}
#endif
- if (task->tk_status < 0)
- nfs_set_pgio_error(data->header, task->tk_status, argp->offset);
- else if (resp->count < argp->count) {
+
+ /* Deal with the suid/sgid bit corner case */
+ if (nfs_should_remove_suid(inode))
+ nfs_mark_for_revalidate(inode);
+ return 0;
+}
+
+/*
+ * This function is called when the WRITE call is complete.
+ */
+static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data)
+{
+ struct nfs_pgio_args *argp = &data->args;
+ struct nfs_pgio_res *resp = &data->res;
+
+ if (resp->count < argp->count) {
static unsigned long complain;
/* This a short write! */
- nfs_inc_stats(inode, NFSIOS_SHORTWRITE);
+ nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE);
/* Has the server at least made some progress? */
if (resp->count == 0) {
@@ -1452,7 +1467,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
{
clear_bit(NFS_INO_COMMIT, &nfsi->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
}
@@ -1606,9 +1621,9 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
nfs_list_remove_request(req);
nfs_clear_page_commit(req->wb_page);
- dprintk("NFS: commit (%s/%lld %d@%lld)",
+ dprintk("NFS: commit (%s/%llu %d@%lld)",
req->wb_context->dentry->d_sb->s_id,
- (long long)NFS_FILEID(req->wb_context->dentry->d_inode),
+ (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
req->wb_bytes,
(long long)req_offset(req));
if (status < 0) {
@@ -1782,27 +1797,28 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
struct nfs_page *req;
int ret = 0;
- for (;;) {
- wait_on_page_writeback(page);
- req = nfs_page_find_request(page);
- if (req == NULL)
- break;
- if (nfs_lock_request(req)) {
- nfs_clear_request_commit(req);
- nfs_inode_remove_request(req);
- /*
- * In case nfs_inode_remove_request has marked the
- * page as being dirty
- */
- cancel_dirty_page(page, PAGE_CACHE_SIZE);
- nfs_unlock_and_release_request(req);
- break;
- }
- ret = nfs_wait_on_request(req);
- nfs_release_request(req);
- if (ret < 0)
- break;
+ wait_on_page_writeback(page);
+
+ /* blocking call to cancel all requests and join to a single (head)
+ * request */
+ req = nfs_lock_and_join_requests(page, false);
+
+ if (IS_ERR(req)) {
+ ret = PTR_ERR(req);
+ } else if (req) {
+ /* all requests from this page have been cancelled by
+ * nfs_lock_and_join_requests, so just remove the head
+ * request from the inode / page_private pointer and
+ * release it */
+ nfs_inode_remove_request(req);
+ /*
+ * In case nfs_inode_remove_request has marked the
+ * page as being dirty
+ */
+ cancel_dirty_page(page, PAGE_CACHE_SIZE);
+ nfs_unlock_and_release_request(req);
}
+
return ret;
}
@@ -1868,7 +1884,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
int __init nfs_init_writepagecache(void)
{
nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
- sizeof(struct nfs_write_header),
+ sizeof(struct nfs_rw_header),
0, SLAB_HWCACHE_ALIGN,
NULL);
if (nfs_wdata_cachep == NULL)
@@ -1930,3 +1946,12 @@ void nfs_destroy_writepagecache(void)
kmem_cache_destroy(nfs_wdata_cachep);
}
+static const struct nfs_rw_ops nfs_rw_write_ops = {
+ .rw_mode = FMODE_WRITE,
+ .rw_alloc_header = nfs_writehdr_alloc,
+ .rw_free_header = nfs_writehdr_free,
+ .rw_release = nfs_writeback_release_common,
+ .rw_done = nfs_writeback_done,
+ .rw_result = nfs_writeback_result,
+ .rw_initiate = nfs_initiate_write,
+};
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index dc8f1ef665c..f994e750e0d 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -95,7 +95,7 @@ config NFSD_V4_SECURITY_LABEL
Smack policies on NFSv4 files, say N.
WARNING: there is still a chance of backwards-incompatible protocol changes.
- For now we recommend "Y" only for developers and testers."
+ For now we recommend "Y" only for developers and testers.
config NFSD_FAULT_INJECTION
bool "NFS server manual fault injection"
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index 8b186a4955c..a986ceb6fd0 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -35,23 +35,25 @@
#ifndef LINUX_NFS4_ACL_H
#define LINUX_NFS4_ACL_H
-#include <linux/posix_acl.h>
+struct nfs4_acl;
+struct svc_fh;
+struct svc_rqst;
-/* Maximum ACL we'll accept from client; chosen (somewhat arbitrarily) to
- * fit in a page: */
-#define NFS4_ACL_MAX 170
+/*
+ * Maximum ACL we'll accept from a client; chosen (somewhat
+ * arbitrarily) so that kmalloc'ing the ACL shouldn't require a
+ * high-order allocation. This allows 204 ACEs on x86_64:
+ */
+#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
+ / sizeof(struct nfs4_ace))
struct nfs4_acl *nfs4_acl_new(int);
int nfs4_acl_get_whotype(char *, u32);
-int nfs4_acl_write_who(int who, char *p);
-
-#define NFS4_ACL_TYPE_DEFAULT 0x01
-#define NFS4_ACL_DIR 0x02
-#define NFS4_ACL_OWNER 0x04
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
-struct nfs4_acl *nfs4_acl_posix_to_nfsv4(struct posix_acl *,
- struct posix_acl *, unsigned int flags);
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *, struct posix_acl **,
- struct posix_acl **, unsigned int flags);
+int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+ struct nfs4_acl **acl);
+__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl);
#endif /* LINUX_NFS4_ACL_H */
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 06cddd57226..72f44823adb 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -1,7 +1,6 @@
/* Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> */
#include <linux/sched.h>
-#include <linux/user_namespace.h>
#include "nfsd.h"
#include "auth.h"
@@ -25,7 +24,6 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
struct cred *new;
int i;
int flags = nfsexp_flags(rqstp, exp);
- int ret;
validate_process_creds();
@@ -71,10 +69,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
if (gid_eq(new->fsgid, INVALID_GID))
new->fsgid = exp->ex_anon_gid;
- ret = set_groups(new, gi);
+ set_groups(new, gi);
put_group_info(gi);
- if (ret < 0)
- goto error;
if (!uid_eq(new->fsuid, GLOBAL_ROOT_UID))
new->cap_effective = cap_drop_nfsd_set(new->cap_effective);
@@ -88,9 +84,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
return 0;
oom:
- ret = -ENOMEM;
-error:
abort_creds(new);
- return ret;
+ return -ENOMEM;
}
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index d5c5b3e0026..b582f9ab6b2 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -84,12 +84,4 @@ int nfsd_cache_lookup(struct svc_rqst *);
void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
int nfsd_reply_cache_stats_open(struct inode *, struct file *);
-#ifdef CONFIG_NFSD_V4
-void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp);
-#else /* CONFIG_NFSD_V4 */
-static inline void nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
-{
-}
-#endif /* CONFIG_NFSD_V4 */
-
#endif /* NFSCACHE_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5f38ea36e26..13b85f94d9e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -17,17 +17,12 @@
#include <linux/exportfs.h>
#include <linux/sunrpc/svc_xprt.h>
-#include <net/ipv6.h>
-
#include "nfsd.h"
#include "nfsfh.h"
#include "netns.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
-typedef struct auth_domain svc_client;
-typedef struct svc_export svc_export;
-
/*
* We have two caches.
* One maps client+vfsmnt+dentry to export options - the export map
@@ -73,7 +68,7 @@ static struct svc_expkey *svc_expkey_lookup(struct cache_detail *cd, struct svc_
static int expkey_parse(struct cache_detail *cd, char *mesg, int mlen)
{
- /* client fsidtype fsid [path] */
+ /* client fsidtype fsid expiry [path] */
char *buf;
int len;
struct auth_domain *dom = NULL;
@@ -295,13 +290,19 @@ svc_expkey_update(struct cache_detail *cd, struct svc_expkey *new,
static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
{
+ struct nfsd4_fs_location *locations = fsloc->locations;
int i;
+ if (!locations)
+ return;
+
for (i = 0; i < fsloc->locations_count; i++) {
- kfree(fsloc->locations[i].path);
- kfree(fsloc->locations[i].hosts);
+ kfree(locations[i].path);
+ kfree(locations[i].hosts);
}
- kfree(fsloc->locations);
+
+ kfree(locations);
+ fsloc->locations = NULL;
}
static void svc_export_put(struct kref *ref)
@@ -388,6 +389,10 @@ fsloc_parse(char **mesg, char *buf, struct nfsd4_fs_locations *fsloc)
int len;
int migrated, i, err;
+ /* more than one fsloc */
+ if (fsloc->locations)
+ return -EINVAL;
+
/* listsize */
err = get_uint(mesg, &fsloc->locations_count);
if (err)
@@ -437,13 +442,18 @@ out_free_all:
static int secinfo_parse(char **mesg, char *buf, struct svc_export *exp)
{
- int listsize, err;
struct exp_flavor_info *f;
+ u32 listsize;
+ int err;
- err = get_int(mesg, &listsize);
+ /* more than one secinfo */
+ if (exp->ex_nflavors)
+ return -EINVAL;
+
+ err = get_uint(mesg, &listsize);
if (err)
return err;
- if (listsize < 0 || listsize > MAX_SECINFO_LIST)
+ if (listsize > MAX_SECINFO_LIST)
return -EINVAL;
for (f = exp->ex_flavors; f < exp->ex_flavors + listsize; f++) {
@@ -474,6 +484,27 @@ static inline int
secinfo_parse(char **mesg, char *buf, struct svc_export *exp) { return 0; }
#endif
+static inline int
+uuid_parse(char **mesg, char *buf, unsigned char **puuid)
+{
+ int len;
+
+ /* more than one uuid */
+ if (*puuid)
+ return -EINVAL;
+
+ /* expect a 16 byte uuid encoded as \xXXXX... */
+ len = qword_get(mesg, buf, PAGE_SIZE);
+ if (len != EX_UUID_LEN)
+ return -EINVAL;
+
+ *puuid = kmemdup(buf, EX_UUID_LEN, GFP_KERNEL);
+ if (*puuid == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
{
/* client path expiry [flags anonuid anongid fsid] */
@@ -536,16 +567,12 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if (err)
goto out3;
exp.ex_anon_uid= make_kuid(&init_user_ns, an_int);
- if (!uid_valid(exp.ex_anon_uid))
- goto out3;
/* anon gid */
err = get_int(&mesg, &an_int);
if (err)
goto out3;
exp.ex_anon_gid= make_kgid(&init_user_ns, an_int);
- if (!gid_valid(exp.ex_anon_gid))
- goto out3;
/* fsid */
err = get_int(&mesg, &an_int);
@@ -556,18 +583,9 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
while ((len = qword_get(&mesg, buf, PAGE_SIZE)) > 0) {
if (strcmp(buf, "fsloc") == 0)
err = fsloc_parse(&mesg, buf, &exp.ex_fslocs);
- else if (strcmp(buf, "uuid") == 0) {
- /* expect a 16 byte uuid encoded as \xXXXX... */
- len = qword_get(&mesg, buf, PAGE_SIZE);
- if (len != 16)
- err = -EINVAL;
- else {
- exp.ex_uuid =
- kmemdup(buf, 16, GFP_KERNEL);
- if (exp.ex_uuid == NULL)
- err = -ENOMEM;
- }
- } else if (strcmp(buf, "secinfo") == 0)
+ else if (strcmp(buf, "uuid") == 0)
+ err = uuid_parse(&mesg, buf, &exp.ex_uuid);
+ else if (strcmp(buf, "secinfo") == 0)
err = secinfo_parse(&mesg, buf, &exp);
else
/* quietly ignore unknown words and anything
@@ -583,6 +601,26 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_uuid);
if (err)
goto out4;
+ /*
+ * No point caching this if it would immediately expire.
+ * Also, this protects exportfs's dummy export from the
+ * anon_uid/anon_gid checks:
+ */
+ if (exp.h.expiry_time < seconds_since_boot())
+ goto out4;
+ /*
+ * For some reason exportfs has been passing down an
+ * invalid (-1) uid & gid on the "dummy" export which it
+ * uses to test export support. To make sure exportfs
+ * sees errors from check_export we therefore need to
+ * delay these checks till after check_export:
+ */
+ err = -EINVAL;
+ if (!uid_valid(exp.ex_anon_uid))
+ goto out4;
+ if (!gid_valid(exp.ex_anon_gid))
+ goto out4;
+ err = 0;
}
expp = svc_export_lookup(&exp);
@@ -633,7 +671,7 @@ static int svc_export_show(struct seq_file *m,
if (exp->ex_uuid) {
int i;
seq_puts(m, ",uuid=");
- for (i=0; i<16; i++) {
+ for (i = 0; i < EX_UUID_LEN; i++) {
if ((i&3) == 0 && i)
seq_putc(m, ':');
seq_printf(m, "%02x", exp->ex_uuid[i]);
@@ -755,7 +793,7 @@ svc_export_update(struct svc_export *new, struct svc_export *old)
static struct svc_expkey *
-exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type,
+exp_find_key(struct cache_detail *cd, struct auth_domain *clp, int fsid_type,
u32 *fsidv, struct cache_req *reqp)
{
struct svc_expkey key, *ek;
@@ -777,9 +815,9 @@ exp_find_key(struct cache_detail *cd, svc_client *clp, int fsid_type,
return ek;
}
-
-static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp,
- const struct path *path, struct cache_req *reqp)
+static struct svc_export *
+exp_get_by_name(struct cache_detail *cd, struct auth_domain *clp,
+ const struct path *path, struct cache_req *reqp)
{
struct svc_export *exp, key;
int err;
@@ -803,11 +841,11 @@ static svc_export *exp_get_by_name(struct cache_detail *cd, svc_client *clp,
/*
* Find the export entry for a given dentry.
*/
-static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp,
- struct path *path)
+static struct svc_export *
+exp_parent(struct cache_detail *cd, struct auth_domain *clp, struct path *path)
{
struct dentry *saved = dget(path->dentry);
- svc_export *exp = exp_get_by_name(cd, clp, path, NULL);
+ struct svc_export *exp = exp_get_by_name(cd, clp, path, NULL);
while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
struct dentry *parent = dget_parent(path->dentry);
@@ -828,7 +866,7 @@ static struct svc_export *exp_parent(struct cache_detail *cd, svc_client *clp,
* since its harder to fool a kernel module than a user space program.
*/
int
-exp_rootfh(struct net *net, svc_client *clp, char *name,
+exp_rootfh(struct net *net, struct auth_domain *clp, char *name,
struct knfsd_fh *f, int maxsize)
{
struct svc_export *exp;
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
new file mode 100644
index 00000000000..cfeea85c5be
--- /dev/null
+++ b/fs/nfsd/export.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef NFSD_EXPORT_H
+#define NFSD_EXPORT_H
+
+#include <linux/sunrpc/cache.h>
+#include <uapi/linux/nfsd/export.h>
+
+struct knfsd_fh;
+struct svc_fh;
+struct svc_rqst;
+
+/*
+ * FS Locations
+ */
+
+#define MAX_FS_LOCATIONS 128
+
+struct nfsd4_fs_location {
+ char *hosts; /* colon separated list of hosts */
+ char *path; /* slash separated list of path components */
+};
+
+struct nfsd4_fs_locations {
+ uint32_t locations_count;
+ struct nfsd4_fs_location *locations;
+/* If we're not actually serving this data ourselves (only providing a
+ * list of replicas that do serve it) then we set "migrated": */
+ int migrated;
+};
+
+/*
+ * We keep an array of pseudoflavors with the export, in order from most
+ * to least preferred. For the foreseeable future, we don't expect more
+ * than the eight pseudoflavors null, unix, krb5, krb5i, krb5p, skpm3,
+ * spkm3i, and spkm3p (and using all 8 at once should be rare).
+ */
+#define MAX_SECINFO_LIST 8
+#define EX_UUID_LEN 16
+
+struct exp_flavor_info {
+ u32 pseudoflavor;
+ u32 flags;
+};
+
+struct svc_export {
+ struct cache_head h;
+ struct auth_domain * ex_client;
+ int ex_flags;
+ struct path ex_path;
+ kuid_t ex_anon_uid;
+ kgid_t ex_anon_gid;
+ int ex_fsid;
+ unsigned char * ex_uuid; /* 16 byte fsid */
+ struct nfsd4_fs_locations ex_fslocs;
+ uint32_t ex_nflavors;
+ struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
+ struct cache_detail *cd;
+};
+
+/* an "export key" (expkey) maps a filehandlefragement to an
+ * svc_export for a given client. There can be several per export,
+ * for the different fsid types.
+ */
+struct svc_expkey {
+ struct cache_head h;
+
+ struct auth_domain * ek_client;
+ int ek_fsidtype;
+ u32 ek_fsid[6];
+
+ struct path ek_path;
+};
+
+#define EX_ISSYNC(exp) (!((exp)->ex_flags & NFSEXP_ASYNC))
+#define EX_NOHIDE(exp) ((exp)->ex_flags & NFSEXP_NOHIDE)
+#define EX_WGATHER(exp) ((exp)->ex_flags & NFSEXP_GATHERED_WRITES)
+
+int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp);
+__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp);
+
+/*
+ * Function declarations
+ */
+int nfsd_export_init(struct net *);
+void nfsd_export_shutdown(struct net *);
+void nfsd_export_flush(struct net *);
+struct svc_export * rqst_exp_get_by_name(struct svc_rqst *,
+ struct path *);
+struct svc_export * rqst_exp_parent(struct svc_rqst *,
+ struct path *);
+struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
+int exp_rootfh(struct net *, struct auth_domain *,
+ char *path, struct knfsd_fh *, int maxsize);
+__be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
+__be32 nfserrno(int errno);
+
+static inline void exp_put(struct svc_export *exp)
+{
+ cache_put(&exp->h, exp->cd);
+}
+
+static inline void exp_get(struct svc_export *exp)
+{
+ cache_get(&exp->h);
+}
+struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
+
+#endif /* NFSD_EXPORT_H */
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index d620e7f8142..2ed05c3cd43 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -97,25 +97,14 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf,
{
static u64 val;
char read_buf[25];
- size_t size, ret;
+ size_t size;
loff_t pos = *ppos;
if (!pos)
nfsd_inject_get(file_inode(file)->i_private, &val);
size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
- if (pos < 0)
- return -EINVAL;
- if (pos >= size || !len)
- return 0;
- if (len > size - pos)
- len = size - pos;
- ret = copy_to_user(buf, read_buf + pos, len);
- if (ret == len)
- return -EFAULT;
- len -= ret;
- *ppos = pos + len;
- return len;
+ return simple_read_from_buffer(buf, len, ppos, read_buf, size);
}
static ssize_t fault_inject_write(struct file *file, const char __user *buf,
diff --git a/fs/nfsd/idmap.h b/fs/nfsd/idmap.h
index bf95f6b817a..a3f34900091 100644
--- a/fs/nfsd/idmap.h
+++ b/fs/nfsd/idmap.h
@@ -56,7 +56,7 @@ static inline void nfsd_idmap_shutdown(struct net *net)
__be32 nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, kuid_t *);
__be32 nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, kgid_t *);
-int nfsd_map_uid_to_name(struct svc_rqst *, kuid_t, char *);
-int nfsd_map_gid_to_name(struct svc_rqst *, kgid_t, char *);
+__be32 nfsd4_encode_user(struct xdr_stream *, struct svc_rqst *, kuid_t);
+__be32 nfsd4_encode_group(struct xdr_stream *, struct svc_rqst *, kgid_t);
#endif /* LINUX_NFSD_IDMAP_H */
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 849a7c3ced2..d32b3aa6600 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -95,6 +95,7 @@ struct nfsd_net {
time_t nfsd4_grace;
bool nfsd_net_up;
+ bool lockd_up;
/*
* Time of server startup
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 95d76dc6c5d..12b023a7ab7 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -30,8 +30,9 @@ nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
{
- svc_fh *fh;
struct posix_acl *acl;
+ struct inode *inode;
+ svc_fh *fh;
__be32 nfserr = 0;
dprintk("nfsd: GETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
@@ -41,6 +42,8 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
if (nfserr)
RETURN_STATUS(nfserr);
+ inode = fh->fh_dentry->d_inode;
+
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
@@ -50,21 +53,13 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
goto fail;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+ acl = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl)) {
- int err = PTR_ERR(acl);
-
- if (err == -ENODATA || err == -EOPNOTSUPP)
- acl = NULL;
- else {
- nfserr = nfserrno(err);
- goto fail;
- }
+ nfserr = nfserrno(PTR_ERR(acl));
+ goto fail;
}
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
-
- struct inode *inode = fh->fh_dentry->d_inode;
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
}
resp->acl_access = acl;
@@ -72,17 +67,10 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
-
- acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+ acl = get_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
- int err = PTR_ERR(acl);
-
- if (err == -ENODATA || err == -EOPNOTSUPP)
- acl = NULL;
- else {
- nfserr = nfserrno(err);
- goto fail;
- }
+ nfserr = nfserrno(PTR_ERR(acl));
+ goto fail;
}
resp->acl_default = acl;
}
@@ -103,31 +91,51 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp,
struct nfsd3_setaclargs *argp,
struct nfsd_attrstat *resp)
{
+ struct inode *inode;
svc_fh *fh;
__be32 nfserr = 0;
+ int error;
dprintk("nfsd: SETACL(2acl) %s\n", SVCFH_fmt(&argp->fh));
fh = fh_copy(&resp->fh, &argp->fh);
nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+ if (nfserr)
+ goto out;
- if (!nfserr) {
- nfserr = nfserrno( nfsd_set_posix_acl(
- fh, ACL_TYPE_ACCESS, argp->acl_access) );
- }
- if (!nfserr) {
- nfserr = nfserrno( nfsd_set_posix_acl(
- fh, ACL_TYPE_DEFAULT, argp->acl_default) );
- }
- if (!nfserr) {
- nfserr = fh_getattr(fh, &resp->stat);
+ inode = fh->fh_dentry->d_inode;
+ if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
+ error = -EOPNOTSUPP;
+ goto out_errno;
}
+ error = fh_want_write(fh);
+ if (error)
+ goto out_errno;
+
+ error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+ if (error)
+ goto out_drop_write;
+ error = inode->i_op->set_acl(inode, argp->acl_default,
+ ACL_TYPE_DEFAULT);
+ if (error)
+ goto out_drop_write;
+
+ fh_drop_write(fh);
+
+ nfserr = fh_getattr(fh, &resp->stat);
+
+out:
/* argp->acl_{access,default} may have been allocated in
nfssvc_decode_setaclargs. */
posix_acl_release(argp->acl_access);
posix_acl_release(argp->acl_default);
return nfserr;
+out_drop_write:
+ fh_drop_write(fh);
+out_errno:
+ nfserr = nfserrno(error);
+ goto out;
}
/*
@@ -174,7 +182,8 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessarg
static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclargs *argp)
{
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
argp->mask = ntohl(*p); p++;
@@ -189,7 +198,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int base;
int n;
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
argp->mask = ntohl(*p++);
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
@@ -210,7 +220,8 @@ static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_fhandle *argp)
{
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
return xdr_argsize_check(rqstp, p);
}
@@ -218,7 +229,8 @@ static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p,
static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessargs *argp)
{
- if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+ p = nfs2svc_decode_fh(p, &argp->fh);
+ if (!p)
return 0;
argp->access = ntohl(*p++);
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 9cbc1a841f8..2a514e21dc7 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -29,8 +29,9 @@ nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
{
- svc_fh *fh;
struct posix_acl *acl;
+ struct inode *inode;
+ svc_fh *fh;
__be32 nfserr = 0;
fh = fh_copy(&resp->fh, &argp->fh);
@@ -38,26 +39,20 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
if (nfserr)
RETURN_STATUS(nfserr);
+ inode = fh->fh_dentry->d_inode;
+
if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
RETURN_STATUS(nfserr_inval);
resp->mask = argp->mask;
if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
- acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+ acl = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl)) {
- int err = PTR_ERR(acl);
-
- if (err == -ENODATA || err == -EOPNOTSUPP)
- acl = NULL;
- else {
- nfserr = nfserrno(err);
- goto fail;
- }
+ nfserr = nfserrno(PTR_ERR(acl));
+ goto fail;
}
if (acl == NULL) {
/* Solaris returns the inode's minimum ACL. */
-
- struct inode *inode = fh->fh_dentry->d_inode;
acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
}
resp->acl_access = acl;
@@ -65,17 +60,10 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
/* Check how Solaris handles requests for the Default ACL
of a non-directory! */
-
- acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+ acl = get_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(acl)) {
- int err = PTR_ERR(acl);
-
- if (err == -ENODATA || err == -EOPNOTSUPP)
- acl = NULL;
- else {
- nfserr = nfserrno(err);
- goto fail;
- }
+ nfserr = nfserrno(PTR_ERR(acl));
+ goto fail;
}
resp->acl_default = acl;
}
@@ -96,21 +84,37 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
struct nfsd3_setaclargs *argp,
struct nfsd3_attrstat *resp)
{
+ struct inode *inode;
svc_fh *fh;
__be32 nfserr = 0;
+ int error;
fh = fh_copy(&resp->fh, &argp->fh);
nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_SATTR);
+ if (nfserr)
+ goto out;
- if (!nfserr) {
- nfserr = nfserrno( nfsd_set_posix_acl(
- fh, ACL_TYPE_ACCESS, argp->acl_access) );
- }
- if (!nfserr) {
- nfserr = nfserrno( nfsd_set_posix_acl(
- fh, ACL_TYPE_DEFAULT, argp->acl_default) );
+ inode = fh->fh_dentry->d_inode;
+ if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) {
+ error = -EOPNOTSUPP;
+ goto out_errno;
}
+ error = fh_want_write(fh);
+ if (error)
+ goto out_errno;
+
+ error = inode->i_op->set_acl(inode, argp->acl_access, ACL_TYPE_ACCESS);
+ if (error)
+ goto out_drop_write;
+ error = inode->i_op->set_acl(inode, argp->acl_default,
+ ACL_TYPE_DEFAULT);
+
+out_drop_write:
+ fh_drop_write(fh);
+out_errno:
+ nfserr = nfserrno(error);
+out:
/* argp->acl_{access,default} may have been allocated in
nfs3svc_decode_setaclargs. */
posix_acl_release(argp->acl_access);
@@ -124,7 +128,8 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp,
static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_getaclargs *args)
{
- if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+ p = nfs3svc_decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->mask = ntohl(*p); p++;
@@ -139,7 +144,8 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int base;
int n;
- if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+ p = nfs3svc_decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->mask = ntohl(*p++);
if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 14d9ecb96cf..e6c01e80325 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -168,7 +168,7 @@ encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
struct kstat *stat)
{
*p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
- *p++ = htonl((u32) stat->mode);
+ *p++ = htonl((u32) (stat->mode & S_IALLUGO));
*p++ = htonl((u32) stat->nlink);
*p++ = htonl((u32) from_kuid(&init_user_ns, stat->uid));
*p++ = htonl((u32) from_kgid(&init_user_ns, stat->gid));
@@ -278,7 +278,8 @@ void fill_post_wcc(struct svc_fh *fhp)
int
nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
return xdr_argsize_check(rqstp, p);
}
@@ -287,7 +288,8 @@ int
nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_sattrargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = decode_sattr3(p, &args->attrs);
@@ -315,7 +317,8 @@ int
nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_accessargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->access = ntohl(*p++);
@@ -330,7 +333,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
int v;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
@@ -360,7 +364,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int len, v, hdr, dlen;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
@@ -535,7 +540,8 @@ int
nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readlinkargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
@@ -558,7 +564,8 @@ int
nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_readdirargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->cookie);
args->verf = p; p += 2;
@@ -580,7 +587,8 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
int len;
u32 max_blocksize = svc_max_payload(rqstp);
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->cookie);
args->verf = p; p += 2;
@@ -605,7 +613,8 @@ int
nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd3_commitargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p = xdr_decode_hyper(p, &args->offset);
args->count = ntohl(*p++);
@@ -842,21 +851,21 @@ out:
static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
{
- struct svc_fh fh;
+ struct svc_fh *fh = &cd->scratch;
__be32 err;
- fh_init(&fh, NFS3_FHSIZE);
- err = compose_entry_fh(cd, &fh, name, namlen);
+ fh_init(fh, NFS3_FHSIZE);
+ err = compose_entry_fh(cd, fh, name, namlen);
if (err) {
*p++ = 0;
*p++ = 0;
goto out;
}
- p = encode_post_op_attr(cd->rqstp, p, &fh);
+ p = encode_post_op_attr(cd->rqstp, p, fh);
*p++ = xdr_one; /* yes, a file handle follows */
- p = encode_fh(p, &fh);
+ p = encode_fh(p, fh);
out:
- fh_put(&fh);
+ fh_put(fh);
return p;
}
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 8a50b3c1809..d714156a19f 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -36,9 +36,14 @@
#include <linux/slab.h>
#include <linux/nfs_fs.h>
-#include <linux/export.h>
+#include "nfsfh.h"
+#include "nfsd.h"
#include "acl.h"
+#include "vfs.h"
+#define NFS4_ACL_TYPE_DEFAULT 0x01
+#define NFS4_ACL_DIR 0x02
+#define NFS4_ACL_OWNER 0x04
/* mode bit translations: */
#define NFS4_READ_MODE (NFS4_ACE_READ_DATA)
@@ -130,36 +135,47 @@ static short ace2type(struct nfs4_ace *);
static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *,
unsigned int);
-struct nfs4_acl *
-nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
- unsigned int flags)
+int
+nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+ struct nfs4_acl **acl)
{
- struct nfs4_acl *acl;
+ struct inode *inode = dentry->d_inode;
+ int error = 0;
+ struct posix_acl *pacl = NULL, *dpacl = NULL;
+ unsigned int flags = 0;
int size = 0;
- if (pacl) {
- if (posix_acl_valid(pacl) < 0)
- return ERR_PTR(-EINVAL);
- size += 2*pacl->a_count;
+ pacl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (!pacl) {
+ pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ if (IS_ERR(pacl))
+ return PTR_ERR(pacl);
}
- if (dpacl) {
- if (posix_acl_valid(dpacl) < 0)
- return ERR_PTR(-EINVAL);
- size += 2*dpacl->a_count;
+ /* allocate for worst case: one (deny, allow) pair each: */
+ size += 2 * pacl->a_count;
+
+ if (S_ISDIR(inode->i_mode)) {
+ flags = NFS4_ACL_DIR;
+ dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ if (dpacl)
+ size += 2 * dpacl->a_count;
}
- /* Allocate for worst case: one (deny, allow) pair each: */
- acl = nfs4_acl_new(size);
- if (acl == NULL)
- return ERR_PTR(-ENOMEM);
+ *acl = nfs4_acl_new(size);
+ if (*acl == NULL) {
+ error = -ENOMEM;
+ goto out;
+ }
- if (pacl)
- _posix_to_nfsv4_one(pacl, acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
+ _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
if (dpacl)
- _posix_to_nfsv4_one(dpacl, acl, flags | NFS4_ACL_TYPE_DEFAULT);
+ _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
- return acl;
+ out:
+ posix_acl_release(pacl);
+ posix_acl_release(dpacl);
+ return error;
}
struct posix_acl_summary {
@@ -385,8 +401,10 @@ sort_pacl(struct posix_acl *pacl)
* by uid/gid. */
int i, j;
- if (pacl->a_count <= 4)
- return; /* no users or groups */
+ /* no users or groups */
+ if (!pacl || pacl->a_count <= 4)
+ return;
+
i = 1;
while (pacl->a_entries[i].e_tag == ACL_USER)
i++;
@@ -513,19 +531,21 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
/*
* ACLs with no ACEs are treated differently in the inheritable
- * and effective cases: when there are no inheritable ACEs, we
- * set a zero-length default posix acl:
+ * and effective cases: when there are no inheritable ACEs,
+ * calls ->set_acl with a NULL ACL structure.
*/
- if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
- pacl = posix_acl_alloc(0, GFP_KERNEL);
- return pacl ? pacl : ERR_PTR(-ENOMEM);
- }
+ if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+ return NULL;
+
/*
* When there are no effective ACEs, the following will end
* up setting a 3-element effective posix ACL with all
* permissions zero.
*/
- nace = 4 + state->users->n + state->groups->n;
+ if (!state->users->n && !state->groups->n)
+ nace = 3;
+ else /* Note we also include a MASK ACE in this case: */
+ nace = 4 + state->users->n + state->groups->n;
pacl = posix_acl_alloc(nace, GFP_KERNEL);
if (!pacl)
return ERR_PTR(-ENOMEM);
@@ -569,9 +589,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
add_to_mask(state, &state->groups->aces[i].perms);
}
- pace++;
- pace->e_tag = ACL_MASK;
- low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+ if (state->users->n || state->groups->n) {
+ pace++;
+ pace->e_tag = ACL_MASK;
+ low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
+ }
pace++;
pace->e_tag = ACL_OTHER;
@@ -719,8 +741,9 @@ static void process_one_v4_ace(struct posix_acl_state *state,
}
}
-int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl, struct posix_acl **pacl,
- struct posix_acl **dpacl, unsigned int flags)
+static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
+ struct posix_acl **pacl, struct posix_acl **dpacl,
+ unsigned int flags)
{
struct posix_acl_state effective_acl_state, default_acl_state;
struct nfs4_ace *ace;
@@ -780,6 +803,57 @@ out_estate:
return ret;
}
+__be32
+nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfs4_acl *acl)
+{
+ __be32 error;
+ int host_error;
+ struct dentry *dentry;
+ struct inode *inode;
+ struct posix_acl *pacl = NULL, *dpacl = NULL;
+ unsigned int flags = 0;
+
+ /* Get inode */
+ error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
+ if (error)
+ return error;
+
+ dentry = fhp->fh_dentry;
+ inode = dentry->d_inode;
+
+ if (!inode->i_op->set_acl || !IS_POSIXACL(inode))
+ return nfserr_attrnotsupp;
+
+ if (S_ISDIR(inode->i_mode))
+ flags = NFS4_ACL_DIR;
+
+ host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
+ if (host_error == -EINVAL)
+ return nfserr_attrnotsupp;
+ if (host_error < 0)
+ goto out_nfserr;
+
+ host_error = inode->i_op->set_acl(inode, pacl, ACL_TYPE_ACCESS);
+ if (host_error < 0)
+ goto out_release;
+
+ if (S_ISDIR(inode->i_mode)) {
+ host_error = inode->i_op->set_acl(inode, dpacl,
+ ACL_TYPE_DEFAULT);
+ }
+
+out_release:
+ posix_acl_release(pacl);
+ posix_acl_release(dpacl);
+out_nfserr:
+ if (host_error == -EOPNOTSUPP)
+ return nfserr_attrnotsupp;
+ else
+ return nfserrno(host_error);
+}
+
+
static short
ace2type(struct nfs4_ace *ace)
{
@@ -798,9 +872,6 @@ ace2type(struct nfs4_ace *ace)
return -1;
}
-EXPORT_SYMBOL(nfs4_acl_posix_to_nfsv4);
-EXPORT_SYMBOL(nfs4_acl_nfsv4_to_posix);
-
struct nfs4_acl *
nfs4_acl_new(int n)
{
@@ -848,21 +919,21 @@ nfs4_acl_get_whotype(char *p, u32 len)
return NFS4_ACL_WHO_NAMED;
}
-int
-nfs4_acl_write_who(int who, char *p)
+__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
{
+ __be32 *p;
int i;
for (i = 0; i < ARRAY_SIZE(s2t_map); i++) {
- if (s2t_map[i].type == who) {
- memcpy(p, s2t_map[i].string, s2t_map[i].stringlen);
- return s2t_map[i].stringlen;
- }
+ if (s2t_map[i].type != who)
+ continue;
+ p = xdr_reserve_space(xdr, s2t_map[i].stringlen + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, s2t_map[i].string,
+ s2t_map[i].stringlen);
+ return 0;
}
- BUG();
+ WARN_ON_ONCE(1);
return -1;
}
-
-EXPORT_SYMBOL(nfs4_acl_new);
-EXPORT_SYMBOL(nfs4_acl_get_whotype);
-EXPORT_SYMBOL(nfs4_acl_write_who);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7f05cd140de..2c73cae9899 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -32,6 +32,7 @@
*/
#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/slab.h>
#include "nfsd.h"
@@ -635,11 +636,29 @@ static struct rpc_cred *get_backchannel_cred(struct nfs4_client *clp, struct rpc
}
}
+static struct rpc_clnt *create_backchannel_client(struct rpc_create_args *args)
+{
+ struct rpc_xprt *xprt;
+
+ if (args->protocol != XPRT_TRANSPORT_BC_TCP)
+ return rpc_create(args);
+
+ xprt = args->bc_xprt->xpt_bc_xprt;
+ if (xprt) {
+ xprt_get(xprt);
+ return rpc_create_xprt(args, xprt);
+ }
+
+ return rpc_create(args);
+}
+
static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *conn, struct nfsd4_session *ses)
{
+ int maxtime = max_cb_time(clp->net);
struct rpc_timeout timeparms = {
- .to_initval = max_cb_time(clp->net),
+ .to_initval = maxtime,
.to_retries = 0,
+ .to_maxval = maxtime,
};
struct rpc_create_args args = {
.net = clp->net,
@@ -674,7 +693,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
args.authflavor = ses->se_cb_sec.flavor;
}
/* Create RPC client */
- client = rpc_create(&args);
+ client = create_backchannel_client(&args);
if (IS_ERR(client)) {
dprintk("NFSD: couldn't create callback client: %ld\n",
PTR_ERR(client));
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4832fd819f8..a0ab0a847d6 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -551,27 +551,45 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
return 0;
}
-static int
-idmap_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
+static __be32 encode_ascii_id(struct xdr_stream *xdr, u32 id)
+{
+ char buf[11];
+ int len;
+ __be32 *p;
+
+ len = sprintf(buf, "%u", id);
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, buf, len);
+ return 0;
+}
+
+static __be32 idmap_id_to_name(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, int type, u32 id)
{
struct ent *item, key = {
.id = id,
.type = type,
};
+ __be32 *p;
int ret;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
if (ret == -ENOENT)
- return sprintf(name, "%u", id);
+ return encode_ascii_id(xdr, id);
if (ret)
- return ret;
+ return nfserrno(ret);
ret = strlen(item->name);
- BUG_ON(ret > IDMAP_NAMESZ);
- memcpy(name, item->name, ret);
+ WARN_ON_ONCE(ret > IDMAP_NAMESZ);
+ p = xdr_reserve_space(xdr, ret + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, item->name, ret);
cache_put(&item->h, nn->idtoname_cache);
- return ret;
+ return 0;
}
static bool
@@ -603,12 +621,12 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
return idmap_name_to_id(rqstp, type, name, namelen, id);
}
-static int
-do_id_to_name(struct svc_rqst *rqstp, int type, u32 id, char *name)
+static __be32 encode_name_from_id(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, int type, u32 id)
{
if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
- return sprintf(name, "%u", id);
- return idmap_id_to_name(rqstp, type, id, name);
+ return encode_ascii_id(xdr, id);
+ return idmap_id_to_name(xdr, rqstp, type, id);
}
__be32
@@ -637,16 +655,16 @@ nfsd_map_name_to_gid(struct svc_rqst *rqstp, const char *name, size_t namelen,
return status;
}
-int
-nfsd_map_uid_to_name(struct svc_rqst *rqstp, kuid_t uid, char *name)
+__be32 nfsd4_encode_user(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ kuid_t uid)
{
u32 id = from_kuid(&init_user_ns, uid);
- return do_id_to_name(rqstp, IDMAP_TYPE_USER, id, name);
+ return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_USER, id);
}
-int
-nfsd_map_gid_to_name(struct svc_rqst *rqstp, kgid_t gid, char *name)
+__be32 nfsd4_encode_group(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ kgid_t gid)
{
u32 id = from_kgid(&init_user_ns, gid);
- return do_id_to_name(rqstp, IDMAP_TYPE_GROUP, id, name);
+ return encode_name_from_id(xdr, rqstp, IDMAP_TYPE_GROUP, id);
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 419572f33b7..8f029db5d27 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -41,6 +41,7 @@
#include "vfs.h"
#include "current_stateid.h"
#include "netns.h"
+#include "acl.h"
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -230,17 +231,16 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
}
static __be32
-do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
{
struct svc_fh *current_fh = &cstate->current_fh;
- struct svc_fh *resfh;
int accmode;
__be32 status;
- resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
- if (!resfh)
+ *resfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+ if (!*resfh)
return nfserr_jukebox;
- fh_init(resfh, NFS4_FHSIZE);
+ fh_init(*resfh, NFS4_FHSIZE);
open->op_truncate = 0;
if (open->op_create) {
@@ -265,12 +265,12 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
*/
status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
open->op_fname.len, &open->op_iattr,
- resfh, open->op_createmode,
+ *resfh, open->op_createmode,
(u32 *)open->op_verf.data,
&open->op_truncate, &open->op_created);
if (!status && open->op_label.len)
- nfsd4_security_inode_setsecctx(resfh, &open->op_label, open->op_bmval);
+ nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
/*
* Following rfc 3530 14.2.16, use the returned bitmask
@@ -280,31 +280,32 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
FATTR4_WORD1_TIME_MODIFY);
- } else {
+ } else
+ /*
+ * Note this may exit with the parent still locked.
+ * We will hold the lock until nfsd4_open's final
+ * lookup, to prevent renames or unlinks until we've had
+ * a chance to an acquire a delegation if appropriate.
+ */
status = nfsd_lookup(rqstp, current_fh,
- open->op_fname.data, open->op_fname.len, resfh);
- fh_unlock(current_fh);
- }
+ open->op_fname.data, open->op_fname.len, *resfh);
if (status)
goto out;
- status = nfsd_check_obj_isreg(resfh);
+ status = nfsd_check_obj_isreg(*resfh);
if (status)
goto out;
if (is_create_with_attrs(open) && open->op_acl != NULL)
- do_set_nfs4_acl(rqstp, resfh, open->op_acl, open->op_bmval);
+ do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
- nfsd4_set_open_owner_reply_cache(cstate, open, resfh);
+ nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
accmode = NFSD_MAY_NOP;
if (open->op_created ||
open->op_claim_type == NFS4_OPEN_CLAIM_DELEGATE_CUR)
accmode |= NFSD_MAY_OWNER_OVERRIDE;
- status = do_open_permission(rqstp, resfh, open, accmode);
+ status = do_open_permission(rqstp, *resfh, open, accmode);
set_change_info(&open->op_cinfo, current_fh);
- fh_dup2(current_fh, resfh);
out:
- fh_put(resfh);
- kfree(resfh);
return status;
}
@@ -357,6 +358,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_open *open)
{
__be32 status;
+ struct svc_fh *resfh = NULL;
struct nfsd4_compoundres *resp;
struct net *net = SVC_NET(rqstp);
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -423,26 +425,26 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
case NFS4_OPEN_CLAIM_NULL:
- status = do_open_lookup(rqstp, cstate, open);
+ status = do_open_lookup(rqstp, cstate, open, &resfh);
if (status)
goto out;
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
status = nfs4_check_open_reclaim(&open->op_clientid,
cstate->minorversion,
nn);
if (status)
goto out;
+ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
case NFS4_OPEN_CLAIM_FH:
case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
status = do_open_fhandle(rqstp, cstate, open);
if (status)
goto out;
+ resfh = &cstate->current_fh;
break;
case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
dprintk("NFSD: unsupported OPEN claim type %d\n",
open->op_claim_type);
status = nfserr_notsupp;
@@ -458,9 +460,14 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* successful, it (1) truncates the file if open->op_truncate was
* set, (2) sets open->op_stateid, (3) sets open->op_delegation.
*/
- status = nfsd4_process_open2(rqstp, &cstate->current_fh, open);
+ status = nfsd4_process_open2(rqstp, resfh, open);
WARN_ON(status && open->op_created);
out:
+ if (resfh && resfh != &cstate->current_fh) {
+ fh_dup2(&cstate->current_fh, resfh);
+ fh_put(resfh);
+ kfree(resfh);
+ }
nfsd4_cleanup_open_state(open, status);
if (open->op_openowner && !nfsd4_has_session(cstate))
cstate->replay_owner = &open->op_openowner->oo_owner;
@@ -610,15 +617,6 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
switch (create->cr_type) {
case NF4LNK:
- /* ugh! we have to null-terminate the linktext, or
- * vfs_symlink() will choke. it is always safe to
- * null-terminate by brute force, since at worst we
- * will overwrite the first byte of the create namelen
- * in the XDR buffer, which has already been extracted
- * during XDR decode.
- */
- create->cr_linkname[create->cr_linklen] = 0;
-
status = nfsd_symlink(rqstp, &cstate->current_fh,
create->cr_name, create->cr_namelen,
create->cr_linkname, create->cr_linklen,
@@ -778,7 +776,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!nfsd4_last_compound_op(rqstp))
rqstp->rq_splice_ok = false;
- nfs4_lock_state();
/* check stateid */
if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
cstate, &read->rd_stateid,
@@ -786,11 +783,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
goto out;
}
- if (read->rd_filp)
- get_file(read->rd_filp);
status = nfs_ok;
out:
- nfs4_unlock_state();
read->rd_rqstp = rqstp;
read->rd_fhp = &cstate->current_fh;
return status;
@@ -929,10 +923,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
int err;
if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
- nfs4_lock_state();
status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
&setattr->sa_stateid, WR_STATE, NULL);
- nfs4_unlock_state();
if (status) {
dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
return status;
@@ -998,17 +990,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (write->wr_offset >= OFFSET_MAX)
return nfserr_inval;
- nfs4_lock_state();
status = nfs4_preprocess_stateid_op(SVC_NET(rqstp),
cstate, stateid, WR_STATE, &filp);
if (status) {
- nfs4_unlock_state();
dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
return status;
}
- if (filp)
- get_file(filp);
- nfs4_unlock_state();
cnt = write->wr_buflen;
write->wr_how_written = write->wr_stable_how;
@@ -1064,13 +1051,15 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return nfserr_jukebox;
p = buf;
- status = nfsd4_encode_fattr(&cstate->current_fh,
+ status = nfsd4_encode_fattr_to_buf(&p, count, &cstate->current_fh,
cstate->current_fh.fh_export,
- cstate->current_fh.fh_dentry, &p,
- count, verify->ve_bmval,
+ cstate->current_fh.fh_dentry,
+ verify->ve_bmval,
rqstp, 0);
-
- /* this means that nfsd4_encode_fattr() ran out of space */
+ /*
+ * If nfsd4_encode_fattr() ran out of space, assume that's because
+ * the attributes are longer (hence different) than those given:
+ */
if (status == nfserr_resource)
status = nfserr_not_same;
if (status)
@@ -1172,9 +1161,7 @@ struct nfsd4_operation {
static struct nfsd4_operation nfsd4_ops[];
-#ifdef NFSD_DEBUG
static const char *nfsd4_op_name(unsigned opnum);
-#endif
/*
* Enforce NFSv4.1 COMPOUND ordering rules:
@@ -1216,6 +1203,8 @@ static inline struct nfsd4_operation *OPDESC(struct nfsd4_op *op)
bool nfsd4_cache_this_op(struct nfsd4_op *op)
{
+ if (op->opnum == OP_ILLEGAL)
+ return false;
return OPDESC(op)->op_flags & OP_CACHEME;
}
@@ -1252,6 +1241,25 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
}
+static void svcxdr_init_encode(struct svc_rqst *rqstp,
+ struct nfsd4_compoundres *resp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_buf *buf = &rqstp->rq_res;
+ struct kvec *head = buf->head;
+
+ xdr->buf = buf;
+ xdr->iov = head;
+ xdr->p = head->iov_base + head->iov_len;
+ xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
+ /* Tail and page_len should be zero at this point: */
+ buf->len = buf->head[0].iov_len;
+ xdr->scratch.iov_len = 0;
+ xdr->page_ptr = buf->pages - 1;
+ buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+ - rqstp->rq_auth_slack;
+}
+
/*
* COMPOUND call.
*/
@@ -1263,26 +1271,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
struct nfsd4_op *op;
struct nfsd4_operation *opdesc;
struct nfsd4_compound_state *cstate = &resp->cstate;
- int slack_bytes;
- u32 plen = 0;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ struct svc_fh *save_fh = &cstate->save_fh;
__be32 status;
- resp->xbuf = &rqstp->rq_res;
- resp->p = rqstp->rq_res.head[0].iov_base +
- rqstp->rq_res.head[0].iov_len;
- resp->tagp = resp->p;
+ svcxdr_init_encode(rqstp, resp);
+ resp->tagp = resp->xdr.p;
/* reserve space for: taglen, tag, and opcnt */
- resp->p += 2 + XDR_QUADLEN(args->taglen);
- resp->end = rqstp->rq_res.head[0].iov_base + PAGE_SIZE;
+ xdr_reserve_space(&resp->xdr, 8 + args->taglen);
resp->taglen = args->taglen;
resp->tag = args->tag;
- resp->opcnt = 0;
resp->rqstp = rqstp;
- resp->cstate.minorversion = args->minorversion;
- resp->cstate.replay_owner = NULL;
- resp->cstate.session = NULL;
- fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
- fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
+ cstate->minorversion = args->minorversion;
+ fh_init(current_fh, NFS4_FHSIZE);
+ fh_init(save_fh, NFS4_FHSIZE);
/*
* Don't use the deferral mechanism for NFSv4; compounds make it
* too hard to avoid non-idempotency problems.
@@ -1320,35 +1322,34 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
goto encode_op;
}
- /* We must be able to encode a successful response to
- * this operation, with enough room left over to encode a
- * failed response to the next operation. If we don't
- * have enough room, fail with ERR_RESOURCE.
- */
- slack_bytes = (char *)resp->end - (char *)resp->p;
- if (slack_bytes < COMPOUND_SLACK_SPACE
- + COMPOUND_ERR_SLACK_SPACE) {
- BUG_ON(slack_bytes < COMPOUND_ERR_SLACK_SPACE);
- op->status = nfserr_resource;
- goto encode_op;
- }
-
opdesc = OPDESC(op);
- if (!cstate->current_fh.fh_dentry) {
+ if (!current_fh->fh_dentry) {
if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) {
op->status = nfserr_nofilehandle;
goto encode_op;
}
- } else if (cstate->current_fh.fh_export->ex_fslocs.migrated &&
+ } else if (current_fh->fh_export->ex_fslocs.migrated &&
!(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) {
op->status = nfserr_moved;
goto encode_op;
}
+ fh_clear_wcc(current_fh);
+
/* If op is non-idempotent */
if (opdesc->op_flags & OP_MODIFIES_SOMETHING) {
- plen = opdesc->op_rsize_bop(rqstp, op);
+ /*
+ * Don't execute this op if we couldn't encode a
+ * succesful reply:
+ */
+ u32 plen = opdesc->op_rsize_bop(rqstp, op);
+ /*
+ * Plus if there's another operation, make sure
+ * we'll have space to at least encode an error:
+ */
+ if (resp->opcnt < args->opcnt)
+ plen += COMPOUND_ERR_SLACK_SPACE;
op->status = nfsd4_check_resp_size(resp, plen);
}
@@ -1367,19 +1368,19 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
clear_current_stateid(cstate);
if (need_wrongsec_check(rqstp))
- op->status = check_nfsd_access(cstate->current_fh.fh_export, rqstp);
+ op->status = check_nfsd_access(current_fh->fh_export, rqstp);
}
encode_op:
/* Only from SEQUENCE */
- if (resp->cstate.status == nfserr_replay_cache) {
+ if (cstate->status == nfserr_replay_cache) {
dprintk("%s NFS4.1 replay from cache\n", __func__);
status = op->status;
goto out;
}
if (op->status == nfserr_replay_me) {
op->replay = &cstate->replay_owner->so_replay;
- nfsd4_encode_replay(resp, op);
+ nfsd4_encode_replay(&resp->xdr, op);
status = op->status = op->replay->rp_status;
} else {
nfsd4_encode_operation(resp, op);
@@ -1401,10 +1402,10 @@ encode_op:
nfsd4_increment_op_stats(op->opnum);
}
- resp->cstate.status = status;
- fh_put(&resp->cstate.current_fh);
- fh_put(&resp->cstate.save_fh);
- BUG_ON(resp->cstate.replay_owner);
+ cstate->status = status;
+ fh_put(current_fh);
+ fh_put(save_fh);
+ BUG_ON(cstate->replay_owner);
out:
/* Reset deferral mechanism for RPC deferrals */
rqstp->rq_usedeferral = 1;
@@ -1418,7 +1419,8 @@ out:
#define op_encode_change_info_maxsz (5)
#define nfs4_fattr_bitmap_maxsz (4)
-#define op_encode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
+/* We'll fall back on returning no lockowner if run out of space: */
+#define op_encode_lockowner_maxsz (0)
#define op_encode_lock_denied_maxsz (8 + op_encode_lockowner_maxsz)
#define nfs4_owner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ))
@@ -1450,6 +1452,49 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
+ nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
}
+/*
+ * Note since this is an idempotent operation we won't insist on failing
+ * the op prematurely if the estimate is too large. We may turn off splice
+ * reads unnecessarily.
+ */
+static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ u32 *bmap = op->u.getattr.ga_bmval;
+ u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
+ u32 ret = 0;
+
+ if (bmap0 & FATTR4_WORD0_ACL)
+ return svc_max_payload(rqstp);
+ if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
+ return svc_max_payload(rqstp);
+
+ if (bmap1 & FATTR4_WORD1_OWNER) {
+ ret += IDMAP_NAMESZ + 4;
+ bmap1 &= ~FATTR4_WORD1_OWNER;
+ }
+ if (bmap1 & FATTR4_WORD1_OWNER_GROUP) {
+ ret += IDMAP_NAMESZ + 4;
+ bmap1 &= ~FATTR4_WORD1_OWNER_GROUP;
+ }
+ if (bmap0 & FATTR4_WORD0_FILEHANDLE) {
+ ret += NFS4_FHSIZE + 4;
+ bmap0 &= ~FATTR4_WORD0_FILEHANDLE;
+ }
+ if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) {
+ ret += NFSD4_MAX_SEC_LABEL_LEN + 12;
+ bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL;
+ }
+ /*
+ * Largest of remaining attributes are 16 bytes (e.g.,
+ * supported_attributes)
+ */
+ ret += 16 * (hweight32(bmap0) + hweight32(bmap1) + hweight32(bmap2));
+ /* bitmask, length */
+ ret += 20;
+ return ret;
+}
+
static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + op_encode_change_info_maxsz)
@@ -1480,18 +1525,19 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
if (rlen > maxcount)
rlen = maxcount;
- return (op_encode_hdr_size + 2) * sizeof(__be32) + rlen;
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
}
static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
+ u32 maxcount = svc_max_payload(rqstp);
u32 rlen = op->u.readdir.rd_maxcount;
- if (rlen > PAGE_SIZE)
- rlen = PAGE_SIZE;
+ if (rlen > maxcount)
+ rlen = maxcount;
- return (op_encode_hdr_size + op_encode_verifier_maxsz)
- * sizeof(__be32) + rlen;
+ return (op_encode_hdr_size + op_encode_verifier_maxsz +
+ XDR_QUADLEN(rlen)) * sizeof(__be32);
}
static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
@@ -1506,6 +1552,12 @@ static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
+ op_encode_change_info_maxsz) * sizeof(__be32);
}
+static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
+ struct nfsd4_op *op)
+{
+ return NFS4_MAX_SESSIONID_LEN + 20;
+}
+
static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
@@ -1513,18 +1565,20 @@ static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *o
static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- return (op_encode_hdr_size + 2 + 1024) * sizeof(__be32);
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+ sizeof(__be32);
}
static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
- return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+ return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
}
static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
- 1 + 1 + 2 + /* eir_flags, spr_how, spo_must_enforce & _allow */\
+ 1 + 1 + /* eir_flags, spr_how */\
+ 4 + /* spo_must_enforce & _allow with bitmap */\
2 + /*eir_server_owner.so_minor_id */\
/* eir_server_owner.so_major_id<> */\
XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + 1 +\
@@ -1585,6 +1639,7 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_GETATTR] = {
.op_func = (nfsd4op_func)nfsd4_getattr,
.op_flags = ALLOWED_ON_ABSENT_FS,
+ .op_rsize_bop = nfsd4_getattr_rsize,
.op_name = "OP_GETATTR",
},
[OP_GETFH] = {
@@ -1654,37 +1709,32 @@ static struct nfsd4_operation nfsd4_ops[] = {
[OP_PUTFH] = {
.op_func = (nfsd4op_func)nfsd4_putfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
- | OP_CLEAR_STATEID,
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
.op_name = "OP_PUTFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTPUBFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
- | OP_CLEAR_STATEID,
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
.op_name = "OP_PUTPUBFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_PUTROOTFH] = {
.op_func = (nfsd4op_func)nfsd4_putrootfh,
.op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS
- | OP_IS_PUTFH_LIKE | OP_MODIFIES_SOMETHING
- | OP_CLEAR_STATEID,
+ | OP_IS_PUTFH_LIKE | OP_CLEAR_STATEID,
.op_name = "OP_PUTROOTFH",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
[OP_READ] = {
.op_func = (nfsd4op_func)nfsd4_read,
- .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READ",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_read_rsize,
.op_get_currentstateid = (stateid_getter)nfsd4_get_readstateid,
},
[OP_READDIR] = {
.op_func = (nfsd4op_func)nfsd4_readdir,
- .op_flags = OP_MODIFIES_SOMETHING,
.op_name = "OP_READDIR",
.op_rsize_bop = (nfsd4op_rsize)nfsd4_readdir_rsize,
},
@@ -1842,14 +1892,33 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
};
-#ifdef NFSD_DEBUG
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ struct nfsd4_operation *opdesc;
+ nfsd4op_rsize estimator;
+
+ if (op->opnum == OP_ILLEGAL)
+ return op_encode_hdr_size * sizeof(__be32);
+ opdesc = OPDESC(op);
+ estimator = opdesc->op_rsize_bop;
+ return estimator ? estimator(rqstp, op) : PAGE_SIZE;
+}
+
+void warn_on_nonidempotent_op(struct nfsd4_op *op)
+{
+ if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) {
+ pr_err("unable to encode reply to nonidempotent op %d (%s)\n",
+ op->opnum, nfsd4_op_name(op->opnum));
+ WARN_ON_ONCE(1);
+ }
+}
+
static const char *nfsd4_op_name(unsigned opnum)
{
if (opnum < ARRAY_SIZE(nfsd4_ops))
return nfsd4_ops[opnum].op_name;
return "unknown_operation";
}
-#endif
#define nfsd4_voidres nfsd4_voidargs
struct nfsd4_voidargs { int dummy; };
@@ -1881,6 +1950,7 @@ struct svc_version nfsd_version4 = {
.vs_proc = nfsd_procedures4,
.vs_dispatch = nfsd_dispatch,
.vs_xdrsize = NFS4_SVC_XDRSIZE,
+ .vs_rpcb_optnl = 1,
};
/*
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a601fd49f99..2204e1fe572 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
#include <linux/ratelimit.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/addr.h>
+#include <linux/hash.h>
#include "xdr4.h"
#include "xdr4cb.h"
#include "vfs.h"
@@ -81,13 +82,13 @@ static DEFINE_MUTEX(client_mutex);
* effort to decrease the scope of the client_mutex, this spinlock may
* eventually cover more:
*/
-static DEFINE_SPINLOCK(recall_lock);
+static DEFINE_SPINLOCK(state_lock);
-static struct kmem_cache *openowner_slab = NULL;
-static struct kmem_cache *lockowner_slab = NULL;
-static struct kmem_cache *file_slab = NULL;
-static struct kmem_cache *stateid_slab = NULL;
-static struct kmem_cache *deleg_slab = NULL;
+static struct kmem_cache *openowner_slab;
+static struct kmem_cache *lockowner_slab;
+static struct kmem_cache *file_slab;
+static struct kmem_cache *stateid_slab;
+static struct kmem_cache *deleg_slab;
void
nfs4_lock_state(void)
@@ -235,9 +236,9 @@ static void nfsd4_free_file(struct nfs4_file *f)
static inline void
put_nfs4_file(struct nfs4_file *fi)
{
- if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
+ if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
hlist_del(&fi->fi_hash);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
iput(fi->fi_inode);
nfsd4_free_file(fi);
}
@@ -364,6 +365,79 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
return openlockstateid(nfs4_alloc_stid(clp, stateid_slab));
}
+/*
+ * When we recall a delegation, we should be careful not to hand it
+ * out again straight away.
+ * To ensure this we keep a pair of bloom filters ('new' and 'old')
+ * in which the filehandles of recalled delegations are "stored".
+ * If a filehandle appear in either filter, a delegation is blocked.
+ * When a delegation is recalled, the filehandle is stored in the "new"
+ * filter.
+ * Every 30 seconds we swap the filters and clear the "new" one,
+ * unless both are empty of course.
+ *
+ * Each filter is 256 bits. We hash the filehandle to 32bit and use the
+ * low 3 bytes as hash-table indices.
+ *
+ * 'state_lock', which is always held when block_delegations() is called,
+ * is used to manage concurrent access. Testing does not need the lock
+ * except when swapping the two filters.
+ */
+static struct bloom_pair {
+ int entries, old_entries;
+ time_t swap_time;
+ int new; /* index into 'set' */
+ DECLARE_BITMAP(set[2], 256);
+} blocked_delegations;
+
+static int delegation_blocked(struct knfsd_fh *fh)
+{
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+ if (bd->entries == 0)
+ return 0;
+ if (seconds_since_boot() - bd->swap_time > 30) {
+ spin_lock(&state_lock);
+ if (seconds_since_boot() - bd->swap_time > 30) {
+ bd->entries -= bd->old_entries;
+ bd->old_entries = bd->entries;
+ memset(bd->set[bd->new], 0,
+ sizeof(bd->set[0]));
+ bd->new = 1-bd->new;
+ bd->swap_time = seconds_since_boot();
+ }
+ spin_unlock(&state_lock);
+ }
+ hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+ if (test_bit(hash&255, bd->set[0]) &&
+ test_bit((hash>>8)&255, bd->set[0]) &&
+ test_bit((hash>>16)&255, bd->set[0]))
+ return 1;
+
+ if (test_bit(hash&255, bd->set[1]) &&
+ test_bit((hash>>8)&255, bd->set[1]) &&
+ test_bit((hash>>16)&255, bd->set[1]))
+ return 1;
+
+ return 0;
+}
+
+static void block_delegations(struct knfsd_fh *fh)
+{
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+ hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
+
+ __set_bit(hash&255, bd->set[bd->new]);
+ __set_bit((hash>>8)&255, bd->set[bd->new]);
+ __set_bit((hash>>16)&255, bd->set[bd->new]);
+ if (bd->entries == 0)
+ bd->swap_time = seconds_since_boot();
+ bd->entries += 1;
+}
+
static struct nfs4_delegation *
alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh)
{
@@ -372,10 +446,11 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct sv
dprintk("NFSD alloc_init_deleg\n");
if (num_delegations > max_delegations)
return NULL;
+ if (delegation_blocked(&current_fh->fh_handle))
+ return NULL;
dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
if (dp == NULL)
return dp;
- dp->dl_stid.sc_type = NFS4_DELEG_STID;
/*
* delegation seqid's are never incremented. The 4.1 special
* meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -402,17 +477,24 @@ static void remove_stid(struct nfs4_stid *s)
idr_remove(stateids, s->sc_stateid.si_opaque.so_id);
}
+static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s)
+{
+ kmem_cache_free(slab, s);
+}
+
void
nfs4_put_delegation(struct nfs4_delegation *dp)
{
if (atomic_dec_and_test(&dp->dl_count)) {
- kmem_cache_free(deleg_slab, dp);
+ nfs4_free_stid(deleg_slab, &dp->dl_stid);
num_delegations--;
}
}
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
+ if (!fp->fi_lease)
+ return;
if (atomic_dec_and_test(&fp->fi_delegees)) {
vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
fp->fi_lease = NULL;
@@ -426,18 +508,30 @@ static void unhash_stid(struct nfs4_stid *s)
s->sc_type = 0;
}
+static void
+hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+{
+ lockdep_assert_held(&state_lock);
+
+ dp->dl_stid.sc_type = NFS4_DELEG_STID;
+ list_add(&dp->dl_perfile, &fp->fi_delegations);
+ list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+}
+
/* Called under the state lock. */
static void
unhash_delegation(struct nfs4_delegation *dp)
{
+ spin_lock(&state_lock);
list_del_init(&dp->dl_perclnt);
- spin_lock(&recall_lock);
list_del_init(&dp->dl_perfile);
list_del_init(&dp->dl_recall_lru);
- spin_unlock(&recall_lock);
- nfs4_put_deleg_lease(dp->dl_file);
- put_nfs4_file(dp->dl_file);
- dp->dl_file = NULL;
+ spin_unlock(&state_lock);
+ if (dp->dl_file) {
+ nfs4_put_deleg_lease(dp->dl_file);
+ put_nfs4_file(dp->dl_file);
+ dp->dl_file = NULL;
+ }
}
@@ -610,7 +704,7 @@ static void close_generic_stateid(struct nfs4_ol_stateid *stp)
static void free_generic_stateid(struct nfs4_ol_stateid *stp)
{
remove_stid(&stp->st_stid);
- kmem_cache_free(stateid_slab, stp);
+ nfs4_free_stid(stateid_slab, &stp->st_stid);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -640,6 +734,12 @@ static void unhash_lockowner(struct nfs4_lockowner *lo)
}
}
+static void nfs4_free_lockowner(struct nfs4_lockowner *lo)
+{
+ kfree(lo->lo_owner.so_owner.data);
+ kmem_cache_free(lockowner_slab, lo);
+}
+
static void release_lockowner(struct nfs4_lockowner *lo)
{
unhash_lockowner(lo);
@@ -668,7 +768,6 @@ static void unhash_open_stateid(struct nfs4_ol_stateid *stp)
static void release_open_stateid(struct nfs4_ol_stateid *stp)
{
unhash_open_stateid(stp);
- unhash_stid(&stp->st_stid);
free_generic_stateid(stp);
}
@@ -690,12 +789,17 @@ static void release_last_closed_stateid(struct nfs4_openowner *oo)
struct nfs4_ol_stateid *s = oo->oo_last_closed_stid;
if (s) {
- unhash_stid(&s->st_stid);
free_generic_stateid(s);
oo->oo_last_closed_stid = NULL;
}
}
+static void nfs4_free_openowner(struct nfs4_openowner *oo)
+{
+ kfree(oo->oo_owner.so_owner.data);
+ kmem_cache_free(openowner_slab, oo);
+}
+
static void release_openowner(struct nfs4_openowner *oo)
{
unhash_openowner(oo);
@@ -829,10 +933,11 @@ static void nfsd4_put_drc_mem(struct nfsd4_channel_attrs *ca)
spin_unlock(&nfsd_drc_lock);
}
-static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
+static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
+ struct nfsd4_channel_attrs *battrs)
{
- int numslots = attrs->maxreqs;
- int slotsize = slot_bytes(attrs);
+ int numslots = fattrs->maxreqs;
+ int slotsize = slot_bytes(fattrs);
struct nfsd4_session *new;
int mem, i;
@@ -849,6 +954,10 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *attrs)
if (!new->se_slots[i])
goto out_free;
}
+
+ memcpy(&new->se_fchannel, fattrs, sizeof(struct nfsd4_channel_attrs));
+ memcpy(&new->se_bchannel, battrs, sizeof(struct nfsd4_channel_attrs));
+
return new;
out_free:
while (i--)
@@ -994,8 +1103,7 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
list_add(&new->se_perclnt, &clp->cl_sessions);
spin_unlock(&clp->cl_lock);
spin_unlock(&nn->client_lock);
- memcpy(&new->se_fchannel, &cses->fore_channel,
- sizeof(struct nfsd4_channel_attrs));
+
if (cses->flags & SESSION4_BACK_CHAN) {
struct sockaddr *sa = svc_addr(rqstp);
/*
@@ -1071,10 +1179,22 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
return NULL;
}
clp->cl_name.len = name.len;
+ INIT_LIST_HEAD(&clp->cl_sessions);
+ idr_init(&clp->cl_stateids);
+ atomic_set(&clp->cl_refcount, 0);
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+ INIT_LIST_HEAD(&clp->cl_lru);
+ INIT_LIST_HEAD(&clp->cl_callbacks);
+ INIT_LIST_HEAD(&clp->cl_revoked);
+ spin_lock_init(&clp->cl_lock);
+ rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
}
-static inline void
+static void
free_client(struct nfs4_client *clp)
{
struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
@@ -1088,6 +1208,7 @@ free_client(struct nfs4_client *clp)
WARN_ON_ONCE(atomic_read(&ses->se_ref));
free_session(ses);
}
+ rpc_destroy_wait_queue(&clp->cl_cb_waitq);
free_svc_cred(&clp->cl_cred);
kfree(clp->cl_name.data);
idr_destroy(&clp->cl_stateids);
@@ -1116,17 +1237,22 @@ destroy_client(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
INIT_LIST_HEAD(&reaplist);
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
while (!list_empty(&clp->cl_delegations)) {
dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
list_del_init(&dp->dl_perclnt);
list_move(&dp->dl_recall_lru, &reaplist);
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
while (!list_empty(&reaplist)) {
dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
destroy_delegation(dp);
}
+ list_splice_init(&clp->cl_revoked, &reaplist);
+ while (!list_empty(&reaplist)) {
+ dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
+ destroy_revoked_delegation(dp);
+ }
while (!list_empty(&clp->cl_openowners)) {
oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
release_openowner(oo);
@@ -1335,7 +1461,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
if (clp == NULL)
return NULL;
- INIT_LIST_HEAD(&clp->cl_sessions);
ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
if (ret) {
spin_lock(&nn->client_lock);
@@ -1343,20 +1468,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
spin_unlock(&nn->client_lock);
return NULL;
}
- idr_init(&clp->cl_stateids);
- atomic_set(&clp->cl_refcount, 0);
- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
- INIT_LIST_HEAD(&clp->cl_idhash);
- INIT_LIST_HEAD(&clp->cl_openowners);
- INIT_LIST_HEAD(&clp->cl_delegations);
- INIT_LIST_HEAD(&clp->cl_lru);
- INIT_LIST_HEAD(&clp->cl_callbacks);
- INIT_LIST_HEAD(&clp->cl_revoked);
- spin_lock_init(&clp->cl_lock);
nfsd4_init_callback(&clp->cl_cb_null);
clp->cl_time = get_seconds();
clear_bit(0, &clp->cl_cb_slot_busy);
- rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
copy_verf(clp, verf);
rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
gen_confirm(clp);
@@ -1526,11 +1640,12 @@ out_err:
}
/*
- * Cache a reply. nfsd4_check_drc_limit() has bounded the cache size.
+ * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
*/
void
nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
{
+ struct xdr_buf *buf = resp->xdr.buf;
struct nfsd4_slot *slot = resp->cstate.slot;
unsigned int base;
@@ -1544,11 +1659,9 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
slot->sl_datalen = 0;
return;
}
- slot->sl_datalen = (char *)resp->p - (char *)resp->cstate.datap;
- base = (char *)resp->cstate.datap -
- (char *)resp->xbuf->head[0].iov_base;
- if (read_bytes_from_xdr_buf(resp->xbuf, base, slot->sl_data,
- slot->sl_datalen))
+ base = resp->cstate.data_offset;
+ slot->sl_datalen = buf->len - base;
+ if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
WARN("%s: sessions DRC could not cache compound\n", __func__);
return;
}
@@ -1584,28 +1697,31 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args,
* The sequence operation is not cached because we can use the slot and
* session values.
*/
-__be32
+static __be32
nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
struct nfsd4_sequence *seq)
{
struct nfsd4_slot *slot = resp->cstate.slot;
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
__be32 status;
dprintk("--> %s slot %p\n", __func__, slot);
- /* Either returns 0 or nfserr_retry_uncached */
status = nfsd4_enc_sequence_replay(resp->rqstp->rq_argp, resp);
- if (status == nfserr_retry_uncached_rep)
+ if (status)
return status;
- /* The sequence operation has been encoded, cstate->datap set. */
- memcpy(resp->cstate.datap, slot->sl_data, slot->sl_datalen);
+ p = xdr_reserve_space(xdr, slot->sl_datalen);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return nfserr_serverfault;
+ }
+ xdr_encode_opaque_fixed(p, slot->sl_data, slot->sl_datalen);
+ xdr_commit_encode(xdr);
resp->opcnt = slot->sl_opcnt;
- resp->p = resp->cstate.datap + XDR_QUADLEN(slot->sl_datalen);
- status = slot->sl_status;
-
- return status;
+ return slot->sl_status;
}
/*
@@ -1843,6 +1959,11 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
return nfs_ok;
}
+#define NFSD_CB_MAX_REQ_SZ ((NFS4_enc_cb_recall_sz + \
+ RPC_MAX_HEADER_WITH_AUTH) * sizeof(__be32))
+#define NFSD_CB_MAX_RESP_SZ ((NFS4_dec_cb_recall_sz + \
+ RPC_MAX_REPHEADER_WITH_AUTH) * sizeof(__be32))
+
static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
{
ca->headerpadsz = 0;
@@ -1853,9 +1974,9 @@ static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
* less than 1k. Tighten up this estimate in the unlikely event
* it turns out to be a problem for some client:
*/
- if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+ if (ca->maxreq_sz < NFSD_CB_MAX_REQ_SZ)
return nfserr_toosmall;
- if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+ if (ca->maxresp_sz < NFSD_CB_MAX_RESP_SZ)
return nfserr_toosmall;
ca->maxresp_cached = 0;
if (ca->maxops < 2)
@@ -1905,9 +2026,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
return status;
status = check_backchannel_attrs(&cr_ses->back_channel);
if (status)
- return status;
+ goto out_release_drc_mem;
status = nfserr_jukebox;
- new = alloc_session(&cr_ses->fore_channel);
+ new = alloc_session(&cr_ses->fore_channel, &cr_ses->back_channel);
if (!new)
goto out_release_drc_mem;
conn = alloc_conn_from_crses(rqstp, cr_ses);
@@ -2172,11 +2293,13 @@ nfsd4_sequence(struct svc_rqst *rqstp,
struct nfsd4_sequence *seq)
{
struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct xdr_stream *xdr = &resp->xdr;
struct nfsd4_session *session;
struct nfs4_client *clp;
struct nfsd4_slot *slot;
struct nfsd4_conn *conn;
__be32 status;
+ int buflen;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
if (resp->opcnt != 1)
@@ -2245,6 +2368,16 @@ nfsd4_sequence(struct svc_rqst *rqstp,
if (status)
goto out_put_session;
+ buflen = (seq->cachethis) ?
+ session->se_fchannel.maxresp_cached :
+ session->se_fchannel.maxresp_sz;
+ status = (seq->cachethis) ? nfserr_rep_too_big_to_cache :
+ nfserr_rep_too_big;
+ if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
+ goto out_put_session;
+ svc_reserve(rqstp, buflen);
+
+ status = nfs_ok;
/* Success! bump slot seqid */
slot->sl_seqid = seq->seqid;
slot->sl_flags |= NFSD4_SLOT_INUSE;
@@ -2270,7 +2403,8 @@ out:
if (!list_empty(&clp->cl_revoked))
seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
out_no_session:
- kfree(conn);
+ if (conn)
+ free_conn(conn);
spin_unlock(&nn->client_lock);
return status;
out_put_session:
@@ -2481,28 +2615,19 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino)
fp->fi_lease = NULL;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
- spin_unlock(&recall_lock);
-}
-
-static void
-nfsd4_free_slab(struct kmem_cache **slab)
-{
- if (*slab == NULL)
- return;
- kmem_cache_destroy(*slab);
- *slab = NULL;
+ spin_unlock(&state_lock);
}
void
nfsd4_free_slabs(void)
{
- nfsd4_free_slab(&openowner_slab);
- nfsd4_free_slab(&lockowner_slab);
- nfsd4_free_slab(&file_slab);
- nfsd4_free_slab(&stateid_slab);
- nfsd4_free_slab(&deleg_slab);
+ kmem_cache_destroy(openowner_slab);
+ kmem_cache_destroy(lockowner_slab);
+ kmem_cache_destroy(file_slab);
+ kmem_cache_destroy(stateid_slab);
+ kmem_cache_destroy(deleg_slab);
}
int
@@ -2511,42 +2636,38 @@ nfsd4_init_slabs(void)
openowner_slab = kmem_cache_create("nfsd4_openowners",
sizeof(struct nfs4_openowner), 0, 0, NULL);
if (openowner_slab == NULL)
- goto out_nomem;
+ goto out;
lockowner_slab = kmem_cache_create("nfsd4_lockowners",
sizeof(struct nfs4_lockowner), 0, 0, NULL);
if (lockowner_slab == NULL)
- goto out_nomem;
+ goto out_free_openowner_slab;
file_slab = kmem_cache_create("nfsd4_files",
sizeof(struct nfs4_file), 0, 0, NULL);
if (file_slab == NULL)
- goto out_nomem;
+ goto out_free_lockowner_slab;
stateid_slab = kmem_cache_create("nfsd4_stateids",
sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
if (stateid_slab == NULL)
- goto out_nomem;
+ goto out_free_file_slab;
deleg_slab = kmem_cache_create("nfsd4_delegations",
sizeof(struct nfs4_delegation), 0, 0, NULL);
if (deleg_slab == NULL)
- goto out_nomem;
+ goto out_free_stateid_slab;
return 0;
-out_nomem:
- nfsd4_free_slabs();
+
+out_free_stateid_slab:
+ kmem_cache_destroy(stateid_slab);
+out_free_file_slab:
+ kmem_cache_destroy(file_slab);
+out_free_lockowner_slab:
+ kmem_cache_destroy(lockowner_slab);
+out_free_openowner_slab:
+ kmem_cache_destroy(openowner_slab);
+out:
dprintk("nfsd4: out of memory while initializing nfsv4\n");
return -ENOMEM;
}
-void nfs4_free_openowner(struct nfs4_openowner *oo)
-{
- kfree(oo->oo_owner.so_owner.data);
- kmem_cache_free(openowner_slab, oo);
-}
-
-void nfs4_free_lockowner(struct nfs4_lockowner *lo)
-{
- kfree(lo->lo_owner.so_owner.data);
- kmem_cache_free(lockowner_slab, lo);
-}
-
static void init_nfs4_replay(struct nfs4_replay *rp)
{
rp->rp_status = nfserr_serverfault;
@@ -2667,15 +2788,15 @@ find_file(struct inode *ino)
unsigned int hashval = file_hashval(ino);
struct nfs4_file *fp;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
if (fp->fi_inode == ino) {
get_nfs4_file(fp);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
return fp;
}
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
return NULL;
}
@@ -2712,6 +2833,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
struct nfs4_client *clp = dp->dl_stid.sc_client;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ lockdep_assert_held(&state_lock);
/* We're assuming the state code never drops its reference
* without first removing the lease. Since we're in this lease
* callback (and since the lease code is serialized by the kernel
@@ -2724,6 +2846,8 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
/* Only place dl_time is set; protected by i_lock: */
dp->dl_time = get_seconds();
+ block_delegations(&dp->dl_fh);
+
nfsd4_cb_recall(dp);
}
@@ -2748,11 +2872,11 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
*/
fl->fl_break_time = 0;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
fp->fi_had_conflict = true;
list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
nfsd_break_one_deleg(dp);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
}
static
@@ -3008,7 +3132,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int f
return NULL;
locks_init_lock(fl);
fl->fl_lmops = &nfsd_lease_mng_ops;
- fl->fl_flags = FL_LEASE;
+ fl->fl_flags = FL_DELEG;
fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
fl->fl_end = OFFSET_MAX;
fl->fl_owner = (fl_owner_t)(dp->dl_file);
@@ -3026,49 +3150,38 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
if (!fl)
return -ENOMEM;
fl->fl_file = find_readable_file(fp);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
- if (status) {
- list_del_init(&dp->dl_perclnt);
- locks_free_lock(fl);
- return status;
- }
+ if (status)
+ goto out_free;
fp->fi_lease = fl;
fp->fi_deleg_file = get_file(fl->fl_file);
atomic_set(&fp->fi_delegees, 1);
- list_add(&dp->dl_perfile, &fp->fi_delegations);
+ spin_lock(&state_lock);
+ hash_delegation_locked(dp, fp);
+ spin_unlock(&state_lock);
return 0;
+out_free:
+ locks_free_lock(fl);
+ return status;
}
static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
- int status;
-
if (fp->fi_had_conflict)
return -EAGAIN;
get_nfs4_file(fp);
dp->dl_file = fp;
- if (!fp->fi_lease) {
- status = nfs4_setlease(dp);
- if (status)
- goto out_free;
- return 0;
- }
- spin_lock(&recall_lock);
+ if (!fp->fi_lease)
+ return nfs4_setlease(dp);
+ spin_lock(&state_lock);
+ atomic_inc(&fp->fi_delegees);
if (fp->fi_had_conflict) {
- spin_unlock(&recall_lock);
- status = -EAGAIN;
- goto out_free;
+ spin_unlock(&state_lock);
+ return -EAGAIN;
}
- atomic_inc(&fp->fi_delegees);
- list_add(&dp->dl_perfile, &fp->fi_delegations);
- spin_unlock(&recall_lock);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+ hash_delegation_locked(dp, fp);
+ spin_unlock(&state_lock);
return 0;
-out_free:
- put_nfs4_file(fp);
- dp->dl_file = fp;
- return status;
}
static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -3117,6 +3230,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
goto out_no_deleg;
break;
case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_FH:
/*
* Let's not give out any delegations till everyone's
* had the chance to reclaim theirs....
@@ -3154,8 +3268,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
return;
out_free:
- unhash_stid(&dp->dl_stid);
- nfs4_put_delegation(dp);
+ destroy_delegation(dp);
out_no_deleg:
open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
@@ -3372,8 +3485,7 @@ nfs4_laundromat(struct nfsd_net *nn)
struct nfs4_delegation *dp;
struct list_head *pos, *next, reaplist;
time_t cutoff = get_seconds() - nn->nfsd4_lease;
- time_t t, clientid_val = nn->nfsd4_lease;
- time_t u, test_val = nn->nfsd4_lease;
+ time_t t, new_timeo = nn->nfsd4_lease;
nfs4_lock_state();
@@ -3385,8 +3497,7 @@ nfs4_laundromat(struct nfsd_net *nn)
clp = list_entry(pos, struct nfs4_client, cl_lru);
if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) {
t = clp->cl_time - cutoff;
- if (clientid_val > t)
- clientid_val = t;
+ new_timeo = min(new_timeo, t);
break;
}
if (mark_client_expired_locked(clp)) {
@@ -3403,39 +3514,35 @@ nfs4_laundromat(struct nfsd_net *nn)
clp->cl_clientid.cl_id);
expire_client(clp);
}
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (net_generic(dp->dl_stid.sc_client->net, nfsd_net_id) != nn)
continue;
if (time_after((unsigned long)dp->dl_time, (unsigned long)cutoff)) {
- u = dp->dl_time - cutoff;
- if (test_val > u)
- test_val = u;
+ t = dp->dl_time - cutoff;
+ new_timeo = min(new_timeo, t);
break;
}
list_move(&dp->dl_recall_lru, &reaplist);
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
revoke_delegation(dp);
}
- test_val = nn->nfsd4_lease;
list_for_each_safe(pos, next, &nn->close_lru) {
oo = container_of(pos, struct nfs4_openowner, oo_close_lru);
if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) {
- u = oo->oo_time - cutoff;
- if (test_val > u)
- test_val = u;
+ t = oo->oo_time - cutoff;
+ new_timeo = min(new_timeo, t);
break;
}
release_openowner(oo);
}
- if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
- clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
+ new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
nfs4_unlock_state();
- return clientid_val;
+ return new_timeo;
}
static struct workqueue_struct *laundry_wq;
@@ -3609,8 +3716,11 @@ static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask,
return nfserr_bad_stateid;
status = lookup_clientid(&stateid->si_opaque.so_clid, sessions,
nn, &cl);
- if (status == nfserr_stale_clientid)
+ if (status == nfserr_stale_clientid) {
+ if (sessions)
+ return nfserr_bad_stateid;
return nfserr_stale_stateid;
+ }
if (status)
return status;
*s = find_stateid_by_type(cl, stateid, typemask);
@@ -3632,6 +3742,7 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
struct svc_fh *current_fh = &cstate->current_fh;
struct inode *ino = current_fh->fh_dentry->d_inode;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct file *file = NULL;
__be32 status;
if (filpp)
@@ -3643,10 +3754,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
return check_special_stateids(net, current_fh, stateid, flags);
+ nfs4_lock_state();
+
status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
&s, cstate->minorversion, nn);
if (status)
- return status;
+ goto out;
status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
if (status)
goto out;
@@ -3657,8 +3770,8 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
if (status)
goto out;
if (filpp) {
- *filpp = dp->dl_file->fi_deleg_file;
- if (!*filpp) {
+ file = dp->dl_file->fi_deleg_file;
+ if (!file) {
WARN_ON_ONCE(1);
status = nfserr_serverfault;
goto out;
@@ -3679,25 +3792,36 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
goto out;
if (filpp) {
if (flags & RD_STATE)
- *filpp = find_readable_file(stp->st_file);
+ file = find_readable_file(stp->st_file);
else
- *filpp = find_writeable_file(stp->st_file);
+ file = find_writeable_file(stp->st_file);
}
break;
default:
- return nfserr_bad_stateid;
+ status = nfserr_bad_stateid;
+ goto out;
}
status = nfs_ok;
+ if (file)
+ *filpp = get_file(file);
out:
+ nfs4_unlock_state();
return status;
}
static __be32
nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
{
- if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
+ struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+
+ if (check_for_locks(stp->st_file, lo))
return nfserr_locks_held;
- release_lock_stateid(stp);
+ /*
+ * Currently there's a 1-1 lock stateid<->lockowner
+ * correspondance, and we have to delete the lockowner when we
+ * delete the lock stateid:
+ */
+ release_lockowner(lo);
return nfs_ok;
}
@@ -3995,10 +4119,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_close_open_stateid(stp);
- if (cstate->minorversion) {
- unhash_stid(&stp->st_stid);
+ if (cstate->minorversion)
free_generic_stateid(stp);
- } else
+ else
oo->oo_last_closed_stid = stp;
if (list_empty(&oo->oo_owner.so_stateids)) {
@@ -4138,6 +4261,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
if (!same_owner_str(&lo->lo_owner, owner, clid))
return false;
+ if (list_empty(&lo->lo_owner.so_stateids)) {
+ WARN_ON_ONCE(1);
+ return false;
+ }
lst = list_first_entry(&lo->lo_owner.so_stateids,
struct nfs4_ol_stateid, st_perstateowner);
return lst->st_file->fi_inode == inode;
@@ -4864,6 +4991,7 @@ static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
struct nfs4_delegation *dp, *next;
u64 count = 0;
+ lockdep_assert_held(&state_lock);
list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
if (victims)
list_move(&dp->dl_recall_lru, victims);
@@ -4879,9 +5007,9 @@ u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max)
LIST_HEAD(victims);
u64 count;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
count = nfsd_find_all_delegations(clp, max, &victims);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
revoke_delegation(dp);
@@ -4895,11 +5023,11 @@ u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max)
LIST_HEAD(victims);
u64 count;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
count = nfsd_find_all_delegations(clp, max, &victims);
list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
nfsd_break_one_deleg(dp);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
return count;
}
@@ -4908,9 +5036,9 @@ u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max)
{
u64 count = 0;
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
count = nfsd_find_all_delegations(clp, max, NULL);
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
nfsd_print_count(clp, count, "delegations");
return count;
@@ -4951,13 +5079,6 @@ struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_
#endif /* CONFIG_NFSD_FAULT_INJECTION */
-/* initialization to perform at module load time: */
-
-void
-nfs4_state_init(void)
-{
-}
-
/*
* Since the lifetime of a delegation isn't limited to that of an open, a
* client may quite reasonably hang on to a delegation as long as it has
@@ -5045,7 +5166,6 @@ nfs4_state_destroy_net(struct net *net)
int i;
struct nfs4_client *clp = NULL;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct rb_node *node, *tmp;
for (i = 0; i < CLIENT_HASH_SIZE; i++) {
while (!list_empty(&nn->conf_id_hashtbl[i])) {
@@ -5054,13 +5174,11 @@ nfs4_state_destroy_net(struct net *net)
}
}
- node = rb_first(&nn->unconf_name_tree);
- while (node != NULL) {
- tmp = node;
- node = rb_next(tmp);
- clp = rb_entry(tmp, struct nfs4_client, cl_namenode);
- rb_erase(tmp, &nn->unconf_name_tree);
- destroy_client(clp);
+ for (i = 0; i < CLIENT_HASH_SIZE; i++) {
+ while (!list_empty(&nn->unconf_id_hashtbl[i])) {
+ clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
+ destroy_client(clp);
+ }
}
kfree(nn->sessionid_hashtbl);
@@ -5119,7 +5237,6 @@ out_recovery:
return ret;
}
-/* should be called with the state lock held */
void
nfs4_state_shutdown_net(struct net *net)
{
@@ -5130,13 +5247,14 @@ nfs4_state_shutdown_net(struct net *net)
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
+ nfs4_lock_state();
INIT_LIST_HEAD(&reaplist);
- spin_lock(&recall_lock);
+ spin_lock(&state_lock);
list_for_each_safe(pos, next, &nn->del_recall_lru) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
list_move(&dp->dl_recall_lru, &reaplist);
}
- spin_unlock(&recall_lock);
+ spin_unlock(&state_lock);
list_for_each_safe(pos, next, &reaplist) {
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
destroy_delegation(dp);
@@ -5144,6 +5262,7 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
+ nfs4_unlock_state();
}
void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d9454fe5653..944275c8f56 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -98,16 +98,6 @@ xdr_error: \
status = nfserr_bad_xdr; \
goto out
-#define READ32(x) (x) = ntohl(*p++)
-#define READ64(x) do { \
- (x) = (u64)ntohl(*p++) << 32; \
- (x) |= ntohl(*p++); \
-} while (0)
-#define READTIME(x) do { \
- p++; \
- (x) = ntohl(*p++); \
- p++; \
-} while (0)
#define READMEM(x,nbytes) do { \
x = (char *)p; \
p += XDR_QUADLEN(nbytes); \
@@ -141,8 +131,8 @@ xdr_error: \
static void next_decode_page(struct nfsd4_compoundargs *argp)
{
- argp->pagelist++;
argp->p = page_address(argp->pagelist[0]);
+ argp->pagelist++;
if (argp->pagelen < PAGE_SIZE) {
argp->end = argp->p + (argp->pagelen>>2);
argp->pagelen = 0;
@@ -190,6 +180,15 @@ static int zero_clientid(clientid_t *clid)
return (clid->cl_boot == 0) && (clid->cl_id == 0);
}
+/**
+ * defer_free - mark an allocation as deferred freed
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @release: release callback to free @p, typically kfree()
+ * @p: pointer to be freed
+ *
+ * Marks @p to be freed when processing the compound operation
+ * described in @argp finishes.
+ */
static int
defer_free(struct nfsd4_compoundargs *argp,
void (*release)(const void *), void *p)
@@ -206,6 +205,16 @@ defer_free(struct nfsd4_compoundargs *argp,
return 0;
}
+/**
+ * savemem - duplicate a chunk of memory for later processing
+ * @argp: NFSv4 compound argument structure to be freed with
+ * @p: pointer to be duplicated
+ * @nbytes: length to be duplicated
+ *
+ * Returns a pointer to a copy of @nbytes bytes of memory at @p
+ * that are preserved until processing of the NFSv4 compound
+ * operation described by @argp finishes.
+ */
static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
{
if (p == argp->tmp) {
@@ -234,17 +243,17 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
bmval[2] = 0;
READ_BUF(4);
- READ32(bmlen);
+ bmlen = be32_to_cpup(p++);
if (bmlen > 1000)
goto xdr_error;
READ_BUF(bmlen << 2);
if (bmlen > 0)
- READ32(bmval[0]);
+ bmval[0] = be32_to_cpup(p++);
if (bmlen > 1)
- READ32(bmval[1]);
+ bmval[1] = be32_to_cpup(p++);
if (bmlen > 2)
- READ32(bmval[2]);
+ bmval[2] = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -256,8 +265,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
{
int expected_len, len = 0;
u32 dummy32;
+ u64 sec;
char *buf;
- int host_err;
DECODE_HEAD;
iattr->ia_valid = 0;
@@ -265,12 +274,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
return status;
READ_BUF(4);
- READ32(expected_len);
+ expected_len = be32_to_cpup(p++);
if (bmval[0] & FATTR4_WORD0_SIZE) {
READ_BUF(8);
len += 8;
- READ64(iattr->ia_size);
+ p = xdr_decode_hyper(p, &iattr->ia_size);
iattr->ia_valid |= ATTR_SIZE;
}
if (bmval[0] & FATTR4_WORD0_ACL) {
@@ -278,25 +287,24 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
struct nfs4_ace *ace;
READ_BUF(4); len += 4;
- READ32(nace);
+ nace = be32_to_cpup(p++);
if (nace > NFS4_ACL_MAX)
- return nfserr_resource;
+ return nfserr_fbig;
*acl = nfs4_acl_new(nace);
- if (*acl == NULL) {
- host_err = -ENOMEM;
- goto out_nfserr;
- }
+ if (*acl == NULL)
+ return nfserr_jukebox;
+
defer_free(argp, kfree, *acl);
(*acl)->naces = nace;
for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
READ_BUF(16); len += 16;
- READ32(ace->type);
- READ32(ace->flag);
- READ32(ace->access_mask);
- READ32(dummy32);
+ ace->type = be32_to_cpup(p++);
+ ace->flag = be32_to_cpup(p++);
+ ace->access_mask = be32_to_cpup(p++);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
len += XDR_QUADLEN(dummy32) << 2;
READMEM(buf, dummy32);
@@ -318,14 +326,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_MODE) {
READ_BUF(4);
len += 4;
- READ32(iattr->ia_mode);
+ iattr->ia_mode = be32_to_cpup(p++);
iattr->ia_mode &= (S_IFMT | S_IALLUGO);
iattr->ia_valid |= ATTR_MODE;
}
if (bmval[1] & FATTR4_WORD1_OWNER) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
@@ -336,7 +344,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
len += (XDR_QUADLEN(dummy32) << 2);
READMEM(buf, dummy32);
@@ -347,15 +355,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
/* We require the high 32 bits of 'seconds' to be 0, and we ignore
all 32 bits of 'nseconds'. */
READ_BUF(12);
len += 12;
- READ64(iattr->ia_atime.tv_sec);
- READ32(iattr->ia_atime.tv_nsec);
+ p = xdr_decode_hyper(p, &sec);
+ iattr->ia_atime.tv_sec = (time_t)sec;
+ iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
return nfserr_inval;
iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
@@ -370,15 +379,16 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
/* We require the high 32 bits of 'seconds' to be 0, and we ignore
all 32 bits of 'nseconds'. */
READ_BUF(12);
len += 12;
- READ64(iattr->ia_mtime.tv_sec);
- READ32(iattr->ia_mtime.tv_nsec);
+ p = xdr_decode_hyper(p, &sec);
+ iattr->ia_mtime.tv_sec = sec;
+ iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
return nfserr_inval;
iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
@@ -396,13 +406,13 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
READ_BUF(4);
len += 4;
- READ32(dummy32); /* lfs: we don't use it */
+ dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */
READ_BUF(4);
len += 4;
- READ32(dummy32); /* pi: we don't use it either */
+ dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */
READ_BUF(4);
len += 4;
- READ32(dummy32);
+ dummy32 = be32_to_cpup(p++);
READ_BUF(dummy32);
if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN)
return nfserr_badlabel;
@@ -411,6 +421,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
label->data = kzalloc(dummy32 + 1, GFP_KERNEL);
if (!label->data)
return nfserr_jukebox;
+ label->len = dummy32;
defer_free(argp, kfree, label->data);
memcpy(label->data, buf, dummy32);
}
@@ -424,10 +435,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
goto xdr_error;
DECODE_TAIL;
-
-out_nfserr:
- status = nfserrno(host_err);
- goto out;
}
static __be32
@@ -436,7 +443,7 @@ nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
- READ32(sid->si_generation);
+ sid->si_generation = be32_to_cpup(p++);
COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
DECODE_TAIL;
@@ -448,7 +455,7 @@ nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access
DECODE_HEAD;
READ_BUF(4);
- READ32(access->ac_req_access);
+ access->ac_req_access = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -463,7 +470,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
/* callback_sec_params4 */
READ_BUF(4);
- READ32(nr_secflavs);
+ nr_secflavs = be32_to_cpup(p++);
if (nr_secflavs)
cbs->flavor = (u32)(-1);
else
@@ -471,7 +478,7 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
cbs->flavor = 0;
for (i = 0; i < nr_secflavs; ++i) {
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
switch (dummy) {
case RPC_AUTH_NULL:
/* Nothing to read */
@@ -481,21 +488,21 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
case RPC_AUTH_UNIX:
READ_BUF(8);
/* stamp */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* machine name */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
SAVEMEM(machine_name, dummy);
/* uid, gid */
READ_BUF(8);
- READ32(uid);
- READ32(gid);
+ uid = be32_to_cpup(p++);
+ gid = be32_to_cpup(p++);
/* more gids */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
if (cbs->flavor == (u32)(-1)) {
kuid_t kuid = make_kuid(&init_user_ns, uid);
@@ -515,14 +522,14 @@ static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_
"not supported!\n");
READ_BUF(8);
/* gcbp_service */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
/* gcbp_handle_from_server */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
/* gcbp_handle_from_client */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
break;
default:
@@ -538,7 +545,7 @@ static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, stru
DECODE_HEAD;
READ_BUF(4);
- READ32(bc->bc_cb_program);
+ bc->bc_cb_program = be32_to_cpup(p++);
nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
DECODE_TAIL;
@@ -550,7 +557,7 @@ static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- READ32(bcts->dir);
+ bcts->dir = be32_to_cpup(p++);
/* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
* could help us figure out we should be using it. */
DECODE_TAIL;
@@ -562,7 +569,7 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
DECODE_HEAD;
READ_BUF(4);
- READ32(close->cl_seqid);
+ close->cl_seqid = be32_to_cpup(p++);
return nfsd4_decode_stateid(argp, &close->cl_stateid);
DECODE_TAIL;
@@ -575,8 +582,8 @@ nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit
DECODE_HEAD;
READ_BUF(12);
- READ64(commit->co_offset);
- READ32(commit->co_count);
+ p = xdr_decode_hyper(p, &commit->co_offset);
+ commit->co_count = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -587,19 +594,30 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
DECODE_HEAD;
READ_BUF(4);
- READ32(create->cr_type);
+ create->cr_type = be32_to_cpup(p++);
switch (create->cr_type) {
case NF4LNK:
READ_BUF(4);
- READ32(create->cr_linklen);
+ create->cr_linklen = be32_to_cpup(p++);
READ_BUF(create->cr_linklen);
- SAVEMEM(create->cr_linkname, create->cr_linklen);
+ /*
+ * The VFS will want a null-terminated string, and
+ * null-terminating in place isn't safe since this might
+ * end on a page boundary:
+ */
+ create->cr_linkname =
+ kmalloc(create->cr_linklen + 1, GFP_KERNEL);
+ if (!create->cr_linkname)
+ return nfserr_jukebox;
+ memcpy(create->cr_linkname, p, create->cr_linklen);
+ create->cr_linkname[create->cr_linklen] = '\0';
+ defer_free(argp, kfree, create->cr_linkname);
break;
case NF4BLK:
case NF4CHR:
READ_BUF(8);
- READ32(create->cr_specdata1);
- READ32(create->cr_specdata2);
+ create->cr_specdata1 = be32_to_cpup(p++);
+ create->cr_specdata2 = be32_to_cpup(p++);
break;
case NF4SOCK:
case NF4FIFO:
@@ -609,7 +627,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
}
READ_BUF(4);
- READ32(create->cr_namelen);
+ create->cr_namelen = be32_to_cpup(p++);
READ_BUF(create->cr_namelen);
SAVEMEM(create->cr_name, create->cr_namelen);
if ((status = check_filename(create->cr_name, create->cr_namelen)))
@@ -641,7 +659,7 @@ nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
DECODE_HEAD;
READ_BUF(4);
- READ32(link->li_namelen);
+ link->li_namelen = be32_to_cpup(p++);
READ_BUF(link->li_namelen);
SAVEMEM(link->li_name, link->li_namelen);
if ((status = check_filename(link->li_name, link->li_namelen)))
@@ -659,24 +677,24 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
* type, reclaim(boolean), offset, length, new_lock_owner(boolean)
*/
READ_BUF(28);
- READ32(lock->lk_type);
+ lock->lk_type = be32_to_cpup(p++);
if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
goto xdr_error;
- READ32(lock->lk_reclaim);
- READ64(lock->lk_offset);
- READ64(lock->lk_length);
- READ32(lock->lk_is_new);
+ lock->lk_reclaim = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &lock->lk_offset);
+ p = xdr_decode_hyper(p, &lock->lk_length);
+ lock->lk_is_new = be32_to_cpup(p++);
if (lock->lk_is_new) {
READ_BUF(4);
- READ32(lock->lk_new_open_seqid);
+ lock->lk_new_open_seqid = be32_to_cpup(p++);
status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
if (status)
return status;
READ_BUF(8 + sizeof(clientid_t));
- READ32(lock->lk_new_lock_seqid);
+ lock->lk_new_lock_seqid = be32_to_cpup(p++);
COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
- READ32(lock->lk_new_owner.len);
+ lock->lk_new_owner.len = be32_to_cpup(p++);
READ_BUF(lock->lk_new_owner.len);
READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
} else {
@@ -684,7 +702,7 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
if (status)
return status;
READ_BUF(4);
- READ32(lock->lk_old_lock_seqid);
+ lock->lk_old_lock_seqid = be32_to_cpup(p++);
}
DECODE_TAIL;
@@ -696,13 +714,13 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
DECODE_HEAD;
READ_BUF(32);
- READ32(lockt->lt_type);
+ lockt->lt_type = be32_to_cpup(p++);
if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
goto xdr_error;
- READ64(lockt->lt_offset);
- READ64(lockt->lt_length);
+ p = xdr_decode_hyper(p, &lockt->lt_offset);
+ p = xdr_decode_hyper(p, &lockt->lt_length);
COPYMEM(&lockt->lt_clientid, 8);
- READ32(lockt->lt_owner.len);
+ lockt->lt_owner.len = be32_to_cpup(p++);
READ_BUF(lockt->lt_owner.len);
READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
@@ -715,16 +733,16 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
DECODE_HEAD;
READ_BUF(8);
- READ32(locku->lu_type);
+ locku->lu_type = be32_to_cpup(p++);
if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
goto xdr_error;
- READ32(locku->lu_seqid);
+ locku->lu_seqid = be32_to_cpup(p++);
status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
if (status)
return status;
READ_BUF(16);
- READ64(locku->lu_offset);
- READ64(locku->lu_length);
+ p = xdr_decode_hyper(p, &locku->lu_offset);
+ p = xdr_decode_hyper(p, &locku->lu_length);
DECODE_TAIL;
}
@@ -735,7 +753,7 @@ nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup
DECODE_HEAD;
READ_BUF(4);
- READ32(lookup->lo_len);
+ lookup->lo_len = be32_to_cpup(p++);
READ_BUF(lookup->lo_len);
SAVEMEM(lookup->lo_name, lookup->lo_len);
if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
@@ -750,7 +768,7 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh
u32 w;
READ_BUF(4);
- READ32(w);
+ w = be32_to_cpup(p++);
*share_access = w & NFS4_SHARE_ACCESS_MASK;
*deleg_want = w & NFS4_SHARE_WANT_MASK;
if (deleg_when)
@@ -802,7 +820,7 @@ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
__be32 *p;
READ_BUF(4);
- READ32(*x);
+ *x = be32_to_cpup(p++);
/* Note: unlinke access bits, deny bits may be zero. */
if (*x & ~NFS4_SHARE_DENY_BOTH)
return nfserr_bad_xdr;
@@ -816,7 +834,7 @@ static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_ne
__be32 *p;
READ_BUF(4);
- READ32(o->len);
+ o->len = be32_to_cpup(p++);
if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
return nfserr_bad_xdr;
@@ -841,7 +859,7 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
open->op_xdr_error = 0;
/* seqid, share_access, share_deny, clientid, ownerlen */
READ_BUF(4);
- READ32(open->op_seqid);
+ open->op_seqid = be32_to_cpup(p++);
/* decode, yet ignore deleg_when until supported */
status = nfsd4_decode_share_access(argp, &open->op_share_access,
&open->op_deleg_want, &dummy);
@@ -856,13 +874,13 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
if (status)
goto xdr_error;
READ_BUF(4);
- READ32(open->op_create);
+ open->op_create = be32_to_cpup(p++);
switch (open->op_create) {
case NFS4_OPEN_NOCREATE:
break;
case NFS4_OPEN_CREATE:
READ_BUF(4);
- READ32(open->op_createmode);
+ open->op_createmode = be32_to_cpup(p++);
switch (open->op_createmode) {
case NFS4_CREATE_UNCHECKED:
case NFS4_CREATE_GUARDED:
@@ -895,12 +913,12 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
/* open_claim */
READ_BUF(4);
- READ32(open->op_claim_type);
+ open->op_claim_type = be32_to_cpup(p++);
switch (open->op_claim_type) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_DELEGATE_PREV:
READ_BUF(4);
- READ32(open->op_fname.len);
+ open->op_fname.len = be32_to_cpup(p++);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
@@ -908,14 +926,14 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
break;
case NFS4_OPEN_CLAIM_PREVIOUS:
READ_BUF(4);
- READ32(open->op_delegate_type);
+ open->op_delegate_type = be32_to_cpup(p++);
break;
case NFS4_OPEN_CLAIM_DELEGATE_CUR:
status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
if (status)
return status;
READ_BUF(4);
- READ32(open->op_fname.len);
+ open->op_fname.len = be32_to_cpup(p++);
READ_BUF(open->op_fname.len);
SAVEMEM(open->op_fname.data, open->op_fname.len);
if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
@@ -945,13 +963,16 @@ static __be32
nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
{
DECODE_HEAD;
-
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
if (status)
return status;
READ_BUF(4);
- READ32(open_conf->oc_seqid);
-
+ open_conf->oc_seqid = be32_to_cpup(p++);
+
DECODE_TAIL;
}
@@ -964,7 +985,7 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d
if (status)
return status;
READ_BUF(4);
- READ32(open_down->od_seqid);
+ open_down->od_seqid = be32_to_cpup(p++);
status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
&open_down->od_deleg_want, NULL);
if (status)
@@ -981,7 +1002,7 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
DECODE_HEAD;
READ_BUF(4);
- READ32(putfh->pf_fhlen);
+ putfh->pf_fhlen = be32_to_cpup(p++);
if (putfh->pf_fhlen > NFS4_FHSIZE)
goto xdr_error;
READ_BUF(putfh->pf_fhlen);
@@ -991,6 +1012,14 @@ nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
}
static __be32
+nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+{
+ if (argp->minorversion == 0)
+ return nfs_ok;
+ return nfserr_notsupp;
+}
+
+static __be32
nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
{
DECODE_HEAD;
@@ -999,8 +1028,8 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
if (status)
return status;
READ_BUF(12);
- READ64(read->rd_offset);
- READ32(read->rd_length);
+ p = xdr_decode_hyper(p, &read->rd_offset);
+ read->rd_length = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1011,10 +1040,10 @@ nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *read
DECODE_HEAD;
READ_BUF(24);
- READ64(readdir->rd_cookie);
+ p = xdr_decode_hyper(p, &readdir->rd_cookie);
COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
- READ32(readdir->rd_dircount); /* just in case you needed a useless field... */
- READ32(readdir->rd_maxcount);
+ readdir->rd_dircount = be32_to_cpup(p++);
+ readdir->rd_maxcount = be32_to_cpup(p++);
if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
goto out;
@@ -1027,7 +1056,7 @@ nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove
DECODE_HEAD;
READ_BUF(4);
- READ32(remove->rm_namelen);
+ remove->rm_namelen = be32_to_cpup(p++);
READ_BUF(remove->rm_namelen);
SAVEMEM(remove->rm_name, remove->rm_namelen);
if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
@@ -1042,10 +1071,10 @@ nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename
DECODE_HEAD;
READ_BUF(4);
- READ32(rename->rn_snamelen);
+ rename->rn_snamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_snamelen + 4);
SAVEMEM(rename->rn_sname, rename->rn_snamelen);
- READ32(rename->rn_tnamelen);
+ rename->rn_tnamelen = be32_to_cpup(p++);
READ_BUF(rename->rn_tnamelen);
SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
@@ -1061,6 +1090,9 @@ nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
{
DECODE_HEAD;
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
READ_BUF(sizeof(clientid_t));
COPYMEM(clientid, sizeof(clientid_t));
@@ -1074,7 +1106,7 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
DECODE_HEAD;
READ_BUF(4);
- READ32(secinfo->si_namelen);
+ secinfo->si_namelen = be32_to_cpup(p++);
READ_BUF(secinfo->si_namelen);
SAVEMEM(secinfo->si_name, secinfo->si_namelen);
status = check_filename(secinfo->si_name, secinfo->si_namelen);
@@ -1090,7 +1122,7 @@ nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
DECODE_HEAD;
READ_BUF(4);
- READ32(sin->sin_style);
+ sin->sin_style = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1111,6 +1143,9 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
{
DECODE_HEAD;
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
READ_BUF(NFS4_VERIFIER_SIZE);
COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
@@ -1118,16 +1153,16 @@ nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclient
if (status)
return nfserr_bad_xdr;
READ_BUF(8);
- READ32(setclientid->se_callback_prog);
- READ32(setclientid->se_callback_netid_len);
+ setclientid->se_callback_prog = be32_to_cpup(p++);
+ setclientid->se_callback_netid_len = be32_to_cpup(p++);
READ_BUF(setclientid->se_callback_netid_len + 4);
SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
- READ32(setclientid->se_callback_addr_len);
+ setclientid->se_callback_addr_len = be32_to_cpup(p++);
READ_BUF(setclientid->se_callback_addr_len + 4);
SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
- READ32(setclientid->se_callback_ident);
+ setclientid->se_callback_ident = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1137,6 +1172,9 @@ nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_s
{
DECODE_HEAD;
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
READ_BUF(8 + NFS4_VERIFIER_SIZE);
COPYMEM(&scd_c->sc_clientid, 8);
COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
@@ -1157,7 +1195,7 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify
* nfsd4_proc_verify */
READ_BUF(4);
- READ32(verify->ve_attrlen);
+ verify->ve_attrlen = be32_to_cpup(p++);
READ_BUF(verify->ve_attrlen);
SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
@@ -1175,11 +1213,11 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
if (status)
return status;
READ_BUF(16);
- READ64(write->wr_offset);
- READ32(write->wr_stable_how);
+ p = xdr_decode_hyper(p, &write->wr_offset);
+ write->wr_stable_how = be32_to_cpup(p++);
if (write->wr_stable_how > 2)
goto xdr_error;
- READ32(write->wr_buflen);
+ write->wr_buflen = be32_to_cpup(p++);
/* Sorry .. no magic macros for this.. *
* READ_BUF(write->wr_buflen);
@@ -1193,7 +1231,6 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
}
write->wr_head.iov_base = p;
write->wr_head.iov_len = avail;
- WARN_ON(avail != (XDR_QUADLEN(avail) << 2));
write->wr_pagelist = argp->pagelist;
len = XDR_QUADLEN(write->wr_buflen) << 2;
@@ -1208,6 +1245,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
len -= pages * PAGE_SIZE;
argp->p = (__be32 *)page_address(argp->pagelist[0]);
+ argp->pagelist++;
argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
}
argp->p += XDR_QUADLEN(len);
@@ -1220,9 +1258,12 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
{
DECODE_HEAD;
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
READ_BUF(12);
COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
- READ32(rlockowner->rl_owner.len);
+ rlockowner->rl_owner.len = be32_to_cpup(p++);
READ_BUF(rlockowner->rl_owner.len);
READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
@@ -1246,63 +1287,63 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
return nfserr_bad_xdr;
READ_BUF(4);
- READ32(exid->flags);
+ exid->flags = be32_to_cpup(p++);
/* Ignore state_protect4_a */
READ_BUF(4);
- READ32(exid->spa_how);
+ exid->spa_how = be32_to_cpup(p++);
switch (exid->spa_how) {
case SP4_NONE:
break;
case SP4_MACH_CRED:
/* spo_must_enforce */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
/* spo_must_allow */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
break;
case SP4_SSV:
/* ssp_ops */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy * 4);
p += dummy;
/* ssp_hash_algs<> */
READ_BUF(4);
- READ32(tmp);
+ tmp = be32_to_cpup(p++);
while (tmp--) {
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
}
/* ssp_encr_algs<> */
READ_BUF(4);
- READ32(tmp);
+ tmp = be32_to_cpup(p++);
while (tmp--) {
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
}
/* ssp_window and ssp_num_gss_handles */
READ_BUF(8);
- READ32(dummy);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
+ dummy = be32_to_cpup(p++);
break;
default:
goto xdr_error;
@@ -1310,7 +1351,7 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
/* Ignore Implementation ID */
READ_BUF(4); /* nfs_impl_id4 array length */
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
if (dummy > 1)
goto xdr_error;
@@ -1318,13 +1359,13 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
if (dummy == 1) {
/* nii_domain */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
/* nii_name */
READ_BUF(4);
- READ32(dummy);
+ dummy = be32_to_cpup(p++);
READ_BUF(dummy);
p += XDR_QUADLEN(dummy);
@@ -1344,21 +1385,21 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
READ_BUF(16);
COPYMEM(&sess->clientid, 8);
- READ32(sess->seqid);
- READ32(sess->flags);
+ sess->seqid = be32_to_cpup(p++);
+ sess->flags = be32_to_cpup(p++);
/* Fore channel attrs */
READ_BUF(28);
- READ32(dummy); /* headerpadsz is always 0 */
- READ32(sess->fore_channel.maxreq_sz);
- READ32(sess->fore_channel.maxresp_sz);
- READ32(sess->fore_channel.maxresp_cached);
- READ32(sess->fore_channel.maxops);
- READ32(sess->fore_channel.maxreqs);
- READ32(sess->fore_channel.nr_rdma_attrs);
+ dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+ sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
+ sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
+ sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
+ sess->fore_channel.maxops = be32_to_cpup(p++);
+ sess->fore_channel.maxreqs = be32_to_cpup(p++);
+ sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++);
if (sess->fore_channel.nr_rdma_attrs == 1) {
READ_BUF(4);
- READ32(sess->fore_channel.rdma_attrs);
+ sess->fore_channel.rdma_attrs = be32_to_cpup(p++);
} else if (sess->fore_channel.nr_rdma_attrs > 1) {
dprintk("Too many fore channel attr bitmaps!\n");
goto xdr_error;
@@ -1366,23 +1407,23 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
/* Back channel attrs */
READ_BUF(28);
- READ32(dummy); /* headerpadsz is always 0 */
- READ32(sess->back_channel.maxreq_sz);
- READ32(sess->back_channel.maxresp_sz);
- READ32(sess->back_channel.maxresp_cached);
- READ32(sess->back_channel.maxops);
- READ32(sess->back_channel.maxreqs);
- READ32(sess->back_channel.nr_rdma_attrs);
+ dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */
+ sess->back_channel.maxreq_sz = be32_to_cpup(p++);
+ sess->back_channel.maxresp_sz = be32_to_cpup(p++);
+ sess->back_channel.maxresp_cached = be32_to_cpup(p++);
+ sess->back_channel.maxops = be32_to_cpup(p++);
+ sess->back_channel.maxreqs = be32_to_cpup(p++);
+ sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++);
if (sess->back_channel.nr_rdma_attrs == 1) {
READ_BUF(4);
- READ32(sess->back_channel.rdma_attrs);
+ sess->back_channel.rdma_attrs = be32_to_cpup(p++);
} else if (sess->back_channel.nr_rdma_attrs > 1) {
dprintk("Too many back channel attr bitmaps!\n");
goto xdr_error;
}
READ_BUF(4);
- READ32(sess->callback_prog);
+ sess->callback_prog = be32_to_cpup(p++);
nfsd4_decode_cb_sec(argp, &sess->cb_sec);
DECODE_TAIL;
}
@@ -1405,7 +1446,7 @@ nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
DECODE_HEAD;
READ_BUF(sizeof(stateid_t));
- READ32(free_stateid->fr_stateid.si_generation);
+ free_stateid->fr_stateid.si_generation = be32_to_cpup(p++);
COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
DECODE_TAIL;
@@ -1419,10 +1460,10 @@ nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- READ32(seq->seqid);
- READ32(seq->slotid);
- READ32(seq->maxslots);
- READ32(seq->cachethis);
+ seq->seqid = be32_to_cpup(p++);
+ seq->slotid = be32_to_cpup(p++);
+ seq->maxslots = be32_to_cpup(p++);
+ seq->cachethis = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1479,7 +1520,7 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
DECODE_HEAD;
READ_BUF(4);
- READ32(rc->rca_one_fs);
+ rc->rca_one_fs = be32_to_cpup(p++);
DECODE_TAIL;
}
@@ -1519,7 +1560,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
[OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
[OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
- [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
+ [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh,
[OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
[OP_READ] = (nfsd4_dec)nfsd4_decode_read,
[OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1536,46 +1577,6 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
[OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
[OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
-};
-
-static nfsd4_dec nfsd41_dec_ops[] = {
- [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
- [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
- [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
- [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
- [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
- [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
- [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
- [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
- [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
- [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
- [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
- [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
- [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
- [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
- [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
- [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
- [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
- [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
- [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
- [OP_RENEW] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
- [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
- [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
- [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_SETCLIENTID_CONFIRM]= (nfsd4_dec)nfsd4_decode_notsupp,
- [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
- [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
- [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_notsupp,
/* new operations for NFSv4.1 */
[OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
@@ -1599,32 +1600,39 @@ static nfsd4_dec nfsd41_dec_ops[] = {
[OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
};
-struct nfsd4_minorversion_ops {
- nfsd4_dec *decoders;
- int nops;
-};
-
-static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
- [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
- [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
- [2] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
-};
+static inline bool
+nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
+{
+ if (op->opnum < FIRST_NFS4_OP)
+ return false;
+ else if (argp->minorversion == 0 && op->opnum > LAST_NFS40_OP)
+ return false;
+ else if (argp->minorversion == 1 && op->opnum > LAST_NFS41_OP)
+ return false;
+ else if (argp->minorversion == 2 && op->opnum > LAST_NFS42_OP)
+ return false;
+ return true;
+}
static __be32
nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
{
DECODE_HEAD;
struct nfsd4_op *op;
- struct nfsd4_minorversion_ops *ops;
bool cachethis = false;
+ int auth_slack= argp->rqstp->rq_auth_slack;
+ int max_reply = auth_slack + 8; /* opcnt, status */
+ int readcount = 0;
+ int readbytes = 0;
int i;
READ_BUF(4);
- READ32(argp->taglen);
+ argp->taglen = be32_to_cpup(p++);
READ_BUF(argp->taglen + 8);
SAVEMEM(argp->tag, argp->taglen);
- READ32(argp->minorversion);
- READ32(argp->opcnt);
+ argp->minorversion = be32_to_cpup(p++);
+ argp->opcnt = be32_to_cpup(p++);
+ max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
if (argp->taglen > NFSD4_MAX_TAGLEN)
goto xdr_error;
@@ -1640,110 +1648,98 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
}
}
- if (argp->minorversion >= ARRAY_SIZE(nfsd4_minorversion))
+ if (argp->minorversion > NFSD_SUPPORTED_MINOR_VERSION)
argp->opcnt = 0;
- ops = &nfsd4_minorversion[argp->minorversion];
for (i = 0; i < argp->opcnt; i++) {
op = &argp->ops[i];
op->replay = NULL;
READ_BUF(4);
- READ32(op->opnum);
+ op->opnum = be32_to_cpup(p++);
- if (op->opnum >= FIRST_NFS4_OP && op->opnum <= LAST_NFS4_OP)
- op->status = ops->decoders[op->opnum](argp, &op->u);
+ if (nfsd4_opnum_in_range(argp, op))
+ op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
else {
op->opnum = OP_ILLEGAL;
op->status = nfserr_op_illegal;
}
-
- if (op->status) {
- argp->opcnt = i+1;
- break;
- }
/*
* We'll try to cache the result in the DRC if any one
* op in the compound wants to be cached:
*/
cachethis |= nfsd4_cache_this_op(op);
+
+ if (op->opnum == OP_READ) {
+ readcount++;
+ readbytes += nfsd4_max_reply(argp->rqstp, op);
+ } else
+ max_reply += nfsd4_max_reply(argp->rqstp, op);
+
+ if (op->status) {
+ argp->opcnt = i+1;
+ break;
+ }
}
/* Sessions make the DRC unnecessary: */
if (argp->minorversion)
cachethis = false;
+ svc_reserve(argp->rqstp, max_reply + readbytes);
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
- DECODE_TAIL;
-}
-
-#define WRITE32(n) *p++ = htonl(n)
-#define WRITE64(n) do { \
- *p++ = htonl((u32)((n) >> 32)); \
- *p++ = htonl((u32)(n)); \
-} while (0)
-#define WRITEMEM(ptr,nbytes) do { if (nbytes > 0) { \
- *(p + XDR_QUADLEN(nbytes) -1) = 0; \
- memcpy(p, ptr, nbytes); \
- p += XDR_QUADLEN(nbytes); \
-}} while (0)
+ if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
+ argp->rqstp->rq_splice_ok = false;
-static void write32(__be32 **p, u32 n)
-{
- *(*p)++ = htonl(n);
-}
-
-static void write64(__be32 **p, u64 n)
-{
- write32(p, (n >> 32));
- write32(p, (u32)n);
+ DECODE_TAIL;
}
-static void write_change(__be32 **p, struct kstat *stat, struct inode *inode)
+static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode)
{
if (IS_I_VERSION(inode)) {
- write64(p, inode->i_version);
+ p = xdr_encode_hyper(p, inode->i_version);
} else {
- write32(p, stat->ctime.tv_sec);
- write32(p, stat->ctime.tv_nsec);
+ *p++ = cpu_to_be32(stat->ctime.tv_sec);
+ *p++ = cpu_to_be32(stat->ctime.tv_nsec);
}
+ return p;
}
-static void write_cinfo(__be32 **p, struct nfsd4_change_info *c)
+static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
{
- write32(p, c->atomic);
+ *p++ = cpu_to_be32(c->atomic);
if (c->change_supported) {
- write64(p, c->before_change);
- write64(p, c->after_change);
+ p = xdr_encode_hyper(p, c->before_change);
+ p = xdr_encode_hyper(p, c->after_change);
} else {
- write32(p, c->before_ctime_sec);
- write32(p, c->before_ctime_nsec);
- write32(p, c->after_ctime_sec);
- write32(p, c->after_ctime_nsec);
+ *p++ = cpu_to_be32(c->before_ctime_sec);
+ *p++ = cpu_to_be32(c->before_ctime_nsec);
+ *p++ = cpu_to_be32(c->after_ctime_sec);
+ *p++ = cpu_to_be32(c->after_ctime_nsec);
}
+ return p;
}
-#define RESERVE_SPACE(nbytes) do { \
- p = resp->p; \
- BUG_ON(p + XDR_QUADLEN(nbytes) > resp->end); \
-} while (0)
-#define ADJUST_ARGS() resp->p = p
-
/* Encode as an array of strings the string given with components
* separated @sep, escaped with esc_enter and esc_exit.
*/
-static __be32 nfsd4_encode_components_esc(char sep, char *components,
- __be32 **pp, int *buflen,
- char esc_enter, char esc_exit)
+static __be32 nfsd4_encode_components_esc(struct xdr_stream *xdr, char sep,
+ char *components, char esc_enter,
+ char esc_exit)
{
- __be32 *p = *pp;
- __be32 *countp = p;
+ __be32 *p;
+ __be32 pathlen;
+ int pathlen_offset;
int strlen, count=0;
char *str, *end, *next;
dprintk("nfsd4_encode_components(%s)\n", components);
- if ((*buflen -= 4) < 0)
+
+ pathlen_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
return nfserr_resource;
- WRITE32(0); /* We will fill this in with @count later */
+ p++; /* We will fill this in with @count later */
+
end = str = components;
while (*end) {
bool found_esc = false;
@@ -1765,59 +1761,57 @@ static __be32 nfsd4_encode_components_esc(char sep, char *components,
strlen = end - str;
if (strlen) {
- if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
+ p = xdr_reserve_space(xdr, strlen + 4);
+ if (!p)
return nfserr_resource;
- WRITE32(strlen);
- WRITEMEM(str, strlen);
+ p = xdr_encode_opaque(p, str, strlen);
count++;
}
else
end++;
str = end;
}
- *pp = p;
- p = countp;
- WRITE32(count);
+ pathlen = htonl(xdr->buf->len - pathlen_offset);
+ write_bytes_to_xdr_buf(xdr->buf, pathlen_offset, &pathlen, 4);
return 0;
}
/* Encode as an array of strings the string given with components
* separated @sep.
*/
-static __be32 nfsd4_encode_components(char sep, char *components,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep,
+ char *components)
{
- return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0);
+ return nfsd4_encode_components_esc(xdr, sep, components, 0, 0);
}
/*
* encode a location element of a fs_locations structure
*/
-static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr,
+ struct nfsd4_fs_location *location)
{
__be32 status;
- __be32 *p = *pp;
- status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen,
+ status = nfsd4_encode_components_esc(xdr, ':', location->hosts,
'[', ']');
if (status)
return status;
- status = nfsd4_encode_components('/', location->path, &p, buflen);
+ status = nfsd4_encode_components(xdr, '/', location->path);
if (status)
return status;
- *pp = p;
return 0;
}
/*
* Encode a path in RFC3530 'pathname4' format
*/
-static __be32 nfsd4_encode_path(const struct path *root,
- const struct path *path, __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_path(struct xdr_stream *xdr,
+ const struct path *root,
+ const struct path *path)
{
struct path cur = *path;
- __be32 *p = *pp;
+ __be32 *p;
struct dentry **components = NULL;
unsigned int ncomponents = 0;
__be32 err = nfserr_jukebox;
@@ -1848,11 +1842,11 @@ static __be32 nfsd4_encode_path(const struct path *root,
components[ncomponents++] = cur.dentry;
cur.dentry = dget_parent(cur.dentry);
}
-
- *buflen -= 4;
- if (*buflen < 0)
+ err = nfserr_resource;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_free;
- WRITE32(ncomponents);
+ *p++ = cpu_to_be32(ncomponents);
while (ncomponents) {
struct dentry *dentry = components[ncomponents - 1];
@@ -1860,20 +1854,18 @@ static __be32 nfsd4_encode_path(const struct path *root,
spin_lock(&dentry->d_lock);
len = dentry->d_name.len;
- *buflen -= 4 + (XDR_QUADLEN(len) << 2);
- if (*buflen < 0) {
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p) {
spin_unlock(&dentry->d_lock);
goto out_free;
}
- WRITE32(len);
- WRITEMEM(dentry->d_name.name, len);
+ p = xdr_encode_opaque(p, dentry->d_name.name, len);
dprintk("/%s", dentry->d_name.name);
spin_unlock(&dentry->d_lock);
dput(dentry);
ncomponents--;
}
- *pp = p;
err = 0;
out_free:
dprintk(")\n");
@@ -1884,8 +1876,8 @@ out_free:
return err;
}
-static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
- const struct path *path, __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, const struct path *path)
{
struct svc_export *exp_ps;
__be32 res;
@@ -1893,7 +1885,7 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
exp_ps = rqst_find_fsidzero_export(rqstp);
if (IS_ERR(exp_ps))
return nfserrno(PTR_ERR(exp_ps));
- res = nfsd4_encode_path(&exp_ps->ex_path, path, pp, buflen);
+ res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path);
exp_put(exp_ps);
return res;
}
@@ -1901,28 +1893,26 @@ static __be32 nfsd4_encode_fsloc_fsroot(struct svc_rqst *rqstp,
/*
* encode a fs_locations structure
*/
-static __be32 nfsd4_encode_fs_locations(struct svc_rqst *rqstp,
- struct svc_export *exp,
- __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr,
+ struct svc_rqst *rqstp, struct svc_export *exp)
{
__be32 status;
int i;
- __be32 *p = *pp;
+ __be32 *p;
struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs;
- status = nfsd4_encode_fsloc_fsroot(rqstp, &exp->ex_path, &p, buflen);
+ status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path);
if (status)
return status;
- if ((*buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
return nfserr_resource;
- WRITE32(fslocs->locations_count);
+ *p++ = cpu_to_be32(fslocs->locations_count);
for (i=0; i<fslocs->locations_count; i++) {
- status = nfsd4_encode_fs_location4(&fslocs->locations[i],
- &p, buflen);
+ status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]);
if (status)
return status;
}
- *pp = p;
return 0;
}
@@ -1940,56 +1930,16 @@ static u32 nfs4_file_type(umode_t mode)
};
}
-static __be32
-nfsd4_encode_name(struct svc_rqst *rqstp, int whotype, kuid_t uid, kgid_t gid,
- __be32 **p, int *buflen)
-{
- int status;
-
- if (*buflen < (XDR_QUADLEN(IDMAP_NAMESZ) << 2) + 4)
- return nfserr_resource;
- if (whotype != NFS4_ACL_WHO_NAMED)
- status = nfs4_acl_write_who(whotype, (u8 *)(*p + 1));
- else if (gid_valid(gid))
- status = nfsd_map_gid_to_name(rqstp, gid, (u8 *)(*p + 1));
- else
- status = nfsd_map_uid_to_name(rqstp, uid, (u8 *)(*p + 1));
- if (status < 0)
- return nfserrno(status);
- *p = xdr_encode_opaque(*p, NULL, status);
- *buflen -= (XDR_QUADLEN(status) << 2) + 4;
- BUG_ON(*buflen < 0);
- return 0;
-}
-
-static inline __be32
-nfsd4_encode_user(struct svc_rqst *rqstp, kuid_t user, __be32 **p, int *buflen)
-{
- return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, user, INVALID_GID,
- p, buflen);
-}
-
static inline __be32
-nfsd4_encode_group(struct svc_rqst *rqstp, kgid_t group, __be32 **p, int *buflen)
+nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ struct nfs4_ace *ace)
{
- return nfsd4_encode_name(rqstp, NFS4_ACL_WHO_NAMED, INVALID_UID, group,
- p, buflen);
-}
-
-static inline __be32
-nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
- __be32 **p, int *buflen)
-{
- kuid_t uid = INVALID_UID;
- kgid_t gid = INVALID_GID;
-
- if (ace->whotype == NFS4_ACL_WHO_NAMED) {
- if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
- gid = ace->who_gid;
- else
- uid = ace->who_uid;
- }
- return nfsd4_encode_name(rqstp, ace->whotype, uid, gid, p, buflen);
+ if (ace->whotype != NFS4_ACL_WHO_NAMED)
+ return nfs4_acl_write_who(xdr, ace->whotype);
+ else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+ return nfsd4_encode_group(xdr, rqstp, ace->who_gid);
+ else
+ return nfsd4_encode_user(xdr, rqstp, ace->who_uid);
}
#define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \
@@ -1998,31 +1948,28 @@ nfsd4_encode_aclname(struct svc_rqst *rqstp, struct nfs4_ace *ace,
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static inline __be32
-nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ void *context, int len)
{
- __be32 *p = *pp;
+ __be32 *p;
- if (*buflen < ((XDR_QUADLEN(len) << 2) + 4 + 4 + 4))
+ p = xdr_reserve_space(xdr, len + 4 + 4 + 4);
+ if (!p)
return nfserr_resource;
/*
* For now we use a 0 here to indicate the null translation; in
* the future we may place a call to translation code here.
*/
- if ((*buflen -= 8) < 0)
- return nfserr_resource;
-
- WRITE32(0); /* lfs */
- WRITE32(0); /* pi */
+ *p++ = cpu_to_be32(0); /* lfs */
+ *p++ = cpu_to_be32(0); /* pi */
p = xdr_encode_opaque(p, context, len);
- *buflen -= (XDR_QUADLEN(len) << 2) + 4;
-
- *pp = p;
return 0;
}
#else
static inline __be32
-nfsd4_encode_security_label(struct svc_rqst *rqstp, void *context, int len, __be32 **pp, int *buflen)
+nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp,
+ void *context, int len)
{ return 0; }
#endif
@@ -2061,26 +2008,26 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
/*
* Note: @fhp can be NULL; in this case, we might have to compose the filehandle
* ourselves.
- *
- * countp is the buffer size in _words_
*/
-__be32
-nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 **buffer, int count, u32 *bmval,
+static __be32
+nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ struct svc_export *exp,
+ struct dentry *dentry, u32 *bmval,
struct svc_rqst *rqstp, int ignore_crossmnt)
{
u32 bmval0 = bmval[0];
u32 bmval1 = bmval[1];
u32 bmval2 = bmval[2];
struct kstat stat;
- struct svc_fh tempfh;
+ struct svc_fh *tempfh = NULL;
struct kstatfs statfs;
- int buflen = count << 2;
- __be32 *attrlenp;
+ __be32 *p;
+ int starting_len = xdr->buf->len;
+ int attrlen_offset;
+ __be32 attrlen;
u32 dummy;
u64 dummy64;
u32 rdattr_err = 0;
- __be32 *p = *buffer;
__be32 status;
int err;
int aclsupport = 0;
@@ -2111,8 +2058,8 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
err = vfs_getattr(&path, &stat);
if (err)
goto out_nfserr;
- if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL |
- FATTR4_WORD0_MAXNAME)) ||
+ if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
(bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
FATTR4_WORD1_SPACE_TOTAL))) {
err = vfs_statfs(&path, &statfs);
@@ -2120,11 +2067,15 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
goto out_nfserr;
}
if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) {
- fh_init(&tempfh, NFS4_FHSIZE);
- status = fh_compose(&tempfh, exp, dentry, NULL);
+ tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
+ status = nfserr_jukebox;
+ if (!tempfh)
+ goto out;
+ fh_init(tempfh, NFS4_FHSIZE);
+ status = fh_compose(tempfh, exp, dentry, NULL);
if (status)
goto out;
- fhp = &tempfh;
+ fhp = tempfh;
}
if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT
| FATTR4_WORD0_SUPPORTED_ATTRS)) {
@@ -2157,25 +2108,33 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
if (bmval2) {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
- WRITE32(3);
- WRITE32(bmval0);
- WRITE32(bmval1);
- WRITE32(bmval2);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
+ *p++ = cpu_to_be32(bmval2);
} else if (bmval1) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE32(2);
- WRITE32(bmval0);
- WRITE32(bmval1);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(bmval0);
+ *p++ = cpu_to_be32(bmval1);
} else {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE32(1);
- WRITE32(bmval0);
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(bmval0);
}
- attrlenp = p++; /* to be backfilled later */
+
+ attrlen_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ p++; /* to be backfilled later */
if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
u32 word0 = nfsd_suppattrs0(minorversion);
@@ -2187,302 +2146,343 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
if (!contextsupport)
word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
if (!word2) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE32(2);
- WRITE32(word0);
- WRITE32(word1);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(word0);
+ *p++ = cpu_to_be32(word1);
} else {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
- WRITE32(3);
- WRITE32(word0);
- WRITE32(word1);
- WRITE32(word2);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(word0);
+ *p++ = cpu_to_be32(word1);
+ *p++ = cpu_to_be32(word2);
}
}
if (bmval0 & FATTR4_WORD0_TYPE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
dummy = nfs4_file_type(stat.mode);
- if (dummy == NF4BAD)
- goto out_serverfault;
- WRITE32(dummy);
+ if (dummy == NF4BAD) {
+ status = nfserr_serverfault;
+ goto out;
+ }
+ *p++ = cpu_to_be32(dummy);
}
if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
- WRITE32(NFS4_FH_PERSISTENT);
+ *p++ = cpu_to_be32(NFS4_FH_PERSISTENT);
else
- WRITE32(NFS4_FH_PERSISTENT|NFS4_FH_VOL_RENAME);
+ *p++ = cpu_to_be32(NFS4_FH_PERSISTENT|
+ NFS4_FH_VOL_RENAME);
}
if (bmval0 & FATTR4_WORD0_CHANGE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- write_change(&p, &stat, dentry->d_inode);
+ p = encode_change(p, &stat, dentry->d_inode);
}
if (bmval0 & FATTR4_WORD0_SIZE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64(stat.size);
+ p = xdr_encode_hyper(p, stat.size);
}
if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_NAMED_ATTR) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
if (bmval0 & FATTR4_WORD0_FSID) {
- if ((buflen -= 16) < 0)
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
goto out_resource;
if (exp->ex_fslocs.migrated) {
- WRITE64(NFS4_REFERRAL_FSID_MAJOR);
- WRITE64(NFS4_REFERRAL_FSID_MINOR);
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR);
+ p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR);
} else switch(fsid_source(fhp)) {
case FSIDSOURCE_FSID:
- WRITE64((u64)exp->ex_fsid);
- WRITE64((u64)0);
+ p = xdr_encode_hyper(p, (u64)exp->ex_fsid);
+ p = xdr_encode_hyper(p, (u64)0);
break;
case FSIDSOURCE_DEV:
- WRITE32(0);
- WRITE32(MAJOR(stat.dev));
- WRITE32(0);
- WRITE32(MINOR(stat.dev));
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(MAJOR(stat.dev));
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(MINOR(stat.dev));
break;
case FSIDSOURCE_UUID:
- WRITEMEM(exp->ex_uuid, 16);
+ p = xdr_encode_opaque_fixed(p, exp->ex_uuid,
+ EX_UUID_LEN);
break;
}
}
if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
if (bmval0 & FATTR4_WORD0_LEASE_TIME) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(nn->nfsd4_lease);
+ *p++ = cpu_to_be32(nn->nfsd4_lease);
}
if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(rdattr_err);
+ *p++ = cpu_to_be32(rdattr_err);
}
if (bmval0 & FATTR4_WORD0_ACL) {
struct nfs4_ace *ace;
if (acl == NULL) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
goto out_acl;
}
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(acl->naces);
+ *p++ = cpu_to_be32(acl->naces);
for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) {
- if ((buflen -= 4*3) < 0)
- goto out_resource;
- WRITE32(ace->type);
- WRITE32(ace->flag);
- WRITE32(ace->access_mask & NFS4_ACE_MASK_ALL);
- status = nfsd4_encode_aclname(rqstp, ace, &p, &buflen);
- if (status == nfserr_resource)
+ p = xdr_reserve_space(xdr, 4*3);
+ if (!p)
goto out_resource;
+ *p++ = cpu_to_be32(ace->type);
+ *p++ = cpu_to_be32(ace->flag);
+ *p++ = cpu_to_be32(ace->access_mask &
+ NFS4_ACE_MASK_ALL);
+ status = nfsd4_encode_aclname(xdr, rqstp, ace);
if (status)
goto out;
}
}
out_acl:
if (bmval0 & FATTR4_WORD0_ACLSUPPORT) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(aclsupport ?
+ *p++ = cpu_to_be32(aclsupport ?
ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0);
}
if (bmval0 & FATTR4_WORD0_CANSETTIME) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
}
if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_FILEHANDLE) {
- buflen -= (XDR_QUADLEN(fhp->fh_handle.fh_size) << 2) + 4;
- if (buflen < 0)
+ p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
+ if (!p)
goto out_resource;
- WRITE32(fhp->fh_handle.fh_size);
- WRITEMEM(&fhp->fh_handle.fh_base, fhp->fh_handle.fh_size);
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
+ fhp->fh_handle.fh_size);
}
if (bmval0 & FATTR4_WORD0_FILEID) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64(stat.ino);
+ p = xdr_encode_hyper(p, stat.ino);
}
if (bmval0 & FATTR4_WORD0_FILES_AVAIL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) statfs.f_ffree);
+ p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
}
if (bmval0 & FATTR4_WORD0_FILES_FREE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) statfs.f_ffree);
+ p = xdr_encode_hyper(p, (u64) statfs.f_ffree);
}
if (bmval0 & FATTR4_WORD0_FILES_TOTAL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) statfs.f_files);
+ p = xdr_encode_hyper(p, (u64) statfs.f_files);
}
if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) {
- status = nfsd4_encode_fs_locations(rqstp, exp, &p, &buflen);
- if (status == nfserr_resource)
- goto out_resource;
+ status = nfsd4_encode_fs_locations(xdr, rqstp, exp);
if (status)
goto out;
}
if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval0 & FATTR4_WORD0_MAXFILESIZE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64(~(u64)0);
+ p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes);
}
if (bmval0 & FATTR4_WORD0_MAXLINK) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(255);
+ *p++ = cpu_to_be32(255);
}
if (bmval0 & FATTR4_WORD0_MAXNAME) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(statfs.f_namelen);
+ *p++ = cpu_to_be32(statfs.f_namelen);
}
if (bmval0 & FATTR4_WORD0_MAXREAD) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) svc_max_payload(rqstp));
+ p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
}
if (bmval0 & FATTR4_WORD0_MAXWRITE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE64((u64) svc_max_payload(rqstp));
+ p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp));
}
if (bmval1 & FATTR4_WORD1_MODE) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(stat.mode & S_IALLUGO);
+ *p++ = cpu_to_be32(stat.mode & S_IALLUGO);
}
if (bmval1 & FATTR4_WORD1_NO_TRUNC) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(1);
+ *p++ = cpu_to_be32(1);
}
if (bmval1 & FATTR4_WORD1_NUMLINKS) {
- if ((buflen -= 4) < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto out_resource;
- WRITE32(stat.nlink);
+ *p++ = cpu_to_be32(stat.nlink);
}
if (bmval1 & FATTR4_WORD1_OWNER) {
- status = nfsd4_encode_user(rqstp, stat.uid, &p, &buflen);
- if (status == nfserr_resource)
- goto out_resource;
+ status = nfsd4_encode_user(xdr, rqstp, stat.uid);
if (status)
goto out;
}
if (bmval1 & FATTR4_WORD1_OWNER_GROUP) {
- status = nfsd4_encode_group(rqstp, stat.gid, &p, &buflen);
- if (status == nfserr_resource)
- goto out_resource;
+ status = nfsd4_encode_group(xdr, rqstp, stat.gid);
if (status)
goto out;
}
if (bmval1 & FATTR4_WORD1_RAWDEV) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
- WRITE32((u32) MAJOR(stat.rdev));
- WRITE32((u32) MINOR(stat.rdev));
+ *p++ = cpu_to_be32((u32) MAJOR(stat.rdev));
+ *p++ = cpu_to_be32((u32) MINOR(stat.rdev));
}
if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_SPACE_FREE) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_SPACE_USED) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
dummy64 = (u64)stat.blocks << 9;
- WRITE64(dummy64);
+ p = xdr_encode_hyper(p, dummy64);
}
if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE64((s64)stat.atime.tv_sec);
- WRITE32(stat.atime.tv_nsec);
+ p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
+ *p++ = cpu_to_be32(stat.atime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE32(0);
- WRITE32(1);
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(0);
}
if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE64((s64)stat.ctime.tv_sec);
- WRITE32(stat.ctime.tv_nsec);
+ p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
+ *p++ = cpu_to_be32(stat.ctime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
- if ((buflen -= 12) < 0)
+ p = xdr_reserve_space(xdr, 12);
+ if (!p)
goto out_resource;
- WRITE64((s64)stat.mtime.tv_sec);
- WRITE32(stat.mtime.tv_nsec);
+ p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
+ *p++ = cpu_to_be32(stat.mtime.tv_nsec);
}
if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
- if ((buflen -= 8) < 0)
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
goto out_resource;
/*
* Get parent's attributes if not ignoring crossmount
@@ -2491,23 +2491,26 @@ out_acl:
if (ignore_crossmnt == 0 &&
dentry == exp->ex_path.mnt->mnt_root)
get_parent_attributes(exp, &stat);
- WRITE64(stat.ino);
+ p = xdr_encode_hyper(p, stat.ino);
}
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
- status = nfsd4_encode_security_label(rqstp, context,
- contextlen, &p, &buflen);
+ status = nfsd4_encode_security_label(xdr, rqstp, context,
+ contextlen);
if (status)
goto out;
}
if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
- WRITE32(3);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
- WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
+ *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
+ *p++ = cpu_to_be32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
}
- *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
- *buffer = p;
+ attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
+ write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
status = nfs_ok;
out:
@@ -2516,8 +2519,12 @@ out:
security_release_secctx(context, contextlen);
#endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
kfree(acl);
- if (fhp == &tempfh)
- fh_put(&tempfh);
+ if (tempfh) {
+ fh_put(tempfh);
+ kfree(tempfh);
+ }
+ if (status)
+ xdr_truncate_encode(xdr, starting_len);
return status;
out_nfserr:
status = nfserrno(err);
@@ -2525,9 +2532,37 @@ out_nfserr:
out_resource:
status = nfserr_resource;
goto out;
-out_serverfault:
- status = nfserr_serverfault;
- goto out;
+}
+
+static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
+ struct xdr_buf *buf, __be32 *p, int bytes)
+{
+ xdr->scratch.iov_len = 0;
+ memset(buf, 0, sizeof(struct xdr_buf));
+ buf->head[0].iov_base = p;
+ buf->head[0].iov_len = 0;
+ buf->len = 0;
+ xdr->buf = buf;
+ xdr->iov = buf->head;
+ xdr->p = p;
+ xdr->end = (void *)p + bytes;
+ buf->buflen = bytes;
+}
+
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry, u32 *bmval,
+ struct svc_rqst *rqstp, int ignore_crossmnt)
+{
+ struct xdr_buf dummy;
+ struct xdr_stream xdr;
+ __be32 ret;
+
+ svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
+ ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
+ ignore_crossmnt);
+ *p = xdr.p;
+ return ret;
}
static inline int attributes_need_mount(u32 *bmval)
@@ -2540,8 +2575,8 @@ static inline int attributes_need_mount(u32 *bmval)
}
static __be32
-nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
- const char *name, int namlen, __be32 **p, int buflen)
+nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
+ const char *name, int namlen)
{
struct svc_export *exp = cd->rd_fhp->fh_export;
struct dentry *dentry;
@@ -2593,7 +2628,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd,
}
out_encode:
- nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval,
+ nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval,
cd->rd_rqstp, ignore_crossmnt);
out_put:
dput(dentry);
@@ -2602,19 +2637,19 @@ out_put:
}
static __be32 *
-nfsd4_encode_rdattr_error(__be32 *p, int buflen, __be32 nfserr)
+nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr)
{
- __be32 *attrlenp;
+ __be32 *p;
- if (buflen < 6)
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
return NULL;
*p++ = htonl(2);
*p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */
*p++ = htonl(0); /* bmval1 */
- attrlenp = p++;
+ *p++ = htonl(4); /* attribute length */
*p++ = nfserr; /* no htonl */
- *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
return p;
}
@@ -2624,10 +2659,13 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
{
struct readdir_cd *ccd = ccdv;
struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
- int buflen;
- __be32 *p = cd->buffer;
- __be32 *cookiep;
+ struct xdr_stream *xdr = cd->xdr;
+ int start_offset = xdr->buf->len;
+ int cookie_offset;
+ int entry_bytes;
__be32 nfserr = nfserr_toosmall;
+ __be64 wire_offset;
+ __be32 *p;
/* In nfsv4, "." and ".." never make it onto the wire.. */
if (name && isdotent(name, namlen)) {
@@ -2635,19 +2673,24 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
return 0;
}
- if (cd->offset)
- xdr_encode_hyper(cd->offset, (u64) offset);
+ if (cd->cookie_offset) {
+ wire_offset = cpu_to_be64(offset);
+ write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset,
+ &wire_offset, 8);
+ }
- buflen = cd->buflen - 4 - XDR_QUADLEN(namlen);
- if (buflen < 0)
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
goto fail;
-
*p++ = xdr_one; /* mark entry present */
- cookiep = p;
+ cookie_offset = xdr->buf->len;
+ p = xdr_reserve_space(xdr, 3*4 + namlen);
+ if (!p)
+ goto fail;
p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
p = xdr_encode_array(p, name, namlen); /* name length & name */
- nfserr = nfsd4_encode_dirent_fattr(cd, name, namlen, &p, buflen);
+ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
switch (nfserr) {
case nfs_ok:
break;
@@ -2655,6 +2698,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
nfserr = nfserr_toosmall;
goto fail;
case nfserr_noent:
+ xdr_truncate_encode(xdr, start_offset);
goto skip_entry;
default:
/*
@@ -2666,59 +2710,74 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
*/
if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR))
goto fail;
- p = nfsd4_encode_rdattr_error(p, buflen, nfserr);
+ p = nfsd4_encode_rdattr_error(xdr, nfserr);
if (p == NULL) {
nfserr = nfserr_toosmall;
goto fail;
}
}
- cd->buflen -= (p - cd->buffer);
- cd->buffer = p;
- cd->offset = cookiep;
+ nfserr = nfserr_toosmall;
+ entry_bytes = xdr->buf->len - start_offset;
+ if (entry_bytes > cd->rd_maxcount)
+ goto fail;
+ cd->rd_maxcount -= entry_bytes;
+ if (!cd->rd_dircount)
+ goto fail;
+ cd->rd_dircount--;
+ cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
return 0;
fail:
+ xdr_truncate_encode(xdr, start_offset);
cd->common.err = nfserr;
return -EINVAL;
}
-static void
-nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid)
+static __be32
+nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
{
__be32 *p;
- RESERVE_SPACE(sizeof(stateid_t));
- WRITE32(sid->si_generation);
- WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, sizeof(stateid_t));
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sid->si_generation);
+ p = xdr_encode_opaque_fixed(p, &sid->si_opaque,
+ sizeof(stateid_opaque_t));
+ return 0;
}
static __be32
nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(8);
- WRITE32(access->ac_supported);
- WRITE32(access->ac_resp_access);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(access->ac_supported);
+ *p++ = cpu_to_be32(access->ac_resp_access);
}
return nfserr;
}
static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 8);
- WRITEMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(bcts->dir);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, bcts->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(bcts->dir);
/* Sorry, we do not yet support RDMA over 4.1: */
- WRITE32(0);
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(0);
}
return nfserr;
}
@@ -2726,8 +2785,10 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
static __be32
nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &close->cl_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid);
return nfserr;
}
@@ -2736,12 +2797,15 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c
static __be32
nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(NFS4_VERIFIER_SIZE);
- WRITEMEM(commit->co_verf.data, NFS4_VERIFIER_SIZE);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, commit->co_verf.data,
+ NFS4_VERIFIER_SIZE);
}
return nfserr;
}
@@ -2749,15 +2813,17 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(32);
- write_cinfo(&p, &create->cr_cinfo);
- WRITE32(2);
- WRITE32(create->cr_bmval[0]);
- WRITE32(create->cr_bmval[1]);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &create->cr_cinfo);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(create->cr_bmval[0]);
+ *p++ = cpu_to_be32(create->cr_bmval[1]);
}
return nfserr;
}
@@ -2766,14 +2832,13 @@ static __be32
nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
{
struct svc_fh *fhp = getattr->ga_fhp;
- int buflen;
+ struct xdr_stream *xdr = &resp->xdr;
if (nfserr)
return nfserr;
- buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2);
- nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry,
- &resp->p, buflen, getattr->ga_bmval,
+ nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
+ getattr->ga_bmval,
resp->rqstp, 0);
return nfserr;
}
@@ -2781,16 +2846,17 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
static __be32
nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
{
+ struct xdr_stream *xdr = &resp->xdr;
struct svc_fh *fhp = *fhpp;
unsigned int len;
__be32 *p;
if (!nfserr) {
len = fhp->fh_handle.fh_size;
- RESERVE_SPACE(len + 4);
- WRITE32(len);
- WRITEMEM(&fhp->fh_handle.fh_base, len);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
}
return nfserr;
}
@@ -2799,35 +2865,50 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
* Including all fields other than the name, a LOCK4denied structure requires
* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes.
*/
-static void
-nfsd4_encode_lock_denied(struct nfsd4_compoundres *resp, struct nfsd4_lock_denied *ld)
+static __be32
+nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
{
struct xdr_netobj *conf = &ld->ld_owner;
__be32 *p;
- RESERVE_SPACE(32 + XDR_LEN(conf->len));
- WRITE64(ld->ld_start);
- WRITE64(ld->ld_length);
- WRITE32(ld->ld_type);
+again:
+ p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len));
+ if (!p) {
+ /*
+ * Don't fail to return the result just because we can't
+ * return the conflicting open:
+ */
+ if (conf->len) {
+ kfree(conf->data);
+ conf->len = 0;
+ conf->data = NULL;
+ goto again;
+ }
+ return nfserr_resource;
+ }
+ p = xdr_encode_hyper(p, ld->ld_start);
+ p = xdr_encode_hyper(p, ld->ld_length);
+ *p++ = cpu_to_be32(ld->ld_type);
if (conf->len) {
- WRITEMEM(&ld->ld_clientid, 8);
- WRITE32(conf->len);
- WRITEMEM(conf->data, conf->len);
+ p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8);
+ p = xdr_encode_opaque(p, conf->data, conf->len);
kfree(conf->data);
} else { /* non - nfsv4 lock in conflict, no clientid nor owner */
- WRITE64((u64)0); /* clientid */
- WRITE32(0); /* length of owner name */
+ p = xdr_encode_hyper(p, (u64)0); /* clientid */
+ *p++ = cpu_to_be32(0); /* length of owner name */
}
- ADJUST_ARGS();
+ return nfserr_denied;
}
static __be32
nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &lock->lk_resp_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
else if (nfserr == nfserr_denied)
- nfsd4_encode_lock_denied(resp, &lock->lk_denied);
+ nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied);
return nfserr;
}
@@ -2835,16 +2916,20 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
static __be32
nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (nfserr == nfserr_denied)
- nfsd4_encode_lock_denied(resp, &lockt->lt_denied);
+ nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
return nfserr;
}
static __be32
nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &locku->lu_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid);
return nfserr;
}
@@ -2853,12 +2938,14 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
static __be32
nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(20);
- write_cinfo(&p, &link->li_cinfo);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &link->li_cinfo);
}
return nfserr;
}
@@ -2867,72 +2954,86 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
static __be32
nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (nfserr)
goto out;
- nfsd4_encode_stateid(resp, &open->op_stateid);
- RESERVE_SPACE(40);
- write_cinfo(&p, &open->op_cinfo);
- WRITE32(open->op_rflags);
- WRITE32(2);
- WRITE32(open->op_bmval[0]);
- WRITE32(open->op_bmval[1]);
- WRITE32(open->op_delegate_type);
- ADJUST_ARGS();
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
+ if (nfserr)
+ goto out;
+ p = xdr_reserve_space(xdr, 40);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &open->op_cinfo);
+ *p++ = cpu_to_be32(open->op_rflags);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(open->op_bmval[0]);
+ *p++ = cpu_to_be32(open->op_bmval[1]);
+ *p++ = cpu_to_be32(open->op_delegate_type);
switch (open->op_delegate_type) {
case NFS4_OPEN_DELEGATE_NONE:
break;
case NFS4_OPEN_DELEGATE_READ:
- nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
- RESERVE_SPACE(20);
- WRITE32(open->op_recall);
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_recall);
/*
* TODO: ACE's in delegations
*/
- WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- WRITE32(0);
- WRITE32(0);
- WRITE32(0); /* XXX: is NULL principal ok? */
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
break;
case NFS4_OPEN_DELEGATE_WRITE:
- nfsd4_encode_stateid(resp, &open->op_delegate_stateid);
- RESERVE_SPACE(32);
- WRITE32(0);
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid);
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 32);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
/*
* TODO: space_limit's in delegations
*/
- WRITE32(NFS4_LIMIT_SIZE);
- WRITE32(~(u32)0);
- WRITE32(~(u32)0);
+ *p++ = cpu_to_be32(NFS4_LIMIT_SIZE);
+ *p++ = cpu_to_be32(~(u32)0);
+ *p++ = cpu_to_be32(~(u32)0);
/*
* TODO: ACE's in delegations
*/
- WRITE32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
- WRITE32(0);
- WRITE32(0);
- WRITE32(0); /* XXX: is NULL principal ok? */
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */
break;
case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */
switch (open->op_why_no_deleg) {
case WND4_CONTENTION:
case WND4_RESOURCE:
- RESERVE_SPACE(8);
- WRITE32(open->op_why_no_deleg);
- WRITE32(0); /* deleg signaling not supported yet */
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_why_no_deleg);
+ /* deleg signaling not supported yet: */
+ *p++ = cpu_to_be32(0);
break;
default:
- RESERVE_SPACE(4);
- WRITE32(open->op_why_no_deleg);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(open->op_why_no_deleg);
}
- ADJUST_ARGS();
break;
default:
BUG();
@@ -2945,8 +3046,10 @@ out:
static __be32
nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &oc->oc_resp_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
return nfserr;
}
@@ -2954,127 +3057,233 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct
static __be32
nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
{
+ struct xdr_stream *xdr = &resp->xdr;
+
if (!nfserr)
- nfsd4_encode_stateid(resp, &od->od_stateid);
+ nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid);
return nfserr;
}
-static __be32
-nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_read *read)
+static __be32 nfsd4_encode_splice_read(
+ struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
{
+ struct xdr_stream *xdr = &resp->xdr;
+ struct xdr_buf *buf = xdr->buf;
u32 eof;
- int v;
- struct page *page;
- unsigned long maxcount;
- long len;
- __be32 *p;
+ int space_left;
+ __be32 nfserr;
+ __be32 *p = xdr->p - 2;
- if (nfserr)
- return nfserr;
- if (resp->xbuf->page_len)
+ /*
+ * Don't inline pages unless we know there's room for eof,
+ * count, and possible padding:
+ */
+ if (xdr->end - xdr->p < 3)
return nfserr_resource;
- RESERVE_SPACE(8); /* eof flag and byte count */
+ nfserr = nfsd_splice_read(read->rd_rqstp, file,
+ read->rd_offset, &maxcount);
+ if (nfserr) {
+ /*
+ * nfsd_splice_actor may have already messed with the
+ * page length; reset it so as not to confuse
+ * xdr_truncate_encode:
+ */
+ buf->page_len = 0;
+ return nfserr;
+ }
+
+ eof = (read->rd_offset + maxcount >=
+ read->rd_fhp->fh_dentry->d_inode->i_size);
- maxcount = svc_max_payload(resp->rqstp);
- if (maxcount > read->rd_length)
- maxcount = read->rd_length;
+ *(p++) = htonl(eof);
+ *(p++) = htonl(maxcount);
+
+ buf->page_len = maxcount;
+ buf->len += maxcount;
+ xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ /* Use rest of head for padding and remaining ops: */
+ buf->tail[0].iov_base = xdr->p;
+ buf->tail[0].iov_len = 0;
+ xdr->iov = buf->tail;
+ if (maxcount&3) {
+ int pad = 4 - (maxcount&3);
+
+ *(xdr->p++) = 0;
+
+ buf->tail[0].iov_base += maxcount&3;
+ buf->tail[0].iov_len = pad;
+ buf->len += pad;
+ }
+
+ space_left = min_t(int, (void *)xdr->end - (void *)xdr->p,
+ buf->buflen - buf->len);
+ buf->buflen = buf->len + space_left;
+ xdr->end = (__be32 *)((void *)xdr->end + space_left);
+
+ return 0;
+}
+
+static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ u32 eof;
+ int v;
+ int starting_len = xdr->buf->len - 8;
+ long len;
+ int thislen;
+ __be32 nfserr;
+ __be32 tmp;
+ __be32 *p;
+ u32 zzz = 0;
+ int pad;
len = maxcount;
v = 0;
- while (len > 0) {
- page = *(resp->rqstp->rq_next_page);
- if (!page) { /* ran out of pages */
- maxcount -= len;
- break;
- }
- resp->rqstp->rq_vec[v].iov_base = page_address(page);
- resp->rqstp->rq_vec[v].iov_len =
- len < PAGE_SIZE ? len : PAGE_SIZE;
- resp->rqstp->rq_next_page++;
+
+ thislen = (void *)xdr->end - (void *)xdr->p;
+ if (len < thislen)
+ thislen = len;
+ p = xdr_reserve_space(xdr, (thislen+3)&~3);
+ WARN_ON_ONCE(!p);
+ resp->rqstp->rq_vec[v].iov_base = p;
+ resp->rqstp->rq_vec[v].iov_len = thislen;
+ v++;
+ len -= thislen;
+
+ while (len) {
+ thislen = min_t(long, len, PAGE_SIZE);
+ p = xdr_reserve_space(xdr, (thislen+3)&~3);
+ WARN_ON_ONCE(!p);
+ resp->rqstp->rq_vec[v].iov_base = p;
+ resp->rqstp->rq_vec[v].iov_len = thislen;
v++;
- len -= PAGE_SIZE;
+ len -= thislen;
}
read->rd_vlen = v;
- nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp,
- read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen,
- &maxcount);
-
+ nfserr = nfsd_readv(file, read->rd_offset, resp->rqstp->rq_vec,
+ read->rd_vlen, &maxcount);
if (nfserr)
return nfserr;
+ xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
+
eof = (read->rd_offset + maxcount >=
read->rd_fhp->fh_dentry->d_inode->i_size);
- WRITE32(eof);
- WRITE32(maxcount);
- ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = (char*)p
- - (char*)resp->xbuf->head[0].iov_base;
- resp->xbuf->page_len = maxcount;
+ tmp = htonl(eof);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
+ tmp = htonl(maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
- /* Use rest of head for padding and remaining ops: */
- resp->xbuf->tail[0].iov_base = p;
- resp->xbuf->tail[0].iov_len = 0;
- if (maxcount&3) {
- RESERVE_SPACE(4);
- WRITE32(0);
- resp->xbuf->tail[0].iov_base += maxcount&3;
- resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
- ADJUST_ARGS();
- }
+ pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
+ write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
+ &zzz, pad);
return 0;
+
}
static __be32
-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_read *read)
{
- int maxcount;
- char *page;
+ unsigned long maxcount;
+ struct xdr_stream *xdr = &resp->xdr;
+ struct file *file = read->rd_filp;
+ int starting_len = xdr->buf->len;
+ struct raparms *ra;
__be32 *p;
+ __be32 err;
if (nfserr)
return nfserr;
- if (resp->xbuf->page_len)
+
+ p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
+ if (!p) {
+ WARN_ON_ONCE(resp->rqstp->rq_splice_ok);
return nfserr_resource;
- if (!*resp->rqstp->rq_next_page)
+ }
+ if (resp->xdr.buf->page_len && resp->rqstp->rq_splice_ok) {
+ WARN_ON_ONCE(1);
return nfserr_resource;
+ }
+ xdr_commit_encode(xdr);
+
+ maxcount = svc_max_payload(resp->rqstp);
+ if (maxcount > xdr->buf->buflen - xdr->buf->len)
+ maxcount = xdr->buf->buflen - xdr->buf->len;
+ if (maxcount > read->rd_length)
+ maxcount = read->rd_length;
+
+ if (!read->rd_filp) {
+ err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
+ &file, &ra);
+ if (err)
+ goto err_truncate;
+ }
+
+ if (file->f_op->splice_read && resp->rqstp->rq_splice_ok)
+ err = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ err = nfsd4_encode_readv(resp, read, file, maxcount);
+
+ if (!read->rd_filp)
+ nfsd_put_tmp_read_open(file, ra);
+
+err_truncate:
+ if (err)
+ xdr_truncate_encode(xdr, starting_len);
+ return err;
+}
- page = page_address(*(resp->rqstp->rq_next_page++));
+static __be32
+nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
+{
+ int maxcount;
+ __be32 wire_count;
+ int zero = 0;
+ struct xdr_stream *xdr = &resp->xdr;
+ int length_offset = xdr->buf->len;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
maxcount = PAGE_SIZE;
- RESERVE_SPACE(4);
+ p = xdr_reserve_space(xdr, maxcount);
+ if (!p)
+ return nfserr_resource;
/*
* XXX: By default, the ->readlink() VFS op will truncate symlinks
* if they would overflow the buffer. Is this kosher in NFSv4? If
* not, one easy fix is: if ->readlink() precisely fills the buffer,
* assume that truncation occurred, and return NFS4ERR_RESOURCE.
*/
- nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, page, &maxcount);
+ nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
+ (char *)p, &maxcount);
if (nfserr == nfserr_isdir)
- return nfserr_inval;
- if (nfserr)
+ nfserr = nfserr_inval;
+ if (nfserr) {
+ xdr_truncate_encode(xdr, length_offset);
return nfserr;
-
- WRITE32(maxcount);
- ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = (char*)p
- - (char*)resp->xbuf->head[0].iov_base;
- resp->xbuf->page_len = maxcount;
-
- /* Use rest of head for padding and remaining ops: */
- resp->xbuf->tail[0].iov_base = p;
- resp->xbuf->tail[0].iov_len = 0;
- if (maxcount&3) {
- RESERVE_SPACE(4);
- WRITE32(0);
- resp->xbuf->tail[0].iov_base += maxcount&3;
- resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
- ADJUST_ARGS();
}
+
+ wire_count = htonl(maxcount);
+ write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
+ xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
+ if (maxcount & 3)
+ write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
+ &zero, 4 - (maxcount&3));
return 0;
}
@@ -3082,47 +3291,52 @@ static __be32
nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
{
int maxcount;
+ int bytes_left;
loff_t offset;
- __be32 *page, *savep, *tailbase;
+ __be64 wire_offset;
+ struct xdr_stream *xdr = &resp->xdr;
+ int starting_len = xdr->buf->len;
__be32 *p;
if (nfserr)
return nfserr;
- if (resp->xbuf->page_len)
- return nfserr_resource;
- if (!*resp->rqstp->rq_next_page)
- return nfserr_resource;
- RESERVE_SPACE(NFS4_VERIFIER_SIZE);
- savep = p;
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
- WRITE32(0);
- WRITE32(0);
- ADJUST_ARGS();
- resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
- tailbase = p;
-
- maxcount = PAGE_SIZE;
- if (maxcount > readdir->rd_maxcount)
- maxcount = readdir->rd_maxcount;
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
+ - (char *)resp->xdr.buf->head[0].iov_base;
/*
- * Convert from bytes to words, account for the two words already
- * written, make sure to leave two words at the end for the next
- * pointer and eof field.
+ * Number of bytes left for directory entries allowing for the
+ * final 8 bytes of the readdir and a following failed op:
*/
- maxcount = (maxcount >> 2) - 4;
- if (maxcount < 0) {
- nfserr = nfserr_toosmall;
+ bytes_left = xdr->buf->buflen - xdr->buf->len
+ - COMPOUND_ERR_SLACK_SPACE - 8;
+ if (bytes_left < 0) {
+ nfserr = nfserr_resource;
+ goto err_no_verf;
+ }
+ maxcount = min_t(u32, readdir->rd_maxcount, INT_MAX);
+ /*
+ * Note the rfc defines rd_maxcount as the size of the
+ * READDIR4resok structure, which includes the verifier above
+ * and the 8 bytes encoded at the end of this function:
+ */
+ if (maxcount < 16) {
+ nfserr = nfserr_toosmall;
goto err_no_verf;
}
+ maxcount = min_t(int, maxcount-16, bytes_left);
- page = page_address(*(resp->rqstp->rq_next_page++));
+ readdir->xdr = xdr;
+ readdir->rd_maxcount = maxcount;
readdir->common.err = 0;
- readdir->buflen = maxcount;
- readdir->buffer = page;
- readdir->offset = NULL;
+ readdir->cookie_offset = 0;
offset = readdir->rd_cookie;
nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
@@ -3130,42 +3344,49 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
&readdir->common, nfsd4_encode_dirent);
if (nfserr == nfs_ok &&
readdir->common.err == nfserr_toosmall &&
- readdir->buffer == page)
- nfserr = nfserr_toosmall;
+ xdr->buf->len == starting_len + 8) {
+ /* nothing encoded; which limit did we hit?: */
+ if (maxcount - 16 < bytes_left)
+ /* It was the fault of rd_maxcount: */
+ nfserr = nfserr_toosmall;
+ else
+ /* We ran out of buffer space: */
+ nfserr = nfserr_resource;
+ }
if (nfserr)
goto err_no_verf;
- if (readdir->offset)
- xdr_encode_hyper(readdir->offset, offset);
+ if (readdir->cookie_offset) {
+ wire_offset = cpu_to_be64(offset);
+ write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset,
+ &wire_offset, 8);
+ }
- p = readdir->buffer;
+ p = xdr_reserve_space(xdr, 8);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ goto err_no_verf;
+ }
*p++ = 0; /* no more entries */
*p++ = htonl(readdir->common.err == nfserr_eof);
- resp->xbuf->page_len = ((char*)p) -
- (char*)page_address(*(resp->rqstp->rq_next_page-1));
-
- /* Use rest of head for padding and remaining ops: */
- resp->xbuf->tail[0].iov_base = tailbase;
- resp->xbuf->tail[0].iov_len = 0;
- resp->p = resp->xbuf->tail[0].iov_base;
- resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
return 0;
err_no_verf:
- p = savep;
- ADJUST_ARGS();
+ xdr_truncate_encode(xdr, starting_len);
return nfserr;
}
static __be32
nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(20);
- write_cinfo(&p, &remove->rm_cinfo);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 20);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &remove->rm_cinfo);
}
return nfserr;
}
@@ -3173,19 +3394,21 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
static __be32
nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(40);
- write_cinfo(&p, &rename->rn_sinfo);
- write_cinfo(&p, &rename->rn_tinfo);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 40);
+ if (!p)
+ return nfserr_resource;
+ p = encode_cinfo(p, &rename->rn_sinfo);
+ p = encode_cinfo(p, &rename->rn_tinfo);
}
return nfserr;
}
static __be32
-nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
+nfsd4_do_encode_secinfo(struct xdr_stream *xdr,
__be32 nfserr, struct svc_export *exp)
{
u32 i, nflavs, supported;
@@ -3196,6 +3419,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
if (nfserr)
goto out;
+ nfserr = nfserr_resource;
if (exp->ex_nflavors) {
flavs = exp->ex_flavors;
nflavs = exp->ex_nflavors;
@@ -3217,9 +3441,10 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
}
supported = 0;
- RESERVE_SPACE(4);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
flavorsp = p++; /* to be backfilled later */
- ADJUST_ARGS();
for (i = 0; i < nflavs; i++) {
rpc_authflavor_t pf = flavs[i].pseudoflavor;
@@ -3227,18 +3452,20 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
if (rpcauth_get_gssinfo(pf, &info) == 0) {
supported++;
- RESERVE_SPACE(4 + 4 + info.oid.len + 4 + 4);
- WRITE32(RPC_AUTH_GSS);
- WRITE32(info.oid.len);
- WRITEMEM(info.oid.data, info.oid.len);
- WRITE32(info.qop);
- WRITE32(info.service);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4 + 4 +
+ XDR_LEN(info.oid.len) + 4 + 4);
+ if (!p)
+ goto out;
+ *p++ = cpu_to_be32(RPC_AUTH_GSS);
+ p = xdr_encode_opaque(p, info.oid.data, info.oid.len);
+ *p++ = cpu_to_be32(info.qop);
+ *p++ = cpu_to_be32(info.service);
} else if (pf < RPC_AUTH_MAXFLAVOR) {
supported++;
- RESERVE_SPACE(4);
- WRITE32(pf);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
+ *p++ = cpu_to_be32(pf);
} else {
if (report)
pr_warn("NFS: SECINFO: security flavor %u "
@@ -3249,7 +3476,7 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
if (nflavs != supported)
report = false;
*flavorsp = htonl(supported);
-
+ nfserr = 0;
out:
if (exp)
exp_put(exp);
@@ -3260,14 +3487,18 @@ static __be32
nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo *secinfo)
{
- return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->si_exp);
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp);
}
static __be32
nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_secinfo_no_name *secinfo)
{
- return nfsd4_do_encode_secinfo(resp, nfserr, secinfo->sin_exp);
+ struct xdr_stream *xdr = &resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp);
}
/*
@@ -3277,41 +3508,47 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
static __be32
nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
- RESERVE_SPACE(16);
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
if (nfserr) {
- WRITE32(3);
- WRITE32(0);
- WRITE32(0);
- WRITE32(0);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
}
else {
- WRITE32(3);
- WRITE32(setattr->sa_bmval[0]);
- WRITE32(setattr->sa_bmval[1]);
- WRITE32(setattr->sa_bmval[2]);
+ *p++ = cpu_to_be32(3);
+ *p++ = cpu_to_be32(setattr->sa_bmval[0]);
+ *p++ = cpu_to_be32(setattr->sa_bmval[1]);
+ *p++ = cpu_to_be32(setattr->sa_bmval[2]);
}
- ADJUST_ARGS();
return nfserr;
}
static __be32
nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(8 + NFS4_VERIFIER_SIZE);
- WRITEMEM(&scd->se_clientid, 8);
- WRITEMEM(&scd->se_confirm, NFS4_VERIFIER_SIZE);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, &scd->se_clientid, 8);
+ p = xdr_encode_opaque_fixed(p, &scd->se_confirm,
+ NFS4_VERIFIER_SIZE);
}
else if (nfserr == nfserr_clid_inuse) {
- RESERVE_SPACE(8);
- WRITE32(0);
- WRITE32(0);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
}
return nfserr;
}
@@ -3319,14 +3556,17 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
static __be32
nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (!nfserr) {
- RESERVE_SPACE(16);
- WRITE32(write->wr_bytes_written);
- WRITE32(write->wr_how_written);
- WRITEMEM(write->wr_verifier.data, NFS4_VERIFIER_SIZE);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(write->wr_bytes_written);
+ *p++ = cpu_to_be32(write->wr_how_written);
+ p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
}
return nfserr;
}
@@ -3343,6 +3583,7 @@ static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
char *major_id;
char *server_scope;
@@ -3358,52 +3599,61 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
server_scope = utsname()->nodename;
server_scope_sz = strlen(server_scope);
- RESERVE_SPACE(
+ p = xdr_reserve_space(xdr,
8 /* eir_clientid */ +
4 /* eir_sequenceid */ +
4 /* eir_flags */ +
- 4 /* spr_how */ +
- 8 /* spo_must_enforce, spo_must_allow */ +
- 8 /* so_minor_id */ +
- 4 /* so_major_id.len */ +
- (XDR_QUADLEN(major_id_sz) * 4) +
- 4 /* eir_server_scope.len */ +
- (XDR_QUADLEN(server_scope_sz) * 4) +
- 4 /* eir_server_impl_id.count (0) */);
+ 4 /* spr_how */);
+ if (!p)
+ return nfserr_resource;
+
+ p = xdr_encode_opaque_fixed(p, &exid->clientid, 8);
+ *p++ = cpu_to_be32(exid->seqid);
+ *p++ = cpu_to_be32(exid->flags);
- WRITEMEM(&exid->clientid, 8);
- WRITE32(exid->seqid);
- WRITE32(exid->flags);
+ *p++ = cpu_to_be32(exid->spa_how);
- WRITE32(exid->spa_how);
switch (exid->spa_how) {
case SP4_NONE:
break;
case SP4_MACH_CRED:
+ /* spo_must_enforce, spo_must_allow */
+ p = xdr_reserve_space(xdr, 16);
+ if (!p)
+ return nfserr_resource;
+
/* spo_must_enforce bitmap: */
- WRITE32(2);
- WRITE32(nfs4_minimal_spo_must_enforce[0]);
- WRITE32(nfs4_minimal_spo_must_enforce[1]);
+ *p++ = cpu_to_be32(2);
+ *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]);
+ *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]);
/* empty spo_must_allow bitmap: */
- WRITE32(0);
+ *p++ = cpu_to_be32(0);
+
break;
default:
WARN_ON_ONCE(1);
}
+ p = xdr_reserve_space(xdr,
+ 8 /* so_minor_id */ +
+ 4 /* so_major_id.len */ +
+ (XDR_QUADLEN(major_id_sz) * 4) +
+ 4 /* eir_server_scope.len */ +
+ (XDR_QUADLEN(server_scope_sz) * 4) +
+ 4 /* eir_server_impl_id.count (0) */);
+ if (!p)
+ return nfserr_resource;
+
/* The server_owner struct */
- WRITE64(minor_id); /* Minor id */
+ p = xdr_encode_hyper(p, minor_id); /* Minor id */
/* major id */
- WRITE32(major_id_sz);
- WRITEMEM(major_id, major_id_sz);
+ p = xdr_encode_opaque(p, major_id, major_id_sz);
/* Server scope */
- WRITE32(server_scope_sz);
- WRITEMEM(server_scope, server_scope_sz);
+ p = xdr_encode_opaque(p, server_scope, server_scope_sz);
/* Implementation id */
- WRITE32(0); /* zero length nfs_impl_id4 array */
- ADJUST_ARGS();
+ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
return 0;
}
@@ -3411,93 +3661,81 @@ static __be32
nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_create_session *sess)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (nfserr)
return nfserr;
- RESERVE_SPACE(24);
- WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(sess->seqid);
- WRITE32(sess->flags);
- ADJUST_ARGS();
-
- RESERVE_SPACE(28);
- WRITE32(0); /* headerpadsz */
- WRITE32(sess->fore_channel.maxreq_sz);
- WRITE32(sess->fore_channel.maxresp_sz);
- WRITE32(sess->fore_channel.maxresp_cached);
- WRITE32(sess->fore_channel.maxops);
- WRITE32(sess->fore_channel.maxreqs);
- WRITE32(sess->fore_channel.nr_rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 24);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, sess->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(sess->seqid);
+ *p++ = cpu_to_be32(sess->flags);
+
+ p = xdr_reserve_space(xdr, 28);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0); /* headerpadsz */
+ *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz);
+ *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz);
+ *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached);
+ *p++ = cpu_to_be32(sess->fore_channel.maxops);
+ *p++ = cpu_to_be32(sess->fore_channel.maxreqs);
+ *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs);
if (sess->fore_channel.nr_rdma_attrs) {
- RESERVE_SPACE(4);
- WRITE32(sess->fore_channel.rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs);
}
- RESERVE_SPACE(28);
- WRITE32(0); /* headerpadsz */
- WRITE32(sess->back_channel.maxreq_sz);
- WRITE32(sess->back_channel.maxresp_sz);
- WRITE32(sess->back_channel.maxresp_cached);
- WRITE32(sess->back_channel.maxops);
- WRITE32(sess->back_channel.maxreqs);
- WRITE32(sess->back_channel.nr_rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 28);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(0); /* headerpadsz */
+ *p++ = cpu_to_be32(sess->back_channel.maxreq_sz);
+ *p++ = cpu_to_be32(sess->back_channel.maxresp_sz);
+ *p++ = cpu_to_be32(sess->back_channel.maxresp_cached);
+ *p++ = cpu_to_be32(sess->back_channel.maxops);
+ *p++ = cpu_to_be32(sess->back_channel.maxreqs);
+ *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs);
if (sess->back_channel.nr_rdma_attrs) {
- RESERVE_SPACE(4);
- WRITE32(sess->back_channel.rdma_attrs);
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(sess->back_channel.rdma_attrs);
}
return 0;
}
static __be32
-nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_destroy_session *destroy_session)
-{
- return nfserr;
-}
-
-static __be32
-nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
- struct nfsd4_free_stateid *free_stateid)
-{
- __be32 *p;
-
- if (nfserr)
- return nfserr;
-
- RESERVE_SPACE(4);
- *p++ = nfserr;
- ADJUST_ARGS();
- return nfserr;
-}
-
-static __be32
nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_sequence *seq)
{
+ struct xdr_stream *xdr = &resp->xdr;
__be32 *p;
if (nfserr)
return nfserr;
- RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
- WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
- WRITE32(seq->seqid);
- WRITE32(seq->slotid);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_opaque_fixed(p, seq->sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ *p++ = cpu_to_be32(seq->seqid);
+ *p++ = cpu_to_be32(seq->slotid);
/* Note slotid's are numbered from zero: */
- WRITE32(seq->maxslots - 1); /* sr_highest_slotid */
- WRITE32(seq->maxslots - 1); /* sr_target_highest_slotid */
- WRITE32(seq->status_flags);
+ *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */
+ *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */
+ *p++ = cpu_to_be32(seq->status_flags);
- ADJUST_ARGS();
- resp->cstate.datap = p; /* DRC cache data pointer */
+ resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */
return 0;
}
@@ -3505,17 +3743,22 @@ static __be32
nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_test_stateid *test_stateid)
{
+ struct xdr_stream *xdr = &resp->xdr;
struct nfsd4_test_stateid_id *stateid, *next;
__be32 *p;
- RESERVE_SPACE(4 + (4 * test_stateid->ts_num_ids));
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids));
+ if (!p)
+ return nfserr_resource;
*p++ = htonl(test_stateid->ts_num_ids);
list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
*p++ = stateid->ts_id_status;
}
- ADJUST_ARGS();
return nfserr;
}
@@ -3576,8 +3819,8 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
[OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
[OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
- [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
- [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_free_stateid,
+ [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
@@ -3594,83 +3837,99 @@ static nfsd4_enc nfsd4_enc_ops[] = {
};
/*
- * Calculate the total amount of memory that the compound response has taken
- * after encoding the current operation with pad.
+ * Calculate whether we still have space to encode repsize bytes.
+ * There are two considerations:
+ * - For NFS versions >=4.1, the size of the reply must stay within
+ * session limits
+ * - For all NFS versions, we must stay within limited preallocated
+ * buffer space.
*
- * pad: if operation is non-idempotent, pad was calculate by op_rsize_bop()
- * which was specified at nfsd4_operation, else pad is zero.
- *
- * Compare this length to the session se_fmaxresp_sz and se_fmaxresp_cached.
- *
- * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
- * will be at least a page and will therefore hold the xdr_buf head.
+ * This is called before the operation is processed, so can only provide
+ * an upper estimate. For some nonidempotent operations (such as
+ * getattr), it's not necessarily a problem if that estimate is wrong,
+ * as we can fail it after processing without significant side effects.
*/
-__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
{
- struct xdr_buf *xb = &resp->rqstp->rq_res;
- struct nfsd4_session *session = NULL;
+ struct xdr_buf *buf = &resp->rqstp->rq_res;
struct nfsd4_slot *slot = resp->cstate.slot;
- u32 length, tlen = 0;
+ if (buf->len + respsize <= buf->buflen)
+ return nfs_ok;
if (!nfsd4_has_session(&resp->cstate))
- return 0;
-
- session = resp->cstate.session;
- if (session == NULL)
- return 0;
-
- if (xb->page_len == 0) {
- length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
- } else {
- if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
- tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
-
- length = xb->head[0].iov_len + xb->page_len + tlen + pad;
- }
- dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
- length, xb->page_len, tlen, pad);
-
- if (length > session->se_fchannel.maxresp_sz)
- return nfserr_rep_too_big;
-
- if ((slot->sl_flags & NFSD4_SLOT_CACHETHIS) &&
- length > session->se_fchannel.maxresp_cached)
+ return nfserr_resource;
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS) {
+ WARN_ON_ONCE(1);
return nfserr_rep_too_big_to_cache;
-
- return 0;
+ }
+ return nfserr_rep_too_big;
}
void
nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
{
+ struct xdr_stream *xdr = &resp->xdr;
struct nfs4_stateowner *so = resp->cstate.replay_owner;
- __be32 *statp;
+ struct svc_rqst *rqstp = resp->rqstp;
+ int post_err_offset;
+ nfsd4_enc encoder;
__be32 *p;
- RESERVE_SPACE(8);
- WRITE32(op->opnum);
- statp = p++; /* to be backfilled at the end */
- ADJUST_ARGS();
+ p = xdr_reserve_space(xdr, 8);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ *p++ = cpu_to_be32(op->opnum);
+ post_err_offset = xdr->buf->len;
if (op->opnum == OP_ILLEGAL)
goto status;
BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
!nfsd4_enc_ops[op->opnum]);
- op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
- /* nfsd4_check_drc_limit guarantees enough room for error status */
- if (!op->status)
- op->status = nfsd4_check_resp_size(resp, 0);
+ encoder = nfsd4_enc_ops[op->opnum];
+ op->status = encoder(resp, op->status, &op->u);
+ xdr_commit_encode(xdr);
+
+ /* nfsd4_check_resp_size guarantees enough room for error status */
+ if (!op->status) {
+ int space_needed = 0;
+ if (!nfsd4_last_compound_op(rqstp))
+ space_needed = COMPOUND_ERR_SLACK_SPACE;
+ op->status = nfsd4_check_resp_size(resp, space_needed);
+ }
+ if (op->status == nfserr_resource && nfsd4_has_session(&resp->cstate)) {
+ struct nfsd4_slot *slot = resp->cstate.slot;
+
+ if (slot->sl_flags & NFSD4_SLOT_CACHETHIS)
+ op->status = nfserr_rep_too_big_to_cache;
+ else
+ op->status = nfserr_rep_too_big;
+ }
+ if (op->status == nfserr_resource ||
+ op->status == nfserr_rep_too_big ||
+ op->status == nfserr_rep_too_big_to_cache) {
+ /*
+ * The operation may have already been encoded or
+ * partially encoded. No op returns anything additional
+ * in the case of one of these three errors, so we can
+ * just truncate back to after the status. But it's a
+ * bug if we had to do this on a non-idempotent op:
+ */
+ warn_on_nonidempotent_op(op);
+ xdr_truncate_encode(xdr, post_err_offset);
+ }
if (so) {
+ int len = xdr->buf->len - post_err_offset;
+
so->so_replay.rp_status = op->status;
- so->so_replay.rp_buflen = (char *)resp->p - (char *)(statp+1);
- memcpy(so->so_replay.rp_buf, statp+1, so->so_replay.rp_buflen);
+ so->so_replay.rp_buflen = len;
+ read_bytes_from_xdr_buf(xdr->buf, post_err_offset,
+ so->so_replay.rp_buf, len);
}
status:
- /*
- * Note: We write the status directly, instead of using WRITE32(),
- * since it is already in network byte order.
- */
- *statp = op->status;
+ /* Note that op->status is already in network byte order: */
+ write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
}
/*
@@ -3682,21 +3941,22 @@ status:
* called with nfs4_lock_state() held
*/
void
-nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
{
__be32 *p;
struct nfs4_replay *rp = op->replay;
BUG_ON(!rp);
- RESERVE_SPACE(8);
- WRITE32(op->opnum);
+ p = xdr_reserve_space(xdr, 8 + rp->rp_buflen);
+ if (!p) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+ *p++ = cpu_to_be32(op->opnum);
*p++ = rp->rp_status; /* already xdr'ed */
- ADJUST_ARGS();
- RESERVE_SPACE(rp->rp_buflen);
- WRITEMEM(rp->rp_buf, rp->rp_buflen);
- ADJUST_ARGS();
+ p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
}
int
@@ -3728,6 +3988,12 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
int
nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compoundargs *args)
{
+ if (rqstp->rq_arg.head[0].iov_len % 4) {
+ /* client is nuts */
+ dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
+ __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
+ return 0;
+ }
args->p = p;
args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
args->pagelist = rqstp->rq_arg.pages;
@@ -3747,19 +4013,19 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
* All that remains is to write the tag and operation count...
*/
struct nfsd4_compound_state *cs = &resp->cstate;
- struct kvec *iov;
+ struct xdr_buf *buf = resp->xdr.buf;
+
+ WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
+ buf->tail[0].iov_len);
+
+ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
p = resp->tagp;
*p++ = htonl(resp->taglen);
memcpy(p, resp->tag, resp->taglen);
p += XDR_QUADLEN(resp->taglen);
*p++ = htonl(resp->opcnt);
- if (rqstp->rq_res.page_len)
- iov = &rqstp->rq_res.tail[0];
- else
- iov = &rqstp->rq_res.head[0];
- iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
- BUG_ON(iov->iov_len > PAGE_SIZE);
if (nfsd4_has_session(cs)) {
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct nfs4_client *clp = cs->session->se_client;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 9186c7ce0b1..6040da8830f 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -224,13 +224,6 @@ hash_refile(struct svc_cacherep *rp)
hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
}
-static inline bool
-nfsd_cache_entry_expired(struct svc_cacherep *rp)
-{
- return rp->c_state != RC_INPROG &&
- time_after(jiffies, rp->c_timestamp + RC_EXPIRE);
-}
-
/*
* Walk the LRU list and prune off entries that are older than RC_EXPIRE.
* Also prune the oldest ones when the total exceeds the max number of entries.
@@ -242,8 +235,14 @@ prune_cache_entries(void)
long freed = 0;
list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
- if (!nfsd_cache_entry_expired(rp) &&
- num_drc_entries <= max_drc_entries)
+ /*
+ * Don't free entries attached to calls that are still
+ * in-progress, but do keep scanning the list.
+ */
+ if (rp->c_state == RC_INPROG)
+ continue;
+ if (num_drc_entries <= max_drc_entries &&
+ time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
break;
nfsd_reply_cache_free_locked(rp);
freed++;
@@ -409,22 +408,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
/*
* Since the common case is a cache miss followed by an insert,
- * preallocate an entry. First, try to reuse the first entry on the LRU
- * if it works, then go ahead and prune the LRU list.
+ * preallocate an entry.
*/
- spin_lock(&cache_lock);
- if (!list_empty(&lru_head)) {
- rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
- if (nfsd_cache_entry_expired(rp) ||
- num_drc_entries >= max_drc_entries) {
- lru_put_end(rp);
- prune_cache_entries();
- goto search_cache;
- }
- }
-
- /* No expired ones available, allocate a new one. */
- spin_unlock(&cache_lock);
rp = nfsd_reply_cache_alloc();
spin_lock(&cache_lock);
if (likely(rp)) {
@@ -432,7 +417,9 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
drc_mem_usage += sizeof(*rp);
}
-search_cache:
+ /* go ahead and prune the cache */
+ prune_cache_entries();
+
found = nfsd_cache_search(rqstp, csum);
if (found) {
if (likely(rp))
@@ -446,15 +433,6 @@ search_cache:
goto out;
}
- /*
- * We're keeping the one we just allocated. Are we now over the
- * limit? Prune one off the tip of the LRU in trade for the one we
- * just allocated if so.
- */
- if (num_drc_entries >= max_drc_entries)
- nfsd_reply_cache_free_locked(list_first_entry(&lru_head,
- struct svc_cacherep, c_lru));
-
nfsdstats.rcmisses++;
rqstp->rq_cacherep = rp;
rp->c_state = RC_INPROG;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 7f555179bf8..51844048937 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -699,6 +699,11 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net)
if (err != 0 || fd < 0)
return -EINVAL;
+ if (svc_alien_sock(net, fd)) {
+ printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
+ return -EINVAL;
+ }
+
err = nfsd_create_serv(net);
if (err != 0)
return err;
@@ -1174,7 +1179,6 @@ static int __init init_nfsd(void)
retval = nfsd4_init_slabs();
if (retval)
goto out_unregister_pernet;
- nfs4_state_init();
retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
if (retval)
goto out_free_slabs;
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 30f34ab0213..847daf37e56 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -15,11 +15,20 @@
#include <linux/nfs2.h>
#include <linux/nfs3.h>
#include <linux/nfs4.h>
+#include <linux/sunrpc/svc.h>
#include <linux/sunrpc/msg_prot.h>
-#include <linux/nfsd/debug.h>
-#include <linux/nfsd/export.h>
-#include <linux/nfsd/stats.h>
+#include <uapi/linux/nfsd/debug.h>
+
+#include "stats.h"
+#include "export.h"
+
+#undef ifdebug
+#ifdef NFSD_DEBUG
+# define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag)
+#else
+# define ifdebug(flag) if (0)
+#endif
/*
* nfsd version
@@ -106,7 +115,6 @@ static inline int nfsd_v4client(struct svc_rqst *rq)
*/
#ifdef CONFIG_NFSD_V4
extern unsigned long max_delegations;
-void nfs4_state_init(void);
int nfsd4_init_slabs(void);
void nfsd4_free_slabs(void);
int nfs4_state_start(void);
@@ -117,7 +125,6 @@ void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
#else
-static inline void nfs4_state_init(void) { }
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
static inline int nfs4_state_start(void) { return 0; }
@@ -282,7 +289,7 @@ void nfsd_lockd_shutdown(void);
* reason.
*/
#define COMPOUND_SLACK_SPACE 140 /* OP_GETFH */
-#define COMPOUND_ERR_SLACK_SPACE 12 /* OP_SETATTR */
+#define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
#define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 3d0e15ae6f7..ec839341815 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -88,9 +88,8 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp,
/* Check if the request originated from a secure port. */
if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) {
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
- dprintk(KERN_WARNING
- "nfsd: request from insecure port %s!\n",
- svc_print_addr(rqstp, buf, sizeof(buf)));
+ dprintk("nfsd: request from insecure port %s!\n",
+ svc_print_addr(rqstp, buf, sizeof(buf)));
return nfserr_perm;
}
@@ -169,8 +168,8 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
data_left -= len;
if (data_left < 0)
return error;
- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth);
- fid = (struct fid *)(fh->fh_auth + len);
+ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+ fid = (struct fid *)(fh->fh_fsid + len);
} else {
__u32 tfh[2];
dev_t xdev;
@@ -385,7 +384,7 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
{
if (dentry != exp->ex_path.dentry) {
struct fid *fid = (struct fid *)
- (fhp->fh_handle.fh_auth + fhp->fh_handle.fh_size/4 - 1);
+ (fhp->fh_handle.fh_fsid + fhp->fh_handle.fh_size/4 - 1);
int maxsize = (fhp->fh_maxsize - fhp->fh_handle.fh_size)/4;
int subtreecheck = !(exp->ex_flags & NFSEXP_NOSUBTREECHECK);
@@ -513,7 +512,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
*/
struct inode * inode = dentry->d_inode;
- __u32 *datap;
dev_t ex_dev = exp_sb(exp)->s_dev;
dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n",
@@ -557,17 +555,16 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
if (inode)
_fh_update_old(dentry, exp, &fhp->fh_handle);
} else {
- int len;
+ fhp->fh_handle.fh_size =
+ key_len(fhp->fh_handle.fh_fsid_type) + 4;
fhp->fh_handle.fh_auth_type = 0;
- datap = fhp->fh_handle.fh_auth+0;
- mk_fsid(fhp->fh_handle.fh_fsid_type, datap, ex_dev,
+
+ mk_fsid(fhp->fh_handle.fh_fsid_type,
+ fhp->fh_handle.fh_fsid,
+ ex_dev,
exp->ex_path.dentry->d_inode->i_ino,
exp->ex_fsid, exp->ex_uuid);
- len = key_len(fhp->fh_handle.fh_fsid_type);
- datap += len/4;
- fhp->fh_handle.fh_size = 4 + len;
-
if (inode)
_fh_update(fhp, exp, dentry);
if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
@@ -598,22 +595,20 @@ fh_update(struct svc_fh *fhp)
_fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
} else {
if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
- goto out;
+ return 0;
_fh_update(fhp, fhp->fh_export, dentry);
if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
return nfserr_opnotsupp;
}
-out:
return 0;
-
out_bad:
printk(KERN_ERR "fh_update: fh not verified!\n");
- goto out;
+ return nfserr_serverfault;
out_negative:
printk(KERN_ERR "fh_update: %pd2 still negative!\n",
dentry);
- goto out;
+ return nfserr_serverfault;
}
/*
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 4775bc4896c..2e89e70ac15 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -1,9 +1,58 @@
-/* Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */
+/*
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+ *
+ * This file describes the layout of the file handles as passed
+ * over the wire.
+ */
+#ifndef _LINUX_NFSD_NFSFH_H
+#define _LINUX_NFSD_NFSFH_H
+
+#include <linux/sunrpc/svc.h>
+#include <uapi/linux/nfsd/nfsfh.h>
+
+static inline __u32 ino_t_to_u32(ino_t ino)
+{
+ return (__u32) ino;
+}
-#ifndef _LINUX_NFSD_FH_INT_H
-#define _LINUX_NFSD_FH_INT_H
+static inline ino_t u32_to_ino_t(__u32 uino)
+{
+ return (ino_t) uino;
+}
+
+/*
+ * This is the internal representation of an NFS handle used in knfsd.
+ * pre_mtime/post_version will be used to support wcc_attr's in NFSv3.
+ */
+typedef struct svc_fh {
+ struct knfsd_fh fh_handle; /* FH data */
+ struct dentry * fh_dentry; /* validated dentry */
+ struct svc_export * fh_export; /* export pointer */
+ int fh_maxsize; /* max size for fh_handle */
-#include <linux/nfsd/nfsfh.h>
+ unsigned char fh_locked; /* inode locked by us */
+ unsigned char fh_want_write; /* remount protection taken */
+
+#ifdef CONFIG_NFSD_V3
+ unsigned char fh_post_saved; /* post-op attrs saved */
+ unsigned char fh_pre_saved; /* pre-op attrs saved */
+
+ /* Pre-op attributes saved during fh_lock */
+ __u64 fh_pre_size; /* size before operation */
+ struct timespec fh_pre_mtime; /* mtime before oper */
+ struct timespec fh_pre_ctime; /* ctime before oper */
+ /*
+ * pre-op nfsv4 change attr: note must check IS_I_VERSION(inode)
+ * to find out if it is valid.
+ */
+ u64 fh_pre_change;
+
+ /* Post-op attributes saved in fh_unlock */
+ struct kstat fh_post_attr; /* full attrs after operation */
+ u64 fh_post_change; /* nfsv4 change; see above */
+#endif /* CONFIG_NFSD_V3 */
+
+} svc_fh;
enum nfsd_fsid {
FSID_DEV = 0,
@@ -133,6 +182,17 @@ fh_init(struct svc_fh *fhp, int maxsize)
#ifdef CONFIG_NFSD_V3
/*
+ * The wcc data stored in current_fh should be cleared
+ * between compound ops.
+ */
+static inline void
+fh_clear_wcc(struct svc_fh *fhp)
+{
+ fhp->fh_post_saved = 0;
+ fhp->fh_pre_saved = 0;
+}
+
+/*
* Fill in the pre_op attr for the wcc data
*/
static inline void
@@ -152,7 +212,8 @@ fill_pre_wcc(struct svc_fh *fhp)
extern void fill_post_wcc(struct svc_fh *);
#else
-#define fill_pre_wcc(ignored)
+#define fh_clear_wcc(ignored)
+#define fill_pre_wcc(ignored)
#define fill_post_wcc(notused)
#endif /* CONFIG_NFSD_V3 */
@@ -203,4 +264,4 @@ fh_unlock(struct svc_fh *fhp)
}
}
-#endif /* _LINUX_NFSD_FH_INT_H */
+#endif /* _LINUX_NFSD_NFSFH_H */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 760c85a6f53..1879e43f286 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -241,6 +241,15 @@ static void nfsd_shutdown_generic(void)
nfsd_racache_shutdown();
}
+static bool nfsd_needs_lockd(void)
+{
+#if defined(CONFIG_NFSD_V3)
+ return (nfsd_versions[2] != NULL) || (nfsd_versions[3] != NULL);
+#else
+ return (nfsd_versions[2] != NULL);
+#endif
+}
+
static int nfsd_startup_net(int nrservs, struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -255,9 +264,14 @@ static int nfsd_startup_net(int nrservs, struct net *net)
ret = nfsd_init_socks(net);
if (ret)
goto out_socks;
- ret = lockd_up(net);
- if (ret)
- goto out_socks;
+
+ if (nfsd_needs_lockd() && !nn->lockd_up) {
+ ret = lockd_up(net);
+ if (ret)
+ goto out_socks;
+ nn->lockd_up = 1;
+ }
+
ret = nfs4_state_start_net(net);
if (ret)
goto out_lockd;
@@ -266,7 +280,10 @@ static int nfsd_startup_net(int nrservs, struct net *net)
return 0;
out_lockd:
- lockd_down(net);
+ if (nn->lockd_up) {
+ lockd_down(net);
+ nn->lockd_up = 0;
+ }
out_socks:
nfsd_shutdown_generic();
return ret;
@@ -277,7 +294,10 @@ static void nfsd_shutdown_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
nfs4_state_shutdown_net(net);
- lockd_down(net);
+ if (nn->lockd_up) {
+ lockd_down(net);
+ nn->lockd_up = 0;
+ }
nn->nfsd_net_up = false;
nfsd_shutdown_generic();
}
@@ -571,12 +591,6 @@ nfsd(void *vrqstp)
nfsdstats.th_cnt++;
mutex_unlock(&nfsd_mutex);
- /*
- * We want less throttling in balance_dirty_pages() so that nfs to
- * localhost doesn't cause nfsd to lock up due to all the client's
- * dirty pages.
- */
- current->flags |= PF_LESS_THROTTLE;
set_freezable();
/*
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 9c769a47ac5..1ac306b769d 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -214,7 +214,8 @@ nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
int
nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p, struct nfsd_fhandle *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
return xdr_argsize_check(rqstp, p);
}
@@ -248,7 +249,8 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
{
unsigned int len;
int v;
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->offset = ntohl(*p++);
@@ -281,7 +283,8 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
unsigned int len, hdr, dlen;
int v;
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
p++; /* beginoffset */
@@ -355,7 +358,8 @@ nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p,
int
nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readlinkargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->buffer = page_address(*(rqstp->rq_next_page++));
@@ -391,7 +395,8 @@ int
nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
struct nfsd_readdirargs *args)
{
- if (!(p = decode_fh(p, &args->fh)))
+ p = decode_fh(p, &args->fh);
+ if (!p)
return 0;
args->cookie = ntohl(*p++);
args->count = ntohl(*p++);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 424d8f5f231..374c66283ac 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -37,7 +37,6 @@
#include <linux/idr.h>
#include <linux/sunrpc/svc_xprt.h>
-#include <linux/nfsd/nfsfh.h>
#include "nfsfh.h"
typedef struct {
@@ -123,7 +122,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
/* Maximum number of operations per session compound */
#define NFSD_MAX_OPS_PER_COMPOUND 16
/* Maximum session per slot cache size */
-#define NFSD_SLOT_CACHE_SIZE 1024
+#define NFSD_SLOT_CACHE_SIZE 2048
/* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
#define NFSD_CACHE_SIZE_SLOTS_PER_SESSION 32
#define NFSD_MAX_MEM_PER_SESSION \
@@ -464,8 +463,6 @@ extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
struct nfsd_net *nn);
extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn);
-extern void nfs4_free_openowner(struct nfs4_openowner *);
-extern void nfs4_free_lockowner(struct nfs4_lockowner *);
extern int set_callback_cred(void);
extern void nfsd4_init_callback(struct nfsd4_callback *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 6d4521feb6e..cd90878a76a 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -24,7 +24,6 @@
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/sunrpc/stats.h>
-#include <linux/nfsd/stats.h>
#include <net/net_namespace.h>
#include "nfsd.h"
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
new file mode 100644
index 00000000000..a5c944b771c
--- /dev/null
+++ b/fs/nfsd/stats.h
@@ -0,0 +1,43 @@
+/*
+ * Statistics for NFS server.
+ *
+ * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
+ */
+#ifndef _NFSD_STATS_H
+#define _NFSD_STATS_H
+
+#include <uapi/linux/nfsd/stats.h>
+
+
+struct nfsd_stats {
+ unsigned int rchits; /* repcache hits */
+ unsigned int rcmisses; /* repcache hits */
+ unsigned int rcnocache; /* uncached reqs */
+ unsigned int fh_stale; /* FH stale error */
+ unsigned int fh_lookup; /* dentry cached */
+ unsigned int fh_anon; /* anon file dentry returned */
+ unsigned int fh_nocache_dir; /* filehandle not found in dcache */
+ unsigned int fh_nocache_nondir; /* filehandle not found in dcache */
+ unsigned int io_read; /* bytes returned to read requests */
+ unsigned int io_write; /* bytes passed in write requests */
+ unsigned int th_cnt; /* number of available threads */
+ unsigned int th_usage[10]; /* number of ticks during which n perdeciles
+ * of available threads were in use */
+ unsigned int th_fullcnt; /* number of times last free thread was used */
+ unsigned int ra_size; /* size of ra cache */
+ unsigned int ra_depth[11]; /* number of times ra entry was found that deep
+ * in the cache (10percentiles). [10] = not found */
+#ifdef CONFIG_NFSD_V4
+ unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */
+#endif
+
+};
+
+
+extern struct nfsd_stats nfsdstats;
+extern struct svc_stat nfsd_svcstats;
+
+void nfsd_stat_init(void);
+void nfsd_stat_shutdown(void);
+
+#endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 13886f7f40d..140c496f612 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -207,7 +207,12 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out_nfserr;
}
} else {
- fh_lock(fhp);
+ /*
+ * In the nfsd4_open() case, this may be held across
+ * subsequent open and delegation acquisition which may
+ * need to take the child's i_mutex:
+ */
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
dentry = lookup_one_len(name, dparent, len);
host_err = PTR_ERR(dentry);
if (IS_ERR(dentry))
@@ -273,13 +278,6 @@ out:
return err;
}
-static int nfsd_break_lease(struct inode *inode)
-{
- if (!S_ISREG(inode->i_mode))
- return 0;
- return break_lease(inode, O_WRONLY | O_NONBLOCK);
-}
-
/*
* Commit metadata changes to stable storage.
*/
@@ -298,41 +296,12 @@ commit_metadata(struct svc_fh *fhp)
}
/*
- * Set various file attributes.
- * N.B. After this call fhp needs an fh_put
+ * Go over the attributes and take care of the small differences between
+ * NFS semantics and what Linux expects.
*/
-__be32
-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
- int check_guard, time_t guardtime)
+static void
+nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
{
- struct dentry *dentry;
- struct inode *inode;
- int accmode = NFSD_MAY_SATTR;
- umode_t ftype = 0;
- __be32 err;
- int host_err;
- int size_change = 0;
-
- if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
- accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
- if (iap->ia_valid & ATTR_SIZE)
- ftype = S_IFREG;
-
- /* Get inode */
- err = fh_verify(rqstp, fhp, ftype, accmode);
- if (err)
- goto out;
-
- dentry = fhp->fh_dentry;
- inode = dentry->d_inode;
-
- /* Ignore any mode updates on symlinks */
- if (S_ISLNK(inode->i_mode))
- iap->ia_valid &= ~ATTR_MODE;
-
- if (!iap->ia_valid)
- goto out;
-
/*
* NFSv2 does not differentiate between "set-[ac]time-to-now"
* which only requires access, and "set-[ac]time-to-X" which
@@ -342,8 +311,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
* convert to "set to now" instead of "set to explicit time"
*
* We only call inode_change_ok as the last test as technically
- * it is not an interface that we should be using. It is only
- * valid if the filesystem does not define it's own i_op->setattr.
+ * it is not an interface that we should be using.
*/
#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
#define MAX_TOUCH_TIME_ERROR (30*60)
@@ -369,30 +337,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
iap->ia_valid &= ~BOTH_TIME_SET;
}
}
-
- /*
- * The size case is special.
- * It changes the file as well as the attributes.
- */
- if (iap->ia_valid & ATTR_SIZE) {
- if (iap->ia_size < inode->i_size) {
- err = nfsd_permission(rqstp, fhp->fh_export, dentry,
- NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
- if (err)
- goto out;
- }
-
- host_err = get_write_access(inode);
- if (host_err)
- goto out_nfserr;
-
- size_change = 1;
- host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
- if (host_err) {
- put_write_access(inode);
- goto out_nfserr;
- }
- }
/* sanitize the mode change */
if (iap->ia_valid & ATTR_MODE) {
@@ -402,8 +346,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
/* Revoke setuid/setgid on chown */
if (!S_ISDIR(inode->i_mode) &&
- (((iap->ia_valid & ATTR_UID) && !uid_eq(iap->ia_uid, inode->i_uid)) ||
- ((iap->ia_valid & ATTR_GID) && !gid_eq(iap->ia_gid, inode->i_gid)))) {
+ ((iap->ia_valid & ATTR_UID) || (iap->ia_valid & ATTR_GID))) {
iap->ia_valid |= ATTR_KILL_PRIV;
if (iap->ia_valid & ATTR_MODE) {
/* we're setting mode too, just clear the s*id bits */
@@ -415,186 +358,118 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
}
}
-
- /* Change the attributes. */
-
- iap->ia_valid |= ATTR_CTIME;
-
- err = nfserr_notsync;
- if (!check_guard || guardtime == inode->i_ctime.tv_sec) {
- host_err = nfsd_break_lease(inode);
- if (host_err)
- goto out_nfserr;
- fh_lock(fhp);
-
- host_err = notify_change(dentry, iap);
- err = nfserrno(host_err);
- fh_unlock(fhp);
- }
- if (size_change)
- put_write_access(inode);
- if (!err)
- commit_metadata(fhp);
-out:
- return err;
-
-out_nfserr:
- err = nfserrno(host_err);
- goto out;
}
-#if defined(CONFIG_NFSD_V2_ACL) || \
- defined(CONFIG_NFSD_V3_ACL) || \
- defined(CONFIG_NFSD_V4)
-static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
+static __be32
+nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct iattr *iap)
{
- ssize_t buflen;
- ssize_t ret;
-
- buflen = vfs_getxattr(dentry, key, NULL, 0);
- if (buflen <= 0)
- return buflen;
+ struct inode *inode = fhp->fh_dentry->d_inode;
+ int host_err;
- *buf = kmalloc(buflen, GFP_KERNEL);
- if (!*buf)
- return -ENOMEM;
+ if (iap->ia_size < inode->i_size) {
+ __be32 err;
- ret = vfs_getxattr(dentry, key, *buf, buflen);
- if (ret < 0)
- kfree(*buf);
- return ret;
-}
-#endif
+ err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
+ NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
+ if (err)
+ return err;
+ }
-#if defined(CONFIG_NFSD_V4)
-static int
-set_nfsv4_acl_one(struct dentry *dentry, struct posix_acl *pacl, char *key)
-{
- int len;
- size_t buflen;
- char *buf = NULL;
- int error = 0;
-
- buflen = posix_acl_xattr_size(pacl->a_count);
- buf = kmalloc(buflen, GFP_KERNEL);
- error = -ENOMEM;
- if (buf == NULL)
- goto out;
+ host_err = get_write_access(inode);
+ if (host_err)
+ goto out_nfserrno;
- len = posix_acl_to_xattr(&init_user_ns, pacl, buf, buflen);
- if (len < 0) {
- error = len;
- goto out;
- }
+ host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
+ if (host_err)
+ goto out_put_write_access;
+ return 0;
- error = vfs_setxattr(dentry, key, buf, len, 0);
-out:
- kfree(buf);
- return error;
+out_put_write_access:
+ put_write_access(inode);
+out_nfserrno:
+ return nfserrno(host_err);
}
+/*
+ * Set various file attributes. After this call fhp needs an fh_put.
+ */
__be32
-nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
- struct nfs4_acl *acl)
+nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+ int check_guard, time_t guardtime)
{
- __be32 error;
- int host_error;
- struct dentry *dentry;
- struct inode *inode;
- struct posix_acl *pacl = NULL, *dpacl = NULL;
- unsigned int flags = 0;
+ struct dentry *dentry;
+ struct inode *inode;
+ int accmode = NFSD_MAY_SATTR;
+ umode_t ftype = 0;
+ __be32 err;
+ int host_err;
+ bool get_write_count;
+ int size_change = 0;
+
+ if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
+ accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+ if (iap->ia_valid & ATTR_SIZE)
+ ftype = S_IFREG;
+
+ /* Callers that do fh_verify should do the fh_want_write: */
+ get_write_count = !fhp->fh_dentry;
/* Get inode */
- error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
- if (error)
- return error;
+ err = fh_verify(rqstp, fhp, ftype, accmode);
+ if (err)
+ goto out;
+ if (get_write_count) {
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+ }
dentry = fhp->fh_dentry;
inode = dentry->d_inode;
- if (S_ISDIR(inode->i_mode))
- flags = NFS4_ACL_DIR;
- host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
- if (host_error == -EINVAL) {
- return nfserr_attrnotsupp;
- } else if (host_error < 0)
- goto out_nfserr;
+ /* Ignore any mode updates on symlinks */
+ if (S_ISLNK(inode->i_mode))
+ iap->ia_valid &= ~ATTR_MODE;
- host_error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
- if (host_error < 0)
- goto out_release;
+ if (!iap->ia_valid)
+ goto out;
- if (S_ISDIR(inode->i_mode))
- host_error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
+ nfsd_sanitize_attrs(inode, iap);
-out_release:
- posix_acl_release(pacl);
- posix_acl_release(dpacl);
-out_nfserr:
- if (host_error == -EOPNOTSUPP)
- return nfserr_attrnotsupp;
- else
- return nfserrno(host_error);
-}
+ /*
+ * The size case is special, it changes the file in addition to the
+ * attributes.
+ */
+ if (iap->ia_valid & ATTR_SIZE) {
+ err = nfsd_get_write_access(rqstp, fhp, iap);
+ if (err)
+ goto out;
+ size_change = 1;
+ }
-static struct posix_acl *
-_get_posix_acl(struct dentry *dentry, char *key)
-{
- void *buf = NULL;
- struct posix_acl *pacl = NULL;
- int buflen;
-
- buflen = nfsd_getxattr(dentry, key, &buf);
- if (!buflen)
- buflen = -ENODATA;
- if (buflen <= 0)
- return ERR_PTR(buflen);
-
- pacl = posix_acl_from_xattr(&init_user_ns, buf, buflen);
- kfree(buf);
- return pacl;
-}
+ iap->ia_valid |= ATTR_CTIME;
-int
-nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl)
-{
- struct inode *inode = dentry->d_inode;
- int error = 0;
- struct posix_acl *pacl = NULL, *dpacl = NULL;
- unsigned int flags = 0;
-
- pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
- if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
- pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
- if (IS_ERR(pacl)) {
- error = PTR_ERR(pacl);
- pacl = NULL;
- goto out;
+ if (check_guard && guardtime != inode->i_ctime.tv_sec) {
+ err = nfserr_notsync;
+ goto out_put_write_access;
}
- if (S_ISDIR(inode->i_mode)) {
- dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
- if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
- dpacl = NULL;
- else if (IS_ERR(dpacl)) {
- error = PTR_ERR(dpacl);
- dpacl = NULL;
- goto out;
- }
- flags = NFS4_ACL_DIR;
- }
+ fh_lock(fhp);
+ host_err = notify_change(dentry, iap, NULL);
+ fh_unlock(fhp);
+ err = nfserrno(host_err);
- *acl = nfs4_acl_posix_to_nfsv4(pacl, dpacl, flags);
- if (IS_ERR(*acl)) {
- error = PTR_ERR(*acl);
- *acl = NULL;
- }
- out:
- posix_acl_release(pacl);
- posix_acl_release(dpacl);
- return error;
+out_put_write_access:
+ if (size_change)
+ put_write_access(inode);
+ if (!err)
+ commit_metadata(fhp);
+out:
+ return err;
}
+#if defined(CONFIG_NFSD_V4)
/*
* NFS junction information is stored in an extended attribute.
*/
@@ -945,51 +820,54 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
}
-static __be32
-nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
{
- mm_segment_t oldfs;
- __be32 err;
- int host_err;
-
- err = nfserr_perm;
-
- if (file->f_op->splice_read && rqstp->rq_splice_ok) {
- struct splice_desc sd = {
- .len = 0,
- .total_len = *count,
- .pos = offset,
- .u.data = rqstp,
- };
-
- rqstp->rq_next_page = rqstp->rq_respages + 1;
- host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
- } else {
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
- set_fs(oldfs);
- }
-
if (host_err >= 0) {
nfsdstats.io_read += host_err;
*count = host_err;
- err = 0;
fsnotify_access(file);
+ return 0;
} else
- err = nfserrno(host_err);
- return err;
+ return nfserrno(host_err);
}
-static void kill_suid(struct dentry *dentry)
+int nfsd_splice_read(struct svc_rqst *rqstp,
+ struct file *file, loff_t offset, unsigned long *count)
{
- struct iattr ia;
- ia.ia_valid = ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+ struct splice_desc sd = {
+ .len = 0,
+ .total_len = *count,
+ .pos = offset,
+ .u.data = rqstp,
+ };
+ int host_err;
- mutex_lock(&dentry->d_inode->i_mutex);
- notify_change(dentry, &ia);
- mutex_unlock(&dentry->d_inode->i_mutex);
+ rqstp->rq_next_page = rqstp->rq_respages + 1;
+ host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+ return nfsd_finish_read(file, count, host_err);
+}
+
+int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
+ unsigned long *count)
+{
+ mm_segment_t oldfs;
+ int host_err;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ host_err = vfs_readv(file, (struct iovec __user *)vec, vlen, &offset);
+ set_fs(oldfs);
+ return nfsd_finish_read(file, count, host_err);
+}
+
+static __be32
+nfsd_vfs_read(struct svc_rqst *rqstp, struct file *file,
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+{
+ if (file->f_op->splice_read && rqstp->rq_splice_ok)
+ return nfsd_splice_read(rqstp, file, offset, count);
+ else
+ return nfsd_readv(file, offset, vec, vlen, count);
}
/*
@@ -1043,6 +921,16 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
int stable = *stablep;
int use_wgather;
loff_t pos = offset;
+ unsigned int pflags = current->flags;
+
+ if (rqstp->rq_local)
+ /*
+ * We want less throttling in balance_dirty_pages()
+ * and shrink_inactive_list() so that nfs to
+ * localhost doesn't cause nfsd to lock up due to all
+ * the client's dirty pages or its congested queue.
+ */
+ current->flags |= PF_LESS_THROTTLE;
dentry = file->f_path.dentry;
inode = dentry->d_inode;
@@ -1063,10 +951,6 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
nfsdstats.io_write += host_err;
fsnotify_modify(file);
- /* clear setuid/setgid flag after write */
- if (inode->i_mode & (S_ISUID | S_ISGID))
- kill_suid(dentry);
-
if (stable) {
if (use_wgather)
host_err = wait_for_concurrent_writes(file);
@@ -1080,36 +964,33 @@ out_nfserr:
err = 0;
else
err = nfserrno(host_err);
+ if (rqstp->rq_local)
+ tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
return err;
}
-/*
- * Read data from a file. count must contain the requested read count
- * on entry. On return, *count contains the number of bytes actually read.
- * N.B. After this call fhp needs an fh_put
- */
-__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
- loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
+__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file **file, struct raparms **ra)
{
- struct file *file;
struct inode *inode;
- struct raparms *ra;
__be32 err;
- err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+ err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file);
if (err)
return err;
- inode = file_inode(file);
+ inode = file_inode(*file);
/* Get readahead parameters */
- ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
-
- if (ra && ra->p_set)
- file->f_ra = ra->p_ra;
+ *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino);
- err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
+ if (*ra && (*ra)->p_set)
+ (*file)->f_ra = (*ra)->p_ra;
+ return nfs_ok;
+}
+void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra)
+{
/* Write back readahead params */
if (ra) {
struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex];
@@ -1119,28 +1000,29 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ra->p_count--;
spin_unlock(&rab->pb_lock);
}
-
nfsd_close(file);
- return err;
}
-/* As above, but use the provided file descriptor. */
-__be32
-nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
- loff_t offset, struct kvec *vec, int vlen,
- unsigned long *count)
+/*
+ * Read data from a file. count must contain the requested read count
+ * on entry. On return, *count contains the number of bytes actually read.
+ * N.B. After this call fhp needs an fh_put
+ */
+__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, struct kvec *vec, int vlen, unsigned long *count)
{
- __be32 err;
+ struct file *file;
+ struct raparms *ra;
+ __be32 err;
+
+ err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra);
+ if (err)
+ return err;
+
+ err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+
+ nfsd_put_tmp_read_open(file, ra);
- if (file) {
- err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
- NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE);
- if (err)
- goto out;
- err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count);
- } else /* Note file may still be NULL in NFSv4 special stateid case: */
- err = nfsd_read(rqstp, fhp, offset, vec, vlen, count);
-out:
return err;
}
@@ -1731,12 +1613,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
err = nfserr_noent;
if (!dold->d_inode)
goto out_dput;
- host_err = nfsd_break_lease(dold->d_inode);
- if (host_err) {
- err = nfserrno(host_err);
- goto out_dput;
- }
- host_err = vfs_link(dold, dirp, dnew);
+ host_err = vfs_link(dold, dirp, dnew, NULL);
if (!host_err) {
err = nfserrno(commit_metadata(ffhp));
if (!err)
@@ -1829,15 +1706,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
goto out_dput_new;
- host_err = nfsd_break_lease(odentry->d_inode);
- if (host_err)
- goto out_dput_new;
- if (ndentry->d_inode) {
- host_err = nfsd_break_lease(ndentry->d_inode);
- if (host_err)
- goto out_dput_new;
- }
- host_err = vfs_rename(fdir, odentry, tdir, ndentry);
+ host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
if (!host_err) {
host_err = commit_metadata(tfhp);
if (!host_err)
@@ -1849,10 +1718,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
dput(odentry);
out_nfserr:
err = nfserrno(host_err);
-
- /* we cannot reply on fh_unlock on the two filehandles,
+ /*
+ * We cannot rely on fh_unlock on the two filehandles,
* as that would do the wrong thing if the two directories
- * were the same, so again we do it by hand
+ * were the same, so again we do it by hand.
*/
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
@@ -1906,16 +1775,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
if (!type)
type = rdentry->d_inode->i_mode & S_IFMT;
- host_err = nfsd_break_lease(rdentry->d_inode);
- if (host_err)
- goto out_put;
if (type != S_IFDIR)
- host_err = vfs_unlink(dirp, rdentry);
+ host_err = vfs_unlink(dirp, rdentry, NULL);
else
host_err = vfs_rmdir(dirp, rdentry);
if (!host_err)
host_err = commit_metadata(fhp);
-out_put:
dput(rdentry);
out_nfserr:
@@ -2255,93 +2120,3 @@ out_nomem:
nfsd_racache_shutdown();
return -ENOMEM;
}
-
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-struct posix_acl *
-nfsd_get_posix_acl(struct svc_fh *fhp, int type)
-{
- struct inode *inode = fhp->fh_dentry->d_inode;
- char *name;
- void *value = NULL;
- ssize_t size;
- struct posix_acl *acl;
-
- if (!IS_POSIXACL(inode))
- return ERR_PTR(-EOPNOTSUPP);
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- return ERR_PTR(-EOPNOTSUPP);
- }
-
- size = nfsd_getxattr(fhp->fh_dentry, name, &value);
- if (size < 0)
- return ERR_PTR(size);
-
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- kfree(value);
- return acl;
-}
-
-int
-nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
-{
- struct inode *inode = fhp->fh_dentry->d_inode;
- char *name;
- void *value = NULL;
- size_t size;
- int error;
-
- if (!IS_POSIXACL(inode) ||
- !inode->i_op->setxattr || !inode->i_op->removexattr)
- return -EOPNOTSUPP;
- switch(type) {
- case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- if (acl && acl->a_count) {
- size = posix_acl_xattr_size(acl->a_count);
- value = kmalloc(size, GFP_KERNEL);
- if (!value)
- return -ENOMEM;
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- if (error < 0)
- goto getout;
- size = error;
- } else
- size = 0;
-
- error = fh_want_write(fhp);
- if (error)
- goto getout;
- if (size)
- error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
- else {
- if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
- error = 0;
- else {
- error = vfs_removexattr(fhp->fh_dentry, name);
- if (error == -ENODATA)
- error = 0;
- }
- }
- fh_drop_write(fhp);
-
-getout:
- kfree(value);
- return error;
-}
-#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index a4be2e38967..91b6ae3f658 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -52,9 +52,6 @@ __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
struct iattr *, int, time_t);
int nfsd_mountpoint(struct dentry *, struct svc_export *);
#ifdef CONFIG_NFSD_V4
-__be32 nfsd4_set_nfs4_acl(struct svc_rqst *, struct svc_fh *,
- struct nfs4_acl *);
-int nfsd4_get_nfs4_acl(struct svc_rqst *, struct dentry *, struct nfs4_acl **);
__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
struct xdr_netobj *);
#endif /* CONFIG_NFSD_V4 */
@@ -73,10 +70,16 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
int, struct file **);
void nfsd_close(struct file *);
+struct raparms;
+__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
+ struct file **, struct raparms **);
+void nfsd_put_tmp_read_open(struct file *, struct raparms *);
+int nfsd_splice_read(struct svc_rqst *,
+ struct file *, loff_t, unsigned long *);
+int nfsd_readv(struct file *, loff_t, struct kvec *, int,
+ unsigned long *);
__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
loff_t, struct kvec *, int, unsigned long *);
-__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *,
- loff_t, struct kvec *, int, unsigned long *);
__be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
loff_t, struct kvec *,int, unsigned long *, int *);
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
@@ -89,8 +92,6 @@ __be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_rename(struct svc_rqst *,
struct svc_fh *, char *, int,
struct svc_fh *, char *, int);
-__be32 nfsd_remove(struct svc_rqst *,
- struct svc_fh *, char *, int);
__be32 nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
char *name, int len);
__be32 nfsd_readdir(struct svc_rqst *, struct svc_fh *,
@@ -101,11 +102,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *,
__be32 nfsd_permission(struct svc_rqst *, struct svc_export *,
struct dentry *, int);
-#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
-int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
-#endif
-
static inline int fh_want_write(struct svc_fh *fh)
{
int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index b6d5542a4ac..335e04aaf7d 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -174,6 +174,9 @@ struct nfsd3_linkres {
struct nfsd3_readdirres {
__be32 status;
struct svc_fh fh;
+ /* Just to save kmalloc on every readdirplus entry (svc_fh is a
+ * little large for the stack): */
+ struct svc_fh scratch;
int count;
__be32 verf[2];
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index b3ed6446ed8..18cbb6d9c8a 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -58,7 +58,7 @@ struct nfsd4_compound_state {
/* For sessions DRC */
struct nfsd4_session *session;
struct nfsd4_slot *slot;
- __be32 *datap;
+ int data_offset;
size_t iovlen;
u32 minorversion;
__be32 status;
@@ -228,7 +228,7 @@ struct nfsd4_open {
u32 op_create; /* request */
u32 op_createmode; /* request */
u32 op_bmval[3]; /* request */
- struct iattr iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
+ struct iattr op_iattr; /* UNCHECKED4, GUARDED4, EXCLUSIVE4_1 */
nfs4_verifier op_verf __attribute__((aligned(32)));
/* EXCLUSIVE4 */
clientid_t op_clientid; /* request */
@@ -250,7 +250,6 @@ struct nfsd4_open {
struct nfs4_acl *op_acl;
struct xdr_netobj op_label;
};
-#define op_iattr iattr
struct nfsd4_open_confirm {
stateid_t oc_req_stateid /* request */;
@@ -288,9 +287,8 @@ struct nfsd4_readdir {
struct svc_fh * rd_fhp; /* response */
struct readdir_cd common;
- __be32 * buffer;
- int buflen;
- __be32 * offset;
+ struct xdr_stream *xdr;
+ int cookie_offset;
};
struct nfsd4_release_lockowner {
@@ -374,7 +372,6 @@ struct nfsd4_test_stateid {
struct nfsd4_free_stateid {
stateid_t fr_stateid; /* request */
- __be32 fr_status; /* response */
};
/* also used for NVERIFY */
@@ -508,9 +505,7 @@ struct nfsd4_compoundargs {
struct nfsd4_compoundres {
/* scratch variables for XDR encode */
- __be32 * p;
- __be32 * end;
- struct xdr_buf * xbuf;
+ struct xdr_stream xdr;
struct svc_rqst * rqstp;
u32 taglen;
@@ -540,6 +535,9 @@ static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp)
return argp->opcnt == resp->opcnt;
}
+int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op);
+void warn_on_nonidempotent_op(struct nfsd4_op *op);
+
#define NFS4_SVC_XDRSIZE sizeof(struct nfsd4_compoundargs)
static inline void
@@ -565,10 +563,11 @@ int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
struct nfsd4_compoundres *);
__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
-void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
-__be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
- struct dentry *dentry, __be32 **buffer, int countp,
- u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
+__be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
+ struct svc_fh *fhp, struct svc_export *exp,
+ struct dentry *dentry,
+ u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_setclientid *setclid);
@@ -576,8 +575,6 @@ extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
struct nfsd4_compound_state *,
struct nfsd4_setclientid_confirm *setclientid_confirm);
extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
-extern __be32 nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
- struct nfsd4_sequence *seq);
extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index deaa3d33a0a..0d58075f34e 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -942,6 +942,18 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
struct inode *cpfile;
int err;
+ if (cpsize > sb->s_blocksize) {
+ printk(KERN_ERR
+ "NILFS: too large checkpoint size: %zu bytes.\n",
+ cpsize);
+ return -EINVAL;
+ } else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
+ printk(KERN_ERR
+ "NILFS: too small checkpoint size: %zu bytes.\n",
+ cpsize);
+ return -EINVAL;
+ }
+
cpfile = nilfs_iget_locked(sb, NULL, NILFS_CPFILE_INO);
if (unlikely(!cpfile))
return -ENOMEM;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index fa0f80308c2..0d5fada9119 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -484,6 +484,18 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
struct nilfs_dat_info *di;
int err;
+ if (entry_size > sb->s_blocksize) {
+ printk(KERN_ERR
+ "NILFS: too large DAT entry size: %zu bytes.\n",
+ entry_size);
+ return -EINVAL;
+ } else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
+ printk(KERN_ERR
+ "NILFS: too small DAT entry size: %zu bytes.\n",
+ entry_size);
+ return -EINVAL;
+ }
+
dat = nilfs_iget_locked(sb, NULL, NILFS_DAT_INO);
if (unlikely(!dat))
return -ENOMEM;
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 08fdb77852a..24978153c0c 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,6 +134,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = nilfs_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -151,10 +152,10 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
*/
const struct file_operations nilfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = nilfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = nilfs_compat_ioctl,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 7e350c562e0..6252b173a46 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -298,19 +298,20 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
}
static ssize_t
-nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t size;
if (rw == WRITE)
return 0;
/* Needs synchronization with the cleaner */
- size = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
+ size = blockdev_direct_IO(rw, iocb, inode, iter, offset,
nilfs_get_block);
/*
@@ -319,7 +320,7 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
*/
if (unlikely((rw & WRITE) && size < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if (end > isize)
nilfs_write_failed(mapping, end);
@@ -783,16 +784,14 @@ void nilfs_evict_inode(struct inode *inode)
int ret;
if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
- if (inode->i_data.nrpages)
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
nilfs_clear_inode(inode);
return;
}
nilfs_transaction_begin(sb, &ti, 0); /* never fails */
- if (inode->i_data.nrpages)
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
/* TODO: some of the following operations may fail. */
nilfs_truncate_bmap(ii, 0);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index b44bdb291b8..422fb54b737 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -37,7 +37,26 @@
#include "sufile.h"
#include "dat.h"
-
+/**
+ * nilfs_ioctl_wrap_copy - wrapping function of get/set metadata info
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @dir: set of direction flags
+ * @dofunc: concrete function of get/set metadata info
+ *
+ * Description: nilfs_ioctl_wrap_copy() gets/sets metadata info by means of
+ * calling dofunc() function on the basis of @argv argument.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
struct nilfs_argv *argv, int dir,
ssize_t (*dofunc)(struct the_nilfs *,
@@ -57,6 +76,14 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
if (argv->v_size > PAGE_SIZE)
return -EINVAL;
+ /*
+ * Reject pairs of a start item position (argv->v_index) and a
+ * total count (argv->v_nmembs) which leads position 'pos' to
+ * overflow by the increment at the end of the loop.
+ */
+ if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
+ return -EINVAL;
+
buf = (void *)__get_free_pages(GFP_NOFS, 0);
if (unlikely(!buf))
return -ENOMEM;
@@ -99,6 +126,9 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
return ret;
}
+/**
+ * nilfs_ioctl_getflags - ioctl to support lsattr
+ */
static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
{
unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
@@ -106,6 +136,9 @@ static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
return put_user(flags, (int __user *)argp);
}
+/**
+ * nilfs_ioctl_setflags - ioctl to support chattr
+ */
static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
void __user *argp)
{
@@ -158,11 +191,33 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_getversion - get info about a file's version (generation number)
+ */
static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
{
return put_user(inode->i_generation, (int __user *)argp);
}
+/**
+ * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_change_cpmode() function changes mode of
+ * given checkpoint between checkpoint and snapshot state. This ioctl
+ * is used in chcp and mkcp utilities.
+ *
+ * Return Value: On success, 0 is returned and mode of a checkpoint is
+ * changed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint mode changing.
+ */
static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -198,6 +253,25 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_delete_checkpoint - remove checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_delete_checkpoint() function removes
+ * checkpoint from NILFS2 file system. This ioctl is used in rmcp
+ * utility.
+ *
+ * Return Value: On success, 0 is returned and a checkpoint is
+ * removed. On error, one of the following negative error codes
+ * is returned.
+ *
+ * %-EPERM - Operation not permitted.
+ *
+ * %-EFAULT - Failure during checkpoint removing.
+ */
static int
nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
@@ -229,6 +303,21 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_do_get_cpinfo - callback method getting info about checkpoints
+ * @nilfs: nilfs object
+ * @posp: pointer on array of checkpoint's numbers
+ * @flags: checkpoint mode (checkpoint or snapshot)
+ * @buf: buffer for storing checkponts' info
+ * @size: size in bytes of one checkpoint info item in array
+ * @nmembs: number of checkpoints in array (numbers and infos)
+ *
+ * Description: nilfs_ioctl_do_get_cpinfo() function returns info about
+ * requested checkpoints. The NILFS_IOCTL_GET_CPINFO ioctl is used in
+ * lscp utility and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_cpinfo structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -242,6 +331,27 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
return ret;
}
+/**
+ * nilfs_ioctl_get_cpstat - get checkpoints statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_cpstat() returns information about checkpoints.
+ * The NILFS_IOCTL_GET_CPSTAT ioctl is used by lscp, rmcp utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and checkpoints information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting checkpoints statistics.
+ */
static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -260,6 +370,21 @@ static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_do_get_suinfo - callback method getting segment usage info
+ * @nilfs: nilfs object
+ * @posp: pointer on array of segment numbers
+ * @flags: *not used*
+ * @buf: buffer for storing suinfo array
+ * @size: size in bytes of one suinfo item in array
+ * @nmembs: count of segment numbers and suinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_suinfo() function returns segment usage
+ * info about requested segments. The NILFS_IOCTL_GET_SUINFO ioctl is used
+ * in lssu, nilfs_resize utilities and by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_suinfo structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -273,6 +398,27 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
return ret;
}
+/**
+ * nilfs_ioctl_get_sustat - get segment usage statistics
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_get_sustat() returns segment usage statistics.
+ * The NILFS_IOCTL_GET_SUSTAT ioctl is used in lssu, nilfs_resize utilities
+ * and by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and segment usage information is
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting segment usage statistics.
+ */
static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -291,6 +437,21 @@ static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_do_get_vinfo - callback method getting virtual blocks info
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_vinfo structures
+ * @size: size in bytes of one vinfo item in array
+ * @nmembs: count of vinfos in array
+ *
+ * Description: nilfs_ioctl_do_get_vinfo() function returns information
+ * on virtual block addresses. The NILFS_IOCTL_GET_VINFO ioctl is used
+ * by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_vinfo structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -303,6 +464,21 @@ nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
return ret;
}
+/**
+ * nilfs_ioctl_do_get_bdescs - callback method getting disk block descriptors
+ * @nilfs: nilfs object
+ * @posp: *not used*
+ * @flags: *not used*
+ * @buf: buffer for storing array of nilfs_bdesc structures
+ * @size: size in bytes of one bdesc item in array
+ * @nmembs: count of bdescs in array
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return value: count of nilfs_bdescs structures in output buffer.
+ */
static ssize_t
nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
void *buf, size_t size, size_t nmembs)
@@ -329,6 +505,29 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
return nmembs;
}
+/**
+ * nilfs_ioctl_get_bdescs - get disk block descriptors
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_do_get_bdescs() function returns information
+ * about descriptors of disk block numbers. The NILFS_IOCTL_GET_BDESCS ioctl
+ * is used by nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned, and disk block descriptors are
+ * copied into userspace pointer @argp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during getting disk block descriptors.
+ */
static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -352,6 +551,26 @@ static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_move_inode_block - prepare data/node block for moving by GC
+ * @inode: inode object
+ * @vdesc: descriptor of virtual block number
+ * @buffers: list of moving buffers
+ *
+ * Description: nilfs_ioctl_move_inode_block() function registers data/node
+ * buffer in the GC pagecache and submit read request.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - Requested block doesn't exist.
+ *
+ * %-EEXIST - Blocks conflict is detected.
+ */
static int nilfs_ioctl_move_inode_block(struct inode *inode,
struct nilfs_vdesc *vdesc,
struct list_head *buffers)
@@ -397,6 +616,19 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
return 0;
}
+/**
+ * nilfs_ioctl_move_blocks - move valid inode's blocks during garbage collection
+ * @sb: superblock object
+ * @argv: vector of arguments from userspace
+ * @buf: array of nilfs_vdesc structures
+ *
+ * Description: nilfs_ioctl_move_blocks() function reads valid data/node
+ * blocks that garbage collector specified with the array of nilfs_vdesc
+ * structures and stores them into page caches of GC inodes.
+ *
+ * Return Value: Number of processed nilfs_vdesc structures or
+ * error code, otherwise.
+ */
static int nilfs_ioctl_move_blocks(struct super_block *sb,
struct nilfs_argv *argv, void *buf)
{
@@ -462,6 +694,25 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
return ret;
}
+/**
+ * nilfs_ioctl_delete_checkpoints - delete checkpoints
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of periods of checkpoints numbers
+ *
+ * Description: nilfs_ioctl_delete_checkpoints() function deletes checkpoints
+ * in the period from p_start to p_end, excluding p_end itself. The checkpoints
+ * which have been already deleted are ignored.
+ *
+ * Return Value: Number of processed nilfs_period structures or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - invalid checkpoints.
+ */
static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
struct nilfs_argv *argv, void *buf)
{
@@ -479,6 +730,24 @@ static int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
return nmembs;
}
+/**
+ * nilfs_ioctl_free_vblocknrs - free virtual block numbers
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of virtual block numbers
+ *
+ * Description: nilfs_ioctl_free_vblocknrs() function frees
+ * the virtual block numbers specified by @buf and @argv->v_nmembs.
+ *
+ * Return Value: Number of processed virtual block numbers or
+ * error code, otherwise.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - The virtual block number have not been allocated.
+ */
static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
struct nilfs_argv *argv, void *buf)
{
@@ -490,6 +759,24 @@ static int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
return (ret < 0) ? ret : nmembs;
}
+/**
+ * nilfs_ioctl_mark_blocks_dirty - mark blocks dirty
+ * @nilfs: nilfs object
+ * @argv: vector of arguments from userspace
+ * @buf: array of block descriptors
+ *
+ * Description: nilfs_ioctl_mark_blocks_dirty() function marks
+ * metadata file or data blocks as dirty.
+ *
+ * Return Value: Number of processed block descriptors or
+ * error code, otherwise.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - the specified block does not exist (hole block)
+ */
static int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
struct nilfs_argv *argv, void *buf)
{
@@ -571,6 +858,20 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return ret;
}
+/**
+ * nilfs_ioctl_clean_segments - clean segments
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_clean_segments() function makes garbage
+ * collection operation in the environment of requested parameters
+ * from userspace. The NILFS_IOCTL_CLEAN_SEGMENTS ioctl is used by
+ * nilfs_cleanerd daemon.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -682,6 +983,33 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_sync - make a checkpoint
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: nilfs_ioctl_sync() function constructs a logical segment
+ * for checkpointing. This function guarantees that all modified data
+ * and metadata are written out to the device when it successfully
+ * returned.
+ *
+ * Return Value: On success, 0 is retured. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-EROFS - Read only filesystem.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOSPC - No space left on device (only in a panic state).
+ *
+ * %-ERESTARTSYS - Interrupted.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp)
{
@@ -710,6 +1038,14 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
return 0;
}
+/**
+ * nilfs_ioctl_resize - resize NILFS2 volume
+ * @inode: inode object
+ * @filp: file object
+ * @argp: pointer on argument from userspace
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
static int nilfs_ioctl_resize(struct inode *inode, struct file *filp,
void __user *argp)
{
@@ -735,6 +1071,59 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_trim_fs() - trim ioctl handle function
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_trim_fs is the FITRIM ioctl handle function. It
+ * checks the arguments from userspace and calls nilfs_sufile_trim_fs, which
+ * performs the actual trim operation.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+static int nilfs_ioctl_trim_fs(struct inode *inode, void __user *argp)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ struct request_queue *q = bdev_get_queue(nilfs->ns_bdev);
+ struct fstrim_range range;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ if (copy_from_user(&range, argp, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(u64, range.minlen, q->limits.discard_granularity);
+
+ down_read(&nilfs->ns_segctor_sem);
+ ret = nilfs_sufile_trim_fs(nilfs->ns_sufile, &range);
+ up_read(&nilfs->ns_segctor_sem);
+
+ if (ret < 0)
+ return ret;
+
+ if (copy_to_user(argp, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+/**
+ * nilfs_ioctl_set_alloc_range - limit range of segments to be allocated
+ * @inode: inode object
+ * @argp: pointer on argument from userspace
+ *
+ * Decription: nilfs_ioctl_set_alloc_range() function defines lower limit
+ * of segments in bytes and upper limit of segments in bytes.
+ * The NILFS_IOCTL_SET_ALLOC_RANGE is used by nilfs_resize utility.
+ *
+ * Return Value: On success, 0 is returned or error code, otherwise.
+ */
static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
@@ -767,6 +1156,28 @@ out:
return ret;
}
+/**
+ * nilfs_ioctl_get_info - wrapping function of get metadata info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ * @membsz: size of an item in bytes
+ * @dofunc: concrete function of getting metadata info
+ *
+ * Description: nilfs_ioctl_get_info() gets metadata info by means of
+ * calling dofunc() function.
+ *
+ * Return Value: On success, 0 is returned and requested metadata info
+ * is copied into userspace. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EINVAL - Invalid arguments from userspace.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EFAULT - Failure during execution of requested operation.
+ */
static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
unsigned int cmd, void __user *argp,
size_t membsz,
@@ -794,6 +1205,95 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
return ret;
}
+/**
+ * nilfs_ioctl_set_suinfo - set segment usage info
+ * @inode: inode object
+ * @filp: file object
+ * @cmd: ioctl's request code
+ * @argp: pointer on argument from userspace
+ *
+ * Description: Expects an array of nilfs_suinfo_update structures
+ * encapsulated in nilfs_argv and updates the segment usage info
+ * according to the flags in nilfs_suinfo_update.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EPERM - Not enough permissions
+ *
+ * %-EFAULT - Error copying input data
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ */
+static int nilfs_ioctl_set_suinfo(struct inode *inode, struct file *filp,
+ unsigned int cmd, void __user *argp)
+{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+ struct nilfs_transaction_info ti;
+ struct nilfs_argv argv;
+ size_t len;
+ void __user *base;
+ void *kbuf;
+ int ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ ret = -EFAULT;
+ if (copy_from_user(&argv, argp, sizeof(argv)))
+ goto out;
+
+ ret = -EINVAL;
+ if (argv.v_size < sizeof(struct nilfs_suinfo_update))
+ goto out;
+
+ if (argv.v_nmembs > nilfs->ns_nsegments)
+ goto out;
+
+ if (argv.v_nmembs >= UINT_MAX / argv.v_size)
+ goto out;
+
+ len = argv.v_size * argv.v_nmembs;
+ if (!len) {
+ ret = 0;
+ goto out;
+ }
+
+ base = (void __user *)(unsigned long)argv.v_base;
+ kbuf = vmalloc(len);
+ if (!kbuf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (copy_from_user(kbuf, base, len)) {
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ nilfs_transaction_begin(inode->i_sb, &ti, 0);
+ ret = nilfs_sufile_set_suinfo(nilfs->ns_sufile, kbuf, argv.v_size,
+ argv.v_nmembs);
+ if (unlikely(ret < 0))
+ nilfs_transaction_abort(inode->i_sb);
+ else
+ nilfs_transaction_commit(inode->i_sb); /* never fails */
+
+out_free:
+ vfree(kbuf);
+out:
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -820,6 +1320,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_get_info(inode, filp, cmd, argp,
sizeof(struct nilfs_suinfo),
nilfs_ioctl_do_get_suinfo);
+ case NILFS_IOCTL_SET_SUINFO:
+ return nilfs_ioctl_set_suinfo(inode, filp, cmd, argp);
case NILFS_IOCTL_GET_SUSTAT:
return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
case NILFS_IOCTL_GET_VINFO:
@@ -836,6 +1338,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return nilfs_ioctl_resize(inode, filp, argp);
case NILFS_IOCTL_SET_ALLOC_RANGE:
return nilfs_ioctl_set_alloc_range(inode, argp);
+ case FITRIM:
+ return nilfs_ioctl_trim_fs(inode, argp);
default:
return -ENOTTY;
}
@@ -859,6 +1363,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_GET_CPINFO:
case NILFS_IOCTL_GET_CPSTAT:
case NILFS_IOCTL_GET_SUINFO:
+ case NILFS_IOCTL_SET_SUINFO:
case NILFS_IOCTL_GET_SUSTAT:
case NILFS_IOCTL_GET_VINFO:
case NILFS_IOCTL_GET_BDESCS:
@@ -866,6 +1371,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case NILFS_IOCTL_SYNC:
case NILFS_IOCTL_RESIZE:
case NILFS_IOCTL_SET_ALLOC_RANGE:
+ case FITRIM:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 2d8be51f90d..dc3a9efdaab 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -416,7 +416,8 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start,
}
if (likely(bio)) {
bio->bi_bdev = nilfs->ns_bdev;
- bio->bi_sector = start << (nilfs->ns_blocksize_bits - 9);
+ bio->bi_iter.bi_sector =
+ start << (nilfs->ns_blocksize_bits - 9);
}
return bio;
}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f6b486b6c0..a1a191634ab 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
nilfs_clear_logs(&sci->sc_segbufs);
- err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
- if (unlikely(err))
- return err;
-
if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
sci->sc_freesegs,
sci->sc_nfreesegs,
NULL);
WARN_ON(err); /* do not happen */
+ sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
}
+
+ err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+ if (unlikely(err))
+ return err;
+
nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
sci->sc_stage = prev_stage;
}
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 3127e9f438a..2a869c35c36 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -870,6 +870,289 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
}
/**
+ * nilfs_sufile_set_suinfo - sets segment usage info
+ * @sufile: inode of segment usage file
+ * @buf: array of suinfo_update
+ * @supsz: byte size of suinfo_update
+ * @nsup: size of suinfo_update array
+ *
+ * Description: Takes an array of nilfs_suinfo_update structs and updates
+ * segment usage accordingly. Only the fields indicated by the sup_flags
+ * are updated.
+ *
+ * Return Value: On success, 0 is returned. On error, one of the
+ * following negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-EINVAL - Invalid values in input (segment number, flags or nblocks)
+ */
+ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
+ unsigned supsz, size_t nsup)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *header_bh, *bh;
+ struct nilfs_suinfo_update *sup, *supend = buf + supsz * nsup;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ unsigned long blkoff, prev_blkoff;
+ int cleansi, cleansu, dirtysi, dirtysu;
+ long ncleaned = 0, ndirtied = 0;
+ int ret = 0;
+
+ if (unlikely(nsup == 0))
+ return ret;
+
+ for (sup = buf; sup < supend; sup = (void *)sup + supsz) {
+ if (sup->sup_segnum >= nilfs->ns_nsegments
+ || (sup->sup_flags &
+ (~0UL << __NR_NILFS_SUINFO_UPDATE_FIELDS))
+ || (nilfs_suinfo_update_nblocks(sup) &&
+ sup->sup_sui.sui_nblocks >
+ nilfs->ns_blocks_per_segment))
+ return -EINVAL;
+ }
+
+ down_write(&NILFS_MDT(sufile)->mi_sem);
+
+ ret = nilfs_sufile_get_header_block(sufile, &header_bh);
+ if (ret < 0)
+ goto out_sem;
+
+ sup = buf;
+ blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
+ ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
+ if (ret < 0)
+ goto out_header;
+
+ for (;;) {
+ kaddr = kmap_atomic(bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, sup->sup_segnum, bh, kaddr);
+
+ if (nilfs_suinfo_update_lastmod(sup))
+ su->su_lastmod = cpu_to_le64(sup->sup_sui.sui_lastmod);
+
+ if (nilfs_suinfo_update_nblocks(sup))
+ su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks);
+
+ if (nilfs_suinfo_update_flags(sup)) {
+ /*
+ * Active flag is a virtual flag projected by running
+ * nilfs kernel code - drop it not to write it to
+ * disk.
+ */
+ sup->sup_sui.sui_flags &=
+ ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+
+ cleansi = nilfs_suinfo_clean(&sup->sup_sui);
+ cleansu = nilfs_segment_usage_clean(su);
+ dirtysi = nilfs_suinfo_dirty(&sup->sup_sui);
+ dirtysu = nilfs_segment_usage_dirty(su);
+
+ if (cleansi && !cleansu)
+ ++ncleaned;
+ else if (!cleansi && cleansu)
+ --ncleaned;
+
+ if (dirtysi && !dirtysu)
+ ++ndirtied;
+ else if (!dirtysi && dirtysu)
+ --ndirtied;
+
+ su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
+ }
+
+ kunmap_atomic(kaddr);
+
+ sup = (void *)sup + supsz;
+ if (sup >= supend)
+ break;
+
+ prev_blkoff = blkoff;
+ blkoff = nilfs_sufile_get_blkoff(sufile, sup->sup_segnum);
+ if (blkoff == prev_blkoff)
+ continue;
+
+ /* get different block */
+ mark_buffer_dirty(bh);
+ put_bh(bh);
+ ret = nilfs_mdt_get_block(sufile, blkoff, 1, NULL, &bh);
+ if (unlikely(ret < 0))
+ goto out_mark;
+ }
+ mark_buffer_dirty(bh);
+ put_bh(bh);
+
+ out_mark:
+ if (ncleaned || ndirtied) {
+ nilfs_sufile_mod_counter(header_bh, (u64)ncleaned,
+ (u64)ndirtied);
+ NILFS_SUI(sufile)->ncleansegs += ncleaned;
+ }
+ nilfs_mdt_mark_dirty(sufile);
+ out_header:
+ put_bh(header_bh);
+ out_sem:
+ up_write(&NILFS_MDT(sufile)->mi_sem);
+ return ret;
+}
+
+/**
+ * nilfs_sufile_trim_fs() - trim ioctl handle function
+ * @sufile: inode of segment usage file
+ * @range: fstrim_range structure
+ *
+ * start: First Byte to trim
+ * len: number of Bytes to trim from start
+ * minlen: minimum extent length in Bytes
+ *
+ * Decription: nilfs_sufile_trim_fs goes through all segments containing bytes
+ * from start to start+len. start is rounded up to the next block boundary
+ * and start+len is rounded down. For each clean segment blkdev_issue_discard
+ * function is invoked.
+ *
+ * Return Value: On success, 0 is returned or negative error code, otherwise.
+ */
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
+{
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+ struct buffer_head *su_bh;
+ struct nilfs_segment_usage *su;
+ void *kaddr;
+ size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size;
+ sector_t seg_start, seg_end, start_block, end_block;
+ sector_t start = 0, nblocks = 0;
+ u64 segnum, segnum_end, minlen, len, max_blocks, ndiscarded = 0;
+ int ret = 0;
+ unsigned int sects_per_block;
+
+ sects_per_block = (1 << nilfs->ns_blocksize_bits) /
+ bdev_logical_block_size(nilfs->ns_bdev);
+ len = range->len >> nilfs->ns_blocksize_bits;
+ minlen = range->minlen >> nilfs->ns_blocksize_bits;
+ max_blocks = ((u64)nilfs->ns_nsegments * nilfs->ns_blocks_per_segment);
+
+ if (!len || range->start >= max_blocks << nilfs->ns_blocksize_bits)
+ return -EINVAL;
+
+ start_block = (range->start + nilfs->ns_blocksize - 1) >>
+ nilfs->ns_blocksize_bits;
+
+ /*
+ * range->len can be very large (actually, it is set to
+ * ULLONG_MAX by default) - truncate upper end of the range
+ * carefully so as not to overflow.
+ */
+ if (max_blocks - start_block < len)
+ end_block = max_blocks - 1;
+ else
+ end_block = start_block + len - 1;
+
+ segnum = nilfs_get_segnum_of_block(nilfs, start_block);
+ segnum_end = nilfs_get_segnum_of_block(nilfs, end_block);
+
+ down_read(&NILFS_MDT(sufile)->mi_sem);
+
+ while (segnum <= segnum_end) {
+ n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
+ segnum_end);
+
+ ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
+ &su_bh);
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ goto out_sem;
+ /* hole */
+ segnum += n;
+ continue;
+ }
+
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
+ su_bh, kaddr);
+ for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
+ if (!nilfs_segment_usage_clean(su))
+ continue;
+
+ nilfs_get_segment_range(nilfs, segnum, &seg_start,
+ &seg_end);
+
+ if (!nblocks) {
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ continue;
+ }
+
+ if (start + nblocks == seg_start) {
+ /* add to previous extent */
+ nblocks += seg_end - seg_start + 1;
+ continue;
+ }
+
+ /* discard previous extent */
+ if (start < start_block) {
+ nblocks -= start_block - start;
+ start = start_block;
+ }
+
+ if (nblocks >= minlen) {
+ kunmap_atomic(kaddr);
+
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (ret < 0) {
+ put_bh(su_bh);
+ goto out_sem;
+ }
+
+ ndiscarded += nblocks;
+ kaddr = kmap_atomic(su_bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(
+ sufile, segnum, su_bh, kaddr);
+ }
+
+ /* start new extent */
+ start = seg_start;
+ nblocks = seg_end - seg_start + 1;
+ }
+ kunmap_atomic(kaddr);
+ put_bh(su_bh);
+ }
+
+
+ if (nblocks) {
+ /* discard last extent */
+ if (start < start_block) {
+ nblocks -= start_block - start;
+ start = start_block;
+ }
+ if (start + nblocks > end_block + 1)
+ nblocks = end_block - start + 1;
+
+ if (nblocks >= minlen) {
+ ret = blkdev_issue_discard(nilfs->ns_bdev,
+ start * sects_per_block,
+ nblocks * sects_per_block,
+ GFP_NOFS, 0);
+ if (!ret)
+ ndiscarded += nblocks;
+ }
+ }
+
+out_sem:
+ up_read(&NILFS_MDT(sufile)->mi_sem);
+
+ range->len = ndiscarded << nilfs->ns_blocksize_bits;
+ return ret;
+}
+
+/**
* nilfs_sufile_read - read or get sufile inode
* @sb: super block instance
* @susize: size of a segment usage entry
@@ -886,6 +1169,18 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
void *kaddr;
int err;
+ if (susize > sb->s_blocksize) {
+ printk(KERN_ERR
+ "NILFS: too large segment usage size: %zu bytes.\n",
+ susize);
+ return -EINVAL;
+ } else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
+ printk(KERN_ERR
+ "NILFS: too small segment usage size: %zu bytes.\n",
+ susize);
+ return -EINVAL;
+ }
+
sufile = nilfs_iget_locked(sb, NULL, NILFS_SUFILE_INO);
if (unlikely(!sufile))
return -ENOMEM;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index e84bc5b51fc..b8afd72f237 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -44,6 +44,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned,
size_t);
+ssize_t nilfs_sufile_set_suinfo(struct inode *, void *, unsigned , size_t);
int nilfs_sufile_updatev(struct inode *, __u64 *, size_t, int, size_t *,
void (*dofunc)(struct inode *, __u64,
@@ -65,6 +66,7 @@ void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *,
int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs);
int nilfs_sufile_read(struct super_block *sb, size_t susize,
struct nilfs_inode *raw_inode, struct inode **inodep);
+int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range);
/**
* nilfs_sufile_scrap - make a segment garbage
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7ac2a122ca1..8c532b2ca3a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1129,6 +1129,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
unsigned long old_mount_opt;
int err;
+ sync_filesystem(sb);
old_sb_flags = sb->s_flags;
old_mount_opt = nilfs->ns_mount_opt;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 94c451ce6d2..8ba8229ba07 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -399,6 +399,16 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
return -EINVAL;
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
+ if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
+ printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
+ nilfs->ns_inode_size);
+ return -EINVAL;
+ } else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
+ printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
+ nilfs->ns_inode_size);
+ return -EINVAL;
+ }
+
nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
diff --git a/fs/nls/mac-celtic.c b/fs/nls/mac-celtic.c
index 634a8b717b0..266c2d7d50b 100644
--- a/fs/nls/mac-celtic.c
+++ b/fs/nls/mac-celtic.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macceltic(void)
diff --git a/fs/nls/mac-centeuro.c b/fs/nls/mac-centeuro.c
index 979e6265ac5..9789c605755 100644
--- a/fs/nls/mac-centeuro.c
+++ b/fs/nls/mac-centeuro.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_maccenteuro(void)
diff --git a/fs/nls/mac-croatian.c b/fs/nls/mac-croatian.c
index dd3f675911e..bb19e7a07d4 100644
--- a/fs/nls/mac-croatian.c
+++ b/fs/nls/mac-croatian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_maccroatian(void)
diff --git a/fs/nls/mac-cyrillic.c b/fs/nls/mac-cyrillic.c
index 1112c84dd8b..2a7dea36acb 100644
--- a/fs/nls/mac-cyrillic.c
+++ b/fs/nls/mac-cyrillic.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_maccyrillic(void)
diff --git a/fs/nls/mac-gaelic.c b/fs/nls/mac-gaelic.c
index 2de9158409c..77b00165358 100644
--- a/fs/nls/mac-gaelic.c
+++ b/fs/nls/mac-gaelic.c
@@ -548,7 +548,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macgaelic(void)
diff --git a/fs/nls/mac-greek.c b/fs/nls/mac-greek.c
index a8631008280..1eccf499e2e 100644
--- a/fs/nls/mac-greek.c
+++ b/fs/nls/mac-greek.c
@@ -478,7 +478,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macgreek(void)
diff --git a/fs/nls/mac-iceland.c b/fs/nls/mac-iceland.c
index babe2998d5c..cbd0875c6d6 100644
--- a/fs/nls/mac-iceland.c
+++ b/fs/nls/mac-iceland.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_maciceland(void)
diff --git a/fs/nls/mac-inuit.c b/fs/nls/mac-inuit.c
index 312364f010d..fba8357aaf0 100644
--- a/fs/nls/mac-inuit.c
+++ b/fs/nls/mac-inuit.c
@@ -513,7 +513,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macinuit(void)
diff --git a/fs/nls/mac-roman.c b/fs/nls/mac-roman.c
index 53ce0809cbd..b6a98a5208c 100644
--- a/fs/nls/mac-roman.c
+++ b/fs/nls/mac-roman.c
@@ -618,7 +618,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macroman(void)
diff --git a/fs/nls/mac-romanian.c b/fs/nls/mac-romanian.c
index add6f7a0c66..25547f02363 100644
--- a/fs/nls/mac-romanian.c
+++ b/fs/nls/mac-romanian.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macromanian(void)
diff --git a/fs/nls/mac-turkish.c b/fs/nls/mac-turkish.c
index dffa96d5de0..b5454bc7b7f 100644
--- a/fs/nls/mac-turkish.c
+++ b/fs/nls/mac-turkish.c
@@ -583,7 +583,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_macturkish(void)
diff --git a/fs/nls/nls_ascii.c b/fs/nls/nls_ascii.c
index 7020e940f74..a2620650d5e 100644
--- a/fs/nls/nls_ascii.c
+++ b/fs/nls/nls_ascii.c
@@ -148,7 +148,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_ascii(void)
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index fea6bd5831d..52ccd34b1e7 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -232,13 +232,14 @@ int utf16s_to_utf8s(const wchar_t *pwcs, int inlen, enum utf16_endian endian,
}
EXPORT_SYMBOL(utf16s_to_utf8s);
-int register_nls(struct nls_table * nls)
+int __register_nls(struct nls_table *nls, struct module *owner)
{
struct nls_table ** tmp = &tables;
if (nls->next)
return -EBUSY;
+ nls->owner = owner;
spin_lock(&nls_lock);
while (*tmp) {
if (nls == *tmp) {
@@ -252,6 +253,7 @@ int register_nls(struct nls_table * nls)
spin_unlock(&nls_lock);
return 0;
}
+EXPORT_SYMBOL(__register_nls);
int unregister_nls(struct nls_table * nls)
{
@@ -538,7 +540,6 @@ struct nls_table *load_nls_default(void)
return &default_table;
}
-EXPORT_SYMBOL(register_nls);
EXPORT_SYMBOL(unregister_nls);
EXPORT_SYMBOL(unload_nls);
EXPORT_SYMBOL(load_nls);
diff --git a/fs/nls/nls_cp1250.c b/fs/nls/nls_cp1250.c
index c8471fe78e4..ace3e19d340 100644
--- a/fs/nls/nls_cp1250.c
+++ b/fs/nls/nls_cp1250.c
@@ -329,7 +329,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp1250(void)
diff --git a/fs/nls/nls_cp1251.c b/fs/nls/nls_cp1251.c
index 1939b46e772..9273ddfd08a 100644
--- a/fs/nls/nls_cp1251.c
+++ b/fs/nls/nls_cp1251.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp1251(void)
diff --git a/fs/nls/nls_cp1255.c b/fs/nls/nls_cp1255.c
index 8120ae2e091..1caf5dfed85 100644
--- a/fs/nls/nls_cp1255.c
+++ b/fs/nls/nls_cp1255.c
@@ -365,7 +365,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp1255(void)
diff --git a/fs/nls/nls_cp437.c b/fs/nls/nls_cp437.c
index ff37a4628ce..7ddb830da3f 100644
--- a/fs/nls/nls_cp437.c
+++ b/fs/nls/nls_cp437.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp437(void)
diff --git a/fs/nls/nls_cp737.c b/fs/nls/nls_cp737.c
index f5576b8be1b..c593f683a0c 100644
--- a/fs/nls/nls_cp737.c
+++ b/fs/nls/nls_cp737.c
@@ -332,7 +332,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp737(void)
diff --git a/fs/nls/nls_cp775.c b/fs/nls/nls_cp775.c
index 4905635d1c0..554c863745f 100644
--- a/fs/nls/nls_cp775.c
+++ b/fs/nls/nls_cp775.c
@@ -301,7 +301,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp775(void)
diff --git a/fs/nls/nls_cp850.c b/fs/nls/nls_cp850.c
index fe5bdad50e2..56cccd14b40 100644
--- a/fs/nls/nls_cp850.c
+++ b/fs/nls/nls_cp850.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp850(void)
diff --git a/fs/nls/nls_cp852.c b/fs/nls/nls_cp852.c
index ceb1c0166dd..7cdc05ac1d4 100644
--- a/fs/nls/nls_cp852.c
+++ b/fs/nls/nls_cp852.c
@@ -319,7 +319,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp852(void)
diff --git a/fs/nls/nls_cp855.c b/fs/nls/nls_cp855.c
index cc7f5fb2e0c..7426eea0566 100644
--- a/fs/nls/nls_cp855.c
+++ b/fs/nls/nls_cp855.c
@@ -281,7 +281,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp855(void)
diff --git a/fs/nls/nls_cp857.c b/fs/nls/nls_cp857.c
index e418e198e8d..098309733eb 100644
--- a/fs/nls/nls_cp857.c
+++ b/fs/nls/nls_cp857.c
@@ -283,7 +283,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp857(void)
diff --git a/fs/nls/nls_cp860.c b/fs/nls/nls_cp860.c
index a86c97d1aa3..84224478e73 100644
--- a/fs/nls/nls_cp860.c
+++ b/fs/nls/nls_cp860.c
@@ -346,7 +346,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp860(void)
diff --git a/fs/nls/nls_cp861.c b/fs/nls/nls_cp861.c
index bd920227acd..dc873e4be09 100644
--- a/fs/nls/nls_cp861.c
+++ b/fs/nls/nls_cp861.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp861(void)
diff --git a/fs/nls/nls_cp862.c b/fs/nls/nls_cp862.c
index e9b68eb3daf..d5263e3c556 100644
--- a/fs/nls/nls_cp862.c
+++ b/fs/nls/nls_cp862.c
@@ -403,7 +403,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp862(void)
diff --git a/fs/nls/nls_cp863.c b/fs/nls/nls_cp863.c
index f8a9b07ab4e..051c9832e36 100644
--- a/fs/nls/nls_cp863.c
+++ b/fs/nls/nls_cp863.c
@@ -363,7 +363,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp863(void)
diff --git a/fs/nls/nls_cp864.c b/fs/nls/nls_cp864.c
index 8d31f435fc6..97eb1273b2f 100644
--- a/fs/nls/nls_cp864.c
+++ b/fs/nls/nls_cp864.c
@@ -389,7 +389,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp864(void)
diff --git a/fs/nls/nls_cp865.c b/fs/nls/nls_cp865.c
index 4bd902fe3ec..11121422852 100644
--- a/fs/nls/nls_cp865.c
+++ b/fs/nls/nls_cp865.c
@@ -369,7 +369,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp865(void)
diff --git a/fs/nls/nls_cp866.c b/fs/nls/nls_cp866.c
index bdc7cb39139..ffdcbc3fc38 100644
--- a/fs/nls/nls_cp866.c
+++ b/fs/nls/nls_cp866.c
@@ -287,7 +287,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp866(void)
diff --git a/fs/nls/nls_cp869.c b/fs/nls/nls_cp869.c
index 9f283a2b151..3b5a3458935 100644
--- a/fs/nls/nls_cp869.c
+++ b/fs/nls/nls_cp869.c
@@ -297,7 +297,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp869(void)
diff --git a/fs/nls/nls_cp874.c b/fs/nls/nls_cp874.c
index 0b3c4886f8c..8dfaa10710f 100644
--- a/fs/nls/nls_cp874.c
+++ b/fs/nls/nls_cp874.c
@@ -256,7 +256,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp874(void)
diff --git a/fs/nls/nls_cp932.c b/fs/nls/nls_cp932.c
index 0ffed6f1ceb..67b7398e848 100644
--- a/fs/nls/nls_cp932.c
+++ b/fs/nls/nls_cp932.c
@@ -7914,7 +7914,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp932(void)
diff --git a/fs/nls/nls_cp936.c b/fs/nls/nls_cp936.c
index 82770301bc3..c96546cfec9 100644
--- a/fs/nls/nls_cp936.c
+++ b/fs/nls/nls_cp936.c
@@ -11092,7 +11092,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp936(void)
diff --git a/fs/nls/nls_cp949.c b/fs/nls/nls_cp949.c
index 8a7a2fe85c6..199171e97aa 100644
--- a/fs/nls/nls_cp949.c
+++ b/fs/nls/nls_cp949.c
@@ -13927,7 +13927,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp949(void)
diff --git a/fs/nls/nls_cp950.c b/fs/nls/nls_cp950.c
index ef2536829aa..8e141870820 100644
--- a/fs/nls/nls_cp950.c
+++ b/fs/nls/nls_cp950.c
@@ -9463,7 +9463,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_cp950(void)
diff --git a/fs/nls/nls_euc-jp.c b/fs/nls/nls_euc-jp.c
index 7424929a278..162b3f16035 100644
--- a/fs/nls/nls_euc-jp.c
+++ b/fs/nls/nls_euc-jp.c
@@ -553,7 +553,6 @@ static struct nls_table table = {
.charset = "euc-jp",
.uni2char = uni2char,
.char2uni = char2uni,
- .owner = THIS_MODULE,
};
static int __init init_nls_euc_jp(void)
diff --git a/fs/nls/nls_iso8859-1.c b/fs/nls/nls_iso8859-1.c
index 7b951bb5849..69ac020d43b 100644
--- a/fs/nls/nls_iso8859-1.c
+++ b/fs/nls/nls_iso8859-1.c
@@ -239,7 +239,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_1(void)
diff --git a/fs/nls/nls_iso8859-13.c b/fs/nls/nls_iso8859-13.c
index c4d52ea9f09..afb3f8f275f 100644
--- a/fs/nls/nls_iso8859-13.c
+++ b/fs/nls/nls_iso8859-13.c
@@ -267,7 +267,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_13(void)
diff --git a/fs/nls/nls_iso8859-14.c b/fs/nls/nls_iso8859-14.c
index dc02600c7fe..046370f0b6f 100644
--- a/fs/nls/nls_iso8859-14.c
+++ b/fs/nls/nls_iso8859-14.c
@@ -323,7 +323,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_14(void)
diff --git a/fs/nls/nls_iso8859-15.c b/fs/nls/nls_iso8859-15.c
index 3c7dfc832ef..7e34a841a05 100644
--- a/fs/nls/nls_iso8859-15.c
+++ b/fs/nls/nls_iso8859-15.c
@@ -289,7 +289,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_15(void)
diff --git a/fs/nls/nls_iso8859-2.c b/fs/nls/nls_iso8859-2.c
index a2d2197e4c7..7dd57118174 100644
--- a/fs/nls/nls_iso8859-2.c
+++ b/fs/nls/nls_iso8859-2.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_2(void)
diff --git a/fs/nls/nls_iso8859-3.c b/fs/nls/nls_iso8859-3.c
index a61e0daa3a8..740b75ec449 100644
--- a/fs/nls/nls_iso8859-3.c
+++ b/fs/nls/nls_iso8859-3.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_3(void)
diff --git a/fs/nls/nls_iso8859-4.c b/fs/nls/nls_iso8859-4.c
index e8ff555483b..8826021e32f 100644
--- a/fs/nls/nls_iso8859-4.c
+++ b/fs/nls/nls_iso8859-4.c
@@ -290,7 +290,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_4(void)
diff --git a/fs/nls/nls_iso8859-5.c b/fs/nls/nls_iso8859-5.c
index 4721e893012..7c04057a1ad 100644
--- a/fs/nls/nls_iso8859-5.c
+++ b/fs/nls/nls_iso8859-5.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_5(void)
diff --git a/fs/nls/nls_iso8859-6.c b/fs/nls/nls_iso8859-6.c
index 01a517d6d30..d4a881400d7 100644
--- a/fs/nls/nls_iso8859-6.c
+++ b/fs/nls/nls_iso8859-6.c
@@ -245,7 +245,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_6(void)
diff --git a/fs/nls/nls_iso8859-7.c b/fs/nls/nls_iso8859-7.c
index 2d27b93ef19..37b75d825a7 100644
--- a/fs/nls/nls_iso8859-7.c
+++ b/fs/nls/nls_iso8859-7.c
@@ -299,7 +299,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_7(void)
diff --git a/fs/nls/nls_iso8859-9.c b/fs/nls/nls_iso8859-9.c
index 694bf070c72..557b98250d3 100644
--- a/fs/nls/nls_iso8859-9.c
+++ b/fs/nls/nls_iso8859-9.c
@@ -254,7 +254,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_iso8859_9(void)
diff --git a/fs/nls/nls_koi8-r.c b/fs/nls/nls_koi8-r.c
index 43875310540..811f232fccf 100644
--- a/fs/nls/nls_koi8-r.c
+++ b/fs/nls/nls_koi8-r.c
@@ -305,7 +305,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_koi8_r(void)
diff --git a/fs/nls/nls_koi8-ru.c b/fs/nls/nls_koi8-ru.c
index e7bc1d75c78..a80a741a867 100644
--- a/fs/nls/nls_koi8-ru.c
+++ b/fs/nls/nls_koi8-ru.c
@@ -55,7 +55,6 @@ static struct nls_table table = {
.charset = "koi8-ru",
.uni2char = uni2char,
.char2uni = char2uni,
- .owner = THIS_MODULE,
};
static int __init init_nls_koi8_ru(void)
diff --git a/fs/nls/nls_koi8-u.c b/fs/nls/nls_koi8-u.c
index 8c9f0292b5a..7e029e4c188 100644
--- a/fs/nls/nls_koi8-u.c
+++ b/fs/nls/nls_koi8-u.c
@@ -312,7 +312,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = charset2lower,
.charset2upper = charset2upper,
- .owner = THIS_MODULE,
};
static int __init init_nls_koi8_u(void)
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c
index 0d60a44acac..afcfbc4a14d 100644
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -46,7 +46,6 @@ static struct nls_table table = {
.char2uni = char2uni,
.charset2lower = identity, /* no conversion */
.charset2upper = identity,
- .owner = THIS_MODULE,
};
static int __init init_nls_utf8(void)
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1fedd5f7ccc..abc8cbcfe90 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -82,20 +82,23 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
* events.
*/
static int dnotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
{
struct dnotify_mark *dn_mark;
- struct inode *to_tell;
struct dnotify_struct *dn;
struct dnotify_struct **prev;
struct fown_struct *fown;
- __u32 test_mask = event->mask & ~FS_EVENT_ON_CHILD;
+ __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;
- BUG_ON(vfsmount_mark);
+ /* not a dir, dnotify doesn't care */
+ if (!S_ISDIR(inode->i_mode))
+ return 0;
- to_tell = event->to_tell;
+ BUG_ON(vfsmount_mark);
dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);
@@ -122,23 +125,6 @@ static int dnotify_handle_event(struct fsnotify_group *group,
return 0;
}
-/*
- * Given an inode and mask determine if dnotify would be interested in sending
- * userspace notification for that pair.
- */
-static bool dnotify_should_send_event(struct fsnotify_group *group,
- struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- /* not a dir, dnotify doesn't care */
- if (!S_ISDIR(inode->i_mode))
- return false;
-
- return true;
-}
-
static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
struct dnotify_mark *dn_mark = container_of(fsn_mark,
@@ -152,10 +138,6 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
static struct fsnotify_ops dnotify_fsnotify_ops = {
.handle_event = dnotify_handle_event,
- .should_send_event = dnotify_should_send_event,
- .free_group_priv = NULL,
- .freeing_mark = NULL,
- .free_event_priv = NULL,
};
/*
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 0c2f9122b26..ee9cb3795c2 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -9,91 +9,59 @@
#include <linux/types.h>
#include <linux/wait.h>
-static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
+#include "fanotify.h"
+
+static bool should_merge(struct fsnotify_event *old_fsn,
+ struct fsnotify_event *new_fsn)
{
- pr_debug("%s: old=%p new=%p\n", __func__, old, new);
+ struct fanotify_event_info *old, *new;
- if (old->to_tell == new->to_tell &&
- old->data_type == new->data_type &&
- old->tgid == new->tgid) {
- switch (old->data_type) {
- case (FSNOTIFY_EVENT_PATH):
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- /* dont merge two permission events */
- if ((old->mask & FAN_ALL_PERM_EVENTS) &&
- (new->mask & FAN_ALL_PERM_EVENTS))
- return false;
-#endif
- if ((old->path.mnt == new->path.mnt) &&
- (old->path.dentry == new->path.dentry))
- return true;
- break;
- case (FSNOTIFY_EVENT_NONE):
- return true;
- default:
- BUG();
- };
- }
+ pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
+ old = FANOTIFY_E(old_fsn);
+ new = FANOTIFY_E(new_fsn);
+
+ if (old_fsn->inode == new_fsn->inode && old->tgid == new->tgid &&
+ old->path.mnt == new->path.mnt &&
+ old->path.dentry == new->path.dentry)
+ return true;
return false;
}
/* and the list better be locked by something too! */
-static struct fsnotify_event *fanotify_merge(struct list_head *list,
- struct fsnotify_event *event)
+static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
{
- struct fsnotify_event_holder *test_holder;
- struct fsnotify_event *test_event = NULL;
- struct fsnotify_event *new_event;
+ struct fsnotify_event *test_event;
+ bool do_merge = false;
pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ /*
+ * Don't merge a permission event with any other event so that we know
+ * the event structure we have created in fanotify_handle_event() is the
+ * one we should check for permission response.
+ */
+ if (event->mask & FAN_ALL_PERM_EVENTS)
+ return 0;
+#endif
- list_for_each_entry_reverse(test_holder, list, event_list) {
- if (should_merge(test_holder->event, event)) {
- test_event = test_holder->event;
+ list_for_each_entry_reverse(test_event, list, list) {
+ if (should_merge(test_event, event)) {
+ do_merge = true;
break;
}
}
- if (!test_event)
- return NULL;
-
- fsnotify_get_event(test_event);
-
- /* if they are exactly the same we are done */
- if (test_event->mask == event->mask)
- return test_event;
-
- /*
- * if the refcnt == 2 this is the only queue
- * for this event and so we can update the mask
- * in place.
- */
- if (atomic_read(&test_event->refcnt) == 2) {
- test_event->mask |= event->mask;
- return test_event;
- }
-
- new_event = fsnotify_clone_event(test_event);
-
- /* done with test_event */
- fsnotify_put_event(test_event);
-
- /* couldn't allocate memory, merge was not possible */
- if (unlikely(!new_event))
- return ERR_PTR(-ENOMEM);
-
- /* build new event and replace it on the list */
- new_event->mask = (test_event->mask | event->mask);
- fsnotify_replace_event(test_holder, new_event);
+ if (!do_merge)
+ return 0;
- /* we hold a reference on new_event from clone_event */
- return new_event;
+ test_event->mask |= event->mask;
+ return 1;
}
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-static int fanotify_get_response_from_access(struct fsnotify_group *group,
- struct fsnotify_event *event)
+static int fanotify_get_response(struct fsnotify_group *group,
+ struct fanotify_perm_event_info *event)
{
int ret;
@@ -106,7 +74,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
return 0;
/* userspace responded, convert to something usable */
- spin_lock(&event->lock);
switch (event->response) {
case FAN_ALLOW:
ret = 0;
@@ -116,7 +83,6 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
ret = -EPERM;
}
event->response = 0;
- spin_unlock(&event->lock);
pr_debug("%s: group=%p event=%p about to return ret=%d\n", __func__,
group, event, ret);
@@ -125,58 +91,17 @@ static int fanotify_get_response_from_access(struct fsnotify_group *group,
}
#endif
-static int fanotify_handle_event(struct fsnotify_group *group,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *fanotify_mark,
- struct fsnotify_event *event)
-{
- int ret = 0;
- struct fsnotify_event *notify_event = NULL;
-
- BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
- BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
- BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
- BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
- BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
- BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
- BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
- BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
- BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
- BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
-
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
- notify_event = fsnotify_add_notify_event(group, event, NULL, fanotify_merge);
- if (IS_ERR(notify_event))
- return PTR_ERR(notify_event);
-
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- if (event->mask & FAN_ALL_PERM_EVENTS) {
- /* if we merged we need to wait on the new event */
- if (notify_event)
- event = notify_event;
- ret = fanotify_get_response_from_access(group, event);
- }
-#endif
-
- if (notify_event)
- fsnotify_put_event(notify_event);
-
- return ret;
-}
-
-static bool fanotify_should_send_event(struct fsnotify_group *group,
- struct inode *to_tell,
- struct fsnotify_mark *inode_mark,
+static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmnt_mark,
- __u32 event_mask, void *data, int data_type)
+ u32 event_mask,
+ void *data, int data_type)
{
__u32 marks_mask, marks_ignored_mask;
struct path *path = data;
- pr_debug("%s: group=%p to_tell=%p inode_mark=%p vfsmnt_mark=%p "
- "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
- inode_mark, vfsmnt_mark, event_mask, data, data_type);
+ pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
+ " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
+ event_mask, data, data_type);
/* if we don't have enough info to send an event to userspace say no */
if (data_type != FSNOTIFY_EVENT_PATH)
@@ -217,6 +142,93 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
return false;
}
+struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+ struct path *path)
+{
+ struct fanotify_event_info *event;
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (mask & FAN_ALL_PERM_EVENTS) {
+ struct fanotify_perm_event_info *pevent;
+
+ pevent = kmem_cache_alloc(fanotify_perm_event_cachep,
+ GFP_KERNEL);
+ if (!pevent)
+ return NULL;
+ event = &pevent->fae;
+ pevent->response = 0;
+ goto init;
+ }
+#endif
+ event = kmem_cache_alloc(fanotify_event_cachep, GFP_KERNEL);
+ if (!event)
+ return NULL;
+init: __maybe_unused
+ fsnotify_init_event(&event->fse, inode, mask);
+ event->tgid = get_pid(task_tgid(current));
+ if (path) {
+ event->path = *path;
+ path_get(&event->path);
+ } else {
+ event->path.mnt = NULL;
+ event->path.dentry = NULL;
+ }
+ return event;
+}
+
+static int fanotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *fanotify_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
+{
+ int ret = 0;
+ struct fanotify_event_info *event;
+ struct fsnotify_event *fsn_event;
+
+ BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+ BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+ BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
+ BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
+ BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD);
+ BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW);
+ BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM);
+ BUILD_BUG_ON(FAN_ACCESS_PERM != FS_ACCESS_PERM);
+ BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+
+ if (!fanotify_should_send_event(inode_mark, fanotify_mark, mask, data,
+ data_type))
+ return 0;
+
+ pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+ mask);
+
+ event = fanotify_alloc_event(inode, mask, data);
+ if (unlikely(!event))
+ return -ENOMEM;
+
+ fsn_event = &event->fse;
+ ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge);
+ if (ret) {
+ /* Permission events shouldn't be merged */
+ BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
+ /* Our event wasn't used in the end. Free it. */
+ fsnotify_destroy_event(group, fsn_event);
+
+ return 0;
+ }
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (mask & FAN_ALL_PERM_EVENTS) {
+ ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
+ fsnotify_destroy_event(group, fsn_event);
+ }
+#endif
+ return ret;
+}
+
static void fanotify_free_group_priv(struct fsnotify_group *group)
{
struct user_struct *user;
@@ -226,10 +238,25 @@ static void fanotify_free_group_priv(struct fsnotify_group *group)
free_uid(user);
}
+static void fanotify_free_event(struct fsnotify_event *fsn_event)
+{
+ struct fanotify_event_info *event;
+
+ event = FANOTIFY_E(fsn_event);
+ path_put(&event->path);
+ put_pid(event->tgid);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (fsn_event->mask & FAN_ALL_PERM_EVENTS) {
+ kmem_cache_free(fanotify_perm_event_cachep,
+ FANOTIFY_PE(fsn_event));
+ return;
+ }
+#endif
+ kmem_cache_free(fanotify_event_cachep, event);
+}
+
const struct fsnotify_ops fanotify_fsnotify_ops = {
.handle_event = fanotify_handle_event,
- .should_send_event = fanotify_should_send_event,
.free_group_priv = fanotify_free_group_priv,
- .free_event_priv = NULL,
- .freeing_mark = NULL,
+ .free_event = fanotify_free_event,
};
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
new file mode 100644
index 00000000000..2a5fb14115d
--- /dev/null
+++ b/fs/notify/fanotify/fanotify.h
@@ -0,0 +1,50 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+extern struct kmem_cache *fanotify_event_cachep;
+extern struct kmem_cache *fanotify_perm_event_cachep;
+
+/*
+ * Structure for normal fanotify events. It gets allocated in
+ * fanotify_handle_event() and freed when the information is retrieved by
+ * userspace
+ */
+struct fanotify_event_info {
+ struct fsnotify_event fse;
+ /*
+ * We hold ref to this path so it may be dereferenced at any point
+ * during this object's lifetime
+ */
+ struct path path;
+ struct pid *tgid;
+};
+
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+/*
+ * Structure for permission fanotify events. It gets allocated and freed in
+ * fanotify_handle_event() since we wait there for user response. When the
+ * information is retrieved by userspace the structure is moved from
+ * group->notification_list to group->fanotify_data.access_list to wait for
+ * user response.
+ */
+struct fanotify_perm_event_info {
+ struct fanotify_event_info fae;
+ int response; /* userspace answer to question */
+ int fd; /* fd we passed to userspace for this event */
+};
+
+static inline struct fanotify_perm_event_info *
+FANOTIFY_PE(struct fsnotify_event *fse)
+{
+ return container_of(fse, struct fanotify_perm_event_info, fae.fse);
+}
+#endif
+
+static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
+{
+ return container_of(fse, struct fanotify_event_info, fse);
+}
+
+struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
+ struct path *path);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index e44cb6427df..3fdc8a3e113 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -19,21 +19,30 @@
#include "../../mount.h"
#include "../fdinfo.h"
+#include "fanotify.h"
#define FANOTIFY_DEFAULT_MAX_EVENTS 16384
#define FANOTIFY_DEFAULT_MAX_MARKS 8192
#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
+/*
+ * All flags that may be specified in parameter event_f_flags of fanotify_init.
+ *
+ * Internal and external open flags are stored together in field f_flags of
+ * struct file. Only external open flags shall be allowed in event_f_flags.
+ * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
+ * excluded.
+ */
+#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
+ O_ACCMODE | O_APPEND | O_NONBLOCK | \
+ __O_SYNC | O_DSYNC | O_CLOEXEC | \
+ O_LARGEFILE | O_NOATIME )
+
extern const struct fsnotify_ops fanotify_fsnotify_ops;
static struct kmem_cache *fanotify_mark_cache __read_mostly;
-static struct kmem_cache *fanotify_response_event_cache __read_mostly;
-
-struct fanotify_response_event {
- struct list_head list;
- __s32 fd;
- struct fsnotify_event *event;
-};
+struct kmem_cache *fanotify_event_cachep __read_mostly;
+struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
/*
* Get an fsnotify notification event if one exists and is small
@@ -61,8 +70,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
}
static int create_fd(struct fsnotify_group *group,
- struct fsnotify_event *event,
- struct file **file)
+ struct fanotify_event_info *event,
+ struct file **file)
{
int client_fd;
struct file *new_file;
@@ -73,12 +82,6 @@ static int create_fd(struct fsnotify_group *group,
if (client_fd < 0)
return client_fd;
- if (event->data_type != FSNOTIFY_EVENT_PATH) {
- WARN_ON(1);
- put_unused_fd(client_fd);
- return -EINVAL;
- }
-
/*
* we need a new file handle for the userspace program so it can read even if it was
* originally opened O_WRONLY.
@@ -109,23 +112,25 @@ static int create_fd(struct fsnotify_group *group,
}
static int fill_event_metadata(struct fsnotify_group *group,
- struct fanotify_event_metadata *metadata,
- struct fsnotify_event *event,
- struct file **file)
+ struct fanotify_event_metadata *metadata,
+ struct fsnotify_event *fsn_event,
+ struct file **file)
{
int ret = 0;
+ struct fanotify_event_info *event;
pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
- group, metadata, event);
+ group, metadata, fsn_event);
*file = NULL;
+ event = container_of(fsn_event, struct fanotify_event_info, fse);
metadata->event_len = FAN_EVENT_METADATA_LEN;
metadata->metadata_len = FAN_EVENT_METADATA_LEN;
metadata->vers = FANOTIFY_METADATA_VERSION;
metadata->reserved = 0;
- metadata->mask = event->mask & FAN_ALL_OUTGOING_EVENTS;
+ metadata->mask = fsn_event->mask & FAN_ALL_OUTGOING_EVENTS;
metadata->pid = pid_vnr(event->tgid);
- if (unlikely(event->mask & FAN_Q_OVERFLOW))
+ if (unlikely(fsn_event->mask & FAN_Q_OVERFLOW))
metadata->fd = FAN_NOFD;
else {
metadata->fd = create_fd(group, event, file);
@@ -137,33 +142,34 @@ static int fill_event_metadata(struct fsnotify_group *group,
}
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-static struct fanotify_response_event *dequeue_re(struct fsnotify_group *group,
- __s32 fd)
+static struct fanotify_perm_event_info *dequeue_event(
+ struct fsnotify_group *group, int fd)
{
- struct fanotify_response_event *re, *return_re = NULL;
+ struct fanotify_perm_event_info *event, *return_e = NULL;
- mutex_lock(&group->fanotify_data.access_mutex);
- list_for_each_entry(re, &group->fanotify_data.access_list, list) {
- if (re->fd != fd)
+ spin_lock(&group->fanotify_data.access_lock);
+ list_for_each_entry(event, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ if (event->fd != fd)
continue;
- list_del_init(&re->list);
- return_re = re;
+ list_del_init(&event->fae.fse.list);
+ return_e = event;
break;
}
- mutex_unlock(&group->fanotify_data.access_mutex);
+ spin_unlock(&group->fanotify_data.access_lock);
- pr_debug("%s: found return_re=%p\n", __func__, return_re);
+ pr_debug("%s: found return_re=%p\n", __func__, return_e);
- return return_re;
+ return return_e;
}
static int process_access_response(struct fsnotify_group *group,
struct fanotify_response *response_struct)
{
- struct fanotify_response_event *re;
- __s32 fd = response_struct->fd;
- __u32 response = response_struct->response;
+ struct fanotify_perm_event_info *event;
+ int fd = response_struct->fd;
+ int response = response_struct->response;
pr_debug("%s: group=%p fd=%d response=%d\n", __func__, group,
fd, response);
@@ -183,58 +189,15 @@ static int process_access_response(struct fsnotify_group *group,
if (fd < 0)
return -EINVAL;
- re = dequeue_re(group, fd);
- if (!re)
+ event = dequeue_event(group, fd);
+ if (!event)
return -ENOENT;
- re->event->response = response;
-
+ event->response = response;
wake_up(&group->fanotify_data.access_waitq);
- kmem_cache_free(fanotify_response_event_cache, re);
-
return 0;
}
-
-static int prepare_for_access_response(struct fsnotify_group *group,
- struct fsnotify_event *event,
- __s32 fd)
-{
- struct fanotify_response_event *re;
-
- if (!(event->mask & FAN_ALL_PERM_EVENTS))
- return 0;
-
- re = kmem_cache_alloc(fanotify_response_event_cache, GFP_KERNEL);
- if (!re)
- return -ENOMEM;
-
- re->event = event;
- re->fd = fd;
-
- mutex_lock(&group->fanotify_data.access_mutex);
-
- if (atomic_read(&group->fanotify_data.bypass_perm)) {
- mutex_unlock(&group->fanotify_data.access_mutex);
- kmem_cache_free(fanotify_response_event_cache, re);
- event->response = FAN_ALLOW;
- return 0;
- }
-
- list_add_tail(&re->list, &group->fanotify_data.access_list);
- mutex_unlock(&group->fanotify_data.access_mutex);
-
- return 0;
-}
-
-#else
-static int prepare_for_access_response(struct fsnotify_group *group,
- struct fsnotify_event *event,
- __s32 fd)
-{
- return 0;
-}
-
#endif
static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -249,7 +212,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
if (ret < 0)
- goto out;
+ return ret;
fd = fanotify_event_metadata.fd;
ret = -EFAULT;
@@ -257,9 +220,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
fanotify_event_metadata.event_len))
goto out_close_fd;
- ret = prepare_for_access_response(group, event, fd);
- if (ret)
- goto out_close_fd;
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (event->mask & FAN_ALL_PERM_EVENTS)
+ FANOTIFY_PE(event)->fd = fd;
+#endif
if (fd != FAN_NOFD)
fd_install(fd, f);
@@ -270,13 +234,6 @@ out_close_fd:
put_unused_fd(fd);
fput(f);
}
-out:
-#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- if (event->mask & FAN_ALL_PERM_EVENTS) {
- event->response = FAN_DENY;
- wake_up(&group->fanotify_data.access_waitq);
- }
-#endif
return ret;
}
@@ -316,30 +273,50 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
kevent = get_one_event(group, count);
mutex_unlock(&group->notification_mutex);
- if (kevent) {
+ if (IS_ERR(kevent)) {
ret = PTR_ERR(kevent);
- if (IS_ERR(kevent))
+ break;
+ }
+
+ if (!kevent) {
+ ret = -EAGAIN;
+ if (file->f_flags & O_NONBLOCK)
break;
- ret = copy_event_to_user(group, kevent, buf);
- fsnotify_put_event(kevent);
- if (ret < 0)
+
+ ret = -ERESTARTSYS;
+ if (signal_pending(current))
+ break;
+
+ if (start != buf)
break;
- buf += ret;
- count -= ret;
+ schedule();
continue;
}
- ret = -EAGAIN;
- if (file->f_flags & O_NONBLOCK)
- break;
- ret = -ERESTARTSYS;
- if (signal_pending(current))
- break;
-
- if (start != buf)
- break;
-
- schedule();
+ ret = copy_event_to_user(group, kevent, buf);
+ /*
+ * Permission events get queued to wait for response. Other
+ * events can be destroyed now.
+ */
+ if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
+ fsnotify_destroy_event(group, kevent);
+ if (ret < 0)
+ break;
+ } else {
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ if (ret < 0) {
+ FANOTIFY_PE(kevent)->response = FAN_DENY;
+ wake_up(&group->fanotify_data.access_waitq);
+ break;
+ }
+ spin_lock(&group->fanotify_data.access_lock);
+ list_add_tail(&kevent->list,
+ &group->fanotify_data.access_list);
+ spin_unlock(&group->fanotify_data.access_lock);
+#endif
+ }
+ buf += ret;
+ count -= ret;
}
finish_wait(&group->notification_waitq, &wait);
@@ -380,22 +357,21 @@ static int fanotify_release(struct inode *ignored, struct file *file)
struct fsnotify_group *group = file->private_data;
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- struct fanotify_response_event *re, *lre;
+ struct fanotify_perm_event_info *event, *next;
- mutex_lock(&group->fanotify_data.access_mutex);
+ spin_lock(&group->fanotify_data.access_lock);
atomic_inc(&group->fanotify_data.bypass_perm);
- list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
- pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
- re, re->event);
-
- list_del_init(&re->list);
- re->event->response = FAN_ALLOW;
+ list_for_each_entry_safe(event, next, &group->fanotify_data.access_list,
+ fae.fse.list) {
+ pr_debug("%s: found group=%p event=%p\n", __func__, group,
+ event);
- kmem_cache_free(fanotify_response_event_cache, re);
+ list_del_init(&event->fae.fse.list);
+ event->response = FAN_ALLOW;
}
- mutex_unlock(&group->fanotify_data.access_mutex);
+ spin_unlock(&group->fanotify_data.access_lock);
wake_up(&group->fanotify_data.access_waitq);
#endif
@@ -409,7 +385,7 @@ static int fanotify_release(struct inode *ignored, struct file *file)
static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct fsnotify_group *group;
- struct fsnotify_event_holder *holder;
+ struct fsnotify_event *fsn_event;
void __user *p;
int ret = -ENOTTY;
size_t send_len = 0;
@@ -421,7 +397,7 @@ static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long ar
switch (cmd) {
case FIONREAD:
mutex_lock(&group->notification_mutex);
- list_for_each_entry(holder, &group->notification_list, event_list)
+ list_for_each_entry(fsn_event, &group->notification_list, list)
send_len += FAN_EVENT_METADATA_LEN;
mutex_unlock(&group->notification_mutex);
ret = put_user(send_len, (int __user *) p);
@@ -695,6 +671,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
struct fsnotify_group *group;
int f_flags, fd;
struct user_struct *user;
+ struct fanotify_event_info *oevent;
pr_debug("%s: flags=%d event_f_flags=%d\n",
__func__, flags, event_f_flags);
@@ -705,6 +682,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
if (flags & ~FAN_ALL_INIT_FLAGS)
return -EINVAL;
+ if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
+ return -EINVAL;
+
+ switch (event_f_flags & O_ACCMODE) {
+ case O_RDONLY:
+ case O_RDWR:
+ case O_WRONLY:
+ break;
+ default:
+ return -EINVAL;
+ }
+
user = get_current_user();
if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
free_uid(user);
@@ -727,9 +716,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.user = user;
atomic_inc(&user->fanotify_listeners);
+ oevent = fanotify_alloc_event(NULL, FS_Q_OVERFLOW, NULL);
+ if (unlikely(!oevent)) {
+ fd = -ENOMEM;
+ goto out_destroy_group;
+ }
+ group->overflow_event = &oevent->fse;
+
+ if (force_o_largefile())
+ event_f_flags |= O_LARGEFILE;
group->fanotify_data.f_flags = event_f_flags;
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
- mutex_init(&group->fanotify_data.access_mutex);
+ spin_lock_init(&group->fanotify_data.access_lock);
init_waitqueue_head(&group->fanotify_data.access_waitq);
INIT_LIST_HEAD(&group->fanotify_data.access_list);
atomic_set(&group->fanotify_data.bypass_perm, 0);
@@ -803,7 +801,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
case FAN_MARK_REMOVE:
if (!mask)
return -EINVAL;
+ break;
case FAN_MARK_FLUSH:
+ if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH))
+ return -EINVAL;
break;
default:
return -EINVAL;
@@ -840,6 +841,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
group->priority == FS_PRIO_0)
goto fput_and_out;
+ if (flags & FAN_MARK_FLUSH) {
+ ret = 0;
+ if (flags & FAN_MARK_MOUNT)
+ fsnotify_clear_vfsmount_marks_by_group(group);
+ else
+ fsnotify_clear_inode_marks_by_group(group);
+ goto fput_and_out;
+ }
+
ret = fanotify_find_path(dfd, pathname, &path, flags);
if (ret)
goto fput_and_out;
@@ -851,7 +861,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
mnt = path.mnt;
/* create/update an inode mark */
- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
+ switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
case FAN_MARK_ADD:
if (flags & FAN_MARK_MOUNT)
ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags);
@@ -864,12 +874,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
else
ret = fanotify_remove_inode_mark(group, inode, mask, flags);
break;
- case FAN_MARK_FLUSH:
- if (flags & FAN_MARK_MOUNT)
- fsnotify_clear_vfsmount_marks_by_group(group);
- else
- fsnotify_clear_inode_marks_by_group(group);
- break;
default:
ret = -EINVAL;
}
@@ -888,9 +892,9 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
{
return sys_fanotify_mark(fanotify_fd, flags,
#ifdef __BIG_ENDIAN
- ((__u64)mask1 << 32) | mask0,
-#else
((__u64)mask0 << 32) | mask1,
+#else
+ ((__u64)mask1 << 32) | mask0,
#endif
dfd, pathname);
}
@@ -904,8 +908,11 @@ COMPAT_SYSCALL_DEFINE6(fanotify_mark,
static int __init fanotify_user_setup(void)
{
fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, SLAB_PANIC);
- fanotify_response_event_cache = KMEM_CACHE(fanotify_response_event,
- SLAB_PANIC);
+ fanotify_event_cachep = KMEM_CACHE(fanotify_event_info, SLAB_PANIC);
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+ fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event_info,
+ SLAB_PANIC);
+#endif
return 0;
}
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4bb21d67d9b..9d3e9c50066 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -128,8 +128,7 @@ static int send_to_group(struct inode *to_tell,
struct fsnotify_mark *vfsmount_mark,
__u32 mask, void *data,
int data_is, u32 cookie,
- const unsigned char *file_name,
- struct fsnotify_event **event)
+ const unsigned char *file_name)
{
struct fsnotify_group *group = NULL;
__u32 inode_test_mask = 0;
@@ -170,27 +169,17 @@ static int send_to_group(struct inode *to_tell,
pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
" inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
- " data=%p data_is=%d cookie=%d event=%p\n",
+ " data=%p data_is=%d cookie=%d\n",
__func__, group, to_tell, mask, inode_mark,
inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
- data_is, cookie, *event);
+ data_is, cookie);
if (!inode_test_mask && !vfsmount_test_mask)
return 0;
- if (group->ops->should_send_event(group, to_tell, inode_mark,
- vfsmount_mark, mask, data,
- data_is) == false)
- return 0;
-
- if (!*event) {
- *event = fsnotify_create_event(to_tell, mask, data,
- data_is, file_name,
- cookie, GFP_KERNEL);
- if (!*event)
- return -ENOMEM;
- }
- return group->ops->handle_event(group, inode_mark, vfsmount_mark, *event);
+ return group->ops->handle_event(group, to_tell, inode_mark,
+ vfsmount_mark, mask, data, data_is,
+ file_name, cookie);
}
/*
@@ -205,7 +194,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
struct fsnotify_group *inode_group, *vfsmount_group;
- struct fsnotify_event *event = NULL;
struct mount *mnt;
int idx, ret = 0;
/* global tests shouldn't care about events on child only the specific event */
@@ -258,18 +246,18 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
if (inode_group > vfsmount_group) {
/* handle inode */
- ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
- data_is, cookie, file_name, &event);
+ ret = send_to_group(to_tell, inode_mark, NULL, mask,
+ data, data_is, cookie, file_name);
/* we didn't use the vfsmount_mark */
vfsmount_group = NULL;
} else if (vfsmount_group > inode_group) {
- ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
- data_is, cookie, file_name, &event);
+ ret = send_to_group(to_tell, NULL, vfsmount_mark, mask,
+ data, data_is, cookie, file_name);
inode_group = NULL;
} else {
ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
- mask, data, data_is, cookie, file_name,
- &event);
+ mask, data, data_is, cookie,
+ file_name);
}
if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
@@ -285,12 +273,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
ret = 0;
out:
srcu_read_unlock(&fsnotify_mark_srcu, idx);
- /*
- * fsnotify_create_event() took a reference so the event can't be cleaned
- * up while we are still trying to add it to lists, drop that one.
- */
- if (event)
- fsnotify_put_event(event);
return ret;
}
diff --git a/fs/notify/group.c b/fs/notify/group.c
index bd2625bd88b..ad199598045 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -55,6 +55,13 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
/* clear the notification queue of all events */
fsnotify_flush_notify(group);
+ /*
+ * Destroy overflow event (we cannot use fsnotify_destroy_event() as
+ * that deliberately ignores overflow events.
+ */
+ if (group->overflow_event)
+ group->ops->free_event(group->overflow_event);
+
fsnotify_put_group(group);
}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index b6642e4de4b..ed855ef6f07 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -2,11 +2,12 @@
#include <linux/inotify.h>
#include <linux/slab.h> /* struct kmem_cache */
-extern struct kmem_cache *event_priv_cachep;
-
-struct inotify_event_private_data {
- struct fsnotify_event_private_data fsnotify_event_priv_data;
+struct inotify_event_info {
+ struct fsnotify_event fse;
int wd;
+ u32 sync_cookie;
+ int name_len;
+ char name[];
};
struct inotify_inode_mark {
@@ -14,8 +15,18 @@ struct inotify_inode_mark {
int wd;
};
+static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+{
+ return container_of(fse, struct inotify_event_info, fse);
+}
+
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
-extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+extern int inotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie);
extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 4216308b81b..43ab1e1a07a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -34,107 +34,90 @@
#include "inotify.h"
/*
- * Check if 2 events contain the same information. We do not compare private data
- * but at this moment that isn't a problem for any know fsnotify listeners.
+ * Check if 2 events contain the same information.
*/
-static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+static bool event_compare(struct fsnotify_event *old_fsn,
+ struct fsnotify_event *new_fsn)
{
- if ((old->mask == new->mask) &&
- (old->to_tell == new->to_tell) &&
- (old->data_type == new->data_type) &&
- (old->name_len == new->name_len)) {
- switch (old->data_type) {
- case (FSNOTIFY_EVENT_INODE):
- /* remember, after old was put on the wait_q we aren't
- * allowed to look at the inode any more, only thing
- * left to check was if the file_name is the same */
- if (!old->name_len ||
- !strcmp(old->file_name, new->file_name))
- return true;
- break;
- case (FSNOTIFY_EVENT_PATH):
- if ((old->path.mnt == new->path.mnt) &&
- (old->path.dentry == new->path.dentry))
- return true;
- break;
- case (FSNOTIFY_EVENT_NONE):
- if (old->mask & FS_Q_OVERFLOW)
- return true;
- else if (old->mask & FS_IN_IGNORED)
- return false;
- return true;
- };
- }
+ struct inotify_event_info *old, *new;
+
+ if (old_fsn->mask & FS_IN_IGNORED)
+ return false;
+ old = INOTIFY_E(old_fsn);
+ new = INOTIFY_E(new_fsn);
+ if ((old_fsn->mask == new_fsn->mask) &&
+ (old_fsn->inode == new_fsn->inode) &&
+ (old->name_len == new->name_len) &&
+ (!old->name_len || !strcmp(old->name, new->name)))
+ return true;
return false;
}
-static struct fsnotify_event *inotify_merge(struct list_head *list,
- struct fsnotify_event *event)
+static int inotify_merge(struct list_head *list,
+ struct fsnotify_event *event)
{
- struct fsnotify_event_holder *last_holder;
struct fsnotify_event *last_event;
- /* and the list better be locked by something too */
- spin_lock(&event->lock);
-
- last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
- last_event = last_holder->event;
- if (event_compare(last_event, event))
- fsnotify_get_event(last_event);
- else
- last_event = NULL;
-
- spin_unlock(&event->lock);
-
- return last_event;
+ last_event = list_entry(list->prev, struct fsnotify_event, list);
+ return event_compare(last_event, event);
}
-static int inotify_handle_event(struct fsnotify_group *group,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- struct fsnotify_event *event)
+int inotify_handle_event(struct fsnotify_group *group,
+ struct inode *inode,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *file_name, u32 cookie)
{
struct inotify_inode_mark *i_mark;
- struct inode *to_tell;
- struct inotify_event_private_data *event_priv;
- struct fsnotify_event_private_data *fsn_event_priv;
- struct fsnotify_event *added_event;
- int wd, ret = 0;
+ struct inotify_event_info *event;
+ struct fsnotify_event *fsn_event;
+ int ret;
+ int len = 0;
+ int alloc_len = sizeof(struct inotify_event_info);
BUG_ON(vfsmount_mark);
- pr_debug("%s: group=%p event=%p to_tell=%p mask=%x\n", __func__, group,
- event, event->to_tell, event->mask);
+ if ((inode_mark->mask & FS_EXCL_UNLINK) &&
+ (data_type == FSNOTIFY_EVENT_PATH)) {
+ struct path *path = data;
+
+ if (d_unlinked(path->dentry))
+ return 0;
+ }
+ if (file_name) {
+ len = strlen(file_name);
+ alloc_len += len + 1;
+ }
- to_tell = event->to_tell;
+ pr_debug("%s: group=%p inode=%p mask=%x\n", __func__, group, inode,
+ mask);
i_mark = container_of(inode_mark, struct inotify_inode_mark,
fsn_mark);
- wd = i_mark->wd;
- event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
- if (unlikely(!event_priv))
+ event = kmalloc(alloc_len, GFP_KERNEL);
+ if (unlikely(!event))
return -ENOMEM;
- fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
- fsnotify_get_group(group);
- fsn_event_priv->group = group;
- event_priv->wd = wd;
-
- added_event = fsnotify_add_notify_event(group, event, fsn_event_priv, inotify_merge);
- if (added_event) {
- inotify_free_event_priv(fsn_event_priv);
- if (!IS_ERR(added_event))
- fsnotify_put_event(added_event);
- else
- ret = PTR_ERR(added_event);
+ fsn_event = &event->fse;
+ fsnotify_init_event(fsn_event, inode, mask);
+ event->wd = i_mark->wd;
+ event->sync_cookie = cookie;
+ event->name_len = len;
+ if (len)
+ strcpy(event->name, file_name);
+
+ ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge);
+ if (ret) {
+ /* Our event wasn't used in the end. Free it. */
+ fsnotify_destroy_event(group, fsn_event);
}
if (inode_mark->mask & IN_ONESHOT)
fsnotify_destroy_mark(inode_mark, group);
- return ret;
+ return 0;
}
static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group)
@@ -142,22 +125,6 @@ static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify
inotify_ignored_and_remove_idr(fsn_mark, group);
}
-static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode,
- struct fsnotify_mark *inode_mark,
- struct fsnotify_mark *vfsmount_mark,
- __u32 mask, void *data, int data_type)
-{
- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
- (data_type == FSNOTIFY_EVENT_PATH)) {
- struct path *path = data;
-
- if (d_unlinked(path->dentry))
- return false;
- }
-
- return true;
-}
-
/*
* This is NEVER supposed to be called. Inotify marks should either have been
* removed from the idr when the watch was removed or in the
@@ -202,22 +169,14 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
free_uid(group->inotify_data.user);
}
-void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+static void inotify_free_event(struct fsnotify_event *fsn_event)
{
- struct inotify_event_private_data *event_priv;
-
-
- event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
- fsnotify_event_priv_data);
-
- fsnotify_put_group(fsn_event_priv->group);
- kmem_cache_free(event_priv_cachep, event_priv);
+ kfree(INOTIFY_E(fsn_event));
}
const struct fsnotify_ops inotify_fsnotify_ops = {
.handle_event = inotify_handle_event,
- .should_send_event = inotify_should_send_event,
.free_group_priv = inotify_free_group_priv,
- .free_event_priv = inotify_free_event_priv,
+ .free_event = inotify_free_event,
.freeing_mark = inotify_freeing_mark,
};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 60f954a891a..cc423a30a0c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -50,7 +50,6 @@ static int inotify_max_queued_events __read_mostly;
static int inotify_max_user_watches __read_mostly;
static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
-struct kmem_cache *event_priv_cachep __read_mostly;
#ifdef CONFIG_SYSCTL
@@ -58,7 +57,7 @@ struct kmem_cache *event_priv_cachep __read_mostly;
static int zero;
-ctl_table inotify_table[] = {
+struct ctl_table inotify_table[] = {
{
.procname = "max_user_instances",
.data = &inotify_max_user_instances,
@@ -124,6 +123,16 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
return ret;
}
+static int round_event_name_len(struct fsnotify_event *fsn_event)
+{
+ struct inotify_event_info *event;
+
+ event = INOTIFY_E(fsn_event);
+ if (!event->name_len)
+ return 0;
+ return roundup(event->name_len + 1, sizeof(struct inotify_event));
+}
+
/*
* Get an inotify_kernel_event if one exists and is small
* enough to fit in "count". Return an error pointer if
@@ -144,9 +153,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
pr_debug("%s: group=%p event=%p\n", __func__, group, event);
- if (event->name_len)
- event_size += roundup(event->name_len + 1, event_size);
-
+ event_size += round_event_name_len(event);
if (event_size > count)
return ERR_PTR(-EINVAL);
@@ -164,40 +171,27 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
* buffer we had in "get_one_event()" above.
*/
static ssize_t copy_event_to_user(struct fsnotify_group *group,
- struct fsnotify_event *event,
+ struct fsnotify_event *fsn_event,
char __user *buf)
{
struct inotify_event inotify_event;
- struct fsnotify_event_private_data *fsn_priv;
- struct inotify_event_private_data *priv;
+ struct inotify_event_info *event;
size_t event_size = sizeof(struct inotify_event);
- size_t name_len = 0;
+ size_t name_len;
+ size_t pad_name_len;
- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
-
- /* we get the inotify watch descriptor from the event private data */
- spin_lock(&event->lock);
- fsn_priv = fsnotify_remove_priv_from_event(group, event);
- spin_unlock(&event->lock);
-
- if (!fsn_priv)
- inotify_event.wd = -1;
- else {
- priv = container_of(fsn_priv, struct inotify_event_private_data,
- fsnotify_event_priv_data);
- inotify_event.wd = priv->wd;
- inotify_free_event_priv(fsn_priv);
- }
+ pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event);
+ event = INOTIFY_E(fsn_event);
+ name_len = event->name_len;
/*
- * round up event->name_len so it is a multiple of event_size
+ * round up name length so it is a multiple of event_size
* plus an extra byte for the terminating '\0'.
*/
- if (event->name_len)
- name_len = roundup(event->name_len + 1, event_size);
- inotify_event.len = name_len;
-
- inotify_event.mask = inotify_mask_to_arg(event->mask);
+ pad_name_len = round_event_name_len(fsn_event);
+ inotify_event.len = pad_name_len;
+ inotify_event.mask = inotify_mask_to_arg(fsn_event->mask);
+ inotify_event.wd = event->wd;
inotify_event.cookie = event->sync_cookie;
/* send the main event */
@@ -209,20 +203,18 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
/*
* fsnotify only stores the pathname, so here we have to send the pathname
* and then pad that pathname out to a multiple of sizeof(inotify_event)
- * with zeros. I get my zeros from the nul_inotify_event.
+ * with zeros.
*/
- if (name_len) {
- unsigned int len_to_zero = name_len - event->name_len;
+ if (pad_name_len) {
/* copy the path name */
- if (copy_to_user(buf, event->file_name, event->name_len))
+ if (copy_to_user(buf, event->name, name_len))
return -EFAULT;
- buf += event->name_len;
+ buf += name_len;
/* fill userspace with 0's */
- if (clear_user(buf, len_to_zero))
+ if (clear_user(buf, pad_name_len - name_len))
return -EFAULT;
- buf += len_to_zero;
- event_size += name_len;
+ event_size += pad_name_len;
}
return event_size;
@@ -254,7 +246,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
if (IS_ERR(kevent))
break;
ret = copy_event_to_user(group, kevent, buf);
- fsnotify_put_event(kevent);
+ fsnotify_destroy_event(group, kevent);
if (ret < 0)
break;
buf += ret;
@@ -297,8 +289,7 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct fsnotify_group *group;
- struct fsnotify_event_holder *holder;
- struct fsnotify_event *event;
+ struct fsnotify_event *fsn_event;
void __user *p;
int ret = -ENOTTY;
size_t send_len = 0;
@@ -311,12 +302,10 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case FIONREAD:
mutex_lock(&group->notification_mutex);
- list_for_each_entry(holder, &group->notification_list, event_list) {
- event = holder->event;
+ list_for_each_entry(fsn_event, &group->notification_list,
+ list) {
send_len += sizeof(struct inotify_event);
- if (event->name_len)
- send_len += roundup(event->name_len + 1,
- sizeof(struct inotify_event));
+ send_len += round_event_name_len(fsn_event);
}
mutex_unlock(&group->notification_mutex);
ret = put_user(send_len, (int __user *) p);
@@ -503,43 +492,12 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group)
{
struct inotify_inode_mark *i_mark;
- struct fsnotify_event *ignored_event, *notify_event;
- struct inotify_event_private_data *event_priv;
- struct fsnotify_event_private_data *fsn_event_priv;
- int ret;
-
- i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
- ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
- FSNOTIFY_EVENT_NONE, NULL, 0,
- GFP_NOFS);
- if (!ignored_event)
- goto skip_send_ignore;
-
- event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
- if (unlikely(!event_priv))
- goto skip_send_ignore;
-
- fsn_event_priv = &event_priv->fsnotify_event_priv_data;
-
- fsnotify_get_group(group);
- fsn_event_priv->group = group;
- event_priv->wd = i_mark->wd;
-
- notify_event = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv, NULL);
- if (notify_event) {
- if (IS_ERR(notify_event))
- ret = PTR_ERR(notify_event);
- else
- fsnotify_put_event(notify_event);
- inotify_free_event_priv(fsn_event_priv);
- }
-skip_send_ignore:
- /* matches the reference taken when the event was created */
- if (ignored_event)
- fsnotify_put_event(ignored_event);
+ /* Queue ignore event for the watch */
+ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
+ NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+ i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
/* remove this mark from the idr */
inotify_remove_from_idr(group, i_mark);
@@ -675,11 +633,23 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
static struct fsnotify_group *inotify_new_group(unsigned int max_events)
{
struct fsnotify_group *group;
+ struct inotify_event_info *oevent;
group = fsnotify_alloc_group(&inotify_fsnotify_ops);
if (IS_ERR(group))
return group;
+ oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
+ if (unlikely(!oevent)) {
+ fsnotify_destroy_group(group);
+ return ERR_PTR(-ENOMEM);
+ }
+ group->overflow_event = &oevent->fse;
+ fsnotify_init_event(group->overflow_event, NULL, FS_Q_OVERFLOW);
+ oevent->wd = -1;
+ oevent->sync_cookie = 0;
+ oevent->name_len = 0;
+
group->max_events = max_events;
spin_lock_init(&group->inotify_data.idr_lock);
@@ -836,7 +806,6 @@ static int __init inotify_user_setup(void)
BUG_ON(hweight32(ALL_INOTIFY_BITS) != 21);
inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
- event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
inotify_max_queued_events = 16384;
inotify_max_user_instances = 128;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 923fe4a5f50..d90deaa08e7 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
static int fsnotify_mark_destroy(void *ignored)
{
struct fsnotify_mark *mark, *next;
- LIST_HEAD(private_destroy_list);
+ struct list_head private_destroy_list;
for (;;) {
spin_lock(&destroy_lock);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 7b51b05f160..1e58402171a 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -48,15 +48,6 @@
#include <linux/fsnotify_backend.h>
#include "fsnotify.h"
-static struct kmem_cache *fsnotify_event_cachep;
-static struct kmem_cache *fsnotify_event_holder_cachep;
-/*
- * This is a magic event we send when the q is too full. Since it doesn't
- * hold real event information we just keep one system wide and use it any time
- * it is needed. It's refcnt is set 1 at kernel init time and will never
- * get set to 0 so it will never get 'freed'
- */
-static struct fsnotify_event *q_overflow_event;
static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
/**
@@ -76,186 +67,82 @@ bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
return list_empty(&group->notification_list) ? true : false;
}
-void fsnotify_get_event(struct fsnotify_event *event)
+void fsnotify_destroy_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
{
- atomic_inc(&event->refcnt);
-}
-
-void fsnotify_put_event(struct fsnotify_event *event)
-{
- if (!event)
+ /* Overflow events are per-group and we don't want to free them */
+ if (!event || event->mask == FS_Q_OVERFLOW)
return;
- if (atomic_dec_and_test(&event->refcnt)) {
- pr_debug("%s: event=%p\n", __func__, event);
-
- if (event->data_type == FSNOTIFY_EVENT_PATH)
- path_put(&event->path);
-
- BUG_ON(!list_empty(&event->private_data_list));
-
- kfree(event->file_name);
- put_pid(event->tgid);
- kmem_cache_free(fsnotify_event_cachep, event);
- }
-}
-
-struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
-{
- return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
-}
-
-void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
-{
- if (holder)
- kmem_cache_free(fsnotify_event_holder_cachep, holder);
-}
-
-/*
- * Find the private data that the group previously attached to this event when
- * the group added the event to the notification queue (fsnotify_add_notify_event)
- */
-struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
-{
- struct fsnotify_event_private_data *lpriv;
- struct fsnotify_event_private_data *priv = NULL;
-
- assert_spin_locked(&event->lock);
-
- list_for_each_entry(lpriv, &event->private_data_list, event_list) {
- if (lpriv->group == group) {
- priv = lpriv;
- list_del(&priv->event_list);
- break;
- }
- }
- return priv;
+ group->ops->free_event(event);
}
/*
* Add an event to the group notification queue. The group can later pull this
- * event off the queue to deal with. If the event is successfully added to the
- * group's notification queue, a reference is taken on event.
+ * event off the queue to deal with. The function returns 0 if the event was
+ * added to the queue, 1 if the event was merged with some other queued event,
+ * 2 if the queue of events has overflown.
*/
-struct fsnotify_event *fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
- struct fsnotify_event_private_data *priv,
- struct fsnotify_event *(*merge)(struct list_head *,
- struct fsnotify_event *))
+int fsnotify_add_notify_event(struct fsnotify_group *group,
+ struct fsnotify_event *event,
+ int (*merge)(struct list_head *,
+ struct fsnotify_event *))
{
- struct fsnotify_event *return_event = NULL;
- struct fsnotify_event_holder *holder = NULL;
+ int ret = 0;
struct list_head *list = &group->notification_list;
- pr_debug("%s: group=%p event=%p priv=%p\n", __func__, group, event, priv);
-
- /*
- * There is one fsnotify_event_holder embedded inside each fsnotify_event.
- * Check if we expect to be able to use that holder. If not alloc a new
- * holder.
- * For the overflow event it's possible that something will use the in
- * event holder before we get the lock so we may need to jump back and
- * alloc a new holder, this can't happen for most events...
- */
- if (!list_empty(&event->holder.event_list)) {
-alloc_holder:
- holder = fsnotify_alloc_event_holder();
- if (!holder)
- return ERR_PTR(-ENOMEM);
- }
+ pr_debug("%s: group=%p event=%p\n", __func__, group, event);
mutex_lock(&group->notification_mutex);
if (group->q_len >= group->max_events) {
- event = q_overflow_event;
-
- /*
- * we need to return the overflow event
- * which means we need a ref
- */
- fsnotify_get_event(event);
- return_event = event;
-
- /* sorry, no private data on the overflow event */
- priv = NULL;
- }
-
- if (!list_empty(list) && merge) {
- struct fsnotify_event *tmp;
-
- tmp = merge(list, event);
- if (tmp) {
+ ret = 2;
+ /* Queue overflow event only if it isn't already queued */
+ if (!list_empty(&group->overflow_event->list)) {
mutex_unlock(&group->notification_mutex);
-
- if (return_event)
- fsnotify_put_event(return_event);
- if (holder != &event->holder)
- fsnotify_destroy_event_holder(holder);
- return tmp;
+ return ret;
}
+ event = group->overflow_event;
+ goto queue;
}
- spin_lock(&event->lock);
-
- if (list_empty(&event->holder.event_list)) {
- if (unlikely(holder))
- fsnotify_destroy_event_holder(holder);
- holder = &event->holder;
- } else if (unlikely(!holder)) {
- /* between the time we checked above and got the lock the in
- * event holder was used, go back and get a new one */
- spin_unlock(&event->lock);
- mutex_unlock(&group->notification_mutex);
-
- if (return_event) {
- fsnotify_put_event(return_event);
- return_event = NULL;
+ if (!list_empty(list) && merge) {
+ ret = merge(list, event);
+ if (ret) {
+ mutex_unlock(&group->notification_mutex);
+ return ret;
}
-
- goto alloc_holder;
}
+queue:
group->q_len++;
- holder->event = event;
-
- fsnotify_get_event(event);
- list_add_tail(&holder->event_list, list);
- if (priv)
- list_add_tail(&priv->event_list, &event->private_data_list);
- spin_unlock(&event->lock);
+ list_add_tail(&event->list, list);
mutex_unlock(&group->notification_mutex);
wake_up(&group->notification_waitq);
kill_fasync(&group->fsn_fa, SIGIO, POLL_IN);
- return return_event;
+ return ret;
}
/*
- * Remove and return the first event from the notification list. There is a
- * reference held on this event since it was on the list. It is the responsibility
- * of the caller to drop this reference.
+ * Remove and return the first event from the notification list. It is the
+ * responsibility of the caller to destroy the obtained event
*/
struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
{
struct fsnotify_event *event;
- struct fsnotify_event_holder *holder;
BUG_ON(!mutex_is_locked(&group->notification_mutex));
pr_debug("%s: group=%p\n", __func__, group);
- holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
-
- event = holder->event;
-
- spin_lock(&event->lock);
- holder->event = NULL;
- list_del_init(&holder->event_list);
- spin_unlock(&event->lock);
-
- /* event == holder means we are referenced through the in event holder */
- if (holder != &event->holder)
- fsnotify_destroy_event_holder(holder);
-
+ event = list_first_entry(&group->notification_list,
+ struct fsnotify_event, list);
+ /*
+ * We need to init list head for the case of overflow event so that
+ * check in fsnotify_add_notify_events() works
+ */
+ list_del_init(&event->list);
group->q_len--;
return event;
@@ -266,15 +153,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
*/
struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
{
- struct fsnotify_event *event;
- struct fsnotify_event_holder *holder;
-
BUG_ON(!mutex_is_locked(&group->notification_mutex));
- holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
- event = holder->event;
-
- return event;
+ return list_first_entry(&group->notification_list,
+ struct fsnotify_event, list);
}
/*
@@ -284,181 +166,31 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
void fsnotify_flush_notify(struct fsnotify_group *group)
{
struct fsnotify_event *event;
- struct fsnotify_event_private_data *priv;
mutex_lock(&group->notification_mutex);
while (!fsnotify_notify_queue_is_empty(group)) {
event = fsnotify_remove_notify_event(group);
- /* if they don't implement free_event_priv they better not have attached any */
- if (group->ops->free_event_priv) {
- spin_lock(&event->lock);
- priv = fsnotify_remove_priv_from_event(group, event);
- spin_unlock(&event->lock);
- if (priv)
- group->ops->free_event_priv(priv);
- }
- fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+ fsnotify_destroy_event(group, event);
}
mutex_unlock(&group->notification_mutex);
}
-static void initialize_event(struct fsnotify_event *event)
-{
- INIT_LIST_HEAD(&event->holder.event_list);
- atomic_set(&event->refcnt, 1);
-
- spin_lock_init(&event->lock);
-
- INIT_LIST_HEAD(&event->private_data_list);
-}
-
-/*
- * Caller damn well better be holding whatever mutex is protecting the
- * old_holder->event_list and the new_event must be a clean event which
- * cannot be found anywhere else in the kernel.
- */
-int fsnotify_replace_event(struct fsnotify_event_holder *old_holder,
- struct fsnotify_event *new_event)
-{
- struct fsnotify_event *old_event = old_holder->event;
- struct fsnotify_event_holder *new_holder = &new_event->holder;
-
- enum event_spinlock_class {
- SPINLOCK_OLD,
- SPINLOCK_NEW,
- };
-
- pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, new_event);
-
- /*
- * if the new_event's embedded holder is in use someone
- * screwed up and didn't give us a clean new event.
- */
- BUG_ON(!list_empty(&new_holder->event_list));
-
- spin_lock_nested(&old_event->lock, SPINLOCK_OLD);
- spin_lock_nested(&new_event->lock, SPINLOCK_NEW);
-
- new_holder->event = new_event;
- list_replace_init(&old_holder->event_list, &new_holder->event_list);
-
- spin_unlock(&new_event->lock);
- spin_unlock(&old_event->lock);
-
- /* event == holder means we are referenced through the in event holder */
- if (old_holder != &old_event->holder)
- fsnotify_destroy_event_holder(old_holder);
-
- fsnotify_get_event(new_event); /* on the list take reference */
- fsnotify_put_event(old_event); /* off the list, drop reference */
-
- return 0;
-}
-
-struct fsnotify_event *fsnotify_clone_event(struct fsnotify_event *old_event)
-{
- struct fsnotify_event *event;
-
- event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
- if (!event)
- return NULL;
-
- pr_debug("%s: old_event=%p new_event=%p\n", __func__, old_event, event);
-
- memcpy(event, old_event, sizeof(*event));
- initialize_event(event);
-
- if (event->name_len) {
- event->file_name = kstrdup(old_event->file_name, GFP_KERNEL);
- if (!event->file_name) {
- kmem_cache_free(fsnotify_event_cachep, event);
- return NULL;
- }
- }
- event->tgid = get_pid(old_event->tgid);
- if (event->data_type == FSNOTIFY_EVENT_PATH)
- path_get(&event->path);
-
- return event;
-}
-
/*
* fsnotify_create_event - Allocate a new event which will be sent to each
* group's handle_event function if the group was interested in this
* particular event.
*
- * @to_tell the inode which is supposed to receive the event (sometimes a
+ * @inode the inode which is supposed to receive the event (sometimes a
* parent of the inode to which the event happened.
* @mask what actually happened.
* @data pointer to the object which was actually affected
* @data_type flag indication if the data is a file, path, inode, nothing...
* @name the filename, if available
*/
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
- int data_type, const unsigned char *name,
- u32 cookie, gfp_t gfp)
+void fsnotify_init_event(struct fsnotify_event *event, struct inode *inode,
+ u32 mask)
{
- struct fsnotify_event *event;
-
- event = kmem_cache_zalloc(fsnotify_event_cachep, gfp);
- if (!event)
- return NULL;
-
- pr_debug("%s: event=%p to_tell=%p mask=%x data=%p data_type=%d\n",
- __func__, event, to_tell, mask, data, data_type);
-
- initialize_event(event);
-
- if (name) {
- event->file_name = kstrdup(name, gfp);
- if (!event->file_name) {
- kmem_cache_free(fsnotify_event_cachep, event);
- return NULL;
- }
- event->name_len = strlen(event->file_name);
- }
-
- event->tgid = get_pid(task_tgid(current));
- event->sync_cookie = cookie;
- event->to_tell = to_tell;
- event->data_type = data_type;
-
- switch (data_type) {
- case FSNOTIFY_EVENT_PATH: {
- struct path *path = data;
- event->path.dentry = path->dentry;
- event->path.mnt = path->mnt;
- path_get(&event->path);
- break;
- }
- case FSNOTIFY_EVENT_INODE:
- event->inode = data;
- break;
- case FSNOTIFY_EVENT_NONE:
- event->inode = NULL;
- event->path.dentry = NULL;
- event->path.mnt = NULL;
- break;
- default:
- BUG();
- }
-
+ INIT_LIST_HEAD(&event->list);
+ event->inode = inode;
event->mask = mask;
-
- return event;
-}
-
-static __init int fsnotify_notification_init(void)
-{
- fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
- fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
-
- q_overflow_event = fsnotify_create_event(NULL, FS_Q_OVERFLOW, NULL,
- FSNOTIFY_EVENT_NONE, NULL, 0,
- GFP_KERNEL);
- if (!q_overflow_event)
- panic("unable to allocate fsnotify q_overflow_event\n");
-
- return 0;
}
-subsys_initcall(fsnotify_notification_init);
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index a27e3fecefa..250ed5b20c8 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
if (page) {
set_page_dirty(page);
unlock_page(page);
- mark_page_accessed(page);
page_cache_release(page);
}
ntfs_debug("Done.");
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index ee4144ce5d7..f82498c35e7 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -58,7 +58,7 @@ typedef enum {
/**
* ntfs_compression_buffer - one buffer for the decompression engine
*/
-static u8 *ntfs_compression_buffer = NULL;
+static u8 *ntfs_compression_buffer;
/**
* ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index 807150e2c2b..dd6103cc93c 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -18,16 +18,9 @@
* distribution in the file COPYING); if not, write to the Free Software
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "debug.h"
-/*
- * A static buffer to hold the error string being displayed and a spinlock
- * to protect concurrent accesses to it.
- */
-static char err_buf[1024];
-static DEFINE_SPINLOCK(err_buf_lock);
-
/**
* __ntfs_warning - output a warning to the syslog
* @function: name of function outputting the warning
@@ -50,6 +43,7 @@ static DEFINE_SPINLOCK(err_buf_lock);
void __ntfs_warning(const char *function, const struct super_block *sb,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
int flen = 0;
@@ -59,17 +53,15 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
#endif
if (function)
flen = strlen(function);
- spin_lock(&err_buf_lock);
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
- va_end(args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
if (sb)
- printk(KERN_ERR "NTFS-fs warning (device %s): %s(): %s\n",
- sb->s_id, flen ? function : "", err_buf);
+ pr_warn("(device %s): %s(): %pV\n",
+ sb->s_id, flen ? function : "", &vaf);
else
- printk(KERN_ERR "NTFS-fs warning: %s(): %s\n",
- flen ? function : "", err_buf);
- spin_unlock(&err_buf_lock);
+ pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
+ va_end(args);
}
/**
@@ -94,6 +86,7 @@ void __ntfs_warning(const char *function, const struct super_block *sb,
void __ntfs_error(const char *function, const struct super_block *sb,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
int flen = 0;
@@ -103,17 +96,15 @@ void __ntfs_error(const char *function, const struct super_block *sb,
#endif
if (function)
flen = strlen(function);
- spin_lock(&err_buf_lock);
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
- va_end(args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
if (sb)
- printk(KERN_ERR "NTFS-fs error (device %s): %s(): %s\n",
- sb->s_id, flen ? function : "", err_buf);
+ pr_err("(device %s): %s(): %pV\n",
+ sb->s_id, flen ? function : "", &vaf);
else
- printk(KERN_ERR "NTFS-fs error: %s(): %s\n",
- flen ? function : "", err_buf);
- spin_unlock(&err_buf_lock);
+ pr_err("%s(): %pV\n", flen ? function : "", &vaf);
+ va_end(args);
}
#ifdef DEBUG
@@ -124,6 +115,7 @@ int debug_msgs = 0;
void __ntfs_debug (const char *file, int line, const char *function,
const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
int flen = 0;
@@ -131,13 +123,11 @@ void __ntfs_debug (const char *file, int line, const char *function,
return;
if (function)
flen = strlen(function);
- spin_lock(&err_buf_lock);
va_start(args, fmt);
- vsnprintf(err_buf, sizeof(err_buf), fmt, args);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
va_end(args);
- printk(KERN_DEBUG "NTFS-fs DEBUG (%s, %d): %s(): %s\n", file, line,
- flen ? function : "", err_buf);
- spin_unlock(&err_buf_lock);
}
/* Dump a runlist. Caller has to provide synchronisation for @rl. */
@@ -149,12 +139,12 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
if (!debug_msgs)
return;
- printk(KERN_DEBUG "NTFS-fs DEBUG: Dumping runlist (values in hex):\n");
+ pr_debug("Dumping runlist (values in hex):\n");
if (!rl) {
- printk(KERN_DEBUG "Run list not present.\n");
+ pr_debug("Run list not present.\n");
return;
}
- printk(KERN_DEBUG "VCN LCN Run length\n");
+ pr_debug("VCN LCN Run length\n");
for (i = 0; ; i++) {
LCN lcn = (rl + i)->lcn;
@@ -163,13 +153,13 @@ void ntfs_debug_dump_runlist(const runlist_element *rl)
if (index > -LCN_ENOENT - 1)
index = 3;
- printk(KERN_DEBUG "%-16Lx %s %-16Lx%s\n",
+ pr_debug("%-16Lx %s %-16Lx%s\n",
(long long)(rl + i)->vcn, lcn_str[index],
(long long)(rl + i)->length,
(rl + i)->length ? "" :
" (runlist end)");
} else
- printk(KERN_DEBUG "%-16Lx %-16Lx %-16Lx%s\n",
+ pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
(long long)(rl + i)->vcn,
(long long)(rl + i)->lcn,
(long long)(rl + i)->length,
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
index 53c27eaf230..61bf091e32a 100644
--- a/fs/ntfs/debug.h
+++ b/fs/ntfs/debug.h
@@ -48,7 +48,12 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl);
#else /* !DEBUG */
-#define ntfs_debug(f, a...) do {} while (0)
+#define ntfs_debug(fmt, ...) \
+do { \
+ if (0) \
+ no_printk(fmt, ##__VA_ARGS__); \
+} while (0)
+
#define ntfs_debug_dump_runlist(rl) do {} while (0)
#endif /* !DEBUG */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index ea4ba9daeb4..5c9e2c81cb1 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
}
do {
unlock_page(pages[--do_pages]);
- mark_page_accessed(pages[do_pages]);
page_cache_release(pages[do_pages]);
} while (do_pages);
if (unlikely(status))
@@ -2091,10 +2090,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
size_t count; /* after file limit checks */
ssize_t written, err;
- count = 0;
- err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
- if (err)
- return err;
+ count = iov_length(iov, nr_segs);
pos = *ppos;
/* We can write back this queue in page reclaim. */
current->backing_dev_info = mapping->backing_dev_info;
@@ -2134,7 +2130,7 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
mutex_unlock(&inode->i_mutex);
if (ret > 0) {
- int err = generic_write_sync(file, pos, ret);
+ int err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
@@ -2203,8 +2199,8 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
const struct file_operations ntfs_file_ops = {
.llseek = generic_file_llseek, /* Seek inside file. */
- .read = do_sync_read, /* Read from file. */
- .aio_read = generic_file_aio_read, /* Async read from file. */
+ .read = new_sync_read, /* Read from file. */
+ .read_iter = generic_file_read_iter, /* Async read from file. */
#ifdef NTFS_RW
.write = do_sync_write, /* Write to file. */
.aio_write = ntfs_file_aio_write, /* Async write to file. */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 2778b0255dc..f47af5e6e23 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -55,7 +55,7 @@
*
* Return 1 if the attributes match and 0 if not.
*
- * NOTE: This function runs with the inode->i_lock spin lock held so it is not
+ * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
* allowed to sleep.
*/
int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
iput(bvi);
skip_large_index_stuff:
/* Setup the operations for this index inode. */
- vi->i_op = NULL;
- vi->i_fop = NULL;
vi->i_mapping->a_ops = &ntfs_mst_aops;
vi->i_blocks = ni->allocated_size >> 9;
/*
@@ -2259,7 +2257,7 @@ void ntfs_evict_big_inode(struct inode *vi)
{
ntfs_inode *ni = NTFS_I(vi);
- truncate_inode_pages(&vi->i_data, 0);
+ truncate_inode_pages_final(&vi->i_data);
clear_inode(vi);
#ifdef NTFS_RW
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 82650d52d91..6c3296e546c 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -19,6 +19,7 @@
* distribution in the file COPYING); if not, write to the Free Software
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/stddef.h>
#include <linux/init.h>
@@ -49,8 +50,8 @@
static unsigned long ntfs_nr_compression_users;
/* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase = NULL;
-static unsigned long ntfs_nr_upcase_users = 0;
+static ntfschar *default_upcase;
+static unsigned long ntfs_nr_upcase_users;
/* Error constants/strings used in inode.c::ntfs_show_options(). */
typedef enum {
@@ -468,6 +469,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
ntfs_debug("Entering with remount options string: %s", opt);
+ sync_filesystem(sb);
+
#ifndef NTFS_RW
/* For read-only compiled driver, enforce read-only flag. */
*flags |= MS_RDONLY;
@@ -1894,7 +1897,7 @@ get_ctx_vol_failed:
vol->minor_ver = vi->minor_ver;
ntfs_attr_put_search_ctx(ctx);
unmap_mft_record(NTFS_I(vol->vol_ino));
- printk(KERN_INFO "NTFS volume version %i.%i.\n", vol->major_ver,
+ pr_info("volume version %i.%i.\n", vol->major_ver,
vol->minor_ver);
if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
@@ -3093,7 +3096,7 @@ static int __init init_ntfs_fs(void)
int err = 0;
/* This may be ugly but it results in pretty output so who cares. (-8 */
- printk(KERN_INFO "NTFS driver " NTFS_VERSION " [Flags: R/"
+ pr_info("driver " NTFS_VERSION " [Flags: R/"
#ifdef NTFS_RW
"W"
#else
@@ -3113,16 +3116,15 @@ static int __init init_ntfs_fs(void)
sizeof(ntfs_index_context), 0 /* offset */,
SLAB_HWCACHE_ALIGN, NULL /* ctor */);
if (!ntfs_index_ctx_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_index_ctx_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
goto ictx_err_out;
}
ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
sizeof(ntfs_attr_search_ctx), 0 /* offset */,
SLAB_HWCACHE_ALIGN, NULL /* ctor */);
if (!ntfs_attr_ctx_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_attr_ctx_cache_name);
+ pr_crit("NTFS: Failed to create %s!\n",
+ ntfs_attr_ctx_cache_name);
goto actx_err_out;
}
@@ -3130,8 +3132,7 @@ static int __init init_ntfs_fs(void)
(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
SLAB_HWCACHE_ALIGN, NULL);
if (!ntfs_name_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_name_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
goto name_err_out;
}
@@ -3139,8 +3140,7 @@ static int __init init_ntfs_fs(void)
sizeof(ntfs_inode), 0,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
if (!ntfs_inode_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_inode_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
goto inode_err_out;
}
@@ -3149,15 +3149,14 @@ static int __init init_ntfs_fs(void)
SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
ntfs_big_inode_init_once);
if (!ntfs_big_inode_cache) {
- printk(KERN_CRIT "NTFS: Failed to create %s!\n",
- ntfs_big_inode_cache_name);
+ pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
goto big_inode_err_out;
}
/* Register the ntfs sysctls. */
err = ntfs_sysctl(1);
if (err) {
- printk(KERN_CRIT "NTFS: Failed to register NTFS sysctls!\n");
+ pr_crit("Failed to register NTFS sysctls!\n");
goto sysctl_err_out;
}
@@ -3166,7 +3165,7 @@ static int __init init_ntfs_fs(void)
ntfs_debug("NTFS driver registered successfully.");
return 0; /* Success! */
}
- printk(KERN_CRIT "NTFS: Failed to register NTFS filesystem driver!\n");
+ pr_crit("Failed to register NTFS filesystem driver!\n");
/* Unregister the ntfs sysctls. */
ntfs_sysctl(0);
@@ -3182,8 +3181,7 @@ actx_err_out:
kmem_cache_destroy(ntfs_index_ctx_cache);
ictx_err_out:
if (!err) {
- printk(KERN_CRIT "NTFS: Aborting NTFS filesystem driver "
- "registration...\n");
+ pr_crit("Aborting NTFS filesystem driver registration...\n");
err = -ENOMEM;
}
return err;
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 79a89184cb5..a503156ec15 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -34,7 +34,7 @@
#include "debug.h"
/* Definition of the ntfs sysctl. */
-static ctl_table ntfs_sysctls[] = {
+static struct ctl_table ntfs_sysctls[] = {
{
.procname = "ntfs-debug",
.data = &debug_msgs, /* Data pointer and size. */
@@ -46,7 +46,7 @@ static ctl_table ntfs_sysctls[] = {
};
/* Define the parent directory /proc/sys/fs. */
-static ctl_table sysctls_root[] = {
+static struct ctl_table sysctls_root[] = {
{
.procname = "fs",
.mode = 0555,
@@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = {
};
/* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table = NULL;
+static struct ctl_table_header *sysctls_root_table;
/**
* ntfs_sysctl - add or remove the debug sysctl
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index f17e58b3298..ce210d4951a 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -38,7 +38,6 @@ ocfs2-objs := \
symlink.o \
sysfile.o \
uptodate.o \
- ver.o \
quota_local.o \
quota_global.o \
xattr.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index b4f788e0ca3..7e8282dcea2 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -160,36 +160,6 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
return acl;
}
-
-/*
- * Get posix acl.
- */
-static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
- struct posix_acl *acl;
- int ret;
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return NULL;
-
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- acl = ERR_PTR(ret);
- return acl;
- }
-
- acl = ocfs2_get_acl_nolock(inode, type, di_bh);
-
- ocfs2_inode_unlock(inode, 0);
-
- brelse(di_bh);
-
- return acl;
-}
-
/*
* Helper function to set i_mode in memory and disk. Some call paths
* will not have di_bh or a journal handle to pass, in which case it
@@ -235,6 +205,7 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh,
di->i_mode = cpu_to_le16(inode->i_mode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
@@ -250,7 +221,7 @@ out:
/*
* Set the access or default ACL of an inode.
*/
-static int ocfs2_set_acl(handle_t *handle,
+int ocfs2_set_acl(handle_t *handle,
struct inode *inode,
struct buffer_head *di_bh,
int type,
@@ -313,6 +284,11 @@ static int ocfs2_set_acl(handle_t *handle,
return ret;
}
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ return ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+}
+
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
{
struct ocfs2_super *osb;
@@ -334,200 +310,3 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
return acl;
}
-
-int ocfs2_acl_chmod(struct inode *inode)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl;
- int ret;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
- ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (ret)
- return ret;
- ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
- acl, NULL, NULL);
- posix_acl_release(acl);
- return ret;
-}
-
-/*
- * Initialize the ACLs of a new inode. If parent directory has default ACL,
- * then clone to new inode. Called from ocfs2_mknod.
- */
-int ocfs2_init_acl(handle_t *handle,
- struct inode *inode,
- struct inode *dir,
- struct buffer_head *di_bh,
- struct buffer_head *dir_bh,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_alloc_context *data_ac)
-{
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl = NULL;
- int ret = 0, ret2;
- umode_t mode;
-
- if (!S_ISLNK(inode->i_mode)) {
- if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
- acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
- dir_bh);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- }
- if (!acl) {
- mode = inode->i_mode & ~current_umask();
- ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
- if (ret) {
- mlog_errno(ret);
- goto cleanup;
- }
- }
- }
- if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
- if (S_ISDIR(inode->i_mode)) {
- ret = ocfs2_set_acl(handle, inode, di_bh,
- ACL_TYPE_DEFAULT, acl,
- meta_ac, data_ac);
- if (ret)
- goto cleanup;
- }
- mode = inode->i_mode;
- ret = posix_acl_create(&acl, GFP_NOFS, &mode);
- if (ret < 0)
- return ret;
-
- ret2 = ocfs2_acl_set_mode(inode, di_bh, handle, mode);
- if (ret2) {
- mlog_errno(ret2);
- ret = ret2;
- goto cleanup;
- }
- if (ret > 0) {
- ret = ocfs2_set_acl(handle, inode,
- di_bh, ACL_TYPE_ACCESS,
- acl, meta_ac, data_ac);
- }
- }
-cleanup:
- posix_acl_release(acl);
- return ret;
-}
-
-static size_t ocfs2_xattr_list_acl_access(struct dentry *dentry,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len,
- int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-static size_t ocfs2_xattr_list_acl_default(struct dentry *dentry,
- char *list,
- size_t list_len,
- const char *name,
- size_t name_len,
- int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
-
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return 0;
-
- if (list && size <= list_len)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
-}
-
-static int ocfs2_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
- struct posix_acl *acl;
- int ret;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- acl = ocfs2_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- ret = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
- return ret;
-}
-
-static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct posix_acl *acl;
- int ret = 0;
-
- if (strcmp(name, "") != 0)
- return -EINVAL;
- if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
- return -EOPNOTSUPP;
-
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- else if (acl) {
- ret = posix_acl_valid(acl);
- if (ret)
- goto cleanup;
- }
- } else
- acl = NULL;
-
- ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
-
-cleanup:
- posix_acl_release(acl);
- return ret;
-}
-
-const struct xattr_handler ocfs2_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .list = ocfs2_xattr_list_acl_access,
- .get = ocfs2_xattr_get_acl,
- .set = ocfs2_xattr_set_acl,
-};
-
-const struct xattr_handler ocfs2_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .list = ocfs2_xattr_list_acl_default,
- .get = ocfs2_xattr_get_acl,
- .set = ocfs2_xattr_set_acl,
-};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 071fbd380f2..3fce68d0862 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -27,10 +27,13 @@ struct ocfs2_acl_entry {
};
struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type);
-extern int ocfs2_acl_chmod(struct inode *);
-extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
- struct buffer_head *, struct buffer_head *,
- struct ocfs2_alloc_context *,
- struct ocfs2_alloc_context *);
+int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac);
#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 17e6bdde96c..9d8fcf2f3b9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1025,7 +1025,7 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
for(i = count; i < (num_got + count); i++) {
bhs[i] = sb_getblk(osb->sb, first_blkno);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
enum ocfs2_alloc_restarted *reason_ret)
{
int status = 0, err = 0;
+ int need_free = 0;
int free_extents;
enum ocfs2_alloc_restarted reason = RESTART_NONE;
u32 bit_off, num_bits;
@@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
@@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
num_bits, flags, meta_ac);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ need_free = 1;
+ goto bail;
}
ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
reason = RESTART_TRANS;
}
+bail:
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num_bits);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num_bits);
+ }
+
leave:
if (reason_ret)
*reason_ret = reason;
@@ -5712,6 +5728,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
}
ocfs2_et_update_clusters(et, -len);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, et->et_root_bh);
@@ -6029,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
int cancel)
{
- if (osb->osb_tl_inode) {
+ if (osb->osb_tl_inode &&
+ atomic_read(&osb->osb_tl_disable) == 0) {
/* We want to push off log flushes while truncates are
* still running. */
if (cancel)
@@ -6206,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
int status;
struct inode *tl_inode = osb->osb_tl_inode;
+ atomic_set(&osb->osb_tl_disable, 1);
+
if (tl_inode) {
cancel_delayed_work(&osb->osb_truncate_log_wq);
flush_workqueue(ocfs2_wq);
@@ -6237,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
* until we're sure all is well. */
INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
ocfs2_truncate_log_worker);
+ atomic_set(&osb->osb_tl_disable, 0);
osb->osb_tl_bh = tl_bh;
osb->osb_tl_inode = tl_inode;
@@ -6805,6 +6826,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct buffer_head *di_bh)
{
int ret, i, has_data, num_pages = 0;
+ int need_free = 0;
+ u32 bit_off, num;
handle_t *handle;
u64 uninitialized_var(block);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
@@ -6850,7 +6873,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
if (has_data) {
- u32 bit_off, num;
unsigned int page_end;
u64 phys;
@@ -6886,6 +6908,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6896,6 +6919,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6913,6 +6937,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_dinode_new_extent_list(inode, di);
ocfs2_journal_dirty(handle, di_bh);
@@ -6927,6 +6952,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
if (ret) {
mlog_errno(ret);
+ need_free = 1;
goto out_commit;
}
@@ -6938,6 +6964,18 @@ out_commit:
dquot_free_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (need_free) {
+ if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
+ ocfs2_free_local_alloc_bits(osb, handle, data_ac,
+ bit_off, num);
+ else
+ ocfs2_free_clusters(handle,
+ data_ac->ac_inode,
+ data_ac->ac_bh,
+ ocfs2_clusters_to_blocks(osb->sb, bit_off),
+ num);
+ }
+
ocfs2_commit_trans(osb, handle);
out_unlock:
@@ -7126,7 +7164,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
if (end > i_size_read(inode))
end = i_size_read(inode);
- BUG_ON(start >= end);
+ BUG_ON(start > end);
if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
!(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
@@ -7176,6 +7214,7 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
@@ -7260,14 +7299,8 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
start = range->start >> osb->s_clustersize_bits;
len = range->len >> osb->s_clustersize_bits;
minlen = range->minlen >> osb->s_clustersize_bits;
- trimmed = 0;
- if (!len) {
- range->len = 0;
- return 0;
- }
-
- if (minlen >= osb->bitmap_cpg)
+ if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize)
return -EINVAL;
main_bm_inode = ocfs2_get_system_file_inode(osb,
@@ -7293,6 +7326,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
goto out_unlock;
}
+ len = range->len >> osb->s_clustersize_bits;
if (start + len > le32_to_cpu(main_bm->i_clusters))
len = le32_to_cpu(main_bm->i_clusters) - start;
@@ -7307,6 +7341,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1);
last_bit = osb->bitmap_cpg;
+ trimmed = 0;
for (group = first_group; group <= last_group;) {
if (first_bit + len >= osb->bitmap_cpg)
last_bit = osb->bitmap_cpg;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f37d3c0e205..4a231a166cf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -80,6 +80,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
+ err = -ENOMEM;
mlog(ML_ERROR, "block offset is outside the allocated size: "
"%llu\n", (unsigned long long)iblock);
goto bail;
@@ -92,6 +93,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
iblock;
buffer_cache_bh = sb_getblk(osb->sb, blkno);
if (!buffer_cache_bh) {
+ err = -ENOMEM;
mlog(ML_ERROR, "couldn't getblock for symlink!\n");
goto bail;
}
@@ -569,7 +571,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
{
struct inode *inode = file_inode(iocb->ki_filp);
int level;
- wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
/* this io's submitter should not have unlocked this before we could */
BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -580,10 +581,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
if (ocfs2_iocb_is_unaligned_aio(iocb)) {
ocfs2_iocb_clear_unaligned_aio(iocb);
- if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
- waitqueue_active(wq)) {
- wake_up_all(wq);
- }
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
ocfs2_iocb_clear_rw_locked(iocb);
@@ -592,33 +590,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
ocfs2_rw_unlock(inode, level);
}
-/*
- * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen
- * from ext3. PageChecked() bits have been removed as OCFS2 does not
- * do journalled data.
- */
-static void ocfs2_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
- jbd2_journal_invalidatepage(journal, page, offset, length);
-}
-
static int ocfs2_releasepage(struct page *page, gfp_t wait)
{
- journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal;
-
if (!page_has_buffers(page))
return 0;
- return jbd2_journal_try_to_free_buffers(journal, page, wait);
+ return try_to_free_buffers(page);
}
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file)->i_mapping->host;
@@ -635,7 +617,7 @@ static ssize_t ocfs2_direct_IO(int rw,
return 0;
return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
- iov, offset, nr_segs,
+ iter, offset,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
}
@@ -1802,8 +1784,7 @@ try_again:
data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &di->id2.i_list,
- clusters_to_alloc);
+ &di->id2.i_list);
}
@@ -1897,10 +1878,14 @@ out_commit:
out:
ocfs2_free_write_ctxt(wc);
- if (data_ac)
+ if (data_ac) {
ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
+ data_ac = NULL;
+ }
+ if (meta_ac) {
ocfs2_free_alloc_context(meta_ac);
+ meta_ac = NULL;
+ }
if (ret == -ENOSPC && try_free) {
/*
@@ -2053,6 +2038,7 @@ out_write_size:
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, wc->w_di_bh);
ocfs2_commit_trans(osb, handle);
@@ -2087,7 +2073,7 @@ const struct address_space_operations ocfs2_aops = {
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
.direct_IO = ocfs2_direct_IO,
- .invalidatepage = ocfs2_invalidatepage,
+ .invalidatepage = block_invalidatepage,
.releasepage = ocfs2_releasepage,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f671e49beb3..6cae155d54d 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -102,9 +102,4 @@ enum ocfs2_iocb_lock_bits {
#define ocfs2_iocb_is_unaligned_aio(iocb) \
test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
-#define OCFS2_IOEND_WQ_HASH_SZ 37
-#define ocfs2_ioend_wq(v) (&ocfs2__ioend_wq[((unsigned long)(v)) %\
- OCFS2_IOEND_WQ_HASH_SZ])
-extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
-
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 5d18ad10c27..1edcb141f63 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -90,7 +90,6 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
* information for this bh as it's not marked locally
* uptodate. */
ret = -EIO;
- put_bh(bh);
mlog_errno(ret);
}
@@ -115,7 +114,7 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
if (bhs[i] == NULL) {
bhs[i] = sb_getblk(osb->sb, block++);
if (bhs[i] == NULL) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -214,7 +213,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
bhs[i] = sb_getblk(sb, block++);
if (bhs[i] == NULL) {
ocfs2_metadata_cache_io_unlock(ci);
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -420,7 +419,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
if (!buffer_uptodate(bh)) {
ret = -EIO;
- put_bh(bh);
mlog_errno(ret);
}
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index bc8c5e7d860..1aefc0350ec 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
- quorum.o tcp.o netdebug.o ver.o
+ quorum.o tcp.o netdebug.o
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 363f0dcc924..73039295d0d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -35,6 +35,7 @@
#include <linux/time.h>
#include <linux/debugfs.h>
#include <linux/slab.h>
+#include <linux/bitmap.h>
#include "heartbeat.h"
#include "tcp.h"
@@ -282,15 +283,6 @@ struct o2hb_bio_wait_ctxt {
int wc_error;
};
-static int o2hb_pop_count(void *map, int count)
-{
- int i = -1, pop = 0;
-
- while ((i = find_next_bit(map, count, i + 1)) < count)
- pop++;
- return pop;
-}
-
static void o2hb_write_timeout(struct work_struct *work)
{
int failed, quorum;
@@ -307,9 +299,9 @@ static void o2hb_write_timeout(struct work_struct *work)
spin_lock_irqsave(&o2hb_live_lock, flags);
if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
- failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
+ failed = bitmap_weight(o2hb_failed_region_bitmap,
O2NM_MAX_REGIONS);
- quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ quorum = bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS);
spin_unlock_irqrestore(&o2hb_live_lock, flags);
@@ -421,7 +413,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
}
/* Must put everything in 512 byte sectors for the bio... */
- bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
+ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9);
bio->bi_bdev = reg->hr_bdev;
bio->bi_private = wc;
bio->bi_end_io = o2hb_bio_end_io;
@@ -765,7 +757,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg)
* If global heartbeat active, unpin all regions if the
* region count > CUT_OFF
*/
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
o2hb_region_unpin(NULL);
unlock:
@@ -954,23 +946,9 @@ out:
return changed;
}
-/* This could be faster if we just implmented a find_last_bit, but I
- * don't think the circumstances warrant it. */
-static int o2hb_highest_node(unsigned long *nodes,
- int numbits)
+static int o2hb_highest_node(unsigned long *nodes, int numbits)
{
- int highest, node;
-
- highest = numbits;
- node = -1;
- while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
- if (node >= numbits)
- break;
-
- highest = node;
- }
-
- return highest;
+ return find_last_bit(nodes, numbits);
}
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
@@ -1129,7 +1107,7 @@ static int o2hb_thread(void *data)
mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
- set_user_nice(current, -20);
+ set_user_nice(current, MIN_NICE);
/* Pin node */
o2nm_depend_this_node();
@@ -1829,7 +1807,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
live_threshold = O2HB_LIVE_THRESHOLD;
if (o2hb_global_heartbeat_active()) {
spin_lock(&o2hb_live_lock);
- if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
+ if (bitmap_weight(o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
live_threshold <<= 1;
spin_unlock(&o2hb_live_lock);
}
@@ -2180,7 +2158,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (!o2hb_dependent_users)
goto unlock;
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
o2hb_region_pin(NULL);
@@ -2480,7 +2458,7 @@ static int o2hb_region_inc_user(const char *region_uuid)
if (o2hb_dependent_users > 1)
goto unlock;
- if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
+ if (bitmap_weight(o2hb_quorum_region_bitmap,
O2NM_MAX_REGIONS) <= O2HB_PIN_CUT_OFF)
ret = o2hb_region_pin(NULL);
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index baa2b9ef7ee..2260fb9e650 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -199,7 +199,8 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
#define mlog_errno(st) do { \
int _st = (st); \
if (_st != -ERESTARTSYS && _st != -EINTR && \
- _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \
+ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \
+ _st != -EDQUOT) \
mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
} while (0)
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index bb240647ca5..441c84e169e 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -29,7 +29,6 @@
#include "heartbeat.h"
#include "masklog.h"
#include "sys.h"
-#include "ver.h"
/* for now we operate under the assertion that there can be only one
* cluster active at a time. Changing this will require trickling
@@ -945,8 +944,6 @@ static int __init init_o2nm(void)
{
int ret = -1;
- cluster_print_version();
-
ret = o2hb_init();
if (ret)
goto out;
@@ -984,6 +981,7 @@ out:
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster management");
module_init(init_o2nm)
module_exit(exit_o2nm)
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index a4b07730b2e..b7f57271d49 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -41,7 +41,7 @@ static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr,
return snprintf(buf, PAGE_SIZE, "%u\n", O2NM_API_VERSION);
}
static struct kobj_attribute attr_version =
- __ATTR(interface_revision, S_IFREG | S_IRUGO, version_show, NULL);
+ __ATTR(interface_revision, S_IRUGO, version_show, NULL);
static struct attribute *o2cb_attrs[] = {
&attr_version.attr,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2cd2406b414..681691bc233 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT;
static struct o2net_node o2net_nodes[O2NM_MAX_NODES];
/* XXX someday we'll need better accounting */
-static struct socket *o2net_listen_sock = NULL;
+static struct socket *o2net_listen_sock;
/*
* listen work is only queued by the listening socket callbacks on the
@@ -137,7 +137,7 @@ static int o2net_sys_err_translations[O2NET_ERR_MAX] =
static void o2net_sc_connect_completed(struct work_struct *work);
static void o2net_rx_until_empty(struct work_struct *work);
static void o2net_shutdown_sc(struct work_struct *work);
-static void o2net_listen_data_ready(struct sock *sk, int bytes);
+static void o2net_listen_data_ready(struct sock *sk);
static void o2net_sc_send_keep_req(struct work_struct *work);
static void o2net_idle_timer(unsigned long data);
static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
@@ -262,17 +262,17 @@ static void o2net_update_recv_stats(struct o2net_sock_container *sc)
#endif /* CONFIG_OCFS2_FS_STATS */
-static inline int o2net_reconnect_delay(void)
+static inline unsigned int o2net_reconnect_delay(void)
{
return o2nm_single_cluster->cl_reconnect_delay_ms;
}
-static inline int o2net_keepalive_delay(void)
+static inline unsigned int o2net_keepalive_delay(void)
{
return o2nm_single_cluster->cl_keepalive_delay_ms;
}
-static inline int o2net_idle_timeout(void)
+static inline unsigned int o2net_idle_timeout(void)
{
return o2nm_single_cluster->cl_idle_timeout_ms;
}
@@ -597,9 +597,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
}
/* see o2net_register_callbacks() */
-static void o2net_data_ready(struct sock *sk, int bytes)
+static void o2net_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
if (sk->sk_user_data) {
@@ -613,7 +613,7 @@ static void o2net_data_ready(struct sock *sk, int bytes)
}
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ ready(sk);
}
/* see o2net_register_callbacks() */
@@ -916,57 +916,30 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key)
static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len)
{
- int ret;
- mm_segment_t oldfs;
- struct kvec vec = {
- .iov_len = len,
- .iov_base = data,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&vec,
- .msg_flags = MSG_DONTWAIT,
- };
-
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_recvmsg(sock, &msg, len, msg.msg_flags);
- set_fs(oldfs);
-
- return ret;
+ struct kvec vec = { .iov_len = len, .iov_base = data, };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT, };
+ return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags);
}
static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec,
size_t veclen, size_t total)
{
int ret;
- mm_segment_t oldfs;
- struct msghdr msg = {
- .msg_iov = (struct iovec *)vec,
- .msg_iovlen = veclen,
- };
+ struct msghdr msg;
if (sock == NULL) {
ret = -EINVAL;
goto out;
}
- oldfs = get_fs();
- set_fs(get_ds());
- ret = sock_sendmsg(sock, &msg, total);
- set_fs(oldfs);
- if (ret != total) {
- mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret,
- total);
- if (ret >= 0)
- ret = -EPIPE; /* should be smarter, I bet */
- goto out;
- }
-
- ret = 0;
+ ret = kernel_sendmsg(sock, &msg, vec, veclen, total);
+ if (likely(ret == total))
+ return 0;
+ mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total);
+ if (ret >= 0)
+ ret = -EPIPE; /* should be smarter, I bet */
out:
- if (ret < 0)
- mlog(0, "returning error: %d\n", ret);
+ mlog(0, "returning error: %d\n", ret);
return ret;
}
@@ -1826,7 +1799,7 @@ int o2net_register_hb_callbacks(void)
/* ------------------------------------------------------------ */
-static int o2net_accept_one(struct socket *sock)
+static int o2net_accept_one(struct socket *sock, int *more)
{
int ret, slen;
struct sockaddr_in sin;
@@ -1837,6 +1810,7 @@ static int o2net_accept_one(struct socket *sock)
struct o2net_node *nn;
BUG_ON(sock == NULL);
+ *more = 0;
ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type,
sock->sk->sk_protocol, &new_sock);
if (ret)
@@ -1848,6 +1822,7 @@ static int o2net_accept_one(struct socket *sock)
if (ret < 0)
goto out;
+ *more = 1;
new_sock->sk->sk_allocation = GFP_ATOMIC;
ret = o2net_set_nodelay(new_sock);
@@ -1946,16 +1921,41 @@ out:
return ret;
}
+/*
+ * This function is invoked in response to one or more
+ * pending accepts at softIRQ level. We must drain the
+ * entire que before returning.
+ */
+
static void o2net_accept_many(struct work_struct *work)
{
struct socket *sock = o2net_listen_sock;
- while (o2net_accept_one(sock) == 0)
+ int more;
+ int err;
+
+ /*
+ * It is critical to note that due to interrupt moderation
+ * at the network driver level, we can't assume to get a
+ * softIRQ for every single conn since tcp SYN packets
+ * can arrive back-to-back, and therefore many pending
+ * accepts may result in just 1 softIRQ. If we terminate
+ * the o2net_accept_one() loop upon seeing an err, what happens
+ * to the rest of the conns in the queue? If no new SYN
+ * arrives for hours, no softIRQ will be delivered,
+ * and the connections will just sit in the queue.
+ */
+
+ for (;;) {
+ err = o2net_accept_one(sock, &more);
+ if (!more)
+ break;
cond_resched();
+ }
}
-static void o2net_listen_data_ready(struct sock *sk, int bytes)
+static void o2net_listen_data_ready(struct sock *sk)
{
- void (*ready)(struct sock *sk, int bytes);
+ void (*ready)(struct sock *sk);
read_lock(&sk->sk_callback_lock);
ready = sk->sk_user_data;
@@ -1964,18 +1964,29 @@ static void o2net_listen_data_ready(struct sock *sk, int bytes)
goto out;
}
- /* ->sk_data_ready is also called for a newly established child socket
- * before it has been accepted and the acceptor has set up their
- * data_ready.. we only want to queue listen work for our listening
- * socket */
+ /* This callback may called twice when a new connection
+ * is being established as a child socket inherits everything
+ * from a parent LISTEN socket, including the data_ready cb of
+ * the parent. This leads to a hazard. In o2net_accept_one()
+ * we are still initializing the child socket but have not
+ * changed the inherited data_ready callback yet when
+ * data starts arriving.
+ * We avoid this hazard by checking the state.
+ * For the listening socket, the state will be TCP_LISTEN; for the new
+ * socket, will be TCP_ESTABLISHED. Also, in this case,
+ * sk->sk_user_data is not a valid function pointer.
+ */
+
if (sk->sk_state == TCP_LISTEN) {
- mlog(ML_TCP, "bytes: %d\n", bytes);
queue_work(o2net_wq, &o2net_listen_work);
+ } else {
+ ready = NULL;
}
out:
read_unlock(&sk->sk_callback_lock);
- ready(sk, bytes);
+ if (ready != NULL)
+ ready(sk);
}
static int o2net_open_listening_sock(__be32 addr, __be16 port)
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4cbcb65784a..dc024367110 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -165,7 +165,7 @@ struct o2net_sock_container {
/* original handlers for the sockets */
void (*sc_state_change)(struct sock *sk);
- void (*sc_data_ready)(struct sock *sk, int bytes);
+ void (*sc_data_ready)(struct sock *sk);
u32 sc_msg_key;
u16 sc_msg_type;
diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c
deleted file mode 100644
index a56eee6abad..00000000000
--- a/fs/ocfs2/cluster/ver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define CLUSTER_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION
-
-void cluster_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(CLUSTER_BUILD_VERSION);
diff --git a/fs/ocfs2/cluster/ver.h b/fs/ocfs2/cluster/ver.h
deleted file mode 100644
index 32554c3382c..00000000000
--- a/fs/ocfs2/cluster/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef O2CLUSTER_VER_H
-#define O2CLUSTER_VER_H
-
-void cluster_print_version(void);
-
-#endif /* O2CLUSTER_VER_H */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0d3a97d2d5f..e2e05a106be 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -37,7 +37,6 @@
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
-#include "super.h"
#include "ocfs2_trace.h"
void ocfs2_dentry_attach_gen(struct dentry *dentry)
@@ -346,52 +345,6 @@ out_attach:
return ret;
}
-DEFINE_SPINLOCK(dentry_list_lock);
-
-/* We limit the number of dentry locks to drop in one go. We have
- * this limit so that we don't starve other users of ocfs2_wq. */
-#define DL_INODE_DROP_COUNT 64
-
-/* Drop inode references from dentry locks */
-static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
-{
- struct ocfs2_dentry_lock *dl;
-
- spin_lock(&dentry_list_lock);
- while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
- dl = osb->dentry_lock_list;
- osb->dentry_lock_list = dl->dl_next;
- spin_unlock(&dentry_list_lock);
- iput(dl->dl_inode);
- kfree(dl);
- spin_lock(&dentry_list_lock);
- }
- spin_unlock(&dentry_list_lock);
-}
-
-void ocfs2_drop_dl_inodes(struct work_struct *work)
-{
- struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
- dentry_lock_work);
-
- __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
- /*
- * Don't queue dropping if umount is in progress. We flush the
- * list in ocfs2_dismount_volume
- */
- spin_lock(&dentry_list_lock);
- if (osb->dentry_lock_list &&
- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- spin_unlock(&dentry_list_lock);
-}
-
-/* Flush the whole work queue */
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
-{
- __ocfs2_drop_dl_inodes(osb, -1);
-}
-
/*
* ocfs2_dentry_iput() and friends.
*
@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
+ iput(dl->dl_inode);
ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
ocfs2_lock_res_free(&dl->dl_lockres);
-
- /* We leave dropping of inode reference to ocfs2_wq as that can
- * possibly lead to inode deletion which gets tricky */
- spin_lock(&dentry_list_lock);
- if (!osb->dentry_lock_list &&
- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
- queue_work(ocfs2_wq, &osb->dentry_lock_work);
- dl->dl_next = osb->dentry_lock_list;
- osb->dentry_lock_list = dl;
- spin_unlock(&dentry_list_lock);
+ kfree(dl);
}
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl)
{
- int unlock;
+ int unlock = 0;
BUG_ON(dl->dl_count == 0);
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index b79eff70995..55f58892b15 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -29,13 +29,8 @@
extern const struct dentry_operations ocfs2_dentry_ops;
struct ocfs2_dentry_lock {
- /* Use count of dentry lock */
unsigned int dl_count;
- union {
- /* Linked list of dentry locks to release */
- struct ocfs2_dentry_lock *dl_next;
- u64 dl_parent_blkno;
- };
+ u64 dl_parent_blkno;
/*
* The ocfs2_dentry_lock keeps an inode reference until
@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock {
int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
u64 parent_blkno);
-extern spinlock_t dentry_list_lock;
-
void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
struct ocfs2_dentry_lock *dl);
-void ocfs2_drop_dl_inodes(struct work_struct *work);
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
-
struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
int skip_unhashed);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 30544ce8e9f..0717662b4ae 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -2349,7 +2349,7 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
dx_root_bh = sb_getblk(osb->sb, dr_blkno);
if (dx_root_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
ocfs2_set_new_buffer_uptodate(INODE_CACHE(dir), dx_root_bh);
@@ -2422,7 +2422,7 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
for (i = 0; i < num_dx_leaves; i++) {
bh = sb_getblk(osb->sb, start_blk + i);
if (bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
goto out;
}
dx_leaves[i] = bh;
@@ -2929,7 +2929,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off);
dirdata_bh = sb_getblk(sb, blkno);
if (!dirdata_bh) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out_commit;
}
@@ -2957,6 +2957,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_init_dir_trailer(dir, dirdata_bh, i);
}
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, dirdata_bh);
if (ocfs2_supports_indexed_dirs(osb) && !dx_inline) {
@@ -3005,6 +3006,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
di->i_size = cpu_to_le64(sb->s_blocksize);
di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
/*
* This should never fail as our extent list is empty and all
@@ -3159,7 +3161,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
*new_bh = sb_getblk(sb, p_blkno);
if (!*new_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -3284,7 +3286,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
if (ocfs2_dir_resv_allowed(osb))
data_ac->ac_resv = &OCFS2_I(dir)->ip_la_data_resv;
- credits = ocfs2_calc_extend_credits(sb, el, 1);
+ credits = ocfs2_calc_extend_credits(sb, el);
} else {
spin_unlock(&OCFS2_I(dir)->ip_lock);
credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS;
@@ -3338,6 +3340,7 @@ do_extend:
} else {
de->rec_len = cpu_to_le16(sb->s_blocksize);
}
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, new_bh);
dir_i_size += dir->i_sb->s_blocksize;
@@ -3716,7 +3719,7 @@ static int ocfs2_dx_dir_rebalance_credits(struct ocfs2_super *osb,
{
int credits = ocfs2_clusters_to_blocks(osb->sb, 2);
- credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list, 1);
+ credits += ocfs2_calc_extend_credits(osb->sb, &dx_root->dr_list);
credits += ocfs2_quota_trans_credits(osb->sb);
return credits;
}
@@ -3896,6 +3899,7 @@ out_commit:
dquot_free_space_nodirty(dir,
ocfs2_clusters_to_bytes(dir->i_sb, 1));
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_commit_trans(osb, handle);
out:
@@ -4134,6 +4138,7 @@ static int ocfs2_expand_inline_dx_root(struct inode *dir,
mlog_errno(ret);
did_quota = 0;
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, dx_root_bh);
out_commit:
@@ -4401,6 +4406,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
di->i_dyn_features = cpu_to_le16(OCFS2_I(dir)->ip_dyn_features);
spin_unlock(&OCFS2_I(dir)->ip_lock);
di->i_dx_root = cpu_to_le64(0ULL);
+ ocfs2_update_inode_fsync_trans(handle, dir, 1);
ocfs2_journal_dirty(handle, di_bh);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index c8a044efbb1..bd1aab1f49a 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -3,5 +3,5 @@ ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
- dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
+ dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e0517762fcc..fae17c640df 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
struct dlm_recovery_ctxt
{
struct list_head resources;
- struct list_head received;
struct list_head node_data;
u8 new_master;
u8 dead_node;
@@ -332,6 +331,7 @@ struct dlm_lock_resource
u16 state;
char lvb[DLM_LVB_LEN];
unsigned int inflight_locks;
+ unsigned int inflight_assert_workers;
unsigned long refmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
};
@@ -911,6 +911,9 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
+
void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e33cd7a3c58..18f13c2e4a1 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
#ifdef CONFIG_DEBUG_FS
-static struct dentry *dlm_debugfs_root = NULL;
+static struct dentry *dlm_debugfs_root;
#define DLM_DEBUGFS_DIR "o2dlm"
#define DLM_DEBUGFS_DLM_STATE "dlm_state"
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8b3382abf84..39efc5057a3 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -43,8 +43,6 @@
#include "dlmdomain.h"
#include "dlmdebug.h"
-#include "dlmver.h"
-
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
@@ -961,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
* domain. Set him in the map and clean up our
* leftover join state. */
BUG_ON(dlm->joining_node != assert->node_idx);
+
+ if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+ mlog(0, "dlm recovery is ongoing, disallow join\n");
+ spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm_domain_lock);
+ return -EAGAIN;
+ }
+
set_bit(assert->node_idx, dlm->domain_map);
clear_bit(assert->node_idx, dlm->exit_domain_map);
__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
@@ -1125,7 +1131,6 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
struct dlm_ctxt *dlm = NULL;
char *local = NULL;
int status = 0;
- int locked = 0;
qr = (struct dlm_query_region *) msg->buf;
@@ -1134,10 +1139,8 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
/* buffer used in dlm_mast_regions() */
local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
- if (!local) {
- status = -ENOMEM;
- goto bail;
- }
+ if (!local)
+ return -ENOMEM;
status = -EINVAL;
@@ -1146,16 +1149,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
if (!dlm) {
mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
"before join domain\n", qr->qr_node, qr->qr_domain);
- goto bail;
+ goto out_domain_lock;
}
spin_lock(&dlm->spinlock);
- locked = 1;
if (dlm->joining_node != qr->qr_node) {
mlog(ML_ERROR, "Node %d queried hb regions on domain %s "
"but joining node is %d\n", qr->qr_node, qr->qr_domain,
dlm->joining_node);
- goto bail;
+ goto out_dlm_lock;
}
/* Support for global heartbeat was added in 1.1 */
@@ -1165,14 +1167,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
"but active dlm protocol is %d.%d\n", qr->qr_node,
qr->qr_domain, dlm->dlm_locking_proto.pv_major,
dlm->dlm_locking_proto.pv_minor);
- goto bail;
+ goto out_dlm_lock;
}
status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
-bail:
- if (locked)
- spin_unlock(&dlm->spinlock);
+out_dlm_lock:
+ spin_unlock(&dlm->spinlock);
+
+out_domain_lock:
spin_unlock(&dlm_domain_lock);
kfree(local);
@@ -1522,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
unsigned int node)
{
int status;
+ int ret;
struct dlm_assert_joined assert_msg;
mlog(0, "Sending join assert to node %u\n", node);
@@ -1533,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm,
status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
&assert_msg, sizeof(assert_msg), node,
- NULL);
+ &ret);
if (status < 0)
mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
"node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY,
node);
+ else
+ status = ret;
return status;
}
@@ -1879,19 +1885,19 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- status = dlm_debug_init(dlm);
+ status = dlm_launch_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = dlm_launch_thread(dlm);
+ status = dlm_launch_recovery_thread(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = dlm_launch_recovery_thread(dlm);
+ status = dlm_debug_init(dlm);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2028,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
- INIT_LIST_HEAD(&dlm->reco.received);
INIT_LIST_HEAD(&dlm->reco.node_data);
INIT_LIST_HEAD(&dlm->purge_list);
INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
@@ -2328,8 +2333,6 @@ static int __init dlm_init(void)
{
int status;
- dlm_print_version();
-
status = dlm_init_mle_cache();
if (status) {
mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
@@ -2379,6 +2382,7 @@ static void __exit dlm_exit (void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 Distributed Lock Management");
module_init(dlm_init);
module_exit(dlm_exit);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 5d32f7511f7..66c2a491f68 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -52,7 +52,7 @@
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
-static struct kmem_cache *dlm_lock_cache = NULL;
+static struct kmem_cache *dlm_lock_cache;
static DEFINE_SPINLOCK(dlm_cookie_lock);
static u64 dlm_next_cookie = 1;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index cf0f103963b..82abf0cc9a1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-static struct kmem_cache *dlm_lockres_cache = NULL;
-static struct kmem_cache *dlm_lockname_cache = NULL;
-static struct kmem_cache *dlm_mle_cache = NULL;
+static struct kmem_cache *dlm_lockres_cache;
+static struct kmem_cache *dlm_lockname_cache;
+static struct kmem_cache *dlm_mle_cache;
static void dlm_mle_release(struct kref *kref);
static void dlm_init_mle(struct dlm_master_list_entry *mle,
@@ -472,11 +472,15 @@ bail:
void dlm_destroy_master_caches(void)
{
- if (dlm_lockname_cache)
+ if (dlm_lockname_cache) {
kmem_cache_destroy(dlm_lockname_cache);
+ dlm_lockname_cache = NULL;
+ }
- if (dlm_lockres_cache)
+ if (dlm_lockres_cache) {
kmem_cache_destroy(dlm_lockres_cache);
+ dlm_lockres_cache = NULL;
+ }
}
static void dlm_lockres_release(struct kref *kref)
@@ -577,6 +581,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
atomic_set(&res->asts_reserved, 0);
res->migration_pending = 0;
res->inflight_locks = 0;
+ res->inflight_assert_workers = 0;
res->dlm = dlm;
@@ -679,6 +684,43 @@ void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
wake_up(&res->wq);
}
+void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ res->inflight_assert_workers++;
+ mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_grab_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
+static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&res->spinlock);
+ BUG_ON(res->inflight_assert_workers == 0);
+ res->inflight_assert_workers--;
+ mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
+ dlm->name, res->lockname.len, res->lockname.name,
+ res->inflight_assert_workers);
+}
+
+static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ spin_lock(&res->spinlock);
+ __dlm_lockres_drop_inflight_worker(dlm, res);
+ spin_unlock(&res->spinlock);
+}
+
/*
* lookup a lock resource by name.
* may already exist in the hashtable.
@@ -1599,7 +1641,8 @@ send_response:
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
dlm_lockres_put(res);
- }
+ } else
+ dlm_lockres_grab_inflight_worker(dlm, res);
} else {
if (res)
dlm_lockres_put(res);
@@ -1885,8 +1928,10 @@ ok:
* up nodes that this node contacted */
while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
nn+1)) < O2NM_MAX_NODES) {
- if (nn != dlm->node_num && nn != assert->node_idx)
+ if (nn != dlm->node_num && nn != assert->node_idx) {
master_request = 1;
+ break;
+ }
}
}
mle->master = assert->node_idx;
@@ -2112,6 +2157,8 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
dlm_lockres_release_ast(dlm, res);
put:
+ dlm_lockres_drop_inflight_worker(dlm, res);
+
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
@@ -2354,6 +2401,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
+ /* delay migration when the lockres is in MIGRATING state */
+ if (res->state & DLM_LOCK_RES_MIGRATING)
+ return 0;
+
if (res->owner != dlm->node_num)
return 0;
@@ -3078,11 +3129,15 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* remove it so that only one mle will be found */
__dlm_unlink_mle(dlm, tmp);
__dlm_mle_detach_hb_events(dlm, tmp);
- ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
- mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
- "telling master to get ref for cleared out mle "
- "during migration\n", dlm->name, namelen, name,
- master, new_master);
+ if (tmp->type == DLM_MLE_MASTER) {
+ ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
+ mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
+ "telling master to get ref "
+ "for cleared out mle during "
+ "migration\n", dlm->name,
+ namelen, name, master,
+ new_master);
+ }
}
spin_unlock(&tmp->spinlock);
}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0b5adca1b17..45067faf569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -537,7 +537,10 @@ master_here:
/* success! see if any other nodes need recovery */
mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
dlm->name, dlm->reco.dead_node, dlm->node_num);
- dlm_reset_recovery(dlm);
+ spin_lock(&dlm->spinlock);
+ __dlm_reset_recovery(dlm);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
}
dlm_end_recovery(dlm);
@@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
if (all_nodes_done) {
int ret;
+ /* Set this flag on recovery master to avoid
+ * a new recovery for another dead node start
+ * before the recovery is not done. That may
+ * cause recovery hung.*/
+ spin_lock(&dlm->spinlock);
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
* just send a finalize message to everyone and
* clean up */
@@ -1697,7 +1708,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
mlog_errno(-ENOMEM);
/* retry!? */
BUG();
- }
+ } else
+ __dlm_lockres_grab_inflight_worker(dlm, res);
} else /* put.. incase we are not the master */
dlm_lockres_put(res);
spin_unlock(&res->spinlock);
@@ -1750,13 +1762,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_migratable_lockres *mres)
{
struct dlm_migratable_lock *ml;
- struct list_head *queue;
+ struct list_head *queue, *iter;
struct list_head *tmpq = NULL;
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
int i, j, bad;
- struct dlm_lock *lock = NULL;
+ struct dlm_lock *lock;
u8 from = O2NM_MAX_NODES;
unsigned int added = 0;
__be64 c;
@@ -1791,14 +1803,16 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* MIGRATION ONLY! */
BUG_ON(!(mres->flags & DLM_MRES_MIGRATION));
+ lock = NULL;
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each_entry(lock, tmpq, list) {
- if (lock->ml.cookie != ml->cookie)
- lock = NULL;
- else
+ list_for_each(iter, tmpq) {
+ lock = list_entry(iter,
+ struct dlm_lock, list);
+ if (lock->ml.cookie == ml->cookie)
break;
+ lock = NULL;
}
if (lock)
break;
@@ -1886,6 +1900,13 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
if (ml->type == LKM_NLMODE)
goto skip_lvb;
+ /*
+ * If the lock is in the blocked list it can't have a valid lvb,
+ * so skip it
+ */
+ if (ml->list == DLM_BLOCKED_LIST)
+ goto skip_lvb;
+
if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
@@ -1966,7 +1987,15 @@ skip_lvb:
}
if (!bad) {
dlm_lock_get(newlock);
- list_add_tail(&newlock->list, queue);
+ if (mres->flags & DLM_MRES_RECOVERY &&
+ ml->list == DLM_CONVERTING_LIST &&
+ newlock->ml.type >
+ newlock->ml.convert_type) {
+ /* newlock is doing downconvert, add it to the
+ * head of converting list */
+ list_add(&newlock->list, queue);
+ } else
+ list_add_tail(&newlock->list, queue);
mlog(0, "%s:%.*s: added lock for node %u, "
"setting refmap bit\n", dlm->name,
res->lockname.len, res->lockname.name, ml->node);
@@ -2875,8 +2904,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
BUG();
}
dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
- dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
default:
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 9db869de829..69aac6f088a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -259,12 +259,15 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
* refs on it. */
unused = __dlm_lockres_unused(lockres);
if (!unused ||
- (lockres->state & DLM_LOCK_RES_MIGRATING)) {
+ (lockres->state & DLM_LOCK_RES_MIGRATING) ||
+ (lockres->inflight_assert_workers != 0)) {
mlog(0, "%s: res %.*s is in use or being remastered, "
- "used %d, state %d\n", dlm->name,
- lockres->lockname.len, lockres->lockname.name,
- !unused, lockres->state);
- list_move_tail(&dlm->purge_list, &lockres->purge);
+ "used %d, state %d, assert master workers %u\n",
+ dlm->name, lockres->lockname.len,
+ lockres->lockname.name,
+ !unused, lockres->state,
+ lockres->inflight_assert_workers);
+ list_move_tail(&lockres->purge, &dlm->purge_list);
spin_unlock(&lockres->spinlock);
continue;
}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 5698b52cf5c..2e3c9dbab68 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -191,7 +191,9 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
DLM_UNLOCK_CLEAR_CONVERT_TYPE);
} else if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR
+ ) {
/* must clear the actions because this unlock
* is about to be retried. cannot free or do
* any list manipulation. */
@@ -200,7 +202,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
res->lockname.name,
status==DLM_RECOVERING?"recovering":
(status==DLM_MIGRATING?"migrating":
- "forward"));
+ (status == DLM_FORWARD ? "forward" :
+ "nolockmanager")));
actions = 0;
}
if (flags & LKM_CANCEL)
@@ -364,7 +367,10 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
* updated state to the recovery master. this thread
* just needs to finish out the operation and call
* the unlockast. */
- ret = DLM_NORMAL;
+ if (dlm_is_node_dead(dlm, owner))
+ ret = DLM_NORMAL;
+ else
+ ret = DLM_NOLOCKMGR;
} else {
/* something bad. this will BUG in ocfs2 */
ret = dlm_err_to_dlm_status(tmpret);
@@ -638,7 +644,9 @@ retry:
if (status == DLM_RECOVERING ||
status == DLM_MIGRATING ||
- status == DLM_FORWARD) {
+ status == DLM_FORWARD ||
+ status == DLM_NOLOCKMGR) {
+
/* We want to go away for a tiny bit to allow recovery
* / migration to complete on this resource. I don't
* know of any wait queue we could sleep on as this
@@ -650,7 +658,7 @@ retry:
msleep(50);
mlog(0, "retrying unlock due to pending recovery/"
- "migration/in-progress\n");
+ "migration/in-progress/reconnect\n");
goto retry;
}
diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c
deleted file mode 100644
index dfc0da4d158..00000000000
--- a/fs/ocfs2/dlm/dlmver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION
-
-void dlm_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlm/dlmver.h b/fs/ocfs2/dlm/dlmver.h
deleted file mode 100644
index f674aee77a1..00000000000
--- a/fs/ocfs2/dlm/dlmver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLM_VER_H
-#define DLM_VER_H
-
-void dlm_print_version(void);
-
-#endif /* DLM_VER_H */
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index f14be89a670..eed3db8c5b4 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Ifs/ocfs2
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
-ocfs2_dlmfs-objs := userdlm.o dlmfs.o dlmfsver.o
+ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index efa2b3d339e..09b7d9dac71 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -49,7 +49,6 @@
#include "stackglue.h"
#include "userdlm.h"
-#include "dlmfsver.h"
#define MLOG_MASK_PREFIX ML_DLMFS
#include "cluster/masklog.h"
@@ -644,8 +643,6 @@ static int __init init_dlmfs_fs(void)
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- dlmfs_print_version();
-
status = bdi_init(&dlmfs_backing_dev_info);
if (status)
return status;
@@ -701,6 +698,7 @@ static void __exit exit_dlmfs_fs(void)
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 DLM-Filesystem");
module_init(init_dlmfs_fs)
module_exit(exit_dlmfs_fs)
diff --git a/fs/ocfs2/dlmfs/dlmfsver.c b/fs/ocfs2/dlmfs/dlmfsver.c
deleted file mode 100644
index a733b3321f8..00000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmfsver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include "dlmfsver.h"
-
-#define DLM_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION
-
-void dlmfs_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(DLM_BUILD_VERSION);
diff --git a/fs/ocfs2/dlmfs/dlmfsver.h b/fs/ocfs2/dlmfs/dlmfsver.h
deleted file mode 100644
index f35eadbed25..00000000000
--- a/fs/ocfs2/dlmfs/dlmfsver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef DLMFS_VER_H
-#define DLMFS_VER_H
-
-void dlmfs_print_version(void);
-
-#endif /* DLMFS_VER_H */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 3a44a648dae..52cfe99ae05 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1304,7 +1304,7 @@ static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
{
wait_for_completion(&mw->mw_complete);
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return mw->mw_status;
}
@@ -1355,7 +1355,7 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
else
ret = mw->mw_status;
/* Re-arm the completion in case we want to wait on it again */
- INIT_COMPLETION(mw->mw_complete);
+ reinit_completion(&mw->mw_complete);
return ret;
}
@@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
* refreshed, so we do it here. Of course, making sense of
* everything is up to the caller :) */
status = ocfs2_should_refresh_lock_res(lockres);
- if (status < 0) {
- ocfs2_cluster_unlock(osb, lockres, level);
- mlog_errno(status);
- goto bail;
- }
if (status) {
status = ocfs2_refresh_slot_info(osb);
@@ -2996,6 +2991,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
/* for now, uuid == domain */
status = ocfs2_cluster_connect(osb->osb_cluster_stack,
+ osb->osb_cluster_name,
+ strlen(osb->osb_cluster_name),
osb->uuid_str,
strlen(osb->uuid_str),
&lproto, ocfs2_do_node_down, osb,
@@ -3005,7 +3002,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_cluster_this_node(&osb->node_num);
+ status = ocfs2_cluster_this_node(conn, &osb->node_num);
if (status < 0) {
mlog_errno(status);
mlog(ML_ERROR,
@@ -3142,22 +3139,60 @@ out:
return 0;
}
+static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
+
/* Mark the lockres as being dropped. It will no longer be
* queued if blocking, but we still may have to wait on it
* being dequeued from the downconvert thread before we can consider
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres)
{
int status;
struct ocfs2_mask_waiter mw;
- unsigned long flags;
+ unsigned long flags, flags2;
ocfs2_init_mask_waiter(&mw);
spin_lock_irqsave(&lockres->l_lock, flags);
lockres->l_flags |= OCFS2_LOCK_FREEING;
+ if (lockres->l_flags & OCFS2_LOCK_QUEUED && current == osb->dc_task) {
+ /*
+ * We know the downconvert is queued but not in progress
+ * because we are the downconvert thread and processing
+ * different lock. So we can just remove the lock from the
+ * queue. This is not only an optimization but also a way
+ * to avoid the following deadlock:
+ * ocfs2_dentry_post_unlock()
+ * ocfs2_dentry_lock_put()
+ * ocfs2_drop_dentry_lock()
+ * iput()
+ * ocfs2_evict_inode()
+ * ocfs2_clear_inode()
+ * ocfs2_mark_lockres_freeing()
+ * ... blocks waiting for OCFS2_LOCK_QUEUED
+ * since we are the downconvert thread which
+ * should clear the flag.
+ */
+ spin_unlock_irqrestore(&lockres->l_lock, flags);
+ spin_lock_irqsave(&osb->dc_task_lock, flags2);
+ list_del_init(&lockres->l_blocked_list);
+ osb->blocked_lock_count--;
+ spin_unlock_irqrestore(&osb->dc_task_lock, flags2);
+ /*
+ * Warn if we recurse into another post_unlock call. Strictly
+ * speaking it isn't a problem but we need to be careful if
+ * that happens (stack overflow, deadlocks, ...) so warn if
+ * ocfs2 grows a path for which this can happen.
+ */
+ WARN_ON_ONCE(lockres->l_ops->post_unlock);
+ /* Since the lock is freeing we don't do much in the fn below */
+ ocfs2_process_blocked_lock(osb, lockres);
+ return;
+ }
while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -3178,7 +3213,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
{
int ret;
- ocfs2_mark_lockres_freeing(lockres);
+ ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4..d293a22c32c 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+ struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index d71903c6068..2930e231f3f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,9 +175,13 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
int datasync)
{
int err = 0;
- journal_t *journal;
struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ journal_t *journal = osb->journal->j_journal;
+ int ret;
+ tid_t commit_tid;
+ bool needs_barrier = false;
trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
OCFS2_I(inode)->ip_blkno,
@@ -185,33 +189,26 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
file->f_path.dentry->d_name.name,
(unsigned long long)datasync);
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
err = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (err)
return err;
- /*
- * Probably don't need the i_mutex at all in here, just putting it here
- * to be consistent with how fsync used to be called, someone more
- * familiar with the fs could possibly remove it.
- */
- mutex_lock(&inode->i_mutex);
- if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
- /*
- * We still have to flush drive's caches to get data to the
- * platter
- */
- if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER)
- blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
- goto bail;
+ commit_tid = datasync ? oi->i_datasync_tid : oi->i_sync_tid;
+ if (journal->j_flags & JBD2_BARRIER &&
+ !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+ needs_barrier = true;
+ err = jbd2_complete_transaction(journal, commit_tid);
+ if (needs_barrier) {
+ ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ if (!err)
+ err = ret;
}
- journal = osb->journal->j_journal;
- err = jbd2_journal_force_commit(journal);
-
-bail:
if (err)
mlog_errno(err);
- mutex_unlock(&inode->i_mutex);
return (err < 0) ? -EIO : 0;
}
@@ -289,6 +286,7 @@ int ocfs2_update_inode_atime(struct inode *inode,
inode->i_atime = CURRENT_TIME;
di->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, bh);
out_commit:
@@ -338,6 +336,7 @@ int ocfs2_simple_size_update(struct inode *inode,
if (ret < 0)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_commit_trans(osb, handle);
out:
return ret;
@@ -432,6 +431,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
di->i_size = cpu_to_le64(new_i_size);
di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, fe_bh);
@@ -474,11 +474,6 @@ static int ocfs2_truncate_file(struct inode *inode,
goto bail;
}
- /* lets handle the simple truncate cases before doing any more
- * cluster locking. */
- if (new_i_size == le64_to_cpu(fe->i_size))
- goto bail;
-
down_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_resv_discard(&osb->osb_la_resmap,
@@ -580,7 +575,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
int did_quota = 0;
/*
- * This function only exists for file systems which don't
+ * Unwritten extent only exists for file systems which
* support holes.
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
@@ -603,8 +598,7 @@ restart_all:
goto leave;
}
- credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list,
- clusters_to_add);
+ credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list);
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
@@ -653,7 +647,7 @@ restarted_transaction:
mlog_errno(status);
goto leave;
}
-
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, bh);
spin_lock(&OCFS2_I(inode)->ip_lock);
@@ -719,7 +713,8 @@ leave:
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
*/
-static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
+ struct buffer_head *di_bh)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
handle_t *handle = NULL;
@@ -736,8 +731,16 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
}
ret = ocfs2_jbd2_file_inode(handle, inode);
- if (ret < 0)
+ if (ret < 0) {
mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret)
+ mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
out:
if (ret) {
@@ -752,7 +755,7 @@ out:
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->write_begin() and ->write_end(). */
static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
- u64 abs_to)
+ u64 abs_to, struct buffer_head *di_bh)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
@@ -760,6 +763,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
handle_t *handle = NULL;
int ret = 0;
unsigned zero_from, zero_to, block_start, block_end;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
BUG_ON(abs_from >= abs_to);
BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
@@ -802,7 +806,8 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
}
if (!handle) {
- handle = ocfs2_zero_start_ordered_transaction(inode);
+ handle = ocfs2_zero_start_ordered_transaction(inode,
+ di_bh);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
handle = NULL;
@@ -819,8 +824,23 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
ret = 0;
}
- if (handle)
+ if (handle) {
+ /*
+ * fs-writeback will release the dirty pages without page lock
+ * whose offset are over inode size, the release happens at
+ * block_write_full_page().
+ */
+ i_size_write(inode, abs_to);
+ inode->i_blocks = ocfs2_inode_sector_count(inode);
+ di->i_size = cpu_to_le64((u64)i_size_read(inode));
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ di->i_mtime_nsec = di->i_ctime_nsec;
+ ocfs2_journal_dirty(handle, di_bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ }
out_unlock:
unlock_page(page);
@@ -916,7 +936,7 @@ out:
* has made sure that the entire range needs zeroing.
*/
static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
- u64 range_end)
+ u64 range_end, struct buffer_head *di_bh)
{
int rc = 0;
u64 next_pos;
@@ -932,7 +952,7 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
if (next_pos > range_end)
next_pos = range_end;
- rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+ rc = ocfs2_write_zero_page(inode, zero_pos, next_pos, di_bh);
if (rc < 0) {
mlog_errno(rc);
break;
@@ -978,7 +998,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
range_end = zero_to_size;
ret = ocfs2_zero_extend_range(inode, range_start,
- range_end);
+ range_end, di_bh);
if (ret) {
mlog_errno(ret);
break;
@@ -1146,14 +1166,14 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock_rw;
}
- if (size_change && attr->ia_size != i_size_read(inode)) {
+ if (size_change) {
status = inode_newsize_ok(inode, attr->ia_size);
if (status)
goto bail_unlock;
inode_dio_wait(inode);
- if (i_size_read(inode) > attr->ia_size) {
+ if (i_size_read(inode) >= attr->ia_size) {
if (ocfs2_should_order_data(inode)) {
status = ocfs2_begin_ordered_truncate(inode,
attr->ia_size);
@@ -1237,7 +1257,7 @@ bail:
dqput(transfer_to[qtype]);
if (!status && attr->ia_valid & ATTR_MODE) {
- status = ocfs2_acl_chmod(inode);
+ status = posix_acl_chmod(inode, inode->i_mode);
if (status < 0)
mlog_errno(status);
}
@@ -1323,6 +1343,7 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
di = (struct ocfs2_dinode *) bh->b_data;
di->i_mode = cpu_to_le16(inode->i_mode);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, bh);
@@ -1555,6 +1576,7 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
if (ret)
mlog_errno(ret);
}
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_commit_trans(osb, handle);
out:
@@ -1870,7 +1892,8 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
}
size = sr->l_start + sr->l_len;
- if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64 ||
+ cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
goto out_inode_unlock;
@@ -2039,13 +2062,6 @@ out:
return ret;
}
-static void ocfs2_aiodio_wait(struct inode *inode)
-{
- wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
-
- wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
-}
-
static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
{
int blockmask = inode->i_sb->s_blocksize - 1;
@@ -2217,16 +2233,13 @@ out:
return ret;
}
-static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
{
int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
int can_do_direct, has_refcount = 0;
ssize_t written = 0;
- size_t ocount; /* original count */
- size_t count; /* after file limit checks */
+ size_t count = iov_iter_count(from);
loff_t old_size, *ppos = &iocb->ki_pos;
u32 old_clusters;
struct file *file = iocb->ki_filp;
@@ -2240,7 +2253,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
file->f_path.dentry->d_name.len,
file->f_path.dentry->d_name.name,
- (unsigned int)nr_segs);
+ (unsigned int)from->nr_segs); /* GRRRRR */
if (iocb->ki_nbytes == 0)
return 0;
@@ -2323,10 +2336,8 @@ relock:
* Wait on previous unaligned aio to complete before
* proceeding.
*/
- ocfs2_aiodio_wait(inode);
-
- /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
- atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+ mutex_lock(&OCFS2_I(inode)->ip_unaligned_aio);
+ /* Mark the iocb as needing an unlock in ocfs2_dio_end_io */
ocfs2_iocb_set_unaligned_aio(iocb);
}
@@ -2340,28 +2351,23 @@ relock:
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
- ret = generic_segment_checks(iov, &nr_segs, &ocount,
- VERIFY_READ);
- if (ret)
- goto out_dio;
-
- count = ocount;
ret = generic_write_checks(file, ppos, &count,
S_ISBLK(inode->i_mode));
if (ret)
goto out_dio;
+ iov_iter_truncate(from, count);
if (direct_io) {
- written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
- ppos, count, ocount);
+ written = generic_file_direct_write(iocb, from, *ppos);
if (written < 0) {
ret = written;
goto out_dio;
}
} else {
current->backing_dev_info = file->f_mapping->backing_dev_info;
- written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos,
- ppos, count, 0);
+ written = generic_perform_write(file, from, *ppos);
+ if (likely(written >= 0))
+ iocb->ki_pos = *ppos + written;
current->backing_dev_info = NULL;
}
@@ -2371,8 +2377,8 @@ out_dio:
if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) ||
((file->f_flags & O_DIRECT) && !direct_io)) {
- ret = filemap_fdatawrite_range(file->f_mapping, pos,
- pos + count - 1);
+ ret = filemap_fdatawrite_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
if (ret < 0)
written = ret;
@@ -2385,8 +2391,8 @@ out_dio:
}
if (!ret)
- ret = filemap_fdatawait_range(file->f_mapping, pos,
- pos + count - 1);
+ ret = filemap_fdatawait_range(file->f_mapping, *ppos,
+ *ppos + count - 1);
}
/*
@@ -2406,7 +2412,7 @@ out_dio:
if (unaligned_dio) {
ocfs2_iocb_clear_unaligned_aio(iocb);
- atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+ mutex_unlock(&OCFS2_I(inode)->ip_unaligned_aio);
}
out:
@@ -2424,84 +2430,6 @@ out_sems:
return ret;
}
-static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
- struct file *out,
- struct splice_desc *sd)
-{
- int ret;
-
- ret = ocfs2_prepare_inode_for_write(out, &sd->pos,
- sd->total_len, 0, NULL, NULL);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
-
- return splice_from_pipe_feed(pipe, sd, pipe_to_file);
-}
-
-static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
- int ret;
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- struct splice_desc sd = {
- .total_len = len,
- .flags = flags,
- .pos = *ppos,
- .u.file = out,
- };
-
-
- trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- out->f_path.dentry->d_name.len,
- out->f_path.dentry->d_name.name, len);
-
- pipe_lock(pipe);
-
- splice_from_pipe_begin(&sd);
- do {
- ret = splice_from_pipe_next(pipe, &sd);
- if (ret <= 0)
- break;
-
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0)
- mlog_errno(ret);
- else {
- ret = ocfs2_splice_to_file(pipe, out, &sd);
- ocfs2_rw_unlock(inode, 1);
- }
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
- splice_from_pipe_end(pipe, &sd);
-
- pipe_unlock(pipe);
-
- if (sd.num_spliced)
- ret = sd.num_spliced;
-
- if (ret > 0) {
- int err;
-
- err = generic_write_sync(out, *ppos, ret);
- if (err)
- ret = err;
- else
- *ppos += ret;
-
- balance_dirty_pages_ratelimited(mapping);
- }
-
- return ret;
-}
-
static ssize_t ocfs2_file_splice_read(struct file *in,
loff_t *ppos,
struct pipe_inode_info *pipe,
@@ -2517,7 +2445,7 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
in->f_path.dentry->d_name.name, len);
/*
- * See the comment in ocfs2_file_aio_read()
+ * See the comment in ocfs2_file_read_iter()
*/
ret = ocfs2_inode_lock_atime(inode, in->f_path.mnt, &lock_level);
if (ret < 0) {
@@ -2532,10 +2460,8 @@ bail:
return ret;
}
-static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
+static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
{
int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
struct file *filp = iocb->ki_filp;
@@ -2544,7 +2470,8 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
filp->f_path.dentry->d_name.len,
- filp->f_path.dentry->d_name.name, nr_segs);
+ filp->f_path.dentry->d_name.name,
+ to->nr_segs); /* GRRRRR */
if (!inode) {
@@ -2589,13 +2516,13 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
}
ocfs2_inode_unlock(inode, lock_level);
- ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
+ ret = generic_file_read_iter(iocb, to);
trace_generic_file_aio_read_ret(ret);
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
- /* see ocfs2_file_aio_write */
+ /* see ocfs2_file_write_iter */
if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
rw_level = -1;
have_alloc_sem = 0;
@@ -2623,7 +2550,16 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_SET:
break;
case SEEK_END:
- offset += inode->i_size;
+ /* SEEK_END requires the OCFS2 inode lock for the file
+ * because it references the file's size.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ offset += i_size_read(inode);
+ ocfs2_inode_unlock(inode, 0);
break;
case SEEK_CUR:
if (offset == 0) {
@@ -2662,6 +2598,7 @@ const struct inode_operations ocfs2_file_iops = {
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
.get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
const struct inode_operations ocfs2_special_file_iops = {
@@ -2669,6 +2606,7 @@ const struct inode_operations ocfs2_special_file_iops = {
.getattr = ocfs2_getattr,
.permission = ocfs2_permission,
.get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
/*
@@ -2677,14 +2615,14 @@ const struct inode_operations ocfs2_special_file_iops = {
*/
const struct file_operations ocfs2_fops = {
.llseek = ocfs2_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
@@ -2692,7 +2630,7 @@ const struct file_operations ocfs2_fops = {
.lock = ocfs2_lock,
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
};
@@ -2725,21 +2663,21 @@ const struct file_operations ocfs2_dops = {
*/
const struct file_operations ocfs2_fops_no_plocks = {
.llseek = ocfs2_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
.mmap = ocfs2_mmap,
.fsync = ocfs2_sync_file,
.release = ocfs2_file_release,
.open = ocfs2_file_open,
- .aio_read = ocfs2_file_aio_read,
- .aio_write = ocfs2_file_aio_write,
+ .read_iter = ocfs2_file_read_iter,
+ .write_iter = ocfs2_file_write_iter,
.unlocked_ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
.flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
- .splice_write = ocfs2_file_splice_write,
+ .splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
};
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f87f9bd1edf..437de7f768c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -130,6 +130,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
struct inode *inode = NULL;
struct super_block *sb = osb->sb;
struct ocfs2_find_inode_args args;
+ journal_t *journal = OCFS2_SB(sb)->journal->j_journal;
trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
sysfile_type);
@@ -169,6 +170,32 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
goto bail;
}
+ /*
+ * Set transaction id's of transactions that have to be committed
+ * to finish f[data]sync. We set them to currently running transaction
+ * as we cannot be sure that the inode or some of its metadata isn't
+ * part of the transaction - the inode could have been reclaimed and
+ * now it is reread from disk.
+ */
+ if (journal) {
+ transaction_t *transaction;
+ tid_t tid;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ read_lock(&journal->j_state_lock);
+ if (journal->j_running_transaction)
+ transaction = journal->j_running_transaction;
+ else
+ transaction = journal->j_committing_transaction;
+ if (transaction)
+ tid = transaction->t_tid;
+ else
+ tid = journal->j_commit_sequence;
+ read_unlock(&journal->j_state_lock);
+ oi->i_sync_tid = tid;
+ oi->i_datasync_tid = tid;
+ }
+
bail:
if (!IS_ERR(inode)) {
trace_ocfs2_iget_end(inode,
@@ -386,19 +413,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
u32 generation = 0;
status = -EINVAL;
- if (inode == NULL || inode->i_sb == NULL) {
- mlog(ML_ERROR, "bad inode\n");
- return status;
- }
sb = inode->i_sb;
osb = OCFS2_SB(sb);
- if (!args) {
- mlog(ML_ERROR, "bad inode args\n");
- make_bad_inode(inode);
- return status;
- }
-
/*
* To improve performance of cold-cache inode stats, we take
* the cluster lock here if possible.
@@ -814,11 +831,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail;
}
- /* If we're coming from downconvert_thread we can't go into our own
- * voting [hello, deadlock city!], so unforuntately we just
- * have to skip deleting this guy. That's OK though because
- * the node who's doing the actual deleting should handle it
- * anyway. */
+ /*
+ * If we're coming from downconvert_thread we can't go into our own
+ * voting [hello, deadlock city!] so we cannot delete the inode. But
+ * since we dropped last inode ref when downconverting dentry lock,
+ * we cannot have the file open and thus the node doing unlink will
+ * take care of deleting the inode.
+ */
if (current == osb->dc_task)
goto bail;
@@ -832,12 +851,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
goto bail_unlock;
}
- /* If we have allowd wipe of this inode for another node, it
- * will be marked here so we can safely skip it. Recovery will
- * cleanup any inodes we might inadvertently skip here. */
- if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
- goto bail_unlock;
-
ret = 1;
bail_unlock:
spin_unlock(&oi->ip_lock);
@@ -951,7 +964,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
filemap_write_and_wait(inode->i_mapping);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
static void ocfs2_delete_inode(struct inode *inode)
@@ -970,8 +983,6 @@ static void ocfs2_delete_inode(struct inode *inode)
if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
goto bail;
- dquot_initialize(inode);
-
if (!ocfs2_inode_is_valid_to_delete(inode)) {
/* It's probably not necessary to truncate_inode_pages
* here but we do it for safety anyway (it will most
@@ -980,6 +991,8 @@ static void ocfs2_delete_inode(struct inode *inode)
goto bail;
}
+ dquot_initialize(inode);
+
/* We want to block signals in delete_inode as the lock and
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned
@@ -1067,6 +1080,7 @@ static void ocfs2_clear_inode(struct inode *inode)
{
int status;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
clear_inode(inode);
trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
@@ -1083,9 +1097,9 @@ static void ocfs2_clear_inode(struct inode *inode)
/* Do these before all the other work so that we don't bounce
* the downconvert thread while waiting to destroy the locks. */
- ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_inode_lockres);
- ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres);
+ ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres);
ocfs2_resv_discard(&OCFS2_SB(inode->i_sb)->osb_la_resmap,
&oi->ip_la_data_resv);
@@ -1167,7 +1181,7 @@ void ocfs2_evict_inode(struct inode *inode)
(OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
ocfs2_delete_inode(inode);
} else {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
ocfs2_clear_inode(inode);
}
@@ -1270,6 +1284,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
ocfs2_journal_dirty(handle, bh);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
leave:
return status;
}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23..a6c991c0fc9 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -44,7 +44,7 @@ struct ocfs2_inode_info
struct rw_semaphore ip_xattr_sem;
/* Number of outstanding AIO's which are not page aligned */
- atomic_t ip_unaligned_aio;
+ struct mutex ip_unaligned_aio;
/* These fields are protected by ip_lock */
spinlock_t ip_lock;
@@ -73,6 +73,13 @@ struct ocfs2_inode_info
u32 ip_dir_lock_gen;
struct ocfs2_alloc_reservation ip_la_data_resv;
+
+ /*
+ * Transactions that contain inode's metadata needed to complete
+ * fsync and fdatasync, respectively.
+ */
+ tid_t i_sync_tid;
+ tid_t i_datasync_tid;
};
/*
@@ -84,8 +91,6 @@ struct ocfs2_inode_info
#define OCFS2_INODE_BITMAP 0x00000004
/* This inode has been wiped from disk */
#define OCFS2_INODE_DELETED 0x00000008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE 0x00000010
/* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +105,11 @@ struct ocfs2_inode_info
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
-#define OCFS2_INODE_MAYBE_ORPHANED 0x00000020
+#define OCFS2_INODE_MAYBE_ORPHANED 0x00000010
/* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT 0x00000040
+#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
-#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080
+#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index fa32ce9b455..6f66b3751ac 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
#include <linux/fs.h>
#include <linux/mount.h>
+#include <linux/blkdev.h>
#include <linux/compat.h>
#include <cluster/masklog.h>
@@ -142,8 +143,8 @@ bail:
return status;
}
-int ocfs2_info_handle_blocksize(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_blocksize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_blocksize oib;
@@ -166,8 +167,8 @@ bail:
return status;
}
-int ocfs2_info_handle_clustersize(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_clustersize(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_clustersize oic;
@@ -191,8 +192,8 @@ bail:
return status;
}
-int ocfs2_info_handle_maxslots(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_maxslots(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_maxslots oim;
@@ -216,8 +217,8 @@ bail:
return status;
}
-int ocfs2_info_handle_label(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_label(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_label oil;
@@ -241,8 +242,8 @@ bail:
return status;
}
-int ocfs2_info_handle_uuid(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_uuid(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_uuid oiu;
@@ -266,8 +267,8 @@ bail:
return status;
}
-int ocfs2_info_handle_fs_features(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_fs_features(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_fs_features oif;
@@ -293,8 +294,8 @@ bail:
return status;
}
-int ocfs2_info_handle_journal_size(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_journal_size(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_journal_size oij;
@@ -318,9 +319,10 @@ bail:
return status;
}
-int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
- struct inode *inode_alloc, u64 blkno,
- struct ocfs2_info_freeinode *fi, u32 slot)
+static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
+ struct inode *inode_alloc, u64 blkno,
+ struct ocfs2_info_freeinode *fi,
+ u32 slot)
{
int status = 0, unlock = 0;
@@ -365,8 +367,8 @@ bail:
return status;
}
-int ocfs2_info_handle_freeinode(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_freeinode(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
u32 i;
u64 blkno = -1;
@@ -412,11 +414,12 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
}
status = ocfs2_info_scan_inode_alloc(osb, inode_alloc, blkno, oifi, i);
- if (status < 0)
- goto bail;
iput(inode_alloc);
inode_alloc = NULL;
+
+ if (status < 0)
+ goto bail;
}
o2info_set_request_filled(&oifi->ifi_req);
@@ -460,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats,
stats->ffs_free_chunks_real++;
}
-void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
- unsigned int chunksize)
+static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg,
+ unsigned int chunksize)
{
o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize);
o2ffg_update_stats(&(ffg->iff_ffs), chunksize);
}
-int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
- struct inode *gb_inode,
- struct ocfs2_dinode *gb_dinode,
- struct ocfs2_chain_rec *rec,
- struct ocfs2_info_freefrag *ffg,
- u32 chunks_in_group)
+static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb,
+ struct inode *gb_inode,
+ struct ocfs2_dinode *gb_dinode,
+ struct ocfs2_chain_rec *rec,
+ struct ocfs2_info_freefrag *ffg,
+ u32 chunks_in_group)
{
int status = 0, used;
u64 blkno;
@@ -570,9 +573,9 @@ bail:
return status;
}
-int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
- struct inode *gb_inode, u64 blkno,
- struct ocfs2_info_freefrag *ffg)
+static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
+ struct inode *gb_inode, u64 blkno,
+ struct ocfs2_info_freefrag *ffg)
{
u32 chunks_in_group;
int status = 0, unlock = 0, i;
@@ -650,8 +653,8 @@ bail:
return status;
}
-int ocfs2_info_handle_freefrag(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_freefrag(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
u64 blkno = -1;
char namebuf[40];
@@ -721,8 +724,8 @@ out_err:
return status;
}
-int ocfs2_info_handle_unknown(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_unknown(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_request oir;
@@ -750,8 +753,8 @@ bail:
* - distinguish different requests.
* - validate size of different requests.
*/
-int ocfs2_info_handle_request(struct inode *inode,
- struct ocfs2_info_request __user *req)
+static int ocfs2_info_handle_request(struct inode *inode,
+ struct ocfs2_info_request __user *req)
{
int status = -EFAULT;
struct ocfs2_info_request oir;
@@ -809,8 +812,8 @@ bail:
return status;
}
-int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
- u64 *req_addr, int compat_flag)
+static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx,
+ u64 *req_addr, int compat_flag)
{
int status = -EFAULT;
u64 __user *bp = NULL;
@@ -847,8 +850,8 @@ bail:
* a better backward&forward compatibility, since a small piece of
* request will be less likely to be broken if disk layout get changed.
*/
-int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
- int compat_flag)
+static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
+ int compat_flag)
{
int i, status = 0;
u64 req_addr;
@@ -966,15 +969,21 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
case FITRIM:
{
struct super_block *sb = inode->i_sb;
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
struct fstrim_range range;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
if (copy_from_user(&range, argp, sizeof(range)))
return -EFAULT;
+ range.minlen = max_t(u64, q->limits.discard_granularity,
+ range.minlen);
ret = ocfs2_trim_fs(sb, &range);
if (ret < 0)
return ret;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3..4b0c68849b3 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -30,6 +30,7 @@
#include <linux/kthread.h>
#include <linux/time.h>
#include <linux/random.h>
+#include <linux/delay.h>
#include <cluster/masklog.h>
@@ -2132,12 +2133,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
spin_lock(&oi->ip_lock);
- /* The remote delete code may have set these on the
- * assumption that the other node would wipe them
- * successfully. If they are still in the node's
- * orphan dir, we need to reset that state. */
- oi->ip_flags &= ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
-
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
@@ -2191,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg)
|| kthread_should_stop());
status = ocfs2_commit_cache(osb);
- if (status < 0)
- mlog_errno(status);
+ if (status < 0) {
+ static unsigned long abort_warn_time;
+
+ /* Warn about this once per minute */
+ if (printk_timed_ratelimit(&abort_warn_time, 60*HZ))
+ mlog(ML_ERROR, "status = %d, journal is "
+ "already aborted.\n", status);
+ /*
+ * After ocfs2_commit_cache() fails, j_num_trans has a
+ * non-zero value. Sleep here to avoid a busy-wait
+ * loop.
+ */
+ msleep_interruptible(1000);
+ }
if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){
mlog(ML_KTHREAD,
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0b479bab367..7f8cde94abf 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -524,8 +524,7 @@ static inline int ocfs2_calc_dxi_expand_credits(struct super_block *sb)
* the result may be wrong.
*/
static inline int ocfs2_calc_extend_credits(struct super_block *sb,
- struct ocfs2_extent_list *root_el,
- u32 bits_wanted)
+ struct ocfs2_extent_list *root_el)
{
int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks;
@@ -627,4 +626,15 @@ static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
new_size);
}
+static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
+ struct inode *inode,
+ int datasync)
+{
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+}
+
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index cd5496b7a0a..04401345562 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -781,6 +781,48 @@ bail:
return status;
}
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits)
+{
+ int status, start;
+ u32 clear_bits;
+ struct inode *local_alloc_inode;
+ void *bitmap;
+ struct ocfs2_dinode *alloc;
+ struct ocfs2_local_alloc *la;
+
+ BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
+
+ local_alloc_inode = ac->ac_inode;
+ alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
+ la = OCFS2_LOCAL_ALLOC(alloc);
+
+ bitmap = la->la_bitmap;
+ start = bit_off - le32_to_cpu(la->la_bm_off);
+ clear_bits = num_bits;
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(local_alloc_inode),
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ while (clear_bits--)
+ ocfs2_clear_bit(start++, bitmap);
+
+ le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
+ ocfs2_journal_dirty(handle, osb->local_alloc_bh);
+
+bail:
+ return status;
+}
+
static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
{
u32 count;
diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h
index 1be9b586446..44a7d1fb2de 100644
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
u32 *bit_off,
u32 *num_bits);
+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
+ handle_t *handle,
+ struct ocfs2_alloc_context *ac,
+ u32 bit_off,
+ u32 num_bits);
+
void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
unsigned int num_clusters);
void ocfs2_la_enable_worker(struct work_struct *work);
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index e57c804069e..6b6d092b099 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -82,6 +82,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
}
ret = flock_lock_file_wait(file, fl);
+ if (ret)
+ ocfs2_file_unlock(file);
out:
mutex_unlock(&fp->fp_mutex);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 3d3f3c83065..599eb4c4c8b 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -151,6 +151,7 @@ static int __ocfs2_move_extent(handle_t *handle,
old_blkno, len);
}
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
out:
ocfs2_free_path(path);
return ret;
@@ -201,8 +202,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode,
}
}
- *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
- clusters_to_move + 2);
+ *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
extra_blocks, clusters_to_move, *credits);
@@ -562,83 +562,6 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
- handle_t *handle,
- struct buffer_head *di_bh,
- u32 num_bits,
- u16 chain)
-{
- int ret;
- u32 tmp_used;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
- struct ocfs2_chain_list *cl =
- (struct ocfs2_chain_list *) &di->id2.i_chain;
-
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
-
- tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
- di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
- le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
- ocfs2_journal_dirty(handle, di_bh);
-
-out:
- return ret;
-}
-
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits)
-{
- int status;
- void *bitmap = bg->bg_bitmap;
- int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
-
- /* All callers get the descriptor via
- * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
- BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
- BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
-
- mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
- num_bits);
-
- if (ocfs2_is_cluster_bitmap(alloc_inode))
- journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
-
- status = ocfs2_journal_access_gd(handle,
- INODE_CACHE(alloc_inode),
- group_bh,
- journal_type);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
-
- le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
- if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
- ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
- " count %u but claims %u are freed. num_bits %d",
- (unsigned long long)le64_to_cpu(bg->bg_blkno),
- le16_to_cpu(bg->bg_bits),
- le16_to_cpu(bg->bg_free_bits_count), num_bits);
- return -EROFS;
- }
- while (num_bits--)
- ocfs2_set_bit(bit_off++, bitmap);
-
- ocfs2_journal_dirty(handle, group_bh);
-
-bail:
- return status;
-}
-
static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
u32 len, int ext_flags)
@@ -768,8 +691,11 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
goal_bit, len);
- if (ret)
+ if (ret) {
+ ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
/*
* Here we should write the new page out first if we are
@@ -1035,6 +961,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
inode->i_ctime = CURRENT_TIME;
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
@@ -1067,8 +994,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
if (status)
return status;
- if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
+ if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
+ status = -EPERM;
goto out_drop;
+ }
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
status = -EPERM;
@@ -1090,8 +1019,10 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
goto out_free;
}
- if (range.me_start > i_size_read(inode))
+ if (range.me_start > i_size_read(inode)) {
+ status = -EINVAL;
goto out_free;
+ }
if (range.me_start + range.me_len > i_size_read(inode))
range.me_len = i_size_read(inode) - range.me_start;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index be3f8676a43..8add6f1030d 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -205,6 +205,21 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
return inode;
}
+static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb,
+ struct dentry *dentry, struct inode *inode)
+{
+ struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
+
+ ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
+ ocfs2_lock_res_free(&dl->dl_lockres);
+ BUG_ON(dl->dl_count != 1);
+ spin_lock(&dentry_attach_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry_attach_lock);
+ kfree(dl);
+ iput(inode);
+}
+
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
umode_t mode,
@@ -230,6 +245,8 @@ static int ocfs2_mknod(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
+ struct posix_acl *default_acl = NULL, *acl = NULL;
+ struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
(unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -331,6 +348,12 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ status = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (status) {
+ mlog_errno(status);
+ goto leave;
+ }
+
handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
S_ISDIR(mode),
xattr_credits));
@@ -379,8 +402,17 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
- status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
- meta_ac, data_ac);
+ if (default_acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_DEFAULT, default_acl,
+ meta_ac, data_ac);
+ }
+ if (!status && acl) {
+ status = ocfs2_set_acl(handle, inode, new_fe_bh,
+ ACL_TYPE_ACCESS, acl,
+ meta_ac, data_ac);
+ }
+
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -407,6 +439,8 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ dl = dentry->d_fsdata;
+
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_fe_bh,
&lookup);
@@ -419,6 +453,10 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
if (handle)
@@ -430,7 +468,6 @@ leave:
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
@@ -450,6 +487,9 @@ leave:
* ocfs2_delete_inode will mutex_lock again.
*/
if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
@@ -475,6 +515,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
struct ocfs2_dinode *fe = NULL;
struct ocfs2_extent_list *fel;
u16 feat;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
*new_fe_bh = NULL;
@@ -489,7 +530,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
*new_fe_bh = sb_getblk(osb->sb, fe_blkno);
if (!*new_fe_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto leave;
}
@@ -556,8 +597,8 @@ static int __ocfs2_mknod_locked(struct inode *dir,
mlog_errno(status);
}
- status = 0; /* error in ocfs2_create_new_inode_locks is not
- * critical */
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
leave:
if (status < 0) {
@@ -644,6 +685,7 @@ static int ocfs2_link(struct dentry *old_dentry,
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
+ u64 old_de_ino;
trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
old_dentry->d_name.len, old_dentry->d_name.name,
@@ -666,6 +708,22 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out;
}
+ err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ old_dentry->d_name.len, &old_de_ino);
+ if (err) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * Check whether another node removed the source inode while we
+ * were in the vfs.
+ */
+ if (old_de_ino != OCFS2_I(inode)->ip_blkno) {
+ err = -ENOENT;
+ goto out;
+ }
+
err = ocfs2_check_dir_for_entry(dir, dentry->d_name.name,
dentry->d_name.len);
if (err)
@@ -948,12 +1006,71 @@ leave:
ocfs2_free_dir_lookup_result(&orphan_insert);
ocfs2_free_dir_lookup_result(&lookup);
- if (status && (status != -ENOTEMPTY))
+ if (status && (status != -ENOTEMPTY) && (status != -ENOENT))
mlog_errno(status);
return status;
}
+static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
+ u64 src_inode_no, u64 dest_inode_no)
+{
+ int ret = 0, i = 0;
+ u64 parent_inode_no = 0;
+ u64 child_inode_no = src_inode_no;
+ struct inode *child_inode;
+
+#define MAX_LOOKUP_TIMES 32
+ while (1) {
+ child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
+ if (IS_ERR(child_inode)) {
+ ret = PTR_ERR(child_inode);
+ break;
+ }
+
+ ret = ocfs2_inode_lock(child_inode, NULL, 0);
+ if (ret < 0) {
+ iput(child_inode);
+ if (ret != -ENOENT)
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
+ &parent_inode_no);
+ ocfs2_inode_unlock(child_inode, 0);
+ iput(child_inode);
+ if (ret < 0) {
+ ret = -ENOENT;
+ break;
+ }
+
+ if (parent_inode_no == dest_inode_no) {
+ ret = 1;
+ break;
+ }
+
+ if (parent_inode_no == osb->root_inode->i_ino) {
+ ret = 0;
+ break;
+ }
+
+ child_inode_no = parent_inode_no;
+
+ if (++i >= MAX_LOOKUP_TIMES) {
+ mlog(ML_NOTICE, "max lookup times reached, filesystem "
+ "may have nested directories, "
+ "src inode: %llu, dest inode: %llu.\n",
+ (unsigned long long)src_inode_no,
+ (unsigned long long)dest_inode_no);
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
/*
* The only place this should be used is rename!
* if they have the same id, then the 1st one is the only one locked.
@@ -965,6 +1082,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
struct inode *inode2)
{
int status;
+ int inode1_is_ancestor, inode2_is_ancestor;
struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
struct buffer_head **tmpbh;
@@ -978,9 +1096,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
if (*bh2)
*bh2 = NULL;
- /* we always want to lock the one with the lower lockid first. */
+ /* we always want to lock the one with the lower lockid first.
+ * and if they are nested, we lock ancestor first */
if (oi1->ip_blkno != oi2->ip_blkno) {
- if (oi1->ip_blkno < oi2->ip_blkno) {
+ inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
+ oi1->ip_blkno);
+ if (inode1_is_ancestor < 0) {
+ status = inode1_is_ancestor;
+ goto bail;
+ }
+
+ inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
+ oi2->ip_blkno);
+ if (inode2_is_ancestor < 0) {
+ status = inode2_is_ancestor;
+ goto bail;
+ }
+
+ if ((inode1_is_ancestor == 1) ||
+ (oi1->ip_blkno < oi2->ip_blkno &&
+ inode2_is_ancestor == 0)) {
/* switch id1 and id2 around */
tmpbh = bh2;
bh2 = bh1;
@@ -1061,6 +1196,7 @@ static int ocfs2_rename(struct inode *old_dir,
struct ocfs2_dir_lookup_result old_entry_lookup = { NULL, };
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
struct ocfs2_dir_lookup_result target_insert = { NULL, };
+ bool should_add_orphan = false;
/* At some point it might be nice to break this function up a
* bit. */
@@ -1097,6 +1233,21 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
rename_lock = 1;
+
+ /* here we cannot guarantee the inodes haven't just been
+ * changed, so check if they are nested again */
+ status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
+ old_inode->i_ino);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ } else if (status == 1) {
+ status = -EPERM;
+ trace_ocfs2_rename_not_permitted(
+ (unsigned long long)old_inode->i_ino,
+ (unsigned long long)new_dir->i_ino);
+ goto bail;
+ }
}
/* if old and new are the same, this'll just do one lock. */
@@ -1267,6 +1418,7 @@ static int ocfs2_rename(struct inode *old_dir,
mlog_errno(status);
goto bail;
}
+ should_add_orphan = true;
}
} else {
BUG_ON(new_dentry->d_parent->d_inode != new_dir);
@@ -1311,17 +1463,6 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
- if (S_ISDIR(new_inode->i_mode) ||
- (ocfs2_read_links_count(newfe) == 1)) {
- status = ocfs2_orphan_add(osb, handle, new_inode,
- newfe_bh, orphan_name,
- &orphan_insert, orphan_dir);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- }
-
/* change the dirent to point to the correct inode */
status = ocfs2_update_entry(new_dir, handle, &target_lookup_res,
old_inode);
@@ -1336,6 +1477,15 @@ static int ocfs2_rename(struct inode *old_dir,
else
ocfs2_add_links_count(newfe, -1);
ocfs2_journal_dirty(handle, newfe_bh);
+ if (should_add_orphan) {
+ status = ocfs2_orphan_add(osb, handle, new_inode,
+ newfe_bh, orphan_name,
+ &orphan_insert, orphan_dir);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
} else {
/* if the name was not found in new_dir, add it now */
status = ocfs2_add_entry(handle, new_dentry, old_inode,
@@ -1605,6 +1755,7 @@ static int ocfs2_symlink(struct inode *dir,
struct ocfs2_dir_lookup_result lookup = { NULL, };
sigset_t oldset;
int did_block_signals = 0;
+ struct ocfs2_dentry_lock *dl = NULL;
trace_ocfs2_symlink_begin(dir, dentry, symname,
dentry->d_name.len, dentry->d_name.name);
@@ -1793,6 +1944,8 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
+ dl = dentry->d_fsdata;
+
status = ocfs2_add_entry(handle, dentry, inode,
le64_to_cpu(fe->i_blkno), parent_fe_bh,
&lookup);
@@ -1818,7 +1971,6 @@ bail:
brelse(new_fe_bh);
brelse(parent_fe_bh);
- kfree(si.name);
kfree(si.value);
ocfs2_free_dir_lookup_result(&lookup);
if (inode_ac)
@@ -1828,6 +1980,9 @@ bail:
if (xattr_ac)
ocfs2_free_alloc_context(xattr_ac);
if ((status < 0) && inode) {
+ if (dl)
+ ocfs2_cleanup_add_entry_failure(osb, dentry, inode);
+
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR;
clear_nlink(inode);
iput(inode);
@@ -2444,6 +2599,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
di->i_orphaned_slot = 0;
set_nlink(inode, 1);
ocfs2_set_links_count(di, inode->i_nlink);
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
ocfs2_journal_dirty(handle, di_bh);
status = ocfs2_add_entry(handle, dentry, inode,
@@ -2504,4 +2660,5 @@ const struct inode_operations ocfs2_dir_iops = {
.removexattr = generic_removexattr,
.fiemap = ocfs2_fiemap,
.get_acl = ocfs2_iop_get_acl,
+ .set_acl = ocfs2_iop_set_acl,
};
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3a903470c79..bbec539230f 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/list.h>
+#include <linux/llist.h>
#include <linux/rbtree.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
@@ -274,19 +275,16 @@ enum ocfs2_mount_options
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
};
-#define OCFS2_OSB_SOFT_RO 0x0001
-#define OCFS2_OSB_HARD_RO 0x0002
-#define OCFS2_OSB_ERROR_FS 0x0004
-#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008
-
-#define OCFS2_DEFAULT_ATIME_QUANTUM 60
+#define OCFS2_OSB_SOFT_RO 0x0001
+#define OCFS2_OSB_HARD_RO 0x0002
+#define OCFS2_OSB_ERROR_FS 0x0004
+#define OCFS2_DEFAULT_ATIME_QUANTUM 60
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
struct ocfs2_replay_map;
struct ocfs2_quota_recovery;
-struct ocfs2_dentry_lock;
struct ocfs2_super
{
struct task_struct *commit_task;
@@ -387,6 +385,7 @@ struct ocfs2_super
u8 osb_stackflags;
char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
+ char osb_cluster_name[OCFS2_CLUSTER_NAME_LEN + 1];
struct ocfs2_cluster_connection *cconn;
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
@@ -413,10 +412,9 @@ struct ocfs2_super
struct list_head blocked_lock_list;
unsigned long blocked_lock_count;
- /* List of dentry locks to release. Anyone can add locks to
- * the list, ocfs2_wq processes the list */
- struct ocfs2_dentry_lock *dentry_lock_list;
- struct work_struct dentry_lock_work;
+ /* List of dquot structures to drop last reference to */
+ struct llist_head dquot_drop_list;
+ struct work_struct dquot_drop_work;
wait_queue_head_t osb_mount_event;
@@ -424,6 +422,7 @@ struct ocfs2_super
struct inode *osb_tl_inode;
struct buffer_head *osb_tl_bh;
struct delayed_work osb_truncate_log_wq;
+ atomic_t osb_tl_disable;
/*
* How many clusters in our truncate log.
* It must be protected by osb_tl_inode->i_mutex.
@@ -448,6 +447,8 @@ struct ocfs2_super
/* rb tree root for refcount lock. */
struct rb_root osb_rf_lock_tree;
struct ocfs2_refcount_tree *osb_ref_tree_lru;
+
+ struct mutex system_file_mutex;
};
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
@@ -578,18 +579,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
spin_unlock(&osb->osb_lock);
}
-
-static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb,
- unsigned long flag)
-{
- unsigned long ret;
-
- spin_lock(&osb->osb_lock);
- ret = osb->osb_flags & flag;
- spin_unlock(&osb->osb_lock);
- return ret;
-}
-
static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
int hard)
{
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62aa9d..6cb019b7c6a 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
__entry->new_len, __get_str(new_name))
);
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
+
TRACE_EVENT(ocfs2_rename_target_exists,
TP_PROTO(int new_len, const char *new_name),
TP_ARGS(new_len, new_name),
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c..f266d67df3c 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -28,6 +28,7 @@ struct ocfs2_dquot {
unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
s64 dq_origspace; /* Last globally synced space usage */
s64 dq_originodes; /* Last globally synced inode usage */
+ struct llist_node list; /* Member of list of dquots to drop */
};
/* Description of one chunk to recover in memory */
@@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block,
int ocfs2_create_local_dquot(struct dquot *dquot);
int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
extern const struct dquot_operations ocfs2_quota_operations;
extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index aaa50611ec6..b990a62cff5 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -10,6 +10,7 @@
#include <linux/jiffies.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
+#include <linux/llist.h>
#include <cluster/masklog.h>
@@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
OCFS2_INODE_UPDATE_CREDITS;
}
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+ struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+ dquot_drop_work);
+ struct llist_node *list;
+ struct ocfs2_dquot *odquot, *next_odquot;
+
+ list = llist_del_all(&osb->dquot_drop_list);
+ llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+ /* Drop the reference we acquired in ocfs2_dquot_release() */
+ dqput(&odquot->dq_dquot);
+ }
+}
+
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
static int ocfs2_release_dquot(struct dquot *dquot)
{
handle_t *handle;
@@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)
/* Check whether we are not racing with some other dqget() */
if (atomic_read(&dquot->dq_count) > 1)
goto out;
+ /* Running from downconvert thread? Postpone quota processing to wq */
+ if (current == osb->dc_task) {
+ /*
+ * Grab our own reference to dquot and queue it for delayed
+ * dropping. Quota code rechecks after calling
+ * ->release_dquot() and won't free dquot structure.
+ */
+ dqgrab(dquot);
+ /* First entry on list -> queue work */
+ if (llist_add(&OCFS2_DQUOT(dquot)->list, &osb->dquot_drop_list))
+ queue_work(ocfs2_wq, &osb->dquot_drop_work);
+ goto out;
+ }
status = ocfs2_lock_global_qf(oinfo, 1);
if (status < 0)
goto out;
@@ -717,6 +752,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
*/
if (status < 0)
mlog_errno(status);
+ /*
+ * Clear dq_off so that we search for the structure in quota file next
+ * time we acquire it. The structure might be deleted and reallocated
+ * elsewhere by another node while our dquot structure is on freelist.
+ */
+ dquot->dq_off = 0;
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_trans:
ocfs2_commit_trans(osb, handle);
@@ -756,16 +797,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
status = ocfs2_lock_global_qf(info, 1);
if (status < 0)
goto out;
- if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
- status = ocfs2_qinfo_lock(info, 0);
- if (status < 0)
- goto out_dq;
- status = qtree_read_dquot(&info->dqi_gi, dquot);
- ocfs2_qinfo_unlock(info, 0);
- if (status < 0)
- goto out_dq;
- }
- set_bit(DQ_READ_B, &dquot->dq_flags);
+ status = ocfs2_qinfo_lock(info, 0);
+ if (status < 0)
+ goto out_dq;
+ /*
+ * We always want to read dquot structure from disk because we don't
+ * know what happened with it while it was on freelist.
+ */
+ status = qtree_read_dquot(&info->dqi_gi, dquot);
+ ocfs2_qinfo_unlock(info, 0);
+ if (status < 0)
+ goto out_dq;
OCFS2_DQUOT(dquot)->dq_use_count++;
OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2e4344be3b9..2001862bf2b 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
out:
- /* Clear the read bit so that next time someone uses this
- * dquot he reads fresh info from disk and allocates local
- * dquot structure */
- clear_bit(DQ_READ_B, &dquot->dq_flags);
return status;
}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bf4dfc14bb2..636aab69ead 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -46,6 +46,7 @@
#include <linux/quotaops.h>
#include <linux/namei.h>
#include <linux/mount.h>
+#include <linux/posix_acl.h>
struct ocfs2_cow_context {
struct inode *inode;
@@ -612,6 +613,11 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out_commit;
+ }
ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
@@ -1310,7 +1316,7 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
@@ -1402,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size)
{
struct ocfs2_refcount_rec *l = a, *r = b, tmp;
- tmp = *(struct ocfs2_refcount_rec *)l;
- *(struct ocfs2_refcount_rec *)l =
- *(struct ocfs2_refcount_rec *)r;
- *(struct ocfs2_refcount_rec *)r = tmp;
+ tmp = *l;
+ *l = *r;
+ *r = tmp;
}
/*
@@ -1561,7 +1566,7 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
@@ -2502,8 +2507,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
*credits += ocfs2_calc_extend_credits(sb,
- et.et_root_el,
- ref_blocks);
+ et.et_root_el);
} else {
*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
*meta_add += 1;
@@ -2874,8 +2878,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
meta_add =
ocfs2_extend_meta_needed(et->et_root_el);
- *credits += ocfs2_calc_extend_credits(sb, et->et_root_el,
- num_clusters + 2);
+ *credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
p_cluster, num_clusters,
@@ -3031,7 +3034,7 @@ int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
for (i = 0; i < blocks; i++, old_block++, new_block++) {
new_bh = sb_getblk(osb->sb, new_block);
if (new_bh == NULL) {
- ret = -EIO;
+ ret = -ENOMEM;
mlog_errno(ret);
break;
}
@@ -3625,8 +3628,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
*credits += ocfs2_calc_extend_credits(inode->i_sb,
- et.et_root_el,
- ref_blocks);
+ et.et_root_el);
}
out:
@@ -4266,20 +4268,36 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct inode *inode = old_dentry->d_inode;
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
+ struct posix_acl *default_acl, *acl;
+ umode_t mode;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
- error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
+ mode = inode->i_mode;
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
+ error = ocfs2_create_inode_in_orphan(dir, mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
goto out;
}
+ error = ocfs2_rw_lock(inode, 1);
+ if (error) {
+ mlog_errno(error);
+ goto out;
+ }
+
error = ocfs2_inode_lock(inode, &old_bh, 1);
if (error) {
mlog_errno(error);
+ ocfs2_rw_unlock(inode, 1);
goto out;
}
@@ -4291,6 +4309,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
up_write(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
brelse(old_bh);
if (error) {
@@ -4301,11 +4320,16 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
- &new_dentry->d_name);
+ &new_dentry->d_name,
+ default_acl, acl);
if (error)
mlog_errno(error);
}
out:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ec55add7604..d5da6f62414 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -53,8 +53,6 @@
*/
static u16 ocfs2_calc_new_backup_super(struct inode *inode,
struct ocfs2_group_desc *gd,
- int new_clusters,
- u32 first_new_cluster,
u16 cl_cpg,
int set)
{
@@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
OCFS2_FEATURE_COMPAT_BACKUP_SB)) {
backups = ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
cl_cpg, 1);
le16_add_cpu(&group->bg_free_bits_count, -1 * backups);
}
@@ -157,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
spin_lock(&OCFS2_I(bm_inode)->ip_lock);
OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(bm_inode)->ip_lock);
i_size_write(bm_inode, le64_to_cpu(fe->i_size));
@@ -167,8 +163,6 @@ out_rollback:
if (ret < 0) {
ocfs2_calc_new_backup_super(bm_inode,
group,
- new_clusters,
- first_new_cluster,
cl_cpg, 0);
le16_add_cpu(&group->bg_free_bits_count, backups);
le16_add_cpu(&group->bg_bits, -1 * num_bits);
@@ -469,6 +463,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
struct ocfs2_chain_list *cl;
struct ocfs2_chain_rec *cr;
u16 cl_bpc;
+ u64 bg_ptr;
if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
return -EROFS;
@@ -513,7 +508,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
ret = ocfs2_verify_group_and_input(main_bm_inode, fe, input, group_bh);
if (ret) {
mlog_errno(ret);
- goto out_unlock;
+ goto out_free_group_bh;
}
trace_ocfs2_group_add((unsigned long long)input->group,
@@ -523,7 +518,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
if (IS_ERR(handle)) {
mlog_errno(PTR_ERR(handle));
ret = -EINVAL;
- goto out_unlock;
+ goto out_free_group_bh;
}
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
@@ -538,12 +533,14 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
}
group = (struct ocfs2_group_desc *)group_bh->b_data;
+ bg_ptr = le64_to_cpu(group->bg_next_group);
group->bg_next_group = cr->c_blkno;
ocfs2_journal_dirty(handle, group_bh);
ret = ocfs2_journal_access_di(handle, INODE_CACHE(main_bm_inode),
main_bm_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
+ group->bg_next_group = cpu_to_le64(bg_ptr);
mlog_errno(ret);
goto out_commit;
}
@@ -566,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
spin_lock(&OCFS2_I(main_bm_inode)->ip_lock);
OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits);
+ le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits);
spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock);
i_size_write(main_bm_inode, le64_to_cpu(fe->i_size));
@@ -574,8 +571,11 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
out_commit:
ocfs2_commit_trans(osb, handle);
-out_unlock:
+
+out_free_group_bh:
brelse(group_bh);
+
+out_unlock:
brelse(main_bm_bh);
ocfs2_inode_unlock(main_bm_inode, 1);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bf1f8930456..1724d43d3da 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -398,7 +398,8 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
return 0;
}
-static int o2cb_cluster_this_node(unsigned int *node)
+static int o2cb_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
int node_num;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 286edf1e231..13a8537d8e8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -23,6 +23,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/reboot.h>
+#include <linux/sched.h>
#include <asm/uaccess.h>
#include "stackglue.h"
@@ -102,6 +103,12 @@
#define OCFS2_TEXT_UUID_LEN 32
#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
+#define VERSION_LOCK "version_lock"
+
+enum ocfs2_connection_type {
+ WITH_CONTROLD,
+ NO_CONTROLD
+};
/*
* ocfs2_live_connection is refcounted because the filesystem and
@@ -110,6 +117,13 @@
struct ocfs2_live_connection {
struct list_head oc_list;
struct ocfs2_cluster_connection *oc_conn;
+ enum ocfs2_connection_type oc_type;
+ atomic_t oc_this_node;
+ int oc_our_slot;
+ struct dlm_lksb oc_version_lksb;
+ char oc_lvb[DLM_LVB_LEN];
+ struct completion oc_sync_wait;
+ wait_queue_head_t oc_wait;
};
struct ocfs2_control_private {
@@ -198,20 +212,15 @@ static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
* mount path. Since the VFS prevents multiple calls to
* fill_super(), we can't get dupes here.
*/
-static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
- struct ocfs2_live_connection **c_ret)
+static int ocfs2_live_connection_attach(struct ocfs2_cluster_connection *conn,
+ struct ocfs2_live_connection *c)
{
int rc = 0;
- struct ocfs2_live_connection *c;
-
- c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
- if (!c)
- return -ENOMEM;
mutex_lock(&ocfs2_control_lock);
c->oc_conn = conn;
- if (atomic_read(&ocfs2_control_opened))
+ if ((c->oc_type == NO_CONTROLD) || atomic_read(&ocfs2_control_opened))
list_add(&c->oc_list, &ocfs2_live_connection_list);
else {
printk(KERN_ERR
@@ -220,12 +229,6 @@ static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
}
mutex_unlock(&ocfs2_control_lock);
-
- if (!rc)
- *c_ret = c;
- else
- kfree(c);
-
return rc;
}
@@ -799,18 +802,251 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
return 0;
}
+static void lvb_to_version(char *lvb, struct ocfs2_protocol_version *ver)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ ver->pv_major = pv->pv_major;
+ ver->pv_minor = pv->pv_minor;
+}
+
+static void version_to_lvb(struct ocfs2_protocol_version *ver, char *lvb)
+{
+ struct ocfs2_protocol_version *pv =
+ (struct ocfs2_protocol_version *)lvb;
+ /*
+ * ocfs2_protocol_version has two u8 variables, so we don't
+ * need any endian conversion.
+ */
+ pv->pv_major = ver->pv_major;
+ pv->pv_minor = ver->pv_minor;
+}
+
+static void sync_wait_cb(void *arg)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ complete(&lc->oc_sync_wait);
+}
+
+static int sync_unlock(struct ocfs2_cluster_connection *conn,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_unlock(conn->cc_lockspace, lksb->sb_lkid, 0, lksb, conn);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x error %d\n",
+ name, lksb->sb_lkid, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ if (lksb->sb_status != -DLM_EUNLOCK) {
+ printk(KERN_ERR "%s lkid %x status %d\n",
+ name, lksb->sb_lkid, lksb->sb_status);
+ return -1;
+ }
+ return 0;
+}
+
+static int sync_lock(struct ocfs2_cluster_connection *conn,
+ int mode, uint32_t flags,
+ struct dlm_lksb *lksb, char *name)
+{
+ int error, status;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ error = dlm_lock(conn->cc_lockspace, mode, lksb, flags,
+ name, strlen(name),
+ 0, sync_wait_cb, conn, NULL);
+ if (error) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d error %d\n",
+ name, lksb->sb_lkid, flags, mode, error);
+ return error;
+ }
+
+ wait_for_completion(&lc->oc_sync_wait);
+
+ status = lksb->sb_status;
+
+ if (status && status != -EAGAIN) {
+ printk(KERN_ERR "%s lkid %x flags %x mode %d status %d\n",
+ name, lksb->sb_lkid, flags, mode, status);
+ }
+
+ return status;
+}
+
+
+static int version_lock(struct ocfs2_cluster_connection *conn, int mode,
+ int flags)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_lock(conn, mode, flags,
+ &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+static int version_unlock(struct ocfs2_cluster_connection *conn)
+{
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ return sync_unlock(conn, &lc->oc_version_lksb, VERSION_LOCK);
+}
+
+/* get_protocol_version()
+ *
+ * To exchange ocfs2 versioning, we use the LVB of the version dlm lock.
+ * The algorithm is:
+ * 1. Attempt to take the lock in EX mode (non-blocking).
+ * 2. If successful (which means it is the first mount), write the
+ * version number and downconvert to PR lock.
+ * 3. If unsuccessful (returns -EAGAIN), read the version from the LVB after
+ * taking the PR lock.
+ */
+
+static int get_protocol_version(struct ocfs2_cluster_connection *conn)
+{
+ int ret;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ struct ocfs2_protocol_version pv;
+
+ running_proto.pv_major =
+ ocfs2_user_plugin.sp_max_proto.pv_major;
+ running_proto.pv_minor =
+ ocfs2_user_plugin.sp_max_proto.pv_minor;
+
+ lc->oc_version_lksb.sb_lvbptr = lc->oc_lvb;
+ ret = version_lock(conn, DLM_LOCK_EX,
+ DLM_LKF_VALBLK|DLM_LKF_NOQUEUE);
+ if (!ret) {
+ conn->cc_version.pv_major = running_proto.pv_major;
+ conn->cc_version.pv_minor = running_proto.pv_minor;
+ version_to_lvb(&running_proto, lc->oc_lvb);
+ version_lock(conn, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_VALBLK);
+ } else if (ret == -EAGAIN) {
+ ret = version_lock(conn, DLM_LOCK_PR, DLM_LKF_VALBLK);
+ if (ret)
+ goto out;
+ lvb_to_version(lc->oc_lvb, &pv);
+
+ if ((pv.pv_major != running_proto.pv_major) ||
+ (pv.pv_minor > running_proto.pv_minor)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ conn->cc_version.pv_major = pv.pv_major;
+ conn->cc_version.pv_minor = pv.pv_minor;
+ }
+out:
+ return ret;
+}
+
+static void user_recover_prep(void *arg)
+{
+}
+
+static void user_recover_slot(void *arg, struct dlm_slot *slot)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ printk(KERN_INFO "ocfs2: Node %d/%d down. Initiating recovery.\n",
+ slot->nodeid, slot->slot);
+ conn->cc_recovery_handler(slot->nodeid, conn->cc_recovery_data);
+
+}
+
+static void user_recover_done(void *arg, struct dlm_slot *slots,
+ int num_slots, int our_slot,
+ uint32_t generation)
+{
+ struct ocfs2_cluster_connection *conn = arg;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+ int i;
+
+ for (i = 0; i < num_slots; i++)
+ if (slots[i].slot == our_slot) {
+ atomic_set(&lc->oc_this_node, slots[i].nodeid);
+ break;
+ }
+
+ lc->oc_our_slot = our_slot;
+ wake_up(&lc->oc_wait);
+}
+
+static const struct dlm_lockspace_ops ocfs2_ls_ops = {
+ .recover_prep = user_recover_prep,
+ .recover_slot = user_recover_slot,
+ .recover_done = user_recover_done,
+};
+
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
+{
+ version_unlock(conn);
+ dlm_release_lockspace(conn->cc_lockspace, 2);
+ conn->cc_lockspace = NULL;
+ ocfs2_live_connection_drop(conn->cc_private);
+ conn->cc_private = NULL;
+ return 0;
+}
+
static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
{
dlm_lockspace_t *fsdlm;
- struct ocfs2_live_connection *uninitialized_var(control);
- int rc = 0;
+ struct ocfs2_live_connection *lc;
+ int rc, ops_rv;
BUG_ON(conn == NULL);
- rc = ocfs2_live_connection_new(conn, &control);
+ lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
+ if (!lc) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ init_waitqueue_head(&lc->oc_wait);
+ init_completion(&lc->oc_sync_wait);
+ atomic_set(&lc->oc_this_node, 0);
+ conn->cc_private = lc;
+ lc->oc_type = NO_CONTROLD;
+
+ rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
+ DLM_LSFL_FS, DLM_LVB_LEN,
+ &ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
+ if (rc)
+ goto out;
+
+ if (ops_rv == -EOPNOTSUPP) {
+ lc->oc_type = WITH_CONTROLD;
+ printk(KERN_NOTICE "ocfs2: You seem to be using an older "
+ "version of dlm_controld and/or ocfs2-tools."
+ " Please consider upgrading.\n");
+ } else if (ops_rv) {
+ rc = ops_rv;
+ goto out;
+ }
+ conn->cc_lockspace = fsdlm;
+
+ rc = ocfs2_live_connection_attach(conn, lc);
if (rc)
goto out;
+ if (lc->oc_type == NO_CONTROLD) {
+ rc = get_protocol_version(conn);
+ if (rc) {
+ printk(KERN_ERR "ocfs2: Could not determine"
+ " locking version\n");
+ user_cluster_disconnect(conn);
+ goto out;
+ }
+ wait_event(lc->oc_wait, (atomic_read(&lc->oc_this_node) > 0));
+ }
+
/*
* running_proto must have been set before we allowed any mounts
* to proceed.
@@ -818,42 +1054,34 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
printk(KERN_ERR
"Unable to mount with fs locking protocol version "
- "%u.%u because the userspace control daemon has "
- "negotiated %u.%u\n",
+ "%u.%u because negotiated protocol is %u.%u\n",
conn->cc_version.pv_major, conn->cc_version.pv_minor,
running_proto.pv_major, running_proto.pv_minor);
rc = -EPROTO;
- ocfs2_live_connection_drop(control);
- goto out;
- }
-
- rc = dlm_new_lockspace(conn->cc_name, NULL, DLM_LSFL_FS, DLM_LVB_LEN,
- NULL, NULL, NULL, &fsdlm);
- if (rc) {
- ocfs2_live_connection_drop(control);
- goto out;
+ ocfs2_live_connection_drop(lc);
+ lc = NULL;
}
- conn->cc_private = control;
- conn->cc_lockspace = fsdlm;
out:
+ if (rc && lc)
+ kfree(lc);
return rc;
}
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
-{
- dlm_release_lockspace(conn->cc_lockspace, 2);
- conn->cc_lockspace = NULL;
- ocfs2_live_connection_drop(conn->cc_private);
- conn->cc_private = NULL;
- return 0;
-}
-static int user_cluster_this_node(unsigned int *this_node)
+static int user_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *this_node)
{
int rc;
+ struct ocfs2_live_connection *lc = conn->cc_private;
+
+ if (lc->oc_type == WITH_CONTROLD)
+ rc = ocfs2_control_get_this_node();
+ else if (lc->oc_type == NO_CONTROLD)
+ rc = atomic_read(&lc->oc_this_node);
+ else
+ rc = -EINVAL;
- rc = ocfs2_control_get_this_node();
if (rc < 0)
return rc;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 39abf89697e..5d965e83bd4 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -309,6 +309,8 @@ int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino,
EXPORT_SYMBOL_GPL(ocfs2_plock);
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -342,8 +344,12 @@ int ocfs2_cluster_connect(const char *stack_name,
goto out;
}
- memcpy(new_conn->cc_name, group, grouplen);
+ strlcpy(new_conn->cc_name, group, GROUP_NAME_MAX + 1);
new_conn->cc_namelen = grouplen;
+ if (cluster_name_len)
+ strlcpy(new_conn->cc_cluster_name, cluster_name,
+ CLUSTER_NAME_MAX + 1);
+ new_conn->cc_cluster_name_len = cluster_name_len;
new_conn->cc_recovery_handler = recovery_handler;
new_conn->cc_recovery_data = recovery_data;
@@ -386,8 +392,9 @@ int ocfs2_cluster_connect_agnostic(const char *group,
if (cluster_stack_name[0])
stack_name = cluster_stack_name;
- return ocfs2_cluster_connect(stack_name, group, grouplen, lproto,
- recovery_handler, recovery_data, conn);
+ return ocfs2_cluster_connect(stack_name, NULL, 0, group, grouplen,
+ lproto, recovery_handler, recovery_data,
+ conn);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_connect_agnostic);
@@ -460,9 +467,10 @@ void ocfs2_cluster_hangup(const char *group, int grouplen)
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
-int ocfs2_cluster_this_node(unsigned int *node)
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node)
{
- return active_stack->sp_ops->this_node(node);
+ return active_stack->sp_ops->this_node(conn, node);
}
EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
@@ -488,7 +496,7 @@ static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_max_locking_protocol =
- __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
+ __ATTR(max_locking_protocol, S_IRUGO,
ocfs2_max_locking_protocol_show, NULL);
static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
@@ -520,7 +528,7 @@ static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
- __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
+ __ATTR(loaded_cluster_plugins, S_IRUGO,
ocfs2_loaded_cluster_plugins_show, NULL);
static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
@@ -542,7 +550,7 @@ static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
}
static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
- __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
+ __ATTR(active_cluster_plugin, S_IRUGO,
ocfs2_active_cluster_plugin_show, NULL);
static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
@@ -591,15 +599,29 @@ static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
static struct kobj_attribute ocfs2_attr_cluster_stack =
- __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
+ __ATTR(cluster_stack, S_IRUGO | S_IWUSR,
ocfs2_cluster_stack_show,
ocfs2_cluster_stack_store);
+
+
+static ssize_t ocfs2_dlm_recover_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "1\n");
+}
+
+static struct kobj_attribute ocfs2_attr_dlm_recover_support =
+ __ATTR(dlm_recover_callback_support, S_IRUGO,
+ ocfs2_dlm_recover_show, NULL);
+
static struct attribute *ocfs2_attrs[] = {
&ocfs2_attr_max_locking_protocol.attr,
&ocfs2_attr_loaded_cluster_plugins.attr,
&ocfs2_attr_active_cluster_plugin.attr,
&ocfs2_attr_cluster_stack.attr,
+ &ocfs2_attr_dlm_recover_support.attr,
NULL,
};
@@ -643,7 +665,7 @@ error:
#define FS_OCFS2_NM 1
-static ctl_table ocfs2_nm_table[] = {
+static struct ctl_table ocfs2_nm_table[] = {
{
.procname = "hb_ctl_path",
.data = ocfs2_hb_ctl_path,
@@ -654,7 +676,7 @@ static ctl_table ocfs2_nm_table[] = {
{ }
};
-static ctl_table ocfs2_mod_table[] = {
+static struct ctl_table ocfs2_mod_table[] = {
{
.procname = "nm",
.data = NULL,
@@ -665,7 +687,7 @@ static ctl_table ocfs2_mod_table[] = {
{ }
};
-static ctl_table ocfs2_kern_table[] = {
+static struct ctl_table ocfs2_kern_table[] = {
{
.procname = "ocfs2",
.data = NULL,
@@ -676,7 +698,7 @@ static ctl_table ocfs2_kern_table[] = {
{ }
};
-static ctl_table ocfs2_root_table[] = {
+static struct ctl_table ocfs2_root_table[] = {
{
.procname = "fs",
.data = NULL,
@@ -687,7 +709,7 @@ static ctl_table ocfs2_root_table[] = {
{ }
};
-static struct ctl_table_header *ocfs2_table_header = NULL;
+static struct ctl_table_header *ocfs2_table_header;
/*
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 1ec56fdb8d0..66334a30cea 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -45,6 +45,9 @@ struct file_lock;
*/
#define GROUP_NAME_MAX 64
+/* This shadows OCFS2_CLUSTER_NAME_LEN */
+#define CLUSTER_NAME_MAX 16
+
/*
* ocfs2_protocol_version changes when ocfs2 does something different in
@@ -97,8 +100,10 @@ struct ocfs2_locking_protocol {
* locking compatibility.
*/
struct ocfs2_cluster_connection {
- char cc_name[GROUP_NAME_MAX];
+ char cc_name[GROUP_NAME_MAX + 1];
int cc_namelen;
+ char cc_cluster_name[CLUSTER_NAME_MAX + 1];
+ int cc_cluster_name_len;
struct ocfs2_protocol_version cc_version;
struct ocfs2_locking_protocol *cc_proto;
void (*cc_recovery_handler)(int node_num, void *recovery_data);
@@ -152,7 +157,8 @@ struct ocfs2_stack_operations {
* ->this_node() returns the cluster's unique identifier for the
* local node.
*/
- int (*this_node)(unsigned int *node);
+ int (*this_node)(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
/*
* Call the underlying dlm lock function. The ->dlm_lock()
@@ -239,6 +245,8 @@ struct ocfs2_stack_plugin {
/* Used by the filesystem */
int ocfs2_cluster_connect(const char *stack_name,
+ const char *cluster_name,
+ int cluster_name_len,
const char *group,
int grouplen,
struct ocfs2_locking_protocol *lproto,
@@ -260,7 +268,8 @@ int ocfs2_cluster_connect_agnostic(const char *group,
int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
int hangup_pending);
void ocfs2_cluster_hangup(const char *group, int grouplen);
-int ocfs2_cluster_this_node(unsigned int *node);
+int ocfs2_cluster_this_node(struct ocfs2_cluster_connection *conn,
+ unsigned int *node);
struct ocfs2_lock_res;
int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 5397c07ce60..0cb889a17ae 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -113,12 +113,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
struct ocfs2_suballoc_result *res);
static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
int nr);
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct ocfs2_group_desc *bg,
- struct buffer_head *group_bh,
- unsigned int bit_off,
- unsigned int num_bits);
static int ocfs2_relink_block_group(handle_t *handle,
struct inode *alloc_inode,
struct buffer_head *fe_bh,
@@ -481,7 +475,7 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
bg_bh = sb_getblk(osb->sb, bg_blkno);
if (!bg_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -661,7 +655,7 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
bg_bh = sb_getblk(osb->sb, bg_blkno);
if (!bg_bh) {
- status = -EIO;
+ status = -ENOMEM;
mlog_errno(status);
goto bail;
}
@@ -777,6 +771,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
+ ocfs2_update_inode_fsync_trans(handle, alloc_inode, 0);
status = 0;
@@ -1343,7 +1338,7 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
return status;
}
-static inline int ocfs2_block_group_set_bits(handle_t *handle,
+int ocfs2_block_group_set_bits(handle_t *handle,
struct inode *alloc_inode,
struct ocfs2_group_desc *bg,
struct buffer_head *group_bh,
@@ -1388,8 +1383,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
ocfs2_journal_dirty(handle, group_bh);
bail:
- if (status)
- mlog_errno(status);
return status;
}
@@ -1588,7 +1581,7 @@ static int ocfs2_block_group_search(struct inode *inode,
return ret;
}
-static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
handle_t *handle,
struct buffer_head *di_bh,
u32 num_bits,
@@ -1615,6 +1608,21 @@ out:
return ret;
}
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain)
+{
+ u32 tmp_used;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
+ struct ocfs2_chain_list *cl;
+
+ cl = (struct ocfs2_chain_list *)&di->id2.i_chain;
+ tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
+ di->id1.bitmap1.i_used = cpu_to_le32(tmp_used - num_bits);
+ le32_add_cpu(&cl->cl_recs[chain].c_free, num_bits);
+}
+
static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
struct ocfs2_extent_rec *rec,
struct ocfs2_chain_list *cl)
@@ -1715,8 +1723,12 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
res->sr_bit_offset, res->sr_bits);
- if (ret < 0)
+ if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode, ac->ac_bh,
+ res->sr_bits,
+ le16_to_cpu(gd->bg_chain));
mlog_errno(ret);
+ }
out_loc_only:
*bits_left = le16_to_cpu(gd->bg_free_bits_count);
@@ -1846,6 +1858,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
res->sr_bit_offset,
res->sr_bits);
if (status < 0) {
+ ocfs2_rollback_alloc_dinode_counts(alloc_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(status);
goto bail;
}
@@ -2099,7 +2113,7 @@ int ocfs2_find_new_inode_loc(struct inode *dir,
ac->ac_find_loc_priv = res;
*fe_blkno = res->sr_blkno;
-
+ ocfs2_update_inode_fsync_trans(handle, dir, 0);
out:
if (handle)
ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle);
@@ -2157,6 +2171,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
res->sr_bit_offset,
res->sr_bits);
if (ret < 0) {
+ ocfs2_rollback_alloc_dinode_counts(ac->ac_inode,
+ ac->ac_bh, res->sr_bits, chain);
mlog_errno(ret);
goto out;
}
@@ -2878,6 +2894,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
if (status < 0) {
mutex_unlock(&inode_alloc_inode->i_mutex);
+ iput(inode_alloc_inode);
mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
(u32)suballoc_slot, status);
goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index a36d0aa5091..2d2501767c0 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,6 +86,22 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb,
u32 bits_wanted,
struct ocfs2_alloc_context **ac);
+int ocfs2_alloc_dinode_update_counts(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+void ocfs2_rollback_alloc_dinode_counts(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 num_bits,
+ u16 chain);
+int ocfs2_block_group_set_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct ocfs2_group_desc *bg,
+ struct buffer_head *group_bh,
+ unsigned int bit_off,
+ unsigned int num_bits);
+
int ocfs2_claim_metadata(handle_t *handle,
struct ocfs2_alloc_context *ac,
u32 bits_wanted,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d4e81e4a9b0..ddb662b3244 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,7 +68,6 @@
#include "super.h"
#include "sysfile.h"
#include "uptodate.h"
-#include "ver.h"
#include "xattr.h"
#include "quota.h"
#include "refcounttree.h"
@@ -76,7 +75,7 @@
#include "buffer_head_io.h"
-static struct kmem_cache *ocfs2_inode_cachep = NULL;
+static struct kmem_cache *ocfs2_inode_cachep;
struct kmem_cache *ocfs2_dquot_cachep;
struct kmem_cache *ocfs2_qf_chunk_cachep;
@@ -86,10 +85,11 @@ struct kmem_cache *ocfs2_qf_chunk_cachep;
* workqueue and schedule on our own. */
struct workqueue_struct *ocfs2_wq = NULL;
-static struct dentry *ocfs2_debugfs_root = NULL;
+static struct dentry *ocfs2_debugfs_root;
MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("OCFS2 cluster file system");
struct mount_options
{
@@ -561,6 +561,9 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb)
if (!oi)
return NULL;
+ oi->i_sync_tid = 0;
+ oi->i_datasync_tid = 0;
+
jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode);
return &oi->vfs_inode;
}
@@ -631,6 +634,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
struct ocfs2_super *osb = OCFS2_SB(sb);
u32 tmp;
+ sync_filesystem(sb);
+
if (!ocfs2_parse_options(sb, data, &parsed_options, 1) ||
!ocfs2_check_set_options(sb, &parsed_options)) {
ret = -EINVAL;
@@ -1238,30 +1243,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type,
return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super);
}
-static void ocfs2_kill_sb(struct super_block *sb)
-{
- struct ocfs2_super *osb = OCFS2_SB(sb);
-
- /* Failed mount? */
- if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
- goto out;
-
- /* Prevent further queueing of inode drop events */
- spin_lock(&dentry_list_lock);
- ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
- spin_unlock(&dentry_list_lock);
- /* Wait for work to finish and/or remove it */
- cancel_work_sync(&osb->dentry_lock_work);
-out:
- kill_block_super(sb);
-}
-
static struct file_system_type ocfs2_fs_type = {
.owner = THIS_MODULE,
.name = "ocfs2",
.mount = ocfs2_mount,
- .kill_sb = ocfs2_kill_sb,
-
+ .kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
.next = NULL
};
@@ -1612,16 +1598,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
return 0;
}
-wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
-
static int __init ocfs2_init(void)
{
- int status, i;
-
- ocfs2_print_version();
-
- for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
- init_waitqueue_head(&ocfs2__ioend_wq[i]);
+ int status;
status = init_ocfs2_uptodate_cache();
if (status < 0)
@@ -1763,7 +1742,7 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_extent_map_init(&oi->vfs_inode);
INIT_LIST_HEAD(&oi->ip_io_markers);
oi->ip_dir_start_lookup = 0;
- atomic_set(&oi->ip_unaligned_aio, 0);
+ mutex_init(&oi->ip_unaligned_aio);
init_rwsem(&oi->ip_alloc_sem);
init_rwsem(&oi->ip_xattr_sem);
mutex_init(&oi->ip_io_mutex);
@@ -1848,8 +1827,8 @@ static int ocfs2_get_sector(struct super_block *sb,
*bh = sb_getblk(sb, block);
if (!*bh) {
- mlog_errno(-EIO);
- return -EIO;
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
}
lock_buffer(*bh);
if (!buffer_dirty(*bh))
@@ -1934,17 +1913,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
debugfs_remove(osb->osb_ctxt);
- /*
- * Flush inode dropping work queue so that deletes are
- * performed while the filesystem is still working
- */
- ocfs2_drop_all_dl_inodes(osb);
-
/* Orphan scan should be stopped as early as possible */
ocfs2_orphan_scan_stop(osb);
ocfs2_disable_quotas(osb);
+ /* All dquots should be freed by now */
+ WARN_ON(!llist_empty(&osb->dquot_drop_list));
+ /* Wait for worker to be done with the work structure in osb */
+ cancel_work_sync(&osb->dquot_drop_work);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
@@ -2075,7 +2053,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
struct inode *inode = NULL;
struct ocfs2_journal *journal;
- __le32 uuid_net_key;
struct ocfs2_super *osb;
u64 total_blocks;
@@ -2121,6 +2098,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
spin_lock_init(&osb->osb_xattr_lock);
ocfs2_init_steal_slots(osb);
+ mutex_init(&osb->system_file_mutex);
+
atomic_set(&osb->alloc_stats.moves, 0);
atomic_set(&osb->alloc_stats.local_data, 0);
atomic_set(&osb->alloc_stats.bitmap_data, 0);
@@ -2225,10 +2204,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
if (ocfs2_clusterinfo_valid(osb)) {
osb->osb_stackflags =
OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags;
- memcpy(osb->osb_cluster_stack,
+ strlcpy(osb->osb_cluster_stack,
OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
- OCFS2_STACK_LABEL_LEN);
- osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+ OCFS2_STACK_LABEL_LEN + 1);
if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
mlog(ML_ERROR,
"couldn't mount because of an invalid "
@@ -2237,6 +2215,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
status = -EINVAL;
goto bail;
}
+ strlcpy(osb->osb_cluster_name,
+ OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster,
+ OCFS2_CLUSTER_NAME_LEN + 1);
} else {
/* The empty string is identical with classic tools that
* don't know about s_cluster_info. */
@@ -2272,8 +2253,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
journal->j_state = OCFS2_JOURNAL_FREE;
- INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
- osb->dentry_lock_list = NULL;
+ INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs);
+ init_llist_head(&osb->dquot_drop_list);
/* get some pseudo constants for clustersize bits */
osb->s_clustersize_bits =
@@ -2307,10 +2288,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
goto bail;
}
- memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
-
- strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
- osb->vol_label[63] = '\0';
+ strlcpy(osb->vol_label, di->id2.i_super.s_label,
+ OCFS2_MAX_VOL_LABEL_LEN);
osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno);
osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno);
osb->first_cluster_group_blkno =
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index f053688d22a..af155c18312 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -113,9 +113,11 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
} else
arr = get_local_system_inode(osb, type, slot);
+ mutex_lock(&osb->system_file_mutex);
if (arr && ((inode = *arr) != NULL)) {
/* get a ref in addition to the array ref */
inode = igrab(inode);
+ mutex_unlock(&osb->system_file_mutex);
BUG_ON(!inode);
return inode;
@@ -129,6 +131,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
*arr = igrab(inode);
BUG_ON(!*arr);
}
+ mutex_unlock(&osb->system_file_mutex);
return inode;
}
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 52eaf33d346..82e17b076ce 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item {
sector_t c_block;
};
-static struct kmem_cache *ocfs2_uptodate_cachep = NULL;
+static struct kmem_cache *ocfs2_uptodate_cachep;
u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci)
{
diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c
deleted file mode 100644
index e2488f4128a..00000000000
--- a/fs/ocfs2/ver.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.c
- *
- * version string
- *
- * Copyright (C) 2002, 2005 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/kernel.h>
-
-#include "ver.h"
-
-#define OCFS2_BUILD_VERSION "1.5.0"
-
-#define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION
-
-void ocfs2_print_version(void)
-{
- printk(KERN_INFO "%s\n", VERSION_STR);
-}
-
-MODULE_DESCRIPTION(VERSION_STR);
-
-MODULE_VERSION(OCFS2_BUILD_VERSION);
diff --git a/fs/ocfs2/ver.h b/fs/ocfs2/ver.h
deleted file mode 100644
index d7395cb91d2..00000000000
--- a/fs/ocfs2/ver.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ver.h
- *
- * Function prototypes
- *
- * Copyright (C) 2002, 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef OCFS2_VER_H
-#define OCFS2_VER_H
-
-void ocfs2_print_version(void);
-
-#endif /* OCFS2_VER_H */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 6ce0686eab7..016f01df382 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -99,8 +99,8 @@ static struct ocfs2_xattr_def_value_root def_xv = {
const struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
- &ocfs2_xattr_acl_access_handler,
- &ocfs2_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
&ocfs2_xattr_trusted_handler,
&ocfs2_xattr_security_handler,
NULL
@@ -109,9 +109,9 @@ const struct xattr_handler *ocfs2_xattr_handlers[] = {
static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
- = &ocfs2_xattr_acl_access_handler,
+ = &posix_acl_access_xattr_handler,
[OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
- = &ocfs2_xattr_acl_default_handler,
+ = &posix_acl_default_xattr_handler,
[OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
[OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
@@ -369,7 +369,7 @@ static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
* them fully.
*/
static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
- u64 xb_blkno)
+ u64 xb_blkno, int new)
{
int i, rc = 0;
@@ -377,15 +377,22 @@ static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
xb_blkno + i);
if (!bucket->bu_bhs[i]) {
- rc = -EIO;
+ rc = -ENOMEM;
mlog_errno(rc);
break;
}
if (!ocfs2_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
- bucket->bu_bhs[i]))
- ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
- bucket->bu_bhs[i]);
+ bucket->bu_bhs[i])) {
+ if (new)
+ ocfs2_set_new_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ else {
+ set_buffer_uptodate(bucket->bu_bhs[i]);
+ ocfs2_set_buffer_uptodate(INODE_CACHE(bucket->bu_inode),
+ bucket->bu_bhs[i]);
+ }
+ }
}
if (rc)
@@ -754,8 +761,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
BUG_ON(why == RESTART_META);
credits = ocfs2_calc_extend_credits(inode->i_sb,
- &vb->vb_xv->xr_list,
- clusters_to_add);
+ &vb->vb_xv->xr_list);
status = ocfs2_extend_trans(handle, credits);
if (status < 0) {
status = -ENOMEM;
@@ -2603,6 +2609,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL);
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
@@ -2865,6 +2872,12 @@ static int ocfs2_create_xattr_block(struct inode *inode,
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
+ if (!new_bh) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto end;
+ }
+
ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode), new_bh);
ret = ocfs2_journal_access_xb(ctxt->handle, INODE_CACHE(inode),
@@ -3040,8 +3053,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
clusters_add += new_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
}
goto meta_guess;
@@ -3106,8 +3118,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
if (!ocfs2_xattr_is_local(xe))
credits += ocfs2_calc_extend_credits(
inode->i_sb,
- &def_xv.xv.xr_list,
- new_clusters);
+ &def_xv.xv.xr_list);
goto out;
}
}
@@ -3132,9 +3143,7 @@ static int ocfs2_calc_xattr_set_need(struct inode *inode,
meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
clusters_add += new_clusters - old_clusters;
credits += ocfs2_calc_extend_credits(inode->i_sb,
- &xv->xr_list,
- new_clusters -
- old_clusters);
+ &xv->xr_list);
if (value_size >= OCFS2_XATTR_ROOT_SIZE)
goto out;
}
@@ -3180,7 +3189,7 @@ meta_guess:
&xb->xb_attrs.xb_root.xt_list;
meta_add += ocfs2_extend_meta_needed(el);
credits += ocfs2_calc_extend_credits(inode->i_sb,
- el, 1);
+ el);
} else
credits += OCFS2_SUBALLOC_ALLOC + 1;
@@ -3199,8 +3208,15 @@ meta_guess:
clusters_add += 1;
}
} else {
- meta_add += 1;
credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ if (xi->xi_value_len > OCFS2_XATTR_INLINE_SIZE) {
+ struct ocfs2_extent_list *el = &def_xv.xv.xr_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el);
+ } else {
+ meta_add += 1;
+ }
}
out:
if (clusters_need)
@@ -3613,6 +3629,7 @@ int ocfs2_xattr_set(struct inode *inode,
}
ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+ ocfs2_update_inode_fsync_trans(ctxt.handle, inode, 0);
ocfs2_commit_trans(osb, ctxt.handle);
@@ -4293,7 +4310,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
- ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno, 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4637,7 +4654,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
* Even if !new_bucket_head, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk, new_bucket_head);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4803,7 +4820,7 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
* Even if !t_is_new, we're overwriting t_bucket. Thus,
* there's no need to read it.
*/
- ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno, t_is_new);
if (ret)
goto out;
@@ -5475,6 +5492,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
ret = ocfs2_truncate_log_append(osb, handle, blkno, len);
if (ret)
mlog_errno(ret);
+ ocfs2_update_inode_fsync_trans(handle, inode, 0);
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -6216,8 +6234,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb,
le16_to_cpu(xv->xr_list.l_next_free_rec);
*credits += ocfs2_calc_extend_credits(sb,
- &def_xv.xv.xr_list,
- le32_to_cpu(xv->xr_clusters));
+ &def_xv.xv.xr_list);
/*
* If the value is a tree with depth > 1, We don't go deep
@@ -6782,7 +6799,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators(
metas.num_metas += ocfs2_extend_meta_needed(xt_et->et_root_el);
*credits += ocfs2_calc_extend_credits(osb->sb,
- xt_et->et_root_el, len);
+ xt_et->et_root_el);
if (metas.num_metas) {
ret = ocfs2_reserve_new_metadata_blocks(osb, metas.num_metas,
@@ -6830,7 +6847,7 @@ static int ocfs2_reflink_xattr_bucket(handle_t *handle,
break;
}
- ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno);
+ ret = ocfs2_init_xattr_bucket(args->new_bucket, new_blkno, 1);
if (ret) {
mlog_errno(ret);
break;
@@ -7190,10 +7207,12 @@ out:
*/
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr)
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl)
{
- int ret = 0;
struct buffer_head *dir_bh = NULL;
+ int ret = 0;
ret = ocfs2_init_security_get(inode, dir, qstr, NULL);
if (ret) {
@@ -7207,9 +7226,10 @@ int ocfs2_init_security_and_acl(struct inode *dir,
goto leave;
}
- ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL);
- if (ret)
- mlog_errno(ret);
+ if (!ret && default_acl)
+ ret = ocfs2_iop_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (!ret && acl)
+ ret = ocfs2_iop_set_acl(inode, acl, ACL_TYPE_ACCESS);
ocfs2_inode_unlock(dir, 0);
brelse(dir_bh);
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 19f134e896a..f10d5b93c36 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -40,8 +40,6 @@ struct ocfs2_security_xattr_info {
extern const struct xattr_handler ocfs2_xattr_user_handler;
extern const struct xattr_handler ocfs2_xattr_trusted_handler;
extern const struct xattr_handler ocfs2_xattr_security_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
-extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
extern const struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
@@ -96,5 +94,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
bool preserve_security);
int ocfs2_init_security_and_acl(struct inode *dir,
struct inode *inode,
- const struct qstr *qstr);
+ const struct qstr *qstr,
+ struct posix_acl *default_acl,
+ struct posix_acl *acl);
#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 54d57d6ba68..902e88527fc 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -337,10 +337,10 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
const struct file_operations omfs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d8b0afde217..ec58c765918 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode)
*/
static void omfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_nlink)
diff --git a/fs/open.c b/fs/open.c
index d420331ca32..d6fd3acde13 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -57,7 +57,8 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
newattrs.ia_valid |= ret | ATTR_FORCE;
mutex_lock(&dentry->d_inode->i_mutex);
- ret = notify_change(dentry, &newattrs);
+ /* Note any delegations or leases have already been broken: */
+ ret = notify_change(dentry, &newattrs, NULL);
mutex_unlock(&dentry->d_inode->i_mutex);
return ret;
}
@@ -230,7 +231,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
return -EINVAL;
/* Return error if mode is not supported */
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ return -EOPNOTSUPP;
+
+ /* Punch hole and zero range are mutually exclusive */
+ if ((mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) ==
+ (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
/* Punch hole must have keep size set */
@@ -238,17 +245,30 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
!(mode & FALLOC_FL_KEEP_SIZE))
return -EOPNOTSUPP;
+ /* Collapse range should only be used exclusively. */
+ if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
+ (mode & ~FALLOC_FL_COLLAPSE_RANGE))
+ return -EINVAL;
+
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- /* It's not possible punch hole on append only file */
- if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+ /*
+ * We can only allow pure fallocate on append only files
+ */
+ if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
return -EPERM;
if (IS_IMMUTABLE(inode))
return -EPERM;
/*
+ * We cannot allow any fallocate operation on an active swapfile
+ */
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
+
+ /*
* Revalidate the write permissions, in case security policy has
* changed since the files were opened.
*/
@@ -464,21 +484,28 @@ out:
static int chmod_common(struct path *path, umode_t mode)
{
struct inode *inode = path->dentry->d_inode;
+ struct inode *delegated_inode = NULL;
struct iattr newattrs;
int error;
error = mnt_want_write(path->mnt);
if (error)
return error;
+retry_deleg:
mutex_lock(&inode->i_mutex);
error = security_path_chmod(path, mode);
if (error)
goto out_unlock;
newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
- error = notify_change(path->dentry, &newattrs);
+ error = notify_change(path->dentry, &newattrs, &delegated_inode);
out_unlock:
mutex_unlock(&inode->i_mutex);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
mnt_drop_write(path->mnt);
return error;
}
@@ -522,6 +549,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
static int chown_common(struct path *path, uid_t user, gid_t group)
{
struct inode *inode = path->dentry->d_inode;
+ struct inode *delegated_inode = NULL;
int error;
struct iattr newattrs;
kuid_t uid;
@@ -546,12 +574,17 @@ static int chown_common(struct path *path, uid_t user, gid_t group)
if (!S_ISDIR(inode->i_mode))
newattrs.ia_valid |=
ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+retry_deleg:
mutex_lock(&inode->i_mutex);
error = security_path_chown(path, uid, gid);
if (!error)
- error = notify_change(path->dentry, &newattrs);
+ error = notify_change(path->dentry, &newattrs, &delegated_inode);
mutex_unlock(&inode->i_mutex);
-
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
return error;
}
@@ -618,35 +651,6 @@ out:
return error;
}
-/*
- * You have to be very careful that these write
- * counts get cleaned up in error cases and
- * upon __fput(). This should probably never
- * be called outside of __dentry_open().
- */
-static inline int __get_file_write_access(struct inode *inode,
- struct vfsmount *mnt)
-{
- int error;
- error = get_write_access(inode);
- if (error)
- return error;
- /*
- * Do not take mount writer counts on
- * special files since no writes to
- * the mount itself will occur.
- */
- if (!special_file(inode->i_mode)) {
- /*
- * Balanced in __fput()
- */
- error = __mnt_want_write(mnt);
- if (error)
- put_write_access(inode);
- }
- return error;
-}
-
int open_check_o_direct(struct file *f)
{
/* NB: we're sure to have correct a_ops only after f_op->open */
@@ -671,28 +675,37 @@ static int do_dentry_open(struct file *f,
f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
FMODE_PREAD | FMODE_PWRITE;
- if (unlikely(f->f_flags & O_PATH))
- f->f_mode = FMODE_PATH;
-
path_get(&f->f_path);
inode = f->f_inode = f->f_path.dentry->d_inode;
- if (f->f_mode & FMODE_WRITE) {
- error = __get_file_write_access(inode, f->f_path.mnt);
- if (error)
- goto cleanup_file;
- if (!special_file(inode->i_mode))
- file_take_write(f);
- }
-
f->f_mapping = inode->i_mapping;
- file_sb_list_add(f, inode->i_sb);
- if (unlikely(f->f_mode & FMODE_PATH)) {
+ if (unlikely(f->f_flags & O_PATH)) {
+ f->f_mode = FMODE_PATH;
f->f_op = &empty_fops;
return 0;
}
+ if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
+ error = get_write_access(inode);
+ if (unlikely(error))
+ goto cleanup_file;
+ error = __mnt_want_write(f->f_path.mnt);
+ if (unlikely(error)) {
+ put_write_access(inode);
+ goto cleanup_file;
+ }
+ f->f_mode |= FMODE_WRITER;
+ }
+
+ /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
+ if (S_ISREG(inode->i_mode))
+ f->f_mode |= FMODE_ATOMIC_POS;
+
f->f_op = fops_get(inode->i_fop);
+ if (unlikely(WARN_ON(!f->f_op))) {
+ error = -ENODEV;
+ goto cleanup_all;
+ }
error = security_file_open(f, cred);
if (error)
@@ -702,7 +715,7 @@ static int do_dentry_open(struct file *f,
if (error)
goto cleanup_all;
- if (!open && f->f_op)
+ if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
@@ -711,6 +724,12 @@ static int do_dentry_open(struct file *f,
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(inode);
+ if ((f->f_mode & FMODE_READ) &&
+ likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter))
+ f->f_mode |= FMODE_CAN_READ;
+ if ((f->f_mode & FMODE_WRITE) &&
+ likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter))
+ f->f_mode |= FMODE_CAN_WRITE;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -720,19 +739,9 @@ static int do_dentry_open(struct file *f,
cleanup_all:
fops_put(f->f_op);
- file_sb_list_del(f);
- if (f->f_mode & FMODE_WRITE) {
+ if (f->f_mode & FMODE_WRITER) {
put_write_access(inode);
- if (!special_file(inode->i_mode)) {
- /*
- * We don't consider this a real
- * mnt_want/drop_write() pair
- * because it all happenend right
- * here, so just reset the state.
- */
- file_reset_write(f);
- __mnt_drop_write(f->f_path.mnt);
- }
+ __mnt_drop_write(f->f_path.mnt);
}
cleanup_file:
path_put(&f->f_path);
@@ -1023,7 +1032,7 @@ int filp_close(struct file *filp, fl_owner_t id)
return 0;
}
- if (filp->f_op && filp->f_op->flush)
+ if (filp->f_op->flush)
retval = filp->f_op->flush(filp, id);
if (likely(!(filp->f_mode & FMODE_PATH))) {
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 8c0ceb8dd1f..15e4500cda3 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -368,6 +368,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino)
static int openprom_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_NOATIME;
return 0;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index d2c45e14e6d..21981e58e2a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -116,99 +116,6 @@ void pipe_wait(struct pipe_inode_info *pipe)
pipe_lock(pipe);
}
-static int
-pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
- int atomic)
-{
- unsigned long copy;
-
- while (len > 0) {
- while (!iov->iov_len)
- iov++;
- copy = min_t(unsigned long, len, iov->iov_len);
-
- if (atomic) {
- if (__copy_from_user_inatomic(to, iov->iov_base, copy))
- return -EFAULT;
- } else {
- if (copy_from_user(to, iov->iov_base, copy))
- return -EFAULT;
- }
- to += copy;
- len -= copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- return 0;
-}
-
-static int
-pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
- int atomic)
-{
- unsigned long copy;
-
- while (len > 0) {
- while (!iov->iov_len)
- iov++;
- copy = min_t(unsigned long, len, iov->iov_len);
-
- if (atomic) {
- if (__copy_to_user_inatomic(iov->iov_base, from, copy))
- return -EFAULT;
- } else {
- if (copy_to_user(iov->iov_base, from, copy))
- return -EFAULT;
- }
- from += copy;
- len -= copy;
- iov->iov_base += copy;
- iov->iov_len -= copy;
- }
- return 0;
-}
-
-/*
- * Attempt to pre-fault in the user memory, so we can use atomic copies.
- * Returns the number of bytes not faulted in.
- */
-static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
-{
- while (!iov->iov_len)
- iov++;
-
- while (len > 0) {
- unsigned long this_len;
-
- this_len = min_t(unsigned long, len, iov->iov_len);
- if (fault_in_pages_writeable(iov->iov_base, this_len))
- break;
-
- len -= this_len;
- iov++;
- }
-
- return len;
-}
-
-/*
- * Pre-fault in the user memory, so we can use atomic copies.
- */
-static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
-{
- while (!iov->iov_len)
- iov++;
-
- while (len > 0) {
- unsigned long this_len;
-
- this_len = min_t(unsigned long, len, iov->iov_len);
- fault_in_pages_readable(iov->iov_base, this_len);
- len -= this_len;
- iov++;
- }
-}
-
static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
@@ -226,52 +133,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
}
/**
- * generic_pipe_buf_map - virtually map a pipe buffer
- * @pipe: the pipe that the buffer belongs to
- * @buf: the buffer that should be mapped
- * @atomic: whether to use an atomic map
- *
- * Description:
- * This function returns a kernel virtual address mapping for the
- * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
- * and the caller has to be careful not to fault before calling
- * the unmap function.
- *
- * Note that this function calls kmap_atomic() if @atomic != 0.
- */
-void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf, int atomic)
-{
- if (atomic) {
- buf->flags |= PIPE_BUF_FLAG_ATOMIC;
- return kmap_atomic(buf->page);
- }
-
- return kmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_map);
-
-/**
- * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
- * @pipe: the pipe that the buffer belongs to
- * @buf: the buffer that should be unmapped
- * @map_data: the data that the mapping function returned
- *
- * Description:
- * This function undoes the mapping that ->map() provided.
- */
-void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf, void *map_data)
-{
- if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
- buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
- kunmap_atomic(map_data);
- } else
- kunmap(buf->page);
-}
-EXPORT_SYMBOL(generic_pipe_buf_unmap);
-
-/**
* generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to attempt to steal
@@ -351,8 +212,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release);
static const struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -361,8 +220,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
static const struct pipe_buf_operations packet_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = anon_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -370,17 +227,14 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
};
static ssize_t
-pipe_read(struct kiocb *iocb, const struct iovec *_iov,
- unsigned long nr_segs, loff_t pos)
+pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
+ size_t total_len = iov_iter_count(to);
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
int do_wakeup;
ssize_t ret;
- struct iovec *iov = (struct iovec *)_iov;
- size_t total_len;
- total_len = iov_length(iov, nr_segs);
/* Null read succeeds. */
if (unlikely(total_len == 0))
return 0;
@@ -394,9 +248,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
int curbuf = pipe->curbuf;
struct pipe_buffer *buf = pipe->bufs + curbuf;
const struct pipe_buf_operations *ops = buf->ops;
- void *addr;
size_t chars = buf->len;
- int error, atomic;
+ size_t written;
+ int error;
if (chars > total_len)
chars = total_len;
@@ -408,21 +262,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov,
break;
}
- atomic = !iov_fault_in_pages_write(iov, chars);
-redo:
- addr = ops->map(pipe, buf, atomic);
- error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
- ops->unmap(pipe, buf, addr);
- if (unlikely(error)) {
- /*
- * Just retry with the slow path if we failed.
- */
- if (atomic) {
- atomic = 0;
- goto redo;
- }
+ written = copy_page_to_iter(buf->page, buf->offset, chars, to);
+ if (unlikely(written < chars)) {
if (!ret)
- ret = error;
+ ret = -EFAULT;
break;
}
ret += chars;
@@ -493,24 +336,19 @@ static inline int is_packetized(struct file *file)
}
static ssize_t
-pipe_write(struct kiocb *iocb, const struct iovec *_iov,
- unsigned long nr_segs, loff_t ppos)
+pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
struct file *filp = iocb->ki_filp;
struct pipe_inode_info *pipe = filp->private_data;
- ssize_t ret;
- int do_wakeup;
- struct iovec *iov = (struct iovec *)_iov;
- size_t total_len;
+ ssize_t ret = 0;
+ int do_wakeup = 0;
+ size_t total_len = iov_iter_count(from);
ssize_t chars;
- total_len = iov_length(iov, nr_segs);
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
- do_wakeup = 0;
- ret = 0;
__pipe_lock(pipe);
if (!pipe->readers) {
@@ -529,32 +367,19 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- int error, atomic = 1;
- void *addr;
-
- error = ops->confirm(pipe, buf);
+ int error = ops->confirm(pipe, buf);
if (error)
goto out;
- iov_fault_in_pages_read(iov, chars);
-redo1:
- addr = ops->map(pipe, buf, atomic);
- error = pipe_iov_copy_from_user(offset + addr, iov,
- chars, atomic);
- ops->unmap(pipe, buf, addr);
- ret = error;
- do_wakeup = 1;
- if (error) {
- if (atomic) {
- atomic = 0;
- goto redo1;
- }
+ ret = copy_page_from_iter(buf->page, offset, chars, from);
+ if (unlikely(ret < chars)) {
+ error = -EFAULT;
goto out;
}
+ do_wakeup = 1;
buf->len += chars;
- total_len -= chars;
ret = chars;
- if (!total_len)
+ if (!iov_iter_count(from))
goto out;
}
}
@@ -573,8 +398,7 @@ redo1:
int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
struct page *page = pipe->tmp_page;
- char *src;
- int error, atomic = 1;
+ int copied;
if (!page) {
page = alloc_page(GFP_HIGHUSER);
@@ -590,40 +414,19 @@ redo1:
* FIXME! Is this really true?
*/
do_wakeup = 1;
- chars = PAGE_SIZE;
- if (chars > total_len)
- chars = total_len;
-
- iov_fault_in_pages_read(iov, chars);
-redo2:
- if (atomic)
- src = kmap_atomic(page);
- else
- src = kmap(page);
-
- error = pipe_iov_copy_from_user(src, iov, chars,
- atomic);
- if (atomic)
- kunmap_atomic(src);
- else
- kunmap(page);
-
- if (unlikely(error)) {
- if (atomic) {
- atomic = 0;
- goto redo2;
- }
+ copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
+ if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
if (!ret)
- ret = error;
+ ret = -EFAULT;
break;
}
- ret += chars;
+ ret += copied;
/* Insert it into the buffer array */
buf->page = page;
buf->ops = &anon_pipe_buf_ops;
buf->offset = 0;
- buf->len = chars;
+ buf->len = copied;
buf->flags = 0;
if (is_packetized(filp)) {
buf->ops = &packet_pipe_buf_ops;
@@ -632,8 +435,7 @@ redo2:
pipe->nrbufs = ++bufs;
pipe->tmp_page = NULL;
- total_len -= chars;
- if (!total_len)
+ if (!iov_iter_count(from))
break;
}
if (bufs < pipe->buffers)
@@ -663,10 +465,11 @@ out:
wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
}
- if (ret > 0) {
+ if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
int err = file_update_time(filp);
if (err)
ret = err;
+ sb_end_write(file_inode(filp)->i_sb);
}
return ret;
}
@@ -726,11 +529,25 @@ pipe_poll(struct file *filp, poll_table *wait)
return mask;
}
+static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
+{
+ int kill = 0;
+
+ spin_lock(&inode->i_lock);
+ if (!--pipe->files) {
+ inode->i_pipe = NULL;
+ kill = 1;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (kill)
+ free_pipe_info(pipe);
+}
+
static int
pipe_release(struct inode *inode, struct file *file)
{
- struct pipe_inode_info *pipe = inode->i_pipe;
- int kill = 0;
+ struct pipe_inode_info *pipe = file->private_data;
__pipe_lock(pipe);
if (file->f_mode & FMODE_READ)
@@ -743,17 +560,9 @@ pipe_release(struct inode *inode, struct file *file)
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
- spin_lock(&inode->i_lock);
- if (!--pipe->files) {
- inode->i_pipe = NULL;
- kill = 1;
- }
- spin_unlock(&inode->i_lock);
__pipe_unlock(pipe);
- if (kill)
- free_pipe_info(pipe);
-
+ put_pipe_info(inode, pipe);
return 0;
}
@@ -1014,7 +823,6 @@ static int fifo_open(struct inode *inode, struct file *filp)
{
struct pipe_inode_info *pipe;
bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
- int kill = 0;
int ret;
filp->f_version = 0;
@@ -1130,25 +938,19 @@ err_wr:
goto err;
err:
- spin_lock(&inode->i_lock);
- if (!--pipe->files) {
- inode->i_pipe = NULL;
- kill = 1;
- }
- spin_unlock(&inode->i_lock);
__pipe_unlock(pipe);
- if (kill)
- free_pipe_info(pipe);
+
+ put_pipe_info(inode, pipe);
return ret;
}
const struct file_operations pipefifo_fops = {
.open = fifo_open,
.llseek = no_llseek,
- .read = do_sync_read,
- .aio_read = pipe_read,
- .write = do_sync_write,
- .aio_write = pipe_write,
+ .read = new_sync_read,
+ .read_iter = pipe_read,
+ .write = new_sync_write,
+ .write_iter = pipe_write,
.poll = pipe_poll,
.unlocked_ioctl = pipe_ioctl,
.release = pipe_release,
diff --git a/fs/pnode.c b/fs/pnode.c
index 9af0df15256..302bf22c4a3 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m,
}
}
-/*
- * return the source mount to be used for cloning
- *
- * @dest the current destination mount
- * @last_dest the last seen destination mount
- * @last_src the last seen source mount
- * @type return CL_SLAVE if the new mount has to be
- * cloned as a slave.
- */
-static struct mount *get_source(struct mount *dest,
- struct mount *last_dest,
- struct mount *last_src,
- int *type)
+static struct mount *next_group(struct mount *m, struct mount *origin)
{
- struct mount *p_last_src = NULL;
- struct mount *p_last_dest = NULL;
-
- while (last_dest != dest->mnt_master) {
- p_last_dest = last_dest;
- p_last_src = last_src;
- last_dest = last_dest->mnt_master;
- last_src = last_src->mnt_master;
+ while (1) {
+ while (1) {
+ struct mount *next;
+ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
+ return first_slave(m);
+ next = next_peer(m);
+ if (m->mnt_group_id == origin->mnt_group_id) {
+ if (next == origin)
+ return NULL;
+ } else if (m->mnt_slave.next != &next->mnt_slave)
+ break;
+ m = next;
+ }
+ /* m is the last peer */
+ while (1) {
+ struct mount *master = m->mnt_master;
+ if (m->mnt_slave.next != &master->mnt_slave_list)
+ return next_slave(m);
+ m = next_peer(master);
+ if (master->mnt_group_id == origin->mnt_group_id)
+ break;
+ if (master->mnt_slave.next == &m->mnt_slave)
+ break;
+ m = master;
+ }
+ if (m == origin)
+ return NULL;
}
+}
- if (p_last_dest) {
- do {
- p_last_dest = next_peer(p_last_dest);
- } while (IS_MNT_NEW(p_last_dest));
- /* is that a peer of the earlier? */
- if (dest == p_last_dest) {
- *type = CL_MAKE_SHARED;
- return p_last_src;
+/* all accesses are serialized by namespace_sem */
+static struct user_namespace *user_ns;
+static struct mount *last_dest, *last_source, *dest_master;
+static struct mountpoint *mp;
+static struct hlist_head *list;
+
+static int propagate_one(struct mount *m)
+{
+ struct mount *child;
+ int type;
+ /* skip ones added by this propagate_mnt() */
+ if (IS_MNT_NEW(m))
+ return 0;
+ /* skip if mountpoint isn't covered by it */
+ if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
+ return 0;
+ if (m->mnt_group_id == last_dest->mnt_group_id) {
+ type = CL_MAKE_SHARED;
+ } else {
+ struct mount *n, *p;
+ for (n = m; ; n = p) {
+ p = n->mnt_master;
+ if (p == dest_master || IS_MNT_MARKED(p)) {
+ while (last_dest->mnt_master != p) {
+ last_source = last_source->mnt_master;
+ last_dest = last_source->mnt_parent;
+ }
+ if (n->mnt_group_id != last_dest->mnt_group_id) {
+ last_source = last_source->mnt_master;
+ last_dest = last_source->mnt_parent;
+ }
+ break;
+ }
}
+ type = CL_SLAVE;
+ /* beginning of peer group among the slaves? */
+ if (IS_MNT_SHARED(m))
+ type |= CL_MAKE_SHARED;
+ }
+
+ /* Notice when we are propagating across user namespaces */
+ if (m->mnt_ns->user_ns != user_ns)
+ type |= CL_UNPRIVILEGED;
+ child = copy_tree(last_source, last_source->mnt.mnt_root, type);
+ if (IS_ERR(child))
+ return PTR_ERR(child);
+ mnt_set_mountpoint(m, mp, child);
+ last_dest = m;
+ last_source = child;
+ if (m->mnt_master != dest_master) {
+ read_seqlock_excl(&mount_lock);
+ SET_MNT_MARK(m->mnt_master);
+ read_sequnlock_excl(&mount_lock);
}
- /* slave of the earlier, then */
- *type = CL_SLAVE;
- /* beginning of peer group among the slaves? */
- if (IS_MNT_SHARED(dest))
- *type |= CL_MAKE_SHARED;
- return last_src;
+ hlist_add_head(&child->mnt_hash, list);
+ return 0;
}
/*
@@ -220,56 +268,50 @@ static struct mount *get_source(struct mount *dest,
* @tree_list : list of heads of trees to be attached.
*/
int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp,
- struct mount *source_mnt, struct list_head *tree_list)
+ struct mount *source_mnt, struct hlist_head *tree_list)
{
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
- struct mount *m, *child;
+ struct mount *m, *n;
int ret = 0;
- struct mount *prev_dest_mnt = dest_mnt;
- struct mount *prev_src_mnt = source_mnt;
- LIST_HEAD(tmp_list);
-
- for (m = propagation_next(dest_mnt, dest_mnt); m;
- m = propagation_next(m, dest_mnt)) {
- int type;
- struct mount *source;
-
- if (IS_MNT_NEW(m))
- continue;
- source = get_source(m, prev_dest_mnt, prev_src_mnt, &type);
-
- /* Notice when we are propagating across user namespaces */
- if (m->mnt_ns->user_ns != user_ns)
- type |= CL_UNPRIVILEGED;
-
- child = copy_tree(source, source->mnt.mnt_root, type);
- if (IS_ERR(child)) {
- ret = PTR_ERR(child);
- list_splice(tree_list, tmp_list.prev);
+ /*
+ * we don't want to bother passing tons of arguments to
+ * propagate_one(); everything is serialized by namespace_sem,
+ * so globals will do just fine.
+ */
+ user_ns = current->nsproxy->mnt_ns->user_ns;
+ last_dest = dest_mnt;
+ last_source = source_mnt;
+ mp = dest_mp;
+ list = tree_list;
+ dest_master = dest_mnt->mnt_master;
+
+ /* all peers of dest_mnt, except dest_mnt itself */
+ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) {
+ ret = propagate_one(n);
+ if (ret)
goto out;
- }
+ }
- if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) {
- mnt_set_mountpoint(m, dest_mp, child);
- list_add_tail(&child->mnt_hash, tree_list);
- } else {
- /*
- * This can happen if the parent mount was bind mounted
- * on some subdirectory of a shared/slave mount.
- */
- list_add_tail(&child->mnt_hash, &tmp_list);
- }
- prev_dest_mnt = m;
- prev_src_mnt = child;
+ /* all slave groups */
+ for (m = next_group(dest_mnt, dest_mnt); m;
+ m = next_group(m, dest_mnt)) {
+ /* everything in that slave group */
+ n = m;
+ do {
+ ret = propagate_one(n);
+ if (ret)
+ goto out;
+ n = next_peer(n);
+ } while (n != m);
}
out:
- br_write_lock(&vfsmount_lock);
- while (!list_empty(&tmp_list)) {
- child = list_first_entry(&tmp_list, struct mount, mnt_hash);
- umount_tree(child, 0);
+ read_seqlock_excl(&mount_lock);
+ hlist_for_each_entry(n, tree_list, mnt_hash) {
+ m = n->mnt_parent;
+ if (m->mnt_master != dest_mnt->mnt_master)
+ CLEAR_MNT_MARK(m->mnt_master);
}
- br_write_unlock(&vfsmount_lock);
+ read_sequnlock_excl(&mount_lock);
return ret;
}
@@ -278,8 +320,7 @@ out:
*/
static inline int do_refcount_check(struct mount *mnt, int count)
{
- int mycount = mnt_get_count(mnt) - mnt->mnt_ghosts;
- return (mycount > count);
+ return mnt_get_count(mnt) > count;
}
/*
@@ -311,7 +352,7 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint, 0);
+ child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
if (child && list_empty(&child->mnt_mounts) &&
(ret = do_refcount_check(child, 1)))
break;
@@ -333,14 +374,16 @@ static void __propagate_umount(struct mount *mnt)
for (m = propagation_next(parent, parent); m;
m = propagation_next(m, parent)) {
- struct mount *child = __lookup_mnt(&m->mnt,
- mnt->mnt_mountpoint, 0);
+ struct mount *child = __lookup_mnt_last(&m->mnt,
+ mnt->mnt_mountpoint);
/*
* umount the child only if the child has no
* other children
*/
- if (child && list_empty(&child->mnt_mounts))
- list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
+ if (child && list_empty(&child->mnt_mounts)) {
+ hlist_del_init_rcu(&child->mnt_hash);
+ hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+ }
}
}
@@ -351,11 +394,11 @@ static void __propagate_umount(struct mount *mnt)
*
* vfsmount lock must be held for write
*/
-int propagate_umount(struct list_head *list)
+int propagate_umount(struct hlist_head *list)
{
struct mount *mnt;
- list_for_each_entry(mnt, list, mnt_hash)
+ hlist_for_each_entry(mnt, list, mnt_hash)
__propagate_umount(mnt);
return 0;
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 59e7eda1851..4a246358b03 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -16,6 +16,9 @@
#define IS_MNT_NEW(m) (!(m)->mnt_ns)
#define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
#define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
+#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
+#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
+#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
@@ -36,8 +39,8 @@ static inline void set_mnt_shared(struct mount *mnt)
void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
- struct list_head *);
-int propagate_umount(struct list_head *);
+ struct hlist_head *);
+int propagate_umount(struct hlist_head *);
int propagate_mount_busy(struct mount *, int);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 8bd2135b7f8..0855f772cd4 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1,10 +1,8 @@
/*
- * linux/fs/posix_acl.c
+ * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
*
- * Copyright (C) 2002 by Andreas Gruenbacher <a.gruenbacher@computer.org>
- *
- * Fixes from William Schumacher incorporated on 15 March 2001.
- * (Reported by Charles Bertsch, <CBertsch@microtest.com>).
+ * Fixes from William Schumacher incorporated on 15 March 2001.
+ * (Reported by Charles Bertsch, <CBertsch@microtest.com>).
*/
/*
@@ -18,15 +16,112 @@
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
#include <linux/export.h>
+#include <linux/user_namespace.h>
-#include <linux/errno.h>
+struct posix_acl **acl_by_type(struct inode *inode, int type)
+{
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ return &inode->i_acl;
+ case ACL_TYPE_DEFAULT:
+ return &inode->i_default_acl;
+ default:
+ BUG();
+ }
+}
+EXPORT_SYMBOL(acl_by_type);
-EXPORT_SYMBOL(posix_acl_init);
-EXPORT_SYMBOL(posix_acl_alloc);
-EXPORT_SYMBOL(posix_acl_valid);
-EXPORT_SYMBOL(posix_acl_equiv_mode);
-EXPORT_SYMBOL(posix_acl_from_mode);
+struct posix_acl *get_cached_acl(struct inode *inode, int type)
+{
+ struct posix_acl **p = acl_by_type(inode, type);
+ struct posix_acl *acl = ACCESS_ONCE(*p);
+ if (acl) {
+ spin_lock(&inode->i_lock);
+ acl = *p;
+ if (acl != ACL_NOT_CACHED)
+ acl = posix_acl_dup(acl);
+ spin_unlock(&inode->i_lock);
+ }
+ return acl;
+}
+EXPORT_SYMBOL(get_cached_acl);
+
+struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
+{
+ return rcu_dereference(*acl_by_type(inode, type));
+}
+EXPORT_SYMBOL(get_cached_acl_rcu);
+
+void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+ struct posix_acl **p = acl_by_type(inode, type);
+ struct posix_acl *old;
+ spin_lock(&inode->i_lock);
+ old = *p;
+ rcu_assign_pointer(*p, posix_acl_dup(acl));
+ spin_unlock(&inode->i_lock);
+ if (old != ACL_NOT_CACHED)
+ posix_acl_release(old);
+}
+EXPORT_SYMBOL(set_cached_acl);
+
+void forget_cached_acl(struct inode *inode, int type)
+{
+ struct posix_acl **p = acl_by_type(inode, type);
+ struct posix_acl *old;
+ spin_lock(&inode->i_lock);
+ old = *p;
+ *p = ACL_NOT_CACHED;
+ spin_unlock(&inode->i_lock);
+ if (old != ACL_NOT_CACHED)
+ posix_acl_release(old);
+}
+EXPORT_SYMBOL(forget_cached_acl);
+
+void forget_all_cached_acls(struct inode *inode)
+{
+ struct posix_acl *old_access, *old_default;
+ spin_lock(&inode->i_lock);
+ old_access = inode->i_acl;
+ old_default = inode->i_default_acl;
+ inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
+ spin_unlock(&inode->i_lock);
+ if (old_access != ACL_NOT_CACHED)
+ posix_acl_release(old_access);
+ if (old_default != ACL_NOT_CACHED)
+ posix_acl_release(old_default);
+}
+EXPORT_SYMBOL(forget_all_cached_acls);
+
+struct posix_acl *get_acl(struct inode *inode, int type)
+{
+ struct posix_acl *acl;
+
+ acl = get_cached_acl(inode, type);
+ if (acl != ACL_NOT_CACHED)
+ return acl;
+
+ if (!IS_POSIXACL(inode))
+ return NULL;
+
+ /*
+ * A filesystem can force a ACL callback by just never filling the
+ * ACL cache. But normally you'd fill the cache either at inode
+ * instantiation time, or on the first ->get_acl call.
+ *
+ * If the filesystem doesn't have a get_acl() function at all, we'll
+ * just create the negative cache entry.
+ */
+ if (!inode->i_op->get_acl) {
+ set_cached_acl(inode, type, NULL);
+ return NULL;
+ }
+ return inode->i_op->get_acl(inode, type);
+}
+EXPORT_SYMBOL(get_acl);
/*
* Init a fresh posix_acl
@@ -37,6 +132,7 @@ posix_acl_init(struct posix_acl *acl, int count)
atomic_set(&acl->a_refcount, 1);
acl->a_count = count;
}
+EXPORT_SYMBOL(posix_acl_init);
/*
* Allocate a new ACL with the specified number of entries.
@@ -51,6 +147,7 @@ posix_acl_alloc(int count, gfp_t flags)
posix_acl_init(acl, count);
return acl;
}
+EXPORT_SYMBOL(posix_acl_alloc);
/*
* Clone an ACL.
@@ -78,8 +175,6 @@ posix_acl_valid(const struct posix_acl *acl)
{
const struct posix_acl_entry *pa, *pe;
int state = ACL_USER_OBJ;
- kuid_t prev_uid = INVALID_UID;
- kgid_t prev_gid = INVALID_GID;
int needs_mask = 0;
FOREACH_ACL_ENTRY(pa, acl, pe) {
@@ -98,10 +193,6 @@ posix_acl_valid(const struct posix_acl *acl)
return -EINVAL;
if (!uid_valid(pa->e_uid))
return -EINVAL;
- if (uid_valid(prev_uid) &&
- uid_lte(pa->e_uid, prev_uid))
- return -EINVAL;
- prev_uid = pa->e_uid;
needs_mask = 1;
break;
@@ -117,10 +208,6 @@ posix_acl_valid(const struct posix_acl *acl)
return -EINVAL;
if (!gid_valid(pa->e_gid))
return -EINVAL;
- if (gid_valid(prev_gid) &&
- gid_lte(pa->e_gid, prev_gid))
- return -EINVAL;
- prev_gid = pa->e_gid;
needs_mask = 1;
break;
@@ -146,6 +233,7 @@ posix_acl_valid(const struct posix_acl *acl)
return 0;
return -EINVAL;
}
+EXPORT_SYMBOL(posix_acl_valid);
/*
* Returns 0 if the acl can be exactly represented in the traditional
@@ -158,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
umode_t mode = 0;
int not_equiv = 0;
+ /*
+ * A null ACL can always be presented as mode bits.
+ */
+ if (!acl)
+ return 0;
+
FOREACH_ACL_ENTRY(pa, acl, pe) {
switch (pa->e_tag) {
case ACL_USER_OBJ:
@@ -186,6 +280,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
*mode_p = (*mode_p & ~S_IRWXUGO) | mode;
return not_equiv;
}
+EXPORT_SYMBOL(posix_acl_equiv_mode);
/*
* Create an ACL representing the file mode permission bits of an inode.
@@ -207,6 +302,7 @@ posix_acl_from_mode(umode_t mode, gfp_t flags)
acl->a_entries[2].e_perm = (mode & S_IRWXO);
return acl;
}
+EXPORT_SYMBOL(posix_acl_from_mode);
/*
* Return 0 if current is granted want access to the inode
@@ -338,7 +434,7 @@ static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
/*
* Modify the ACL for the chmod syscall.
*/
-static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
+static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
{
struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
struct posix_acl_entry *pa, *pe;
@@ -384,7 +480,7 @@ static int posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
}
int
-posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
+__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
{
struct posix_acl *clone = posix_acl_clone(*acl, gfp);
int err = -ENOMEM;
@@ -399,15 +495,15 @@ posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
*acl = clone;
return err;
}
-EXPORT_SYMBOL(posix_acl_create);
+EXPORT_SYMBOL(__posix_acl_create);
int
-posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
+__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
{
struct posix_acl *clone = posix_acl_clone(*acl, gfp);
int err = -ENOMEM;
if (clone) {
- err = posix_acl_chmod_masq(clone, mode);
+ err = __posix_acl_chmod_masq(clone, mode);
if (err) {
posix_acl_release(clone);
clone = NULL;
@@ -417,4 +513,389 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
*acl = clone;
return err;
}
+EXPORT_SYMBOL(__posix_acl_chmod);
+
+int
+posix_acl_chmod(struct inode *inode, umode_t mode)
+{
+ struct posix_acl *acl;
+ int ret = 0;
+
+ if (!IS_POSIXACL(inode))
+ return 0;
+ if (!inode->i_op->set_acl)
+ return -EOPNOTSUPP;
+
+ acl = get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR_OR_NULL(acl)) {
+ if (acl == ERR_PTR(-EOPNOTSUPP))
+ return 0;
+ return PTR_ERR(acl);
+ }
+
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+ if (ret)
+ return ret;
+ ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS);
+ posix_acl_release(acl);
+ return ret;
+}
EXPORT_SYMBOL(posix_acl_chmod);
+
+int
+posix_acl_create(struct inode *dir, umode_t *mode,
+ struct posix_acl **default_acl, struct posix_acl **acl)
+{
+ struct posix_acl *p;
+ int ret;
+
+ if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
+ goto no_acl;
+
+ p = get_acl(dir, ACL_TYPE_DEFAULT);
+ if (IS_ERR(p)) {
+ if (p == ERR_PTR(-EOPNOTSUPP))
+ goto apply_umask;
+ return PTR_ERR(p);
+ }
+
+ if (!p)
+ goto apply_umask;
+
+ *acl = posix_acl_clone(p, GFP_NOFS);
+ if (!*acl)
+ return -ENOMEM;
+
+ ret = posix_acl_create_masq(*acl, mode);
+ if (ret < 0) {
+ posix_acl_release(*acl);
+ return -ENOMEM;
+ }
+
+ if (ret == 0) {
+ posix_acl_release(*acl);
+ *acl = NULL;
+ }
+
+ if (!S_ISDIR(*mode)) {
+ posix_acl_release(p);
+ *default_acl = NULL;
+ } else {
+ *default_acl = p;
+ }
+ return 0;
+
+apply_umask:
+ *mode &= ~current_umask();
+no_acl:
+ *default_acl = NULL;
+ *acl = NULL;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(posix_acl_create);
+
+/*
+ * Fix up the uids and gids in posix acl extended attributes in place.
+ */
+static void posix_acl_fix_xattr_userns(
+ struct user_namespace *to, struct user_namespace *from,
+ void *value, size_t size)
+{
+ posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+ posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+ int count;
+ kuid_t uid;
+ kgid_t gid;
+
+ if (!value)
+ return;
+ if (size < sizeof(posix_acl_xattr_header))
+ return;
+ if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+ return;
+
+ count = posix_acl_xattr_count(size);
+ if (count < 0)
+ return;
+ if (count == 0)
+ return;
+
+ for (end = entry + count; entry != end; entry++) {
+ switch(le16_to_cpu(entry->e_tag)) {
+ case ACL_USER:
+ uid = make_kuid(from, le32_to_cpu(entry->e_id));
+ entry->e_id = cpu_to_le32(from_kuid(to, uid));
+ break;
+ case ACL_GROUP:
+ gid = make_kgid(from, le32_to_cpu(entry->e_id));
+ entry->e_id = cpu_to_le32(from_kgid(to, gid));
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void posix_acl_fix_xattr_from_user(void *value, size_t size)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ if (user_ns == &init_user_ns)
+ return;
+ posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
+}
+
+void posix_acl_fix_xattr_to_user(void *value, size_t size)
+{
+ struct user_namespace *user_ns = current_user_ns();
+ if (user_ns == &init_user_ns)
+ return;
+ posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
+}
+
+/*
+ * Convert from extended attribute to in-memory representation.
+ */
+struct posix_acl *
+posix_acl_from_xattr(struct user_namespace *user_ns,
+ const void *value, size_t size)
+{
+ posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
+ posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
+ int count;
+ struct posix_acl *acl;
+ struct posix_acl_entry *acl_e;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(posix_acl_xattr_header))
+ return ERR_PTR(-EINVAL);
+ if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ count = posix_acl_xattr_count(size);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+
+ acl = posix_acl_alloc(count, GFP_NOFS);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ acl_e = acl->a_entries;
+
+ for (end = entry + count; entry != end; acl_e++, entry++) {
+ acl_e->e_tag = le16_to_cpu(entry->e_tag);
+ acl_e->e_perm = le16_to_cpu(entry->e_perm);
+
+ switch(acl_e->e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ break;
+
+ case ACL_USER:
+ acl_e->e_uid =
+ make_kuid(user_ns,
+ le32_to_cpu(entry->e_id));
+ if (!uid_valid(acl_e->e_uid))
+ goto fail;
+ break;
+ case ACL_GROUP:
+ acl_e->e_gid =
+ make_kgid(user_ns,
+ le32_to_cpu(entry->e_id));
+ if (!gid_valid(acl_e->e_gid))
+ goto fail;
+ break;
+
+ default:
+ goto fail;
+ }
+ }
+ return acl;
+
+fail:
+ posix_acl_release(acl);
+ return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL (posix_acl_from_xattr);
+
+/*
+ * Convert from in-memory to extended attribute representation.
+ */
+int
+posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
+ void *buffer, size_t size)
+{
+ posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
+ posix_acl_xattr_entry *ext_entry;
+ int real_size, n;
+
+ real_size = posix_acl_xattr_size(acl->a_count);
+ if (!buffer)
+ return real_size;
+ if (real_size > size)
+ return -ERANGE;
+
+ ext_entry = ext_acl->a_entries;
+ ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
+
+ for (n=0; n < acl->a_count; n++, ext_entry++) {
+ const struct posix_acl_entry *acl_e = &acl->a_entries[n];
+ ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
+ ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
+ switch(acl_e->e_tag) {
+ case ACL_USER:
+ ext_entry->e_id =
+ cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
+ break;
+ case ACL_GROUP:
+ ext_entry->e_id =
+ cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
+ break;
+ default:
+ ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
+ break;
+ }
+ }
+ return real_size;
+}
+EXPORT_SYMBOL (posix_acl_to_xattr);
+
+static int
+posix_acl_xattr_get(struct dentry *dentry, const char *name,
+ void *value, size_t size, int type)
+{
+ struct posix_acl *acl;
+ int error;
+
+ if (!IS_POSIXACL(dentry->d_inode))
+ return -EOPNOTSUPP;
+ if (S_ISLNK(dentry->d_inode->i_mode))
+ return -EOPNOTSUPP;
+
+ acl = get_acl(dentry->d_inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+
+ error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ posix_acl_release(acl);
+
+ return error;
+}
+
+static int
+posix_acl_xattr_set(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags, int type)
+{
+ struct inode *inode = dentry->d_inode;
+ struct posix_acl *acl = NULL;
+ int ret;
+
+ if (!IS_POSIXACL(inode))
+ return -EOPNOTSUPP;
+ if (!inode->i_op->set_acl)
+ return -EOPNOTSUPP;
+
+ if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ return value ? -EACCES : 0;
+ if (!inode_owner_or_capable(inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+
+ if (acl) {
+ ret = posix_acl_valid(acl);
+ if (ret)
+ goto out;
+ }
+ }
+
+ ret = inode->i_op->set_acl(inode, acl, type);
+out:
+ posix_acl_release(acl);
+ return ret;
+}
+
+static size_t
+posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len, int type)
+{
+ const char *xname;
+ size_t size;
+
+ if (!IS_POSIXACL(dentry->d_inode))
+ return -EOPNOTSUPP;
+ if (S_ISLNK(dentry->d_inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (type == ACL_TYPE_ACCESS)
+ xname = POSIX_ACL_XATTR_ACCESS;
+ else
+ xname = POSIX_ACL_XATTR_DEFAULT;
+
+ size = strlen(xname) + 1;
+ if (list && size <= list_size)
+ memcpy(list, xname, size);
+ return size;
+}
+
+const struct xattr_handler posix_acl_access_xattr_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .list = posix_acl_xattr_list,
+ .get = posix_acl_xattr_get,
+ .set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
+
+const struct xattr_handler posix_acl_default_xattr_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .list = posix_acl_xattr_list,
+ .get = posix_acl_xattr_get,
+ .set = posix_acl_xattr_set,
+};
+EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler);
+
+int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+ int error;
+
+ if (type == ACL_TYPE_ACCESS) {
+ error = posix_acl_equiv_mode(acl, &inode->i_mode);
+ if (error < 0)
+ return 0;
+ if (error == 0)
+ acl = NULL;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ set_cached_acl(inode, type, acl);
+ return 0;
+}
+
+int simple_acl_create(struct inode *dir, struct inode *inode)
+{
+ struct posix_acl *default_acl, *acl;
+ int error;
+
+ error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (error)
+ return error;
+
+ set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
+ set_cached_acl(inode, ACL_TYPE_ACCESS, acl);
+
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
+ return 0;
+}
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af6222f8a..2183fcf41d5 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -31,6 +31,10 @@ config PROC_FS
config PROC_KCORE
bool "/proc/kcore support" if !ARM
depends on PROC_FS && MMU
+ help
+ Provides a virtual ELF core file of the live kernel. This can
+ be read with gdb and other ELF tools. No modifications can be
+ made using this mechanism.
config PROC_VMCORE
bool "/proc/vmcore support"
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index ab30716584f..239493ec718 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -27,6 +27,5 @@ proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
proc-$(CONFIG_NET) += proc_net.o
proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
-proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cbd0f1b324b..64db2bceac5 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,26 +138,17 @@ static const char * const task_state_array[] = {
"D (disk sleep)", /* 2 */
"T (stopped)", /* 4 */
"t (tracing stop)", /* 8 */
- "Z (zombie)", /* 16 */
- "X (dead)", /* 32 */
- "x (dead)", /* 64 */
- "K (wakekill)", /* 128 */
- "W (waking)", /* 256 */
- "P (parked)", /* 512 */
+ "X (dead)", /* 16 */
+ "Z (zombie)", /* 32 */
};
static inline const char *get_task_state(struct task_struct *tsk)
{
- unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state;
- const char * const *p = &task_state_array[0];
+ unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
- BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array));
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
- while (state) {
- p++;
- state >>= 1;
- }
- return *p;
+ return task_state_array[fls(state)];
}
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
@@ -183,6 +174,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m,
"State:\t%s\n"
"Tgid:\t%d\n"
+ "Ngid:\t%d\n"
"Pid:\t%d\n"
"PPid:\t%d\n"
"TracerPid:\t%d\n"
@@ -190,6 +182,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
"Gid:\t%d\t%d\t%d\t%d\n",
get_task_state(p),
task_tgid_nr_ns(p, ns),
+ task_numa_group_id(p),
pid_nr_ns(pid, ns),
ppid, tpid,
from_kuid_munged(user_ns, cred->uid),
@@ -451,8 +444,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt += t->min_flt;
maj_flt += t->maj_flt;
gtime += task_gtime(t);
- t = next_thread(t);
- } while (t != task);
+ } while_each_thread(task, t);
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1485e38daaa..2d696b0c93b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -200,41 +200,9 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
return result;
}
-static int proc_pid_cmdline(struct task_struct *task, char * buffer)
+static int proc_pid_cmdline(struct task_struct *task, char *buffer)
{
- int res = 0;
- unsigned int len;
- struct mm_struct *mm = get_task_mm(task);
- if (!mm)
- goto out;
- if (!mm->arg_end)
- goto out_mm; /* Shh! No looking before we're done */
-
- len = mm->arg_end - mm->arg_start;
-
- if (len > PAGE_SIZE)
- len = PAGE_SIZE;
-
- res = access_process_vm(task, mm->arg_start, buffer, len, 0);
-
- // If the nul at the end of args has been overwritten, then
- // assume application is using setproctitle(3).
- if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
- len = strnlen(buffer, res);
- if (len < res) {
- res = len;
- } else {
- len = mm->env_end - mm->env_start;
- if (len > PAGE_SIZE - res)
- len = PAGE_SIZE - res;
- res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
- res = strnlen(buffer, res);
- }
- }
-out_mm:
- mmput(mm);
-out:
- return res;
+ return get_cmdline(task, buffer, PAGE_SIZE);
}
static int proc_pid_auxv(struct task_struct *task, char *buffer)
@@ -1151,10 +1119,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
- if (!uid_valid(kloginuid)) {
- length = -EINVAL;
- goto out_free_page;
+
+ /* is userspace tring to explicitly UNSET the loginuid? */
+ if (loginuid == AUDIT_UID_UNSET) {
+ kloginuid = INVALID_UID;
+ } else {
+ kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
+ if (!uid_valid(kloginuid)) {
+ length = -EINVAL;
+ goto out_free_page;
+ }
}
length = audit_set_loginuid(kloginuid);
@@ -1230,6 +1204,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
+ if (make_it_fail < 0 || make_it_fail > 1)
+ return -EINVAL;
+
task = get_proc_task(file_inode(file));
if (!task)
return -ESRCH;
@@ -1652,13 +1629,18 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
return 0;
}
+static inline bool proc_inode_is_dead(struct inode *inode)
+{
+ return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
+}
+
int pid_delete_dentry(const struct dentry *dentry)
{
/* Is the task we represent dead?
* If so, then don't put the dentry on the lru list,
* kill it immediately.
*/
- return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+ return proc_inode_is_dead(dentry->d_inode);
}
const struct dentry_operations pid_dentry_operations =
@@ -1813,6 +1795,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
if (rc)
goto out_mmput;
+ rc = -ENOENT;
down_read(&mm->mmap_sem);
vma = find_exact_vma(mm, vm_start, vm_end);
if (vma && vma->vm_file) {
@@ -2576,7 +2559,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
- ONE("personality", S_IRUGO, proc_pid_personality),
+ ONE("personality", S_IRUSR, proc_pid_personality),
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2586,7 +2569,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUGO, proc_pid_syscall),
+ INF("syscall", S_IRUSR, proc_pid_syscall),
#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
@@ -2605,7 +2588,7 @@ static const struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
- REG("pagemap", S_IRUGO, proc_pagemap_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2614,7 +2597,7 @@ static const struct pid_entry tgid_base_stuff[] = {
INF("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
- ONE("stack", S_IRUGO, proc_pid_stack),
+ ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -2915,14 +2898,14 @@ static const struct pid_entry tid_base_stuff[] = {
REG("environ", S_IRUSR, proc_environ_operations),
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
- ONE("personality", S_IRUGO, proc_pid_personality),
+ ONE("personality", S_IRUSR, proc_pid_personality),
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
#endif
REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
- INF("syscall", S_IRUGO, proc_pid_syscall),
+ INF("syscall", S_IRUSR, proc_pid_syscall),
#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tid_stat),
@@ -2943,7 +2926,7 @@ static const struct pid_entry tid_base_stuff[] = {
#ifdef CONFIG_PROC_PAGE_MONITOR
REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
REG("smaps", S_IRUGO, proc_tid_smaps_operations),
- REG("pagemap", S_IRUGO, proc_pagemap_operations),
+ REG("pagemap", S_IRUSR, proc_pagemap_operations),
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2952,7 +2935,7 @@ static const struct pid_entry tid_base_stuff[] = {
INF("wchan", S_IRUGO, proc_pid_wchan),
#endif
#ifdef CONFIG_STACKTRACE
- ONE("stack", S_IRUGO, proc_pid_stack),
+ ONE("stack", S_IRUSR, proc_pid_stack),
#endif
#ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3086,34 +3069,42 @@ out_no_task:
* In the case of a seek we start with the leader and walk nr
* threads past it.
*/
-static struct task_struct *first_tid(struct task_struct *leader,
- int tid, int nr, struct pid_namespace *ns)
+static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
+ struct pid_namespace *ns)
{
- struct task_struct *pos;
+ struct task_struct *pos, *task;
+ unsigned long nr = f_pos;
+
+ if (nr != f_pos) /* 32bit overflow? */
+ return NULL;
rcu_read_lock();
- /* Attempt to start with the pid of a thread */
- if (tid && (nr > 0)) {
+ task = pid_task(pid, PIDTYPE_PID);
+ if (!task)
+ goto fail;
+
+ /* Attempt to start with the tid of a thread */
+ if (tid && nr) {
pos = find_task_by_pid_ns(tid, ns);
- if (pos && (pos->group_leader == leader))
+ if (pos && same_thread_group(pos, task))
goto found;
}
/* If nr exceeds the number of threads there is nothing todo */
- pos = NULL;
- if (nr && nr >= get_nr_threads(leader))
- goto out;
+ if (nr >= get_nr_threads(task))
+ goto fail;
/* If we haven't found our starting place yet start
* with the leader and walk nr threads forward.
*/
- for (pos = leader; nr > 0; --nr) {
- pos = next_thread(pos);
- if (pos == leader) {
- pos = NULL;
- goto out;
- }
- }
+ pos = task = task->group_leader;
+ do {
+ if (!nr--)
+ goto found;
+ } while_each_thread(task, pos);
+fail:
+ pos = NULL;
+ goto out;
found:
get_task_struct(pos);
out:
@@ -3146,25 +3137,16 @@ static struct task_struct *next_tid(struct task_struct *start)
/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file *file, struct dir_context *ctx)
{
- struct task_struct *leader = NULL;
- struct task_struct *task = get_proc_task(file_inode(file));
+ struct inode *inode = file_inode(file);
+ struct task_struct *task;
struct pid_namespace *ns;
int tid;
- if (!task)
- return -ENOENT;
- rcu_read_lock();
- if (pid_alive(task)) {
- leader = task->group_leader;
- get_task_struct(leader);
- }
- rcu_read_unlock();
- put_task_struct(task);
- if (!leader)
+ if (proc_inode_is_dead(inode))
return -ENOENT;
if (!dir_emit_dots(file, ctx))
- goto out;
+ return 0;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
@@ -3172,7 +3154,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
ns = file->f_dentry->d_sb->s_fs_info;
tid = (int)file->f_version;
file->f_version = 0;
- for (task = first_tid(leader, tid, ctx->pos - 2, ns);
+ for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
char name[PROC_NUMBUF];
@@ -3188,8 +3170,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
break;
}
}
-out:
- put_task_struct(leader);
+
return 0;
}
diff --git a/fs/proc/cmdline.c b/fs/proc/cmdline.c
index 82676e3fcd1..cbd82dff7e8 100644
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -26,4 +26,4 @@ static int __init proc_cmdline_init(void)
proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
return 0;
}
-module_init(proc_cmdline_init);
+fs_initcall(proc_cmdline_init);
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index b701eaa482b..290ba85cb90 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -29,7 +29,6 @@ static int show_console_dev(struct seq_file *m, void *v)
char flags[ARRAY_SIZE(con_flags) + 1];
struct console *con = v;
unsigned int a;
- int len;
dev_t dev = 0;
if (con->device) {
@@ -47,11 +46,10 @@ static int show_console_dev(struct seq_file *m, void *v)
con_flags[a].name : ' ';
flags[a] = 0;
- seq_printf(m, "%s%d%n", con->name, con->index, &len);
- len = 21 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c%c%c%c (%s)", len, ' ', con->read ? 'R' : '-',
+ seq_setwidth(m, 21 - 1);
+ seq_printf(m, "%s%d", con->name, con->index);
+ seq_pad(m, ' ');
+ seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
con->write ? 'W' : '-', con->unblank ? 'U' : '-',
flags);
if (dev)
@@ -111,4 +109,4 @@ static int __init proc_consoles_init(void)
proc_create("consoles", 0, NULL, &proc_consoles_operations);
return 0;
}
-module_init(proc_consoles_init);
+fs_initcall(proc_consoles_init);
diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c
index 5a1e539a234..06f4d31e039 100644
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -21,4 +21,4 @@ static int __init proc_cpuinfo_init(void)
proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
return 0;
}
-module_init(proc_cpuinfo_init);
+fs_initcall(proc_cpuinfo_init);
diff --git a/fs/proc/devices.c b/fs/proc/devices.c
index b14347167c3..50493edc30e 100644
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -67,4 +67,4 @@ static int __init proc_devices_init(void)
proc_create("devices", 0, NULL, &proc_devinfo_operations);
return 0;
}
-module_init(proc_devices_init);
+fs_initcall(proc_devices_init);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 985ea881b5b..0788d093f5d 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
+#include "../mount.h"
#include "internal.h"
#include "fd.h"
@@ -48,8 +49,9 @@ static int seq_show(struct seq_file *m, void *v)
}
if (!ret) {
- seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
- (long long)file->f_pos, f_flags);
+ seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n",
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id);
if (file->f_op->show_fdinfo)
ret = file->f_op->show_fdinfo(m, file);
fput(file);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 737e15615b0..b7f268eb5f4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -49,8 +49,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
setattr_copy(inode, iattr);
mark_inode_dirty(inode);
- de->uid = inode->i_uid;
- de->gid = inode->i_gid;
+ proc_set_user(de, inode->i_uid, inode->i_gid);
de->mode = inode->i_mode;
return 0;
}
@@ -175,22 +174,6 @@ static const struct inode_operations proc_link_inode_operations = {
};
/*
- * As some entries in /proc are volatile, we want to
- * get rid of unused dentries. This could be made
- * smarter: we could keep a "volatile" flag in the
- * inode to indicate which ones to keep.
- */
-static int proc_delete_dentry(const struct dentry * dentry)
-{
- return 1;
-}
-
-static const struct dentry_operations proc_dentry_operations =
-{
- .d_delete = proc_delete_dentry,
-};
-
-/*
* Don't create negative dentries here, return -ENOENT by hand
* instead.
*/
@@ -209,7 +192,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
inode = proc_get_inode(dir->i_sb, de);
if (!inode)
return ERR_PTR(-ENOMEM);
- d_set_d_op(dentry, &proc_dentry_operations);
+ d_set_d_op(dentry, &simple_dentry_operations);
d_add(dentry, inode);
return NULL;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 9f8ef9b7674..0adbc02d60e 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode)
const struct proc_ns_operations *ns_ops;
void *ns;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
/* Stop tracking associated processes */
@@ -47,7 +47,7 @@ static void proc_evict_inode(struct inode *inode)
pde_put(de);
head = PROC_I(inode)->sysctl;
if (head) {
- rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+ RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
sysctl_head_put(head);
}
/* Release any associated namespace */
@@ -285,15 +285,27 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
return rv;
}
-static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags)
+static unsigned long
+proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct proc_dir_entry *pde = PDE(file_inode(file));
- int rv = -EIO;
- unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ unsigned long rv = -EIO;
+
if (use_pde(pde)) {
- get_unmapped_area = pde->proc_fops->get_unmapped_area;
- if (get_unmapped_area)
- rv = get_unmapped_area(file, orig_addr, len, pgoff, flags);
+ typeof(proc_reg_get_unmapped_area) *get_area;
+
+ get_area = pde->proc_fops->get_unmapped_area;
+#ifdef CONFIG_MMU
+ if (!get_area)
+ get_area = current->mm->get_unmapped_area;
+#endif
+
+ if (get_area)
+ rv = get_area(file, orig_addr, len, pgoff, flags);
+ else
+ rv = orig_addr;
unuse_pde(pde);
}
return rv;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 651d09a11dd..3ab6d14e71c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -211,13 +211,6 @@ extern int proc_fill_super(struct super_block *);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
- * proc_devtree.c
- */
-#ifdef CONFIG_PROC_DEVICETREE
-extern void proc_device_tree_init(void);
-#endif
-
-/*
* proc_namespaces.c
*/
extern const struct inode_operations proc_ns_dir_inode_operations;
diff --git a/fs/proc/interrupts.c b/fs/proc/interrupts.c
index 05029c0e2f2..a352d5703b4 100644
--- a/fs/proc/interrupts.c
+++ b/fs/proc/interrupts.c
@@ -50,4 +50,4 @@ static int __init proc_interrupts_init(void)
proc_create("interrupts", 0, NULL, &proc_interrupts_operations);
return 0;
}
-module_init(proc_interrupts_init);
+fs_initcall(proc_interrupts_init);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 06ea155e1a5..39e6ef32f0b 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -255,8 +255,7 @@ static int kcore_update_ram(void)
end_pfn = 0;
for_each_node_state(nid, N_MEMORY) {
unsigned long node_end;
- node_end = NODE_DATA(nid)->node_start_pfn +
- NODE_DATA(nid)->node_spanned_pages;
+ node_end = node_end_pfn(nid);
if (end_pfn < node_end)
end_pfn = node_end;
}
@@ -640,4 +639,4 @@ static int __init proc_kcore_init(void)
return 0;
}
-module_init(proc_kcore_init);
+fs_initcall(proc_kcore_init);
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index bdfabdaefdc..05f8dcdb086 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -61,4 +61,4 @@ static int __init proc_kmsg_init(void)
proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
return 0;
}
-module_init(proc_kmsg_init);
+fs_initcall(proc_kmsg_init);
diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 1afa4dd4cae..aec66e6c206 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -42,4 +42,4 @@ static int __init proc_loadavg_init(void)
proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
return 0;
}
-module_init(proc_loadavg_init);
+fs_initcall(proc_loadavg_init);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 59d85d60889..7445af0b1aa 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -1,8 +1,8 @@
#include <linux/fs.h>
-#include <linux/hugetlb.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/mmzone.h>
#include <linux/proc_fs.h>
@@ -24,10 +24,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
- unsigned long allowed;
struct vmalloc_info vmi;
long cached;
+ long available;
+ unsigned long pagecache;
+ unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ struct zone *zone;
int lru;
/*
@@ -37,8 +40,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_meminfo(&i);
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
- allowed = ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
@@ -50,12 +51,44 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
+ for_each_zone(zone)
+ wmark_low += zone->watermark[WMARK_LOW];
+
+ /*
+ * Estimate the amount of memory available for userspace allocations,
+ * without causing swapping.
+ *
+ * Free memory cannot be taken below the low watermark, before the
+ * system starts swapping.
+ */
+ available = i.freeram - wmark_low;
+
+ /*
+ * Not all the page cache can be freed, otherwise the system will
+ * start swapping. Assume at least half of the page cache, or the
+ * low watermark worth of cache, needs to stay.
+ */
+ pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
+ pagecache -= min(pagecache / 2, wmark_low);
+ available += pagecache;
+
+ /*
+ * Part of the reclaimable slab consists of items that are in use,
+ * and cannot be freed. Cap this estimate at the low watermark.
+ */
+ available += global_page_state(NR_SLAB_RECLAIMABLE) -
+ min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
+
+ if (available < 0)
+ available = 0;
+
/*
* Tagged format, for easy grepping and expansion.
*/
seq_printf(m,
"MemTotal: %8lu kB\n"
"MemFree: %8lu kB\n"
+ "MemAvailable: %8lu kB\n"
"Buffers: %8lu kB\n"
"Cached: %8lu kB\n"
"SwapCached: %8lu kB\n"
@@ -108,6 +141,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
,
K(i.totalram),
K(i.freeram),
+ K(available),
K(i.bufferram),
K(cached),
K(total_swapcache_pages()),
@@ -147,7 +181,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(global_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
K(global_page_state(NR_WRITEBACK_TEMP)),
- K(allowed),
+ K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
vmi.used >> 10,
@@ -186,4 +220,4 @@ static int __init proc_meminfo_init(void)
proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
return 0;
}
-module_init(proc_meminfo_init);
+fs_initcall(proc_meminfo_init);
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 49a7fff2e83..89026095f2b 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = {
.setattr = proc_setattr,
};
-static int ns_delete_dentry(const struct dentry *dentry)
-{
- /* Don't cache namespace inodes when not in use */
- return 1;
-}
-
static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
struct inode *inode = dentry->d_inode;
@@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
const struct dentry_operations ns_dentry_operations =
{
- .d_delete = ns_delete_dentry,
+ .d_delete = always_delete_dentry,
.d_dname = ns_dname,
};
@@ -152,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
struct task_struct *task;
void *ns;
char name[50];
- int len = -EACCES;
+ int res = -EACCES;
task = get_proc_task(inode);
if (!task)
@@ -161,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
if (!ptrace_may_access(task, PTRACE_MODE_READ))
goto out_put_task;
- len = -ENOENT;
+ res = -ENOENT;
ns = ns_ops->get(task);
if (!ns)
goto out_put_task;
snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
- len = strlen(name);
-
- if (len > buflen)
- len = buflen;
- if (copy_to_user(buffer, name, len))
- len = -EFAULT;
-
+ res = readlink_copy(buffer, buflen, name);
ns_ops->put(ns);
out_put_task:
put_task_struct(task);
out:
- return len;
+ return res;
}
static const struct inode_operations proc_ns_link_inode_operations = {
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index ccfd99bd1c5..d4a35746cab 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -39,7 +39,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
- int flags, len;
+ int flags;
flags = region->vm_flags;
file = region->vm_file;
@@ -50,8 +50,9 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
ino = inode->i_ino;
}
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m,
- "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
region->vm_start,
region->vm_end,
flags & VM_READ ? 'r' : '-',
@@ -59,13 +60,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
((loff_t)region->vm_pgoff) << PAGE_SHIFT,
- MAJOR(dev), MINOR(dev), ino, &len);
+ MAJOR(dev), MINOR(dev), ino);
if (file) {
- len = 25 + sizeof(void *) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
+ seq_pad(m, ' ');
seq_path(m, &file->f_path, "");
}
@@ -133,4 +131,4 @@ static int __init proc_nommu_init(void)
return 0;
}
-module_init(proc_nommu_init);
+fs_initcall(proc_nommu_init);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b8730d9ebae..e647c55275d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -118,10 +118,11 @@ u64 stable_page_flags(struct page *page)
/*
* PageTransCompound can be true for non-huge compound pages (slab
* pages or pages allocated by drivers with __GFP_COMP) because it
- * just checks PG_head/PG_tail, so we need to check PageLRU to make
- * sure a given page is a thp, not a non-huge compound page.
+ * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
+ * to make sure a given page is a thp, not a non-huge compound page.
*/
- else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
+ else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+ PageAnon(compound_head(page))))
u |= 1 << KPF_THP;
/*
@@ -217,4 +218,4 @@ static int __init proc_page_init(void)
proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
return 0;
}
-module_init(proc_page_init);
+fs_initcall(proc_page_init);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
deleted file mode 100644
index 106a8357063..00000000000
--- a/fs/proc/proc_devtree.c
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * proc_devtree.c - handles /proc/device-tree
- *
- * Copyright 1997 Paul Mackerras
- */
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/time.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/printk.h>
-#include <linux/stat.h>
-#include <linux/string.h>
-#include <linux/of.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <asm/prom.h>
-#include <asm/uaccess.h>
-#include "internal.h"
-
-static inline void set_node_proc_entry(struct device_node *np,
- struct proc_dir_entry *de)
-{
-#ifdef HAVE_ARCH_DEVTREE_FIXUPS
- np->pde = de;
-#endif
-}
-
-static struct proc_dir_entry *proc_device_tree;
-
-/*
- * Supply data on a read from /proc/device-tree/node/property.
- */
-static int property_proc_show(struct seq_file *m, void *v)
-{
- struct property *pp = m->private;
-
- seq_write(m, pp->value, pp->length);
- return 0;
-}
-
-static int property_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, property_proc_show, __PDE_DATA(inode));
-}
-
-static const struct file_operations property_proc_fops = {
- .owner = THIS_MODULE,
- .open = property_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-/*
- * For a node with a name like "gc@10", we make symlinks called "gc"
- * and "@10" to it.
- */
-
-/*
- * Add a property to a node
- */
-static struct proc_dir_entry *
-__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp,
- const char *name)
-{
- struct proc_dir_entry *ent;
-
- /*
- * Unfortunately proc_register puts each new entry
- * at the beginning of the list. So we rearrange them.
- */
- ent = proc_create_data(name,
- strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR,
- de, &property_proc_fops, pp);
- if (ent == NULL)
- return NULL;
-
- if (!strncmp(name, "security-", 9))
- ent->size = 0; /* don't leak number of password chars */
- else
- ent->size = pp->length;
-
- return ent;
-}
-
-
-void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop)
-{
- __proc_device_tree_add_prop(pde, prop, prop->name);
-}
-
-void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
- struct property *prop)
-{
- remove_proc_entry(prop->name, pde);
-}
-
-void proc_device_tree_update_prop(struct proc_dir_entry *pde,
- struct property *newprop,
- struct property *oldprop)
-{
- struct proc_dir_entry *ent;
-
- if (!oldprop) {
- proc_device_tree_add_prop(pde, newprop);
- return;
- }
-
- for (ent = pde->subdir; ent != NULL; ent = ent->next)
- if (ent->data == oldprop)
- break;
- if (ent == NULL) {
- pr_warn("device-tree: property \"%s\" does not exist\n",
- oldprop->name);
- } else {
- ent->data = newprop;
- ent->size = newprop->length;
- }
-}
-
-/*
- * Various dodgy firmware might give us nodes and/or properties with
- * conflicting names. That's generally ok, except for exporting via /proc,
- * so munge names here to ensure they're unique.
- */
-
-static int duplicate_name(struct proc_dir_entry *de, const char *name)
-{
- struct proc_dir_entry *ent;
- int found = 0;
-
- spin_lock(&proc_subdir_lock);
-
- for (ent = de->subdir; ent != NULL; ent = ent->next) {
- if (strcmp(ent->name, name) == 0) {
- found = 1;
- break;
- }
- }
-
- spin_unlock(&proc_subdir_lock);
-
- return found;
-}
-
-static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de,
- const char *name)
-{
- char *fixed_name;
- int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */
- int i = 1, size;
-
-realloc:
- fixed_name = kmalloc(fixup_len, GFP_KERNEL);
- if (fixed_name == NULL) {
- pr_err("device-tree: Out of memory trying to fixup "
- "name \"%s\"\n", name);
- return name;
- }
-
-retry:
- size = snprintf(fixed_name, fixup_len, "%s#%d", name, i);
- size++; /* account for NULL */
-
- if (size > fixup_len) {
- /* We ran out of space, free and reallocate. */
- kfree(fixed_name);
- fixup_len = size;
- goto realloc;
- }
-
- if (duplicate_name(de, fixed_name)) {
- /* Multiple duplicates. Retry with a different offset. */
- i++;
- goto retry;
- }
-
- pr_warn("device-tree: Duplicate name in %s, renamed to \"%s\"\n",
- np->full_name, fixed_name);
-
- return fixed_name;
-}
-
-/*
- * Process a node, adding entries for its children and its properties.
- */
-void proc_device_tree_add_node(struct device_node *np,
- struct proc_dir_entry *de)
-{
- struct property *pp;
- struct proc_dir_entry *ent;
- struct device_node *child;
- const char *p;
-
- set_node_proc_entry(np, de);
- for (child = NULL; (child = of_get_next_child(np, child));) {
- /* Use everything after the last slash, or the full name */
- p = kbasename(child->full_name);
-
- if (duplicate_name(de, p))
- p = fixup_name(np, de, p);
-
- ent = proc_mkdir(p, de);
- if (ent == NULL)
- break;
- proc_device_tree_add_node(child, ent);
- }
- of_node_put(child);
-
- for (pp = np->properties; pp != NULL; pp = pp->next) {
- p = pp->name;
-
- if (strchr(p, '/'))
- continue;
-
- if (duplicate_name(de, p))
- p = fixup_name(np, de, p);
-
- ent = __proc_device_tree_add_prop(de, pp, p);
- if (ent == NULL)
- break;
- }
-}
-
-/*
- * Called on initialization to set up the /proc/device-tree subtree
- */
-void __init proc_device_tree_init(void)
-{
- struct device_node *root;
-
- proc_device_tree = proc_mkdir("device-tree", NULL);
- if (proc_device_tree == NULL)
- return;
- root = of_find_node_by_path("/");
- if (root == NULL) {
- pr_debug("/proc/device-tree: can't find root\n");
- return;
- }
- proc_device_tree_add_node(root, proc_device_tree);
- of_node_put(root);
-}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 87dbcbef7fe..5dbadecb234 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -92,6 +92,8 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
int proc_remount(struct super_block *sb, int *flags, char *data)
{
struct pid_namespace *pid = sb->s_fs_info;
+
+ sync_filesystem(sb);
return !proc_parse_options(data, pid);
}
@@ -183,9 +185,6 @@ void __init proc_root_init(void)
proc_mkdir("openprom", NULL);
#endif
proc_tty_init();
-#ifdef CONFIG_PROC_DEVICETREE
- proc_device_tree_init();
-#endif
proc_mkdir("bus", NULL);
proc_sys_init();
}
diff --git a/fs/proc/self.c b/fs/proc/self.c
index ffeb202ec94..4348bb8907c 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
if (!tgid)
return -ENOENT;
sprintf(tmp, "%d", tgid);
- return vfs_readlink(dentry,buffer,buflen,tmp);
+ return readlink_copy(buffer, buflen, tmp);
}
static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index 62604be9f58..ad8a77f94be 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -41,4 +41,4 @@ static int __init proc_softirqs_init(void)
proc_create("softirqs", 0, NULL, &proc_softirqs_operations);
return 0;
}
-module_init(proc_softirqs_init);
+fs_initcall(proc_softirqs_init);
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 1cf86c0e868..bf2d03f8fd3 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,7 +9,7 @@
#include <linux/slab.h>
#include <linux/time.h>
#include <linux/irqnr.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
#include <linux/tick.h>
#ifndef arch_irq_stat_cpu
@@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v)
static int stat_open(struct inode *inode, struct file *file)
{
- size_t size = 1024 + 128 * num_possible_cpus();
- char *buf;
- struct seq_file *m;
- int res;
+ size_t size = 1024 + 128 * num_online_cpus();
/* minimum size to display an interrupt count : 2 bytes */
size += 2 * nr_irqs;
-
- /* don't ask for more than the kmalloc() max size */
- if (size > KMALLOC_MAX_SIZE)
- size = KMALLOC_MAX_SIZE;
- buf = kmalloc(size, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- res = single_open(file, show_stat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = ksize(buf);
- } else
- kfree(buf);
- return res;
+ return single_open_size(file, show_stat, NULL, size);
}
static const struct file_operations proc_stat_operations = {
@@ -221,4 +203,4 @@ static int __init proc_stat_init(void)
proc_create("stat", 0, NULL, &proc_stat_operations);
return 0;
}
-module_init(proc_stat_init);
+fs_initcall(proc_stat_init);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7366e9d63ce..cfa63ee92c9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,5 @@
#include <linux/mm.h>
+#include <linux/vmacache.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -62,7 +63,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10,
+ (PTRS_PER_PTE * sizeof(pte_t) *
+ atomic_long_read(&mm->nr_ptes)) >> 10,
swap << (PAGE_SHIFT-10));
}
@@ -83,14 +85,6 @@ unsigned long task_statm(struct mm_struct *mm,
return mm->total_vm;
}
-static void pad_len_spaces(struct seq_file *m, int len)
-{
- len = 25 + sizeof(void*) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
-}
-
#ifdef CONFIG_NUMA
/*
* These functions are for numa_maps but called in generic **maps seq_file
@@ -159,7 +153,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
/*
* We remember last_addr rather than next_addr to hit with
- * mmap_cache most of the time. We have zero last_addr at
+ * vmacache most of the time. We have zero last_addr at
* the beginning and also after lseek. We will have -1 last_addr
* after the end of the vmas.
*/
@@ -268,7 +262,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
- int len;
const char *name = NULL;
if (file) {
@@ -286,7 +279,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
if (stack_guard_page_end(vma, end))
end -= PAGE_SIZE;
- seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
+ seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
start,
end,
flags & VM_READ ? 'r' : '-',
@@ -294,18 +288,24 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? 's' : 'p',
pgoff,
- MAJOR(dev), MINOR(dev), ino, &len);
+ MAJOR(dev), MINOR(dev), ino);
/*
* Print the dentry name for named mappings, and a
* special [heap] marker for the heap:
*/
if (file) {
- pad_len_spaces(m, len);
+ seq_pad(m, ' ');
seq_path(m, &file->f_path, "\n");
goto done;
}
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = vma->vm_ops->name(vma);
+ if (name)
+ goto done;
+ }
+
name = arch_vma_name(vma);
if (!name) {
pid_t tid;
@@ -333,7 +333,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
name = "[stack]";
} else {
/* Thread stack in /proc/PID/maps */
- pad_len_spaces(m, len);
+ seq_pad(m, ' ');
seq_printf(m, "[stack:%d]", tid);
}
}
@@ -341,7 +341,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
done:
if (name) {
- pad_len_spaces(m, len);
+ seq_pad(m, ' ');
seq_puts(m, name);
}
seq_putc(m, '\n');
@@ -505,9 +505,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
mss->anonymous_thp += HPAGE_PMD_SIZE;
return 0;
}
@@ -561,6 +561,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_NONLINEAR)] = "nl",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_DONTDUMP)] = "dd",
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ [ilog2(VM_SOFTDIRTY)] = "sd",
+#endif
[ilog2(VM_MIXEDMAP)] = "mm",
[ilog2(VM_HUGEPAGE)] = "hg",
[ilog2(VM_NOHUGEPAGE)] = "nh",
@@ -740,9 +743,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_file_clear_soft_dirty(ptent);
}
- if (vma->vm_flags & VM_SOFTDIRTY)
- vma->vm_flags &= ~VM_SOFTDIRTY;
-
set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif
}
@@ -810,8 +810,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
if (type == CLEAR_REFS_SOFT_DIRTY) {
soft_dirty_cleared = true;
- pr_warn_once("The pagemap bits 55-60 has changed their meaning! "
- "See the linux/Documentation/vm/pagemap.txt for details.\n");
+ pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
+ " See the linux/Documentation/vm/pagemap.txt for "
+ "details.\n");
}
task = get_proc_task(file_inode(file));
@@ -842,11 +843,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
*
* Writing 3 to /proc/pid/clear_refs only affects file
* mapped pages.
+ *
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
*/
if (type == CLEAR_REFS_ANON && vma->vm_file)
continue;
if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
continue;
+ if (type == CLEAR_REFS_SOFT_DIRTY) {
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ }
walk_page_range(vma->vm_start, vma->vm_end,
&clear_refs_walk);
}
@@ -941,6 +948,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
frame = pte_pfn(pte);
flags = PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
+ if (pte_soft_dirty(pte))
+ flags2 |= __PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
@@ -960,7 +969,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
if (page && !PageAnon(page))
flags |= PM_FILE;
- if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
+ if ((vma->vm_flags & VM_SOFTDIRTY))
flags2 |= __PM_SOFT_DIRTY;
*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -993,13 +1002,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
{
struct vm_area_struct *vma;
struct pagemapread *pm = walk->private;
+ spinlock_t *ptl;
pte_t *pte;
int err = 0;
pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
/* find the first VMA at or above 'addr' */
vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
+ if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1017,7 +1027,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (err)
break;
}
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return err;
}
@@ -1319,7 +1329,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
md = walk->private;
- if (pmd_trans_huge_lock(pmd, md->vma) == 1) {
+ if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
@@ -1327,7 +1337,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
- spin_unlock(&walk->mm->page_table_lock);
+ spin_unlock(ptl);
return 0;
}
@@ -1351,7 +1361,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
struct numa_maps *md;
struct page *page;
- if (pte_none(*pte))
+ if (!pte_present(*pte))
return 0;
page = pte_page(*pte);
@@ -1385,8 +1395,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
struct mm_struct *mm = vma->vm_mm;
struct mm_walk walk = {};
struct mempolicy *pol;
- int n;
- char buffer[50];
+ char buffer[64];
+ int nid;
if (!mm)
return 0;
@@ -1402,18 +1412,16 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
walk.mm = mm;
pol = get_vma_policy(task, vma, vma->vm_start);
- n = mpol_to_str(buffer, sizeof(buffer), pol);
+ mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol);
- if (n < 0)
- return n;
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
if (file) {
- seq_printf(m, " file=");
+ seq_puts(m, " file=");
seq_path(m, &file->f_path, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
- seq_printf(m, " heap");
+ seq_puts(m, " heap");
} else {
pid_t tid = vm_is_stack(task, vma, is_pid);
if (tid != 0) {
@@ -1423,14 +1431,14 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
*/
if (!is_pid || (vma->vm_start <= mm->start_stack &&
vma->vm_end >= mm->start_stack))
- seq_printf(m, " stack");
+ seq_puts(m, " stack");
else
seq_printf(m, " stack:%d", tid);
}
}
if (is_vm_hugetlb_page(vma))
- seq_printf(m, " huge");
+ seq_puts(m, " huge");
walk_page_range(vma->vm_start, vma->vm_end, &walk);
@@ -1458,9 +1466,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (md->writeback)
seq_printf(m, " writeback=%lu", md->writeback);
- for_each_node_state(n, N_MEMORY)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
+ for_each_node_state(nid, N_MEMORY)
+ if (md->node[nid])
+ seq_printf(m, " N%d=%lu", nid, md->node[nid]);
out:
seq_putc(m, '\n');
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 56123a6f462..678455d2d68 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,14 +123,6 @@ unsigned long task_statm(struct mm_struct *mm,
return size;
}
-static void pad_len_spaces(struct seq_file *m, int len)
-{
- len = 25 + sizeof(void*) * 6 - len;
- if (len < 1)
- len = 1;
- seq_printf(m, "%*c", len, ' ');
-}
-
/*
* display a single VMA to a sequenced file
*/
@@ -142,7 +134,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
unsigned long ino = 0;
struct file *file;
dev_t dev = 0;
- int flags, len;
+ int flags;
unsigned long long pgoff = 0;
flags = vma->vm_flags;
@@ -155,8 +147,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
}
+ seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
seq_printf(m,
- "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
+ "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
vma->vm_start,
vma->vm_end,
flags & VM_READ ? 'r' : '-',
@@ -164,16 +157,16 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
flags & VM_EXEC ? 'x' : '-',
flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
pgoff,
- MAJOR(dev), MINOR(dev), ino, &len);
+ MAJOR(dev), MINOR(dev), ino);
if (file) {
- pad_len_spaces(m, len);
+ seq_pad(m, ' ');
seq_path(m, &file->f_path, "");
} else if (mm) {
pid_t tid = vm_is_stack(priv->task, vma, is_pid);
if (tid != 0) {
- pad_len_spaces(m, len);
+ seq_pad(m, ' ');
/*
* Thread stack in /proc/PID/task/TID/maps or
* the main process stack.
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 06189462590..33de567c25a 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,7 +5,7 @@
#include <linux/seq_file.h>
#include <linux/time.h>
#include <linux/kernel_stat.h>
-#include <asm/cputime.h>
+#include <linux/cputime.h>
static int uptime_proc_show(struct seq_file *m, void *v)
{
@@ -49,4 +49,4 @@ static int __init proc_uptime_init(void)
proc_create("uptime", 0, NULL, &uptime_proc_fops);
return 0;
}
-module_init(proc_uptime_init);
+fs_initcall(proc_uptime_init);
diff --git a/fs/proc/version.c b/fs/proc/version.c
index 76817a60678..d2154eb6d78 100644
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -31,4 +31,4 @@ static int __init proc_version_init(void)
proc_create("version", 0, NULL, &version_proc_fops);
return 0;
}
-module_init(proc_version_init);
+fs_initcall(proc_version_init);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 9100d695988..382aa890e22 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -42,7 +42,7 @@ static size_t elfnotes_sz;
/* Total size of vmcore file. */
static u64 vmcore_size;
-static struct proc_dir_entry *proc_vmcore = NULL;
+static struct proc_dir_entry *proc_vmcore;
/*
* Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
@@ -468,17 +468,23 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
return rc;
}
nhdr_ptr = notes_section;
- while (real_sz < max_sz) {
- if (nhdr_ptr->n_namesz == 0)
- break;
+ while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf64_Nhdr) +
((nhdr_ptr->n_namesz + 3) & ~3) +
((nhdr_ptr->n_descsz + 3) & ~3);
+ if ((real_sz + sz) > max_sz) {
+ pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+ nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+ break;
+ }
real_sz += sz;
nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
}
kfree(notes_section);
phdr_ptr->p_memsz = real_sz;
+ if (real_sz == 0) {
+ pr_warn("Warning: Zero PT_NOTE entries found\n");
+ }
}
return 0;
@@ -648,17 +654,23 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
return rc;
}
nhdr_ptr = notes_section;
- while (real_sz < max_sz) {
- if (nhdr_ptr->n_namesz == 0)
- break;
+ while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf32_Nhdr) +
((nhdr_ptr->n_namesz + 3) & ~3) +
((nhdr_ptr->n_descsz + 3) & ~3);
+ if ((real_sz + sz) > max_sz) {
+ pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
+ nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
+ break;
+ }
real_sz += sz;
nhdr_ptr = (Elf32_Nhdr*)((char*)nhdr_ptr + sz);
}
kfree(notes_section);
phdr_ptr->p_memsz = real_sz;
+ if (real_sz == 0) {
+ pr_warn("Warning: Zero PT_NOTE entries found\n");
+ }
}
return 0;
@@ -1082,7 +1094,7 @@ static int __init vmcore_init(void)
proc_vmcore->size = vmcore_size;
return 0;
}
-module_init(vmcore_init)
+fs_initcall(vmcore_init);
/* Cleanup function for vmcore module. */
void vmcore_cleanup(void)
@@ -1104,4 +1116,3 @@ void vmcore_cleanup(void)
}
free_elfcorebuf();
}
-EXPORT_SYMBOL_GPL(vmcore_cleanup);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 5fe34c355e8..1a81373947f 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -20,15 +20,15 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
struct proc_mounts *p = proc_mounts(file->private_data);
struct mnt_namespace *ns = p->ns;
unsigned res = POLLIN | POLLRDNORM;
+ int event;
poll_wait(file, &p->ns->poll, wait);
- br_read_lock(&vfsmount_lock);
- if (p->m.poll_event != ns->event) {
- p->m.poll_event = ns->event;
+ event = ACCESS_ONCE(ns->event);
+ if (p->m.poll_event != event) {
+ p->m.poll_event = event;
res |= POLLERR | POLLPRI;
}
- br_read_unlock(&vfsmount_lock);
return res;
}
@@ -234,17 +234,12 @@ static int mounts_open_common(struct inode *inode, struct file *file,
rcu_read_lock();
nsp = task_nsproxy(task);
- if (!nsp) {
+ if (!nsp || !nsp->mnt_ns) {
rcu_read_unlock();
put_task_struct(task);
goto err;
}
ns = nsp->mnt_ns;
- if (!ns) {
- rcu_read_unlock();
- put_task_struct(task);
- goto err;
- }
get_mnt_ns(ns);
rcu_read_unlock();
task_lock(task);
@@ -272,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file,
p->root = root;
p->m.poll_event = ns->event;
p->show = show;
+ p->cached_event = ~0ULL;
return 0;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 12823845d32..192297b0090 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -249,6 +249,7 @@ static void parse_options(char *options)
static int pstore_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
parse_options(data);
return 0;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b8e93a40a5d..0a9b72cdfec 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -18,6 +18,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "pstore: " fmt
+
#include <linux/atomic.h>
#include <linux/types.h>
#include <linux/errno.h>
@@ -224,14 +226,12 @@ static void allocate_buf_for_compression(void)
zlib_inflate_workspacesize());
stream.workspace = kmalloc(size, GFP_KERNEL);
if (!stream.workspace) {
- pr_err("pstore: No memory for compression workspace; "
- "skipping compression\n");
+ pr_err("No memory for compression workspace; skipping compression\n");
kfree(big_oops_buf);
big_oops_buf = NULL;
}
} else {
- pr_err("No memory for uncompressed data; "
- "skipping compression\n");
+ pr_err("No memory for uncompressed data; skipping compression\n");
stream.workspace = NULL;
}
@@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi)
pstore_get_records(0);
kmsg_dump_register(&pstore_dumper);
- pstore_register_console();
- pstore_register_ftrace();
+
+ if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+ pstore_register_console();
+ pstore_register_ftrace();
+ }
if (pstore_update_ms >= 0) {
pstore_timer.expires = jiffies +
@@ -452,8 +455,7 @@ int pstore_register(struct pstore_info *psi)
add_timer(&pstore_timer);
}
- pr_info("pstore: Registered %s as persistent store backend\n",
- psi->name);
+ pr_info("Registered %s as persistent store backend\n", psi->name);
return 0;
}
@@ -494,12 +496,13 @@ void pstore_get_records(int quiet)
big_oops_buf_sz);
if (unzipped_len > 0) {
+ kfree(buf);
buf = big_oops_buf;
size = unzipped_len;
compressed = false;
} else {
- pr_err("pstore: decompression failed;"
- "returned %d\n", unzipped_len);
+ pr_err("decompression failed;returned %d\n",
+ unzipped_len);
compressed = true;
}
}
@@ -520,8 +523,8 @@ out:
mutex_unlock(&psi->read_mutex);
if (failed)
- printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
- failed, psi->name);
+ pr_warn("failed to load %d record(s) from '%s'\n",
+ failed, psi->name);
}
static void pstore_dowork(struct work_struct *work)
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index fa8cef2cca3..3b5744306ed 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -86,6 +86,7 @@ struct ramoops_context {
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
unsigned int dump_write_cnt;
+ /* _read_cnt need clear on ramoops_pstore_open */
unsigned int dump_read_cnt;
unsigned int console_read_cnt;
unsigned int ftrace_read_cnt;
@@ -101,6 +102,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
cxt->dump_read_cnt = 0;
cxt->console_read_cnt = 0;
+ cxt->ftrace_read_cnt = 0;
return 0;
}
@@ -117,13 +119,15 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
return NULL;
prz = przs[i];
+ if (!prz)
+ return NULL;
- if (update) {
- /* Update old/shadowed buffer. */
+ /* Update old/shadowed buffer. */
+ if (update)
persistent_ram_save_old(prz);
- if (!persistent_ram_old_size(prz))
- return NULL;
- }
+
+ if (!persistent_ram_old_size(prz))
+ return NULL;
*typep = type;
*id = i;
@@ -316,6 +320,7 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
{
int i;
+ cxt->max_dump_cnt = 0;
if (!cxt->przs)
return;
@@ -346,7 +351,7 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
GFP_KERNEL);
if (!cxt->przs) {
dev_err(dev, "failed to initialize a prz array for dumps\n");
- return -ENOMEM;
+ goto fail_prz;
}
for (i = 0; i < cxt->max_dump_cnt; i++) {
@@ -428,7 +433,6 @@ static int ramoops_probe(struct platform_device *pdev)
if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
- cxt->dump_read_cnt = 0;
cxt->size = pdata->mem_size;
cxt->phys_addr = pdata->mem_address;
cxt->record_size = pdata->record_size;
@@ -505,7 +509,6 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
- cxt->max_dump_cnt = 0;
fail_cnt:
kfree(cxt->fprz);
fail_init_fprz:
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index de272d42676..34a1e5aa848 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -12,6 +12,8 @@
*
*/
+#define pr_fmt(fmt) "persistent_ram: " fmt
+
#include <linux/device.h>
#include <linux/err.h>
#include <linux/errno.h>
@@ -54,7 +56,7 @@ static size_t buffer_start_add_atomic(struct persistent_ram_zone *prz, size_t a)
do {
old = atomic_read(&prz->buffer->start);
new = old + a;
- while (unlikely(new > prz->buffer_size))
+ while (unlikely(new >= prz->buffer_size))
new -= prz->buffer_size;
} while (atomic_cmpxchg(&prz->buffer->start, old, new) != old);
@@ -91,7 +93,7 @@ static size_t buffer_start_add_locked(struct persistent_ram_zone *prz, size_t a)
old = atomic_read(&prz->buffer->start);
new = old + a;
- while (unlikely(new > prz->buffer_size))
+ while (unlikely(new >= prz->buffer_size))
new -= prz->buffer_size;
atomic_set(&prz->buffer->start, new);
@@ -205,12 +207,10 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
size = buffer->data + prz->buffer_size - block;
numerr = persistent_ram_decode_rs8(prz, block, size, par);
if (numerr > 0) {
- pr_devel("persistent_ram: error in block %p, %d\n",
- block, numerr);
+ pr_devel("error in block %p, %d\n", block, numerr);
prz->corrected_bytes += numerr;
} else if (numerr < 0) {
- pr_devel("persistent_ram: uncorrectable error in block %p\n",
- block);
+ pr_devel("uncorrectable error in block %p\n", block);
prz->bad_blocks++;
}
block += prz->ecc_info.block_size;
@@ -257,7 +257,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly,
0, 1, prz->ecc_info.ecc_size);
if (prz->rs_decoder == NULL) {
- pr_info("persistent_ram: init_rs failed\n");
+ pr_info("init_rs failed\n");
return -EINVAL;
}
@@ -267,10 +267,10 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
numerr = persistent_ram_decode_rs8(prz, buffer, sizeof(*buffer),
prz->par_header);
if (numerr > 0) {
- pr_info("persistent_ram: error in header, %d\n", numerr);
+ pr_info("error in header, %d\n", numerr);
prz->corrected_bytes += numerr;
} else if (numerr < 0) {
- pr_info("persistent_ram: uncorrectable error in header\n");
+ pr_info("uncorrectable error in header\n");
prz->bad_blocks++;
}
@@ -317,7 +317,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz)
prz->old_log = kmalloc(size, GFP_KERNEL);
}
if (!prz->old_log) {
- pr_err("persistent_ram: failed to allocate buffer\n");
+ pr_err("failed to allocate buffer\n");
return;
}
@@ -396,8 +396,8 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL);
if (!pages) {
- pr_err("%s: Failed to allocate array for %u pages\n", __func__,
- page_count);
+ pr_err("%s: Failed to allocate array for %u pages\n",
+ __func__, page_count);
return NULL;
}
@@ -462,19 +462,17 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
if (prz->buffer->sig == sig) {
if (buffer_size(prz) > prz->buffer_size ||
buffer_start(prz) > buffer_size(prz))
- pr_info("persistent_ram: found existing invalid buffer,"
- " size %zu, start %zu\n",
- buffer_size(prz), buffer_start(prz));
+ pr_info("found existing invalid buffer, size %zu, start %zu\n",
+ buffer_size(prz), buffer_start(prz));
else {
- pr_debug("persistent_ram: found existing buffer,"
- " size %zu, start %zu\n",
- buffer_size(prz), buffer_start(prz));
+ pr_debug("found existing buffer, size %zu, start %zu\n",
+ buffer_size(prz), buffer_start(prz));
persistent_ram_save_old(prz);
return 0;
}
} else {
- pr_debug("persistent_ram: no valid data in buffer"
- " (sig = 0x%08x)\n", prz->buffer->sig);
+ pr_debug("no valid data in buffer (sig = 0x%08x)\n",
+ prz->buffer->sig);
}
prz->buffer->sig = sig;
@@ -509,7 +507,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
prz = kzalloc(sizeof(struct persistent_ram_zone), GFP_KERNEL);
if (!prz) {
- pr_err("persistent_ram: failed to allocate persistent ram zone\n");
+ pr_err("failed to allocate persistent ram zone\n");
goto err;
}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 2e8caa62da7..c4bcb778886 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -27,7 +27,6 @@
static const struct super_operations qnx4_sops;
-static void qnx4_put_super(struct super_block *sb);
static struct inode *qnx4_alloc_inode(struct super_block *sb);
static void qnx4_destroy_inode(struct inode *inode);
static int qnx4_remount(struct super_block *sb, int *flags, char *data);
@@ -37,7 +36,6 @@ static const struct super_operations qnx4_sops =
{
.alloc_inode = qnx4_alloc_inode,
.destroy_inode = qnx4_destroy_inode,
- .put_super = qnx4_put_super,
.statfs = qnx4_statfs,
.remount_fs = qnx4_remount,
};
@@ -46,6 +44,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data)
{
struct qnx4_sb_info *qs;
+ sync_filesystem(sb);
qs = qnx4_sb(sb);
qs->Version = QNX4_VERSION;
*flags |= MS_RDONLY;
@@ -148,18 +147,19 @@ static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf)
* it really _is_ a qnx4 filesystem, and to check the size
* of the directory entry.
*/
-static const char *qnx4_checkroot(struct super_block *sb)
+static const char *qnx4_checkroot(struct super_block *sb,
+ struct qnx4_super_block *s)
{
struct buffer_head *bh;
struct qnx4_inode_entry *rootdir;
int rd, rl;
int i, j;
- if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/')
+ if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0')
return "no qnx4 filesystem (no root dir).";
QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
- rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
- rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
+ rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1;
+ rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size);
for (j = 0; j < rl; j++) {
bh = sb_bread(sb, rd + j); /* root dir, first block */
if (bh == NULL)
@@ -189,7 +189,6 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
struct inode *root;
const char *errmsg;
struct qnx4_sb_info *qs;
- int ret = -EINVAL;
qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
if (!qs)
@@ -198,67 +197,50 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
sb_set_blocksize(s, QNX4_BLOCK_SIZE);
+ s->s_op = &qnx4_sops;
+ s->s_magic = QNX4_SUPER_MAGIC;
+ s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
+
/* Check the superblock signature. Since the qnx4 code is
dangerous, we should leave as quickly as possible
if we don't belong here... */
bh = sb_bread(s, 1);
if (!bh) {
printk(KERN_ERR "qnx4: unable to read the superblock\n");
- goto outnobh;
+ return -EINVAL;
}
- if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
- if (!silent)
- printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
- goto out;
- }
- s->s_op = &qnx4_sops;
- s->s_magic = QNX4_SUPER_MAGIC;
- s->s_flags |= MS_RDONLY; /* Yup, read-only yet */
- qnx4_sb(s)->sb_buf = bh;
- qnx4_sb(s)->sb = (struct qnx4_super_block *) bh->b_data;
-
/* check before allocating dentries, inodes, .. */
- errmsg = qnx4_checkroot(s);
+ errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
+ brelse(bh);
if (errmsg != NULL) {
if (!silent)
printk(KERN_ERR "qnx4: %s\n", errmsg);
- goto out;
+ return -EINVAL;
}
/* does root not have inode number QNX4_ROOT_INO ?? */
root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
if (IS_ERR(root)) {
printk(KERN_ERR "qnx4: get inode failed\n");
- ret = PTR_ERR(root);
- goto outb;
+ return PTR_ERR(root);
}
- ret = -ENOMEM;
s->s_root = d_make_root(root);
if (s->s_root == NULL)
- goto outb;
+ return -ENOMEM;
- brelse(bh);
return 0;
-
- outb:
- kfree(qs->BitMap);
- out:
- brelse(bh);
- outnobh:
- kfree(qs);
- s->s_fs_info = NULL;
- return ret;
}
-static void qnx4_put_super(struct super_block *sb)
+static void qnx4_kill_sb(struct super_block *sb)
{
struct qnx4_sb_info *qs = qnx4_sb(sb);
- kfree( qs->BitMap );
- kfree( qs );
- sb->s_fs_info = NULL;
- return;
+ kill_block_super(sb);
+ if (qs) {
+ kfree(qs->BitMap);
+ kfree(qs);
+ }
}
static int qnx4_readpage(struct file *file, struct page *page)
@@ -409,7 +391,7 @@ static struct file_system_type qnx4_fs_type = {
.owner = THIS_MODULE,
.name = "qnx4",
.mount = qnx4_mount,
- .kill_sb = kill_block_super,
+ .kill_sb = qnx4_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("qnx4");
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index d024505ba00..e62c8183777 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -60,10 +60,6 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
struct buffer_head *bh;
*res_dir = NULL;
- if (!dir->i_sb) {
- printk(KERN_WARNING "qnx4: no superblock on dir.\n");
- return NULL;
- }
bh = NULL;
block = offset = blkofs = 0;
while (blkofs * QNX4_BLOCK_SIZE + offset < dir->i_size) {
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 34e2d329c97..c9b1be2c164 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -10,8 +10,6 @@
#endif
struct qnx4_sb_info {
- struct buffer_head *sb_buf; /* superblock buffer */
- struct qnx4_super_block *sb; /* our superblock */
unsigned int Version; /* may be useful */
struct qnx4_inode_entry *BitMap; /* useful */
};
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 8d941edfefa..65cdaab3ed4 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -55,6 +55,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root)
static int qnx6_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_RDONLY;
return 0;
}
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index 880fd988436..c51df1dd237 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -8,9 +8,10 @@ config QUOTA
help
If you say Y here, you will be able to set per user limits for disk
usage (also called disk quotas). Currently, it works for the
- ext2, ext3, and reiserfs file system. ext3 also supports journalled
- quotas for which you don't need to run quotacheck(8) after an unclean
- shutdown.
+ ext2, ext3, ext4, jfs, ocfs2 and reiserfs file systems.
+ Note that gfs2 and xfs use their own quota system.
+ Ext3, ext4 and reiserfs also support journaled quotas for which
+ you don't need to run quotacheck(8) after an unclean shutdown.
For further details, read the Quota mini-HOWTO, available from
<http://www.tldp.org/docs.html#howto>, or the documentation provided
with the quota tools. Probably the quota support is only useful for
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 831d49a4111..7f30bdc57d1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -528,7 +528,7 @@ restart:
if (atomic_read(&dquot->dq_count)) {
DEFINE_WAIT(wait);
- atomic_inc(&dquot->dq_count);
+ dqgrab(dquot);
prepare_to_wait(&dquot->dq_wait_unused, &wait,
TASK_UNINTERRUPTIBLE);
spin_unlock(&dq_list_lock);
@@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb,
dqstats_inc(DQST_LOOKUPS);
dqput(old_dquot);
old_dquot = dquot;
- ret = fn(dquot, priv);
- if (ret < 0)
- goto out;
+ /*
+ * ->release_dquot() can be racing with us. Our reference
+ * protects us from new calls to it so just wait for any
+ * outstanding call and recheck the DQ_ACTIVE_B after that.
+ */
+ wait_on_dquot(dquot);
+ if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+ ret = fn(dquot, priv);
+ if (ret < 0)
+ goto out;
+ }
spin_lock(&dq_list_lock);
/* We are safe to continue now because our dquot could not
* be moved out of the inuse list while we hold the reference */
@@ -624,7 +632,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
/* Now we have active dquot from which someone is
* holding reference so we can safely just increase
* use count */
- atomic_inc(&dquot->dq_count);
+ dqgrab(dquot);
spin_unlock(&dq_list_lock);
dqstats_inc(DQST_LOOKUPS);
err = sb->dq_op->write_dquot(dquot);
@@ -694,6 +702,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
struct dquot *dquot;
unsigned long freed = 0;
+ spin_lock(&dq_list_lock);
head = free_dquots.prev;
while (head != &free_dquots && sc->nr_to_scan) {
dquot = list_entry(head, struct dquot, dq_free);
@@ -705,6 +714,7 @@ dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
freed++;
head = free_dquots.prev;
}
+ spin_unlock(&dq_list_lock);
return freed;
}
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 16e8abb7709..72d29177998 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -9,13 +9,25 @@
#include <net/netlink.h>
#include <net/genetlink.h>
+static const struct genl_multicast_group quota_mcgrps[] = {
+ { .name = "events", },
+};
+
/* Netlink family structure for quota */
static struct genl_family quota_genl_family = {
- .id = GENL_ID_GENERATE,
+ /*
+ * Needed due to multicast group ID abuse - old code assumed
+ * the family ID was also a valid multicast group ID (which
+ * isn't true) and userspace might thus rely on it. Assign a
+ * static ID for this group to make dealing with that easier.
+ */
+ .id = GENL_ID_VFS_DQUOT,
.hdrsize = 0,
.name = "VFS_DQUOT",
.version = 1,
.maxattr = QUOTA_NL_A_MAX,
+ .mcgrps = quota_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(quota_mcgrps),
};
/**
@@ -78,7 +90,7 @@ void quota_send_warning(struct kqid qid, dev_t dev,
goto attr_err_out;
genlmsg_end(skb, msg_head);
- genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
+ genlmsg_multicast(&quota_genl_family, skb, 0, 0, GFP_NOFS);
return;
attr_err_out:
printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index dea86e8967e..ff3f0b3cfdb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -117,6 +117,7 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
{
+ memset(dst, 0, sizeof(*dst));
dst->dqb_bhardlimit = src->d_blk_hardlimit;
dst->dqb_bsoftlimit = src->d_blk_softlimit;
dst->dqb_curspace = src->d_bcount;
@@ -277,6 +278,17 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
return ret;
}
+static int quota_rmxquota(struct super_block *sb, void __user *addr)
+{
+ __u32 flags;
+
+ if (copy_from_user(&flags, addr, sizeof(flags)))
+ return -EFAULT;
+ if (!sb->s_qcop->rm_xquota)
+ return -ENOSYS;
+ return sb->s_qcop->rm_xquota(sb, flags);
+}
+
/* Copy parameters and call proper function */
static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
void __user *addr, struct path *path)
@@ -315,8 +327,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return sb->s_qcop->quota_sync(sb, type);
case Q_XQUOTAON:
case Q_XQUOTAOFF:
- case Q_XQUOTARM:
return quota_setxstate(sb, cmd, addr);
+ case Q_XQUOTARM:
+ return quota_rmxquota(sb, addr);
case Q_XGETQSTAT:
return quota_getxstate(sb, addr);
case Q_XGETQSTATV:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 4884ac5ae9b..4f56de822d2 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -30,22 +30,15 @@
#include "internal.h"
-const struct address_space_operations ramfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
-};
-
const struct file_operations ramfs_file_operations = {
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 8d5b438cc18..dda012ad420 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -27,24 +27,23 @@
#include "internal.h"
static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
-
-const struct address_space_operations ramfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
- .set_page_dirty = __set_page_dirty_no_writeback,
-};
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags);
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
const struct file_operations ramfs_file_operations = {
.mmap = ramfs_nommu_mmap,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.fsync = noop_fsync,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
@@ -197,7 +196,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
* - the pages to be mapped must exist
* - the pages be physically contiguous in sequence
*/
-unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
+static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long addr, unsigned long len,
unsigned long pgoff, unsigned long flags)
{
@@ -256,7 +255,7 @@ out:
/*
* set up a mapping for shared memory segments
*/
-int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
+static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma)
{
if (!(vma->vm_flags & VM_SHARED))
return -ENOSYS;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 39d14659a8d..d365b1c4eb3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -43,6 +43,13 @@
static const struct super_operations ramfs_ops;
static const struct inode_operations ramfs_dir_inode_operations;
+static const struct address_space_operations ramfs_aops = {
+ .readpage = simple_readpage,
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end,
+ .set_page_dirty = __set_page_dirty_no_writeback,
+};
+
static struct backing_dev_info ramfs_backing_dev_info = {
.name = "ramfs",
.ra_pages = 0, /* No readahead */
@@ -275,4 +282,4 @@ int __init init_ramfs_fs(void)
return err;
}
-module_init(init_ramfs_fs)
+fs_initcall(init_ramfs_fs);
diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h
index 6b330639b51..a9d8ae88fa1 100644
--- a/fs/ramfs/internal.h
+++ b/fs/ramfs/internal.h
@@ -10,5 +10,4 @@
*/
-extern const struct address_space_operations ramfs_aops;
extern const struct inode_operations ramfs_file_inode_operations;
diff --git a/fs/read_write.c b/fs/read_write.c
index e3cd280b158..009d8542a88 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -25,11 +25,12 @@
typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
unsigned long, loff_t);
+typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *);
const struct file_operations generic_ro_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.mmap = generic_file_readonly_mmap,
.splice_read = generic_file_splice_read,
};
@@ -257,17 +258,29 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
fn = no_llseek;
if (file->f_mode & FMODE_LSEEK) {
- if (file->f_op && file->f_op->llseek)
+ if (file->f_op->llseek)
fn = file->f_op->llseek;
}
return fn(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);
+static inline struct fd fdget_pos(int fd)
+{
+ return __to_fd(__fdget_pos(fd));
+}
+
+static inline void fdput_pos(struct fd f)
+{
+ if (f.flags & FDPUT_POS_UNLOCK)
+ mutex_unlock(&f.file->f_pos_lock);
+ fdput(f);
+}
+
SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
off_t retval;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
if (!f.file)
return -EBADF;
@@ -278,7 +291,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput(f);
+ fdput_pos(f);
return retval;
}
@@ -295,7 +308,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned int, whence)
{
int retval;
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
loff_t offset;
if (!f.file)
@@ -315,7 +328,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
retval = 0;
}
out_putf:
- fdput(f);
+ fdput_pos(f);
return retval;
}
#endif
@@ -378,13 +391,34 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
EXPORT_SYMBOL(do_sync_read);
+ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = len;
+ iov_iter_init(&iter, READ, &iov, 1, len);
+
+ ret = filp->f_op->read_iter(&kiocb, &iter);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
+EXPORT_SYMBOL(new_sync_read);
+
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
- if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
+ if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
@@ -394,8 +428,10 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
- else
+ else if (file->f_op->aio_read)
ret = do_sync_read(file, buf, count, pos);
+ else
+ ret = new_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
@@ -427,13 +463,34 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
EXPORT_SYMBOL(do_sync_write);
+ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
+{
+ struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = len;
+ iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+ ret = filp->f_op->write_iter(&kiocb, &iter);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
+EXPORT_SYMBOL(new_sync_write);
+
ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
{
mm_segment_t old_fs;
const char __user *p;
ssize_t ret;
- if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
old_fs = get_fs();
@@ -443,8 +500,10 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
count = MAX_RW_COUNT;
if (file->f_op->write)
ret = file->f_op->write(file, p, count, pos);
- else
+ else if (file->f_op->aio_write)
ret = do_sync_write(file, p, count, pos);
+ else
+ ret = new_sync_write(file, p, count, pos);
set_fs(old_fs);
if (ret > 0) {
fsnotify_modify(file);
@@ -460,7 +519,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_READ, buf, count)))
return -EFAULT;
@@ -471,8 +530,10 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
- else
+ else if (file->f_op->aio_write)
ret = do_sync_write(file, buf, count, pos);
+ else
+ ret = new_sync_write(file, buf, count, pos);
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
@@ -498,7 +559,7 @@ static inline void file_pos_write(struct file *file, loff_t pos)
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -506,7 +567,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
return ret;
}
@@ -514,7 +575,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -522,7 +583,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
return ret;
@@ -589,6 +650,25 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
}
EXPORT_SYMBOL(iov_shorten);
+static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov,
+ unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn)
+{
+ struct kiocb kiocb;
+ struct iov_iter iter;
+ ssize_t ret;
+
+ init_sync_kiocb(&kiocb, filp);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = len;
+
+ iov_iter_init(&iter, rw, iov, nr_segs, len);
+ ret = fn(&kiocb, &iter);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+
static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
{
@@ -726,11 +806,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
-
- if (!file->f_op) {
- ret = -EINVAL;
- goto out;
- }
+ iter_fn_t iter_fn;
ret = rw_copy_check_uvector(type, uvector, nr_segs,
ARRAY_SIZE(iovstack), iovstack, &iov);
@@ -746,13 +822,18 @@ static ssize_t do_readv_writev(int type, struct file *file,
if (type == READ) {
fn = file->f_op->read;
fnv = file->f_op->aio_read;
+ iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->aio_write;
+ iter_fn = file->f_op->write_iter;
file_start_write(file);
}
- if (fnv)
+ if (iter_fn)
+ ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+ pos, iter_fn);
+ else if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
else
@@ -778,7 +859,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
{
if (!(file->f_mode & FMODE_READ))
return -EBADF;
- if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+ if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
return do_readv_writev(READ, file, vec, vlen, pos);
@@ -791,7 +872,7 @@ ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
{
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
- if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+ if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
return do_readv_writev(WRITE, file, vec, vlen, pos);
@@ -802,7 +883,7 @@ EXPORT_SYMBOL(vfs_writev);
SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -810,7 +891,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
if (ret > 0)
@@ -822,7 +903,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
unsigned long, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
@@ -830,7 +911,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
ret = vfs_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
- fdput(f);
+ fdput_pos(f);
}
if (ret > 0)
@@ -905,14 +986,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
ssize_t ret;
io_fn_t fn;
iov_fn_t fnv;
-
- ret = -EINVAL;
- if (!file->f_op)
- goto out;
-
- ret = -EFAULT;
- if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
- goto out;
+ iter_fn_t iter_fn;
ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
UIO_FASTIOV, iovstack, &iov);
@@ -928,13 +1002,18 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
if (type == READ) {
fn = file->f_op->read;
fnv = file->f_op->aio_read;
+ iter_fn = file->f_op->read_iter;
} else {
fn = (io_fn_t)file->f_op->write;
fnv = file->f_op->aio_write;
+ iter_fn = file->f_op->write_iter;
file_start_write(file);
}
- if (fnv)
+ if (iter_fn)
+ ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len,
+ pos, iter_fn);
+ else if (fnv)
ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
pos, fnv);
else
@@ -965,7 +1044,7 @@ static size_t compat_readv(struct file *file,
goto out;
ret = -EINVAL;
- if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
+ if (!(file->f_mode & FMODE_CAN_READ))
goto out;
ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
@@ -977,11 +1056,11 @@ out:
return ret;
}
-COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
- unsigned long, vlen)
+ compat_ulong_t, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
@@ -991,13 +1070,13 @@ COMPAT_SYSCALL_DEFINE3(readv, unsigned long, fd,
ret = compat_readv(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
- fdput(f);
+ fdput_pos(f);
return ret;
}
-COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
- unsigned long, vlen, loff_t, pos)
+static long __compat_sys_preadv64(unsigned long fd,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t pos)
{
struct fd f;
ssize_t ret;
@@ -1014,12 +1093,22 @@ COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
return ret;
}
-COMPAT_SYSCALL_DEFINE5(preadv, unsigned long, fd,
+#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
+COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
const struct compat_iovec __user *,vec,
- unsigned long, vlen, u32, pos_low, u32, pos_high)
+ unsigned long, vlen, loff_t, pos)
+{
+ return __compat_sys_preadv64(fd, vec, vlen, pos);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
+ const struct compat_iovec __user *,vec,
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return compat_sys_preadv64(fd, vec, vlen, pos);
+
+ return __compat_sys_preadv64(fd, vec, vlen, pos);
}
static size_t compat_writev(struct file *file,
@@ -1032,7 +1121,7 @@ static size_t compat_writev(struct file *file,
goto out;
ret = -EINVAL;
- if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
+ if (!(file->f_mode & FMODE_CAN_WRITE))
goto out;
ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
@@ -1044,11 +1133,11 @@ out:
return ret;
}
-COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
+COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
const struct compat_iovec __user *, vec,
- unsigned long, vlen)
+ compat_ulong_t, vlen)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_pos(fd);
ssize_t ret;
loff_t pos;
@@ -1058,13 +1147,13 @@ COMPAT_SYSCALL_DEFINE3(writev, unsigned long, fd,
ret = compat_writev(f.file, vec, vlen, &pos);
if (ret >= 0)
f.file->f_pos = pos;
- fdput(f);
+ fdput_pos(f);
return ret;
}
-COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
- const struct compat_iovec __user *,vec,
- unsigned long, vlen, loff_t, pos)
+static long __compat_sys_pwritev64(unsigned long fd,
+ const struct compat_iovec __user *vec,
+ unsigned long vlen, loff_t pos)
{
struct fd f;
ssize_t ret;
@@ -1081,12 +1170,22 @@ COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
return ret;
}
-COMPAT_SYSCALL_DEFINE5(pwritev, unsigned long, fd,
+#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
+COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
+ const struct compat_iovec __user *,vec,
+ unsigned long, vlen, loff_t, pos)
+{
+ return __compat_sys_pwritev64(fd, vec, vlen, pos);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
const struct compat_iovec __user *,vec,
- unsigned long, vlen, u32, pos_low, u32, pos_high)
+ compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
loff_t pos = ((loff_t)pos_high << 32) | pos_low;
- return compat_sys_pwritev64(fd, vec, vlen, pos);
+
+ return __compat_sys_pwritev64(fd, vec, vlen, pos);
}
#endif
diff --git a/fs/readdir.c b/fs/readdir.c
index 93d71e57431..33fd92208cb 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -13,6 +13,7 @@
#include <linux/stat.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/fsnotify.h>
#include <linux/dirent.h>
#include <linux/security.h>
#include <linux/syscalls.h>
@@ -24,7 +25,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
int res = -ENOTDIR;
- if (!file->f_op || !file->f_op->iterate)
+ if (!file->f_op->iterate)
goto out;
res = security_file_permission(file, MAY_READ);
@@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
ctx->pos = file->f_pos;
res = file->f_op->iterate(file, ctx);
file->f_pos = ctx->pos;
+ fsnotify_access(file);
file_accessed(file);
}
mutex_unlock(&inode->i_mutex);
diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h
index f096b80e73d..4a211f5b34b 100644
--- a/fs/reiserfs/acl.h
+++ b/fs/reiserfs/acl.h
@@ -48,18 +48,18 @@ static inline int reiserfs_acl_count(size_t size)
#ifdef CONFIG_REISERFS_FS_POSIX_ACL
struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
int reiserfs_acl_chmod(struct inode *inode);
int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
struct inode *dir, struct dentry *dentry,
struct inode *inode);
int reiserfs_cache_default_acl(struct inode *dir);
-extern const struct xattr_handler reiserfs_posix_acl_default_handler;
-extern const struct xattr_handler reiserfs_posix_acl_access_handler;
#else
#define reiserfs_cache_default_acl(inode) 0
#define reiserfs_get_acl NULL
+#define reiserfs_set_acl NULL
static inline int reiserfs_acl_chmod(struct inode *inode)
{
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index dc9a6829f7c..dc198bc64c6 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -50,8 +50,10 @@ static inline void get_bit_address(struct super_block *s,
unsigned int *bmap_nr,
unsigned int *offset)
{
- /* It is in the bitmap block number equal to the block
- * number divided by the number of bits in a block. */
+ /*
+ * It is in the bitmap block number equal to the block
+ * number divided by the number of bits in a block.
+ */
*bmap_nr = block >> (s->s_blocksize_bits + 3);
/* Within that bitmap block it is located at bit offset *offset. */
*offset = block & ((s->s_blocksize << 3) - 1);
@@ -71,10 +73,12 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
get_bit_address(s, block, &bmap, &offset);
- /* Old format filesystem? Unlikely, but the bitmaps are all up front so
- * we need to account for it. */
+ /*
+ * Old format filesystem? Unlikely, but the bitmaps are all
+ * up front so we need to account for it.
+ */
if (unlikely(test_bit(REISERFS_OLD_FORMAT,
- &(REISERFS_SB(s)->s_properties)))) {
+ &REISERFS_SB(s)->s_properties))) {
b_blocknr_t bmap1 = REISERFS_SB(s)->s_sbh->b_blocknr + 1;
if (block >= bmap1 &&
block <= bmap1 + bmap_count) {
@@ -108,8 +112,11 @@ int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
return 1;
}
-/* searches in journal structures for a given block number (bmap, off). If block
- is found in reiserfs journal it suggests next free block candidate to test. */
+/*
+ * Searches in journal structures for a given block number (bmap, off).
+ * If block is found in reiserfs journal it suggests next free block
+ * candidate to test.
+ */
static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
int off, int *next)
{
@@ -120,7 +127,7 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
*next = tmp;
PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
} else {
- (*next) = off + 1; /* inc offset to avoid looping. */
+ (*next) = off + 1; /* inc offset to avoid looping. */
PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
}
PROC_INFO_INC(s, scan_bitmap.retry);
@@ -129,8 +136,10 @@ static inline int is_block_in_journal(struct super_block *s, unsigned int bmap,
return 0;
}
-/* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
- * block; */
+/*
+ * Searches for a window of zero bits with given minimum and maximum
+ * lengths in one bitmap block
+ */
static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
unsigned int bmap_n, int *beg, int boundary,
int min, int max, int unfm)
@@ -142,14 +151,9 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
int org = *beg;
BUG_ON(!th->t_trans_id);
-
RFALSE(bmap_n >= reiserfs_bmap_count(s), "Bitmap %u is out of "
"range (0..%u)", bmap_n, reiserfs_bmap_count(s) - 1);
PROC_INFO_INC(s, scan_bitmap.bmap);
-/* this is unclear and lacks comments, explain how journal bitmaps
- work here for the reader. Convey a sense of the design here. What
- is a window? */
-/* - I mean `a window of zero bits' as in description of this function - Zam. */
if (!bi) {
reiserfs_error(s, "jdm-4055", "NULL bitmap info pointer "
@@ -162,18 +166,21 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
return 0;
while (1) {
- cont:
+cont:
if (bi->free_count < min) {
brelse(bh);
- return 0; // No free blocks in this bitmap
+ return 0; /* No free blocks in this bitmap */
}
/* search for a first zero bit -- beginning of a window */
*beg = reiserfs_find_next_zero_le_bit
((unsigned long *)(bh->b_data), boundary, *beg);
- if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block
- * cannot contain a zero window of minimum size */
+ /*
+ * search for a zero bit fails or the rest of bitmap block
+ * cannot contain a zero window of minimum size
+ */
+ if (*beg + min > boundary) {
brelse(bh);
return 0;
}
@@ -187,49 +194,75 @@ static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
next = end;
break;
}
- /* finding the other end of zero bit window requires looking into journal structures (in
- * case of searching for free blocks for unformatted nodes) */
+
+ /*
+ * finding the other end of zero bit window requires
+ * looking into journal structures (in case of
+ * searching for free blocks for unformatted nodes)
+ */
if (unfm && is_block_in_journal(s, bmap_n, end, &next))
break;
}
- /* now (*beg) points to beginning of zero bits window,
- * (end) points to one bit after the window end */
- if (end - *beg >= min) { /* it seems we have found window of proper size */
+ /*
+ * now (*beg) points to beginning of zero bits window,
+ * (end) points to one bit after the window end
+ */
+
+ /* found window of proper size */
+ if (end - *beg >= min) {
int i;
reiserfs_prepare_for_journal(s, bh, 1);
- /* try to set all blocks used checking are they still free */
+ /*
+ * try to set all blocks used checking are
+ * they still free
+ */
for (i = *beg; i < end; i++) {
- /* It seems that we should not check in journal again. */
+ /* Don't check in journal again. */
if (reiserfs_test_and_set_le_bit
(i, bh->b_data)) {
- /* bit was set by another process
- * while we slept in prepare_for_journal() */
+ /*
+ * bit was set by another process while
+ * we slept in prepare_for_journal()
+ */
PROC_INFO_INC(s, scan_bitmap.stolen);
- if (i >= *beg + min) { /* we can continue with smaller set of allocated blocks,
- * if length of this set is more or equal to `min' */
+
+ /*
+ * we can continue with smaller set
+ * of allocated blocks, if length of
+ * this set is more or equal to `min'
+ */
+ if (i >= *beg + min) {
end = i;
break;
}
- /* otherwise we clear all bit were set ... */
+
+ /*
+ * otherwise we clear all bit
+ * were set ...
+ */
while (--i >= *beg)
reiserfs_clear_le_bit
(i, bh->b_data);
reiserfs_restore_prepared_buffer(s, bh);
*beg = org;
- /* ... and search again in current block from beginning */
+
+ /*
+ * Search again in current block
+ * from beginning
+ */
goto cont;
}
}
bi->free_count -= (end - *beg);
- journal_mark_dirty(th, s, bh);
+ journal_mark_dirty(th, bh);
brelse(bh);
/* free block count calculation */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
1);
PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
- journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
return end - (*beg);
} else {
@@ -268,11 +301,13 @@ static inline int block_group_used(struct super_block *s, u32 id)
int bm = bmap_hash_id(s, id);
struct reiserfs_bitmap_info *info = &SB_AP_BITMAP(s)[bm];
- /* If we don't have cached information on this bitmap block, we're
+ /*
+ * If we don't have cached information on this bitmap block, we're
* going to have to load it later anyway. Loading it here allows us
* to make a better decision. This favors long-term performance gain
* with a better on-disk layout vs. a short term gain of skipping the
- * read and potentially having a bad placement. */
+ * read and potentially having a bad placement.
+ */
if (info->free_count == UINT_MAX) {
struct buffer_head *bh = reiserfs_read_bitmap_block(s, bm);
brelse(bh);
@@ -305,26 +340,26 @@ __le32 reiserfs_choose_packing(struct inode * dir)
return packing;
}
-/* Tries to find contiguous zero bit window (given size) in given region of
- * bitmap and place new blocks there. Returns number of allocated blocks. */
+/*
+ * Tries to find contiguous zero bit window (given size) in given region of
+ * bitmap and place new blocks there. Returns number of allocated blocks.
+ */
static int scan_bitmap(struct reiserfs_transaction_handle *th,
b_blocknr_t * start, b_blocknr_t finish,
int min, int max, int unfm, sector_t file_block)
{
int nr_allocated = 0;
struct super_block *s = th->t_super;
- /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
- * - Hans, it is not a block number - Zam. */
-
unsigned int bm, off;
unsigned int end_bm, end_off;
unsigned int off_max = s->s_blocksize << 3;
BUG_ON(!th->t_trans_id);
-
PROC_INFO_INC(s, scan_bitmap.call);
+
+ /* No point in looking for more free blocks */
if (SB_FREE_BLOCKS(s) <= 0)
- return 0; // No point in looking for more free blocks
+ return 0;
get_bit_address(s, *start, &bm, &off);
get_bit_address(s, finish, &end_bm, &end_off);
@@ -333,7 +368,8 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th,
if (end_bm > reiserfs_bmap_count(s))
end_bm = reiserfs_bmap_count(s);
- /* When the bitmap is more than 10% free, anyone can allocate.
+ /*
+ * When the bitmap is more than 10% free, anyone can allocate.
* When it's less than 10% free, only files that already use the
* bitmap are allowed. Once we pass 80% full, this restriction
* is lifted.
@@ -371,7 +407,7 @@ static int scan_bitmap(struct reiserfs_transaction_handle *th,
nr_allocated =
scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
- ret:
+ret:
*start = bm * off_max + off;
return nr_allocated;
@@ -388,9 +424,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
unsigned int nr, offset;
BUG_ON(!th->t_trans_id);
-
PROC_INFO_INC(s, free_block);
-
rs = SB_DISK_SUPER_BLOCK(s);
sbh = SB_BUFFER_WITH_SB(s);
apbi = SB_AP_BITMAP(s);
@@ -415,14 +449,14 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
"block %lu: bit already cleared", block);
}
apbi[nr].free_count++;
- journal_mark_dirty(th, s, bmbh);
+ journal_mark_dirty(th, bmbh);
brelse(bmbh);
reiserfs_prepare_for_journal(s, sbh, 1);
/* update super block */
set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
- journal_mark_dirty(th, s, sbh);
+ journal_mark_dirty(th, sbh);
if (for_unformatted) {
int depth = reiserfs_write_unlock_nested(s);
dquot_free_block_nodirty(inode, 1);
@@ -435,8 +469,8 @@ void reiserfs_free_block(struct reiserfs_transaction_handle *th,
int for_unformatted)
{
struct super_block *s = th->t_super;
- BUG_ON(!th->t_trans_id);
+ BUG_ON(!th->t_trans_id);
RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
if (!is_reusable(s, block, 1))
return;
@@ -471,6 +505,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
unsigned long save = ei->i_prealloc_block;
int dirty = 0;
struct inode *inode = &ei->vfs_inode;
+
BUG_ON(!th->t_trans_id);
#ifdef CONFIG_REISERFS_CHECK
if (ei->i_prealloc_count < 0)
@@ -486,7 +521,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th,
if (dirty)
reiserfs_update_sd(th, inode);
ei->i_prealloc_block = save;
- list_del_init(&(ei->i_prealloc_list));
+ list_del_init(&ei->i_prealloc_list);
}
/* FIXME: It should be inline function */
@@ -494,6 +529,7 @@ void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
struct inode *inode)
{
struct reiserfs_inode_info *ei = REISERFS_I(inode);
+
BUG_ON(!th->t_trans_id);
if (ei->i_prealloc_count)
__discard_prealloc(th, ei);
@@ -504,7 +540,6 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
BUG_ON(!th->t_trans_id);
-
while (!list_empty(plist)) {
struct reiserfs_inode_info *ei;
ei = list_entry(plist->next, struct reiserfs_inode_info,
@@ -532,7 +567,8 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
{
char *this_char, *value;
- REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */
+ /* clear default settings */
+ REISERFS_SB(s)->s_alloc_options.bits = 0;
while ((this_char = strsep(&options, ":")) != NULL) {
if ((value = strchr(this_char, '=')) != NULL)
@@ -562,7 +598,7 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options)
if (!strcmp(this_char, "displacing_new_packing_localities")) {
SET_OPTION(displacing_new_packing_localities);
continue;
- };
+ }
if (!strcmp(this_char, "old_hashed_relocation")) {
SET_OPTION(old_hashed_relocation);
@@ -729,11 +765,12 @@ void show_alloc_options(struct seq_file *seq, struct super_block *s)
static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
{
char *hash_in;
+
if (hint->formatted_node) {
hash_in = (char *)&hint->key.k_dir_id;
} else {
if (!hint->inode) {
- //hint->search_start = hint->beg;
+ /*hint->search_start = hint->beg;*/
hash_in = (char *)&hint->key.k_dir_id;
} else
if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
@@ -757,6 +794,7 @@ static void dirid_groups(reiserfs_blocknr_hint_t * hint)
__u32 dirid = 0;
int bm = 0;
struct super_block *sb = hint->th->t_super;
+
if (hint->inode)
dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
else if (hint->formatted_node)
@@ -786,7 +824,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint)
dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
- /* keep the root dir and it's first set of subdirs close to
+ /*
+ * keep the root dir and it's first set of subdirs close to
* the start of the disk
*/
if (dirid <= 2)
@@ -800,7 +839,8 @@ static void oid_groups(reiserfs_blocknr_hint_t * hint)
}
}
-/* returns 1 if it finds an indirect item and gets valid hint info
+/*
+ * returns 1 if it finds an indirect item and gets valid hint info
* from it, otherwise 0
*/
static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
@@ -812,25 +852,29 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
__le32 *item;
int ret = 0;
- if (!hint->path) /* reiserfs code can call this function w/o pointer to path
- * structure supplied; then we rely on supplied search_start */
+ /*
+ * reiserfs code can call this function w/o pointer to path
+ * structure supplied; then we rely on supplied search_start
+ */
+ if (!hint->path)
return 0;
path = hint->path;
bh = get_last_bh(path);
RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
- ih = get_ih(path);
+ ih = tp_item_head(path);
pos_in_item = path->pos_in_item;
- item = get_item(path);
+ item = tp_item_body(path);
hint->search_start = bh->b_blocknr;
+ /*
+ * for indirect item: go to left and look for the first non-hole entry
+ * in the indirect item
+ */
if (!hint->formatted_node && is_indirect_le_ih(ih)) {
- /* for indirect item: go to left and look for the first non-hole entry
- in the indirect item */
if (pos_in_item == I_UNFM_NUM(ih))
pos_in_item--;
-// pos_in_item = I_UNFM_NUM (ih) - 1;
while (pos_in_item >= 0) {
int t = get_block_num(item, pos_in_item);
if (t) {
@@ -846,10 +890,12 @@ static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
return ret;
}
-/* should be, if formatted node, then try to put on first part of the device
- specified as number of percent with mount option device, else try to put
- on last of device. This is not to say it is good code to do so,
- but the effect should be measured. */
+/*
+ * should be, if formatted node, then try to put on first part of the device
+ * specified as number of percent with mount option device, else try to put
+ * on last of device. This is not to say it is good code to do so,
+ * but the effect should be measured.
+ */
static inline void set_border_in_hint(struct super_block *s,
reiserfs_blocknr_hint_t * hint)
{
@@ -975,21 +1021,27 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
set_border_in_hint(s, hint);
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /* whenever we create a new directory, we displace it. At first we will
- hash for location, later we might look for a moderately empty place for
- it */
+ /*
+ * whenever we create a new directory, we displace it. At first
+ * we will hash for location, later we might look for a moderately
+ * empty place for it
+ */
if (displacing_new_packing_localities(s)
&& hint->th->displace_new_blocks) {
displace_new_packing_locality(hint);
- /* we do not continue determine_search_start,
- * if new packing locality is being displaced */
+ /*
+ * we do not continue determine_search_start,
+ * if new packing locality is being displaced
+ */
return;
}
#endif
- /* all persons should feel encouraged to add more special cases here and
- * test them */
+ /*
+ * all persons should feel encouraged to add more special cases
+ * here and test them
+ */
if (displacing_large_files(s) && !hint->formatted_node
&& this_blocknr_allocation_would_make_it_a_large_file(hint)) {
@@ -997,8 +1049,10 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
return;
}
- /* if none of our special cases is relevant, use the left neighbor in the
- tree order of the new node we are allocating for */
+ /*
+ * if none of our special cases is relevant, use the left
+ * neighbor in the tree order of the new node we are allocating for
+ */
if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
hash_formatted_node(hint);
return;
@@ -1006,10 +1060,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
unfm_hint = get_left_neighbor(hint);
- /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
- new blocks are displaced based on directory ID. Also, if suggested search_start
- is less than last preallocated block, we start searching from it, assuming that
- HDD dataflow is faster in forward direction */
+ /*
+ * Mimic old block allocator behaviour, that is if VFS allowed for
+ * preallocation, new blocks are displaced based on directory ID.
+ * Also, if suggested search_start is less than last preallocated
+ * block, we start searching from it, assuming that HDD dataflow
+ * is faster in forward direction
+ */
if (TEST_OPTION(old_way, s)) {
if (!hint->formatted_node) {
if (!reiserfs_hashed_relocation(s))
@@ -1038,11 +1095,13 @@ static void determine_search_start(reiserfs_blocknr_hint_t * hint,
TEST_OPTION(old_hashed_relocation, s)) {
old_hashed_relocation(hint);
}
+
/* new_hashed_relocation works with both formatted/unformatted nodes */
if ((!unfm_hint || hint->formatted_node) &&
TEST_OPTION(new_hashed_relocation, s)) {
new_hashed_relocation(hint);
}
+
/* dirid grouping works only on unformatted nodes */
if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
dirid_groups(hint);
@@ -1080,8 +1139,6 @@ static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
return CARRY_ON;
}
-/* XXX I know it could be merged with upper-level function;
- but may be result function would be too complex. */
static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
b_blocknr_t * new_blocknrs,
b_blocknr_t start,
@@ -1109,7 +1166,10 @@ static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
/* do we have something to fill prealloc. array also ? */
if (nr_allocated > 0) {
- /* it means prealloc_size was greater that 0 and we do preallocation */
+ /*
+ * it means prealloc_size was greater that 0 and
+ * we do preallocation
+ */
list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
&SB_JOURNAL(hint->th->t_super)->
j_prealloc_list);
@@ -1177,7 +1237,8 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
start = 0;
finish = hint->beg;
break;
- default: /* We've tried searching everywhere, not enough space */
+ default:
+ /* We've tried searching everywhere, not enough space */
/* Free the blocks */
if (!hint->formatted_node) {
#ifdef REISERQUOTA_DEBUG
@@ -1262,8 +1323,11 @@ static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
return amount_needed;
}
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us /* Amount of blocks we have
- already reserved */ )
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
+ b_blocknr_t *new_blocknrs,
+ int amount_needed,
+ /* Amount of blocks we have already reserved */
+ int reserved_by_us)
{
int initial_amount_needed = amount_needed;
int ret;
@@ -1275,15 +1339,21 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new
return NO_DISK_SPACE;
/* should this be if !hint->inode && hint->preallocate? */
/* do you mean hint->formatted_node can be removed ? - Zam */
- /* hint->formatted_node cannot be removed because we try to access
- inode information here, and there is often no inode assotiated with
- metadata allocations - green */
+ /*
+ * hint->formatted_node cannot be removed because we try to access
+ * inode information here, and there is often no inode associated with
+ * metadata allocations - green
+ */
if (!hint->formatted_node && hint->preallocate) {
amount_needed = use_preallocated_list_if_available
(hint, new_blocknrs, amount_needed);
- if (amount_needed == 0) /* all blocknrs we need we got from
- prealloc. list */
+
+ /*
+ * We have all the block numbers we need from the
+ * prealloc list
+ */
+ if (amount_needed == 0)
return CARRY_ON;
new_blocknrs += (initial_amount_needed - amount_needed);
}
@@ -1297,10 +1367,12 @@ int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new
ret = blocknrs_and_prealloc_arrays_from_search_start
(hint, new_blocknrs, amount_needed);
- /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
- * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
- * variant) */
-
+ /*
+ * We used prealloc. list to fill (partially) new_blocknrs array.
+ * If final allocation fails we need to return blocks back to
+ * prealloc. list or just free them. -- Zam (I chose second
+ * variant)
+ */
if (ret != CARRY_ON) {
while (amount_needed++ < initial_amount_needed) {
reiserfs_free_block(hint->th, hint->inode,
@@ -1339,10 +1411,12 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
struct reiserfs_bitmap_info *info = SB_AP_BITMAP(sb) + bitmap;
struct buffer_head *bh;
- /* Way old format filesystems had the bitmaps packed up front.
- * I doubt there are any of these left, but just in case... */
+ /*
+ * Way old format filesystems had the bitmaps packed up front.
+ * I doubt there are any of these left, but just in case...
+ */
if (unlikely(test_bit(REISERFS_OLD_FORMAT,
- &(REISERFS_SB(sb)->s_properties))))
+ &REISERFS_SB(sb)->s_properties)))
block = REISERFS_SB(sb)->s_sbh->b_blocknr + 1 + bitmap;
else if (bitmap == 0)
block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 1fd2051109a..d9f5a60dd59 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -59,7 +59,10 @@ static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *d
int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
{
- struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */
+
+ /* key of current position in the directory (key of directory entry) */
+ struct cpu_key pos_key;
+
INITIALIZE_PATH(path_to_entry);
struct buffer_head *bh;
int item_num, entry_num;
@@ -77,21 +80,28 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
reiserfs_check_lock_depth(inode->i_sb, "readdir");
- /* form key for search the next directory entry using f_pos field of
- file structure */
+ /*
+ * form key for search the next directory entry using
+ * f_pos field of file structure
+ */
make_cpu_key(&pos_key, inode, ctx->pos ?: DOT_OFFSET, TYPE_DIRENTRY, 3);
next_pos = cpu_key_k_offset(&pos_key);
path_to_entry.reada = PATH_READA;
while (1) {
- research:
- /* search the directory item, containing entry with specified key */
+research:
+ /*
+ * search the directory item, containing entry with
+ * specified key
+ */
search_res =
search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
&de);
if (search_res == IO_ERROR) {
- // FIXME: we could just skip part of directory which could
- // not be read
+ /*
+ * FIXME: we could just skip part of directory
+ * which could not be read
+ */
ret = -EIO;
goto out;
}
@@ -102,40 +112,49 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
store_ih(&tmp_ih, ih);
/* we must have found item, that is item of this directory, */
- RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
+ RFALSE(COMP_SHORT_KEYS(&ih->ih_key, &pos_key),
"vs-9000: found item %h does not match to dir we readdir %K",
ih, &pos_key);
RFALSE(item_num > B_NR_ITEMS(bh) - 1,
"vs-9005 item_num == %d, item amount == %d",
item_num, B_NR_ITEMS(bh));
- /* and entry must be not more than number of entries in the item */
- RFALSE(I_ENTRY_COUNT(ih) < entry_num,
+ /*
+ * and entry must be not more than number of entries
+ * in the item
+ */
+ RFALSE(ih_entry_count(ih) < entry_num,
"vs-9010: entry number is too big %d (%d)",
- entry_num, I_ENTRY_COUNT(ih));
+ entry_num, ih_entry_count(ih));
+ /*
+ * go through all entries in the directory item beginning
+ * from the entry, that has been found
+ */
if (search_res == POSITION_FOUND
- || entry_num < I_ENTRY_COUNT(ih)) {
- /* go through all entries in the directory item beginning from the entry, that has been found */
+ || entry_num < ih_entry_count(ih)) {
struct reiserfs_de_head *deh =
B_I_DEH(bh, ih) + entry_num;
- for (; entry_num < I_ENTRY_COUNT(ih);
+ for (; entry_num < ih_entry_count(ih);
entry_num++, deh++) {
int d_reclen;
char *d_name;
ino_t d_ino;
+ loff_t cur_pos = deh_offset(deh);
+ /* it is hidden entry */
if (!de_visible(deh))
- /* it is hidden entry */
continue;
d_reclen = entry_length(bh, ih, entry_num);
d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
if (d_reclen <= 0 ||
d_name + d_reclen > bh->b_data + bh->b_size) {
- /* There is corrupted data in entry,
- * We'd better stop here */
+ /*
+ * There is corrupted data in entry,
+ * We'd better stop here
+ */
pathrelse(&path_to_entry);
ret = -EIO;
goto out;
@@ -144,10 +163,10 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
if (!d_name[d_reclen - 1])
d_reclen = strlen(d_name);
+ /* too big to send back to VFS */
if (d_reclen >
REISERFS_MAX_NAME(inode->i_sb->
s_blocksize)) {
- /* too big to send back to VFS */
continue;
}
@@ -172,10 +191,14 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
goto research;
}
}
- // Note, that we copy name to user space via temporary
- // buffer (local_buf) because filldir will block if
- // user space buffer is swapped out. At that time
- // entry can move to somewhere else
+
+ /*
+ * Note, that we copy name to user space via
+ * temporary buffer (local_buf) because
+ * filldir will block if user space buffer is
+ * swapped out. At that time entry can move to
+ * somewhere else
+ */
memcpy(local_buf, d_name, d_reclen);
/*
@@ -196,8 +219,9 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
if (local_buf != small_buf) {
kfree(local_buf);
}
- // next entry should be looked for with such offset
- next_pos = deh_offset(deh) + 1;
+
+ /* deh_offset(deh) may be invalid now. */
+ next_pos = cur_pos + 1;
if (item_moved(&tmp_ih, &path_to_entry)) {
set_cpu_key_k_offset(&pos_key,
@@ -207,22 +231,26 @@ int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx)
} /* for */
}
+ /* end of directory has been reached */
if (item_num != B_NR_ITEMS(bh) - 1)
- // end of directory has been reached
goto end;
- /* item we went through is last item of node. Using right
- delimiting key check is it directory end */
+ /*
+ * item we went through is last item of node. Using right
+ * delimiting key check is it directory end
+ */
rkey = get_rkey(&path_to_entry, inode->i_sb);
if (!comp_le_keys(rkey, &MIN_KEY)) {
- /* set pos_key to key, that is the smallest and greater
- that key of the last entry in the item */
+ /*
+ * set pos_key to key, that is the smallest and greater
+ * that key of the last entry in the item
+ */
set_cpu_key_k_offset(&pos_key, next_pos);
continue;
}
+ /* end of directory has been reached */
if (COMP_SHORT_KEYS(rkey, &pos_key)) {
- // end of directory has been reached
goto end;
}
@@ -246,71 +274,73 @@ static int reiserfs_readdir(struct file *file, struct dir_context *ctx)
return reiserfs_readdir_inode(file_inode(file), ctx);
}
-/* compose directory item containing "." and ".." entries (entries are
- not aligned to 4 byte boundary) */
-/* the last four params are LE */
+/*
+ * compose directory item containing "." and ".." entries (entries are
+ * not aligned to 4 byte boundary)
+ */
void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
__le32 par_dirid, __le32 par_objid)
{
- struct reiserfs_de_head *deh;
+ struct reiserfs_de_head *dot, *dotdot;
memset(body, 0, EMPTY_DIR_SIZE_V1);
- deh = (struct reiserfs_de_head *)body;
+ dot = (struct reiserfs_de_head *)body;
+ dotdot = dot + 1;
/* direntry header of "." */
- put_deh_offset(&(deh[0]), DOT_OFFSET);
+ put_deh_offset(dot, DOT_OFFSET);
/* these two are from make_le_item_head, and are are LE */
- deh[0].deh_dir_id = dirid;
- deh[0].deh_objectid = objid;
- deh[0].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen("."));
- mark_de_visible(&(deh[0]));
+ dot->deh_dir_id = dirid;
+ dot->deh_objectid = objid;
+ dot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dot, EMPTY_DIR_SIZE_V1 - strlen("."));
+ mark_de_visible(dot);
/* direntry header of ".." */
- put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+ put_deh_offset(dotdot, DOT_DOT_OFFSET);
/* key of ".." for the root directory */
/* these two are from the inode, and are are LE */
- deh[1].deh_dir_id = par_dirid;
- deh[1].deh_objectid = par_objid;
- deh[1].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen(".."));
- mark_de_visible(&(deh[1]));
+ dotdot->deh_dir_id = par_dirid;
+ dotdot->deh_objectid = par_objid;
+ dotdot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dotdot, deh_location(dot) - strlen(".."));
+ mark_de_visible(dotdot);
/* copy ".." and "." */
- memcpy(body + deh_location(&(deh[0])), ".", 1);
- memcpy(body + deh_location(&(deh[1])), "..", 2);
+ memcpy(body + deh_location(dot), ".", 1);
+ memcpy(body + deh_location(dotdot), "..", 2);
}
/* compose directory item containing "." and ".." entries */
void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
__le32 par_dirid, __le32 par_objid)
{
- struct reiserfs_de_head *deh;
+ struct reiserfs_de_head *dot, *dotdot;
memset(body, 0, EMPTY_DIR_SIZE);
- deh = (struct reiserfs_de_head *)body;
+ dot = (struct reiserfs_de_head *)body;
+ dotdot = dot + 1;
/* direntry header of "." */
- put_deh_offset(&(deh[0]), DOT_OFFSET);
+ put_deh_offset(dot, DOT_OFFSET);
/* these two are from make_le_item_head, and are are LE */
- deh[0].deh_dir_id = dirid;
- deh[0].deh_objectid = objid;
- deh[0].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
- mark_de_visible(&(deh[0]));
+ dot->deh_dir_id = dirid;
+ dot->deh_objectid = objid;
+ dot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dot, EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
+ mark_de_visible(dot);
/* direntry header of ".." */
- put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+ put_deh_offset(dotdot, DOT_DOT_OFFSET);
/* key of ".." for the root directory */
/* these two are from the inode, and are are LE */
- deh[1].deh_dir_id = par_dirid;
- deh[1].deh_objectid = par_objid;
- deh[1].deh_state = 0; /* Endian safe if 0 */
- put_deh_location(&(deh[1]),
- deh_location(&(deh[0])) - ROUND_UP(strlen("..")));
- mark_de_visible(&(deh[1]));
+ dotdot->deh_dir_id = par_dirid;
+ dotdot->deh_objectid = par_objid;
+ dotdot->deh_state = 0; /* Endian safe if 0 */
+ put_deh_location(dotdot, deh_location(dot) - ROUND_UP(strlen("..")));
+ mark_de_visible(dotdot);
/* copy ".." and "." */
- memcpy(body + deh_location(&(deh[0])), ".", 1);
- memcpy(body + deh_location(&(deh[1])), "..", 2);
+ memcpy(body + deh_location(dot), ".", 1);
+ memcpy(body + deh_location(dotdot), "..", 2);
}
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2b7882b508d..54fdf196bfb 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -2,18 +2,13 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-/* Now we have all buffers that must be used in balancing of the tree */
-/* Further calculations can not cause schedule(), and thus the buffer */
-/* tree will be stable until the balancing will be finished */
-/* balance the tree according to the analysis made before, */
-/* and using buffers obtained after all above. */
-
-/**
- ** balance_leaf_when_delete
- ** balance_leaf
- ** do_balance
- **
- **/
+/*
+ * Now we have all buffers that must be used in balancing of the tree
+ * Further calculations can not cause schedule(), and thus the buffer
+ * tree will be stable until the balancing will be finished
+ * balance the tree according to the analysis made before,
+ * and using buffers obtained after all above.
+ */
#include <asm/uaccess.h>
#include <linux/time.h>
@@ -61,48 +56,190 @@ static inline void buffer_info_init_bh(struct tree_balance *tb,
inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
struct buffer_head *bh, int flag)
{
- journal_mark_dirty(tb->transaction_handle,
- tb->transaction_handle->t_super, bh);
+ journal_mark_dirty(tb->transaction_handle, bh);
}
#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
-/* summary:
- if deleting something ( tb->insert_size[0] < 0 )
- return(balance_leaf_when_delete()); (flag d handled here)
- else
- if lnum is larger than 0 we put items into the left node
- if rnum is larger than 0 we put items into the right node
- if snum1 is larger than 0 we put items into the new node s1
- if snum2 is larger than 0 we put items into the new node s2
-Note that all *num* count new items being created.
-
-It would be easier to read balance_leaf() if each of these summary
-lines was a separate procedure rather than being inlined. I think
-that there are many passages here and in balance_leaf_when_delete() in
-which two calls to one procedure can replace two passages, and it
-might save cache space and improve software maintenance costs to do so.
-
-Vladimir made the perceptive comment that we should offload most of
-the decision making in this function into fix_nodes/check_balance, and
-then create some sort of structure in tb that says what actions should
-be performed by do_balance.
-
--Hans */
-
-/* Balance leaf node in case of delete or cut: insert_size[0] < 0
+/*
+ * summary:
+ * if deleting something ( tb->insert_size[0] < 0 )
+ * return(balance_leaf_when_delete()); (flag d handled here)
+ * else
+ * if lnum is larger than 0 we put items into the left node
+ * if rnum is larger than 0 we put items into the right node
+ * if snum1 is larger than 0 we put items into the new node s1
+ * if snum2 is larger than 0 we put items into the new node s2
+ * Note that all *num* count new items being created.
+ */
+
+static void balance_leaf_when_delete_del(struct tree_balance *tb)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int item_pos = PATH_LAST_POSITION(tb->tb_path);
+ struct buffer_info bi;
+#ifdef CONFIG_REISERFS_CHECK
+ struct item_head *ih = item_head(tbS0, item_pos);
+#endif
+
+ RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
+ "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
+ -tb->insert_size[0], ih);
+
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_delete_items(&bi, 0, item_pos, 1, -1);
+
+ if (!item_pos && tb->CFL[0]) {
+ if (B_NR_ITEMS(tbS0)) {
+ replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+ } else {
+ if (!PATH_H_POSITION(tb->tb_path, 1))
+ replace_key(tb, tb->CFL[0], tb->lkey[0],
+ PATH_H_PPARENT(tb->tb_path, 0), 0);
+ }
+ }
+
+ RFALSE(!item_pos && !tb->CFL[0],
+ "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
+ tb->L[0]);
+}
+
+/* cut item in S[0] */
+static void balance_leaf_when_delete_cut(struct tree_balance *tb)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int item_pos = PATH_LAST_POSITION(tb->tb_path);
+ struct item_head *ih = item_head(tbS0, item_pos);
+ int pos_in_item = tb->tb_path->pos_in_item;
+ struct buffer_info bi;
+ buffer_info_init_tbS0(tb, &bi);
+
+ if (is_direntry_le_ih(ih)) {
+ /*
+ * UFS unlink semantics are such that you can only
+ * delete one directory entry at a time.
+ *
+ * when we cut a directory tb->insert_size[0] means
+ * number of entries to be cut (always 1)
+ */
+ tb->insert_size[0] = -1;
+ leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+ -tb->insert_size[0]);
+
+ RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
+ "PAP-12030: can not change delimiting key. CFL[0]=%p",
+ tb->CFL[0]);
+
+ if (!item_pos && !pos_in_item && tb->CFL[0])
+ replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
+ } else {
+ leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+ -tb->insert_size[0]);
+
+ RFALSE(!ih_item_len(ih),
+ "PAP-12035: cut must leave non-zero dynamic "
+ "length of item");
+ }
+}
+
+static int balance_leaf_when_delete_left(struct tree_balance *tb)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+
+ /* L[0] must be joined with S[0] */
+ if (tb->lnum[0] == -1) {
+ /* R[0] must be also joined with S[0] */
+ if (tb->rnum[0] == -1) {
+ if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
+ /*
+ * all contents of all the
+ * 3 buffers will be in L[0]
+ */
+ if (PATH_H_POSITION(tb->tb_path, 1) == 0 &&
+ 1 < B_NR_ITEMS(tb->FR[0]))
+ replace_key(tb, tb->CFL[0],
+ tb->lkey[0], tb->FR[0], 1);
+
+ leaf_move_items(LEAF_FROM_S_TO_L, tb, n, -1,
+ NULL);
+ leaf_move_items(LEAF_FROM_R_TO_L, tb,
+ B_NR_ITEMS(tb->R[0]), -1,
+ NULL);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+ reiserfs_invalidate_buffer(tb, tb->R[0]);
+
+ return 0;
+ }
+
+ /* all contents of all the 3 buffers will be in R[0] */
+ leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1, NULL);
+ leaf_move_items(LEAF_FROM_L_TO_R, tb,
+ B_NR_ITEMS(tb->L[0]), -1, NULL);
+
+ /* right_delimiting_key is correct in R[0] */
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+ reiserfs_invalidate_buffer(tb, tb->L[0]);
+
+ return -1;
+ }
+
+ RFALSE(tb->rnum[0] != 0,
+ "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
+ /* all contents of L[0] and S[0] will be in L[0] */
+ leaf_shift_left(tb, n, -1);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+
+ return 0;
+ }
+
+ /*
+ * a part of contents of S[0] will be in L[0] and
+ * the rest part of S[0] will be in R[0]
+ */
+
+ RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
+ (tb->lnum[0] + tb->rnum[0] > n + 1),
+ "PAP-12050: rnum(%d) and lnum(%d) and item "
+ "number(%d) in S[0] are not consistent",
+ tb->rnum[0], tb->lnum[0], n);
+ RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
+ (tb->lbytes != -1 || tb->rbytes != -1),
+ "PAP-12055: bad rbytes (%d)/lbytes (%d) "
+ "parameters when items are not split",
+ tb->rbytes, tb->lbytes);
+ RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
+ (tb->lbytes < 1 || tb->rbytes != -1),
+ "PAP-12060: bad rbytes (%d)/lbytes (%d) "
+ "parameters when items are split",
+ tb->rbytes, tb->lbytes);
+
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+
+ return 0;
+}
+
+/*
+ * Balance leaf node in case of delete or cut: insert_size[0] < 0
*
* lnum, rnum can have values >= -1
* -1 means that the neighbor must be joined with S
* 0 means that nothing should be done with the neighbor
- * >0 means to shift entirely or partly the specified number of items to the neighbor
+ * >0 means to shift entirely or partly the specified number of items
+ * to the neighbor
*/
static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
int item_pos = PATH_LAST_POSITION(tb->tb_path);
- int pos_in_item = tb->tb_path->pos_in_item;
struct buffer_info bi;
int n;
struct item_head *ih;
@@ -114,1527 +251,1202 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
"PAP-12010: tree can not be empty");
- ih = B_N_PITEM_HEAD(tbS0, item_pos);
+ ih = item_head(tbS0, item_pos);
buffer_info_init_tbS0(tb, &bi);
/* Delete or truncate the item */
- switch (flag) {
- case M_DELETE: /* delete item in S[0] */
+ BUG_ON(flag != M_DELETE && flag != M_CUT);
+ if (flag == M_DELETE)
+ balance_leaf_when_delete_del(tb);
+ else /* M_CUT */
+ balance_leaf_when_delete_cut(tb);
- RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
- "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
- -tb->insert_size[0], ih);
- leaf_delete_items(&bi, 0, item_pos, 1, -1);
+ /*
+ * the rule is that no shifting occurs unless by shifting
+ * a node can be freed
+ */
+ n = B_NR_ITEMS(tbS0);
- if (!item_pos && tb->CFL[0]) {
- if (B_NR_ITEMS(tbS0)) {
- replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0,
- 0);
- } else {
- if (!PATH_H_POSITION(tb->tb_path, 1))
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- PATH_H_PPARENT(tb->tb_path,
- 0), 0);
- }
- }
- RFALSE(!item_pos && !tb->CFL[0],
- "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
- tb->L[0]);
+ /* L[0] takes part in balancing */
+ if (tb->lnum[0])
+ return balance_leaf_when_delete_left(tb);
+
+ if (tb->rnum[0] == -1) {
+ /* all contents of R[0] and S[0] will be in R[0] */
+ leaf_shift_right(tb, n, -1);
+ reiserfs_invalidate_buffer(tb, tbS0);
+ return 0;
+ }
+
+ RFALSE(tb->rnum[0],
+ "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
+ return 0;
+}
- break;
+static void balance_leaf_insert_left(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ int ret;
+ struct buffer_info bi;
+ int n = B_NR_ITEMS(tb->L[0]);
+
+ if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
+ /* part of new item falls into L[0] */
+ int new_item_len, shift;
+ int version;
+
+ ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1);
+
+ /* Calculate item length to insert to S[0] */
+ new_item_len = ih_item_len(ih) - tb->lbytes;
+
+ /* Calculate and check item length to insert to L[0] */
+ put_ih_item_len(ih, ih_item_len(ih) - new_item_len);
+
+ RFALSE(ih_item_len(ih) <= 0,
+ "PAP-12080: there is nothing to insert into L[0]: "
+ "ih_item_len=%d", ih_item_len(ih));
+
+ /* Insert new item into L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
+ min_t(int, tb->zeroes_num, ih_item_len(ih)));
+
+ version = ih_version(ih);
+
+ /*
+ * Calculate key component, item length and body to
+ * insert into S[0]
+ */
+ shift = 0;
+ if (is_indirect_le_ih(ih))
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+
+ add_le_ih_k_offset(ih, tb->lbytes << shift);
+
+ put_ih_item_len(ih, new_item_len);
+ if (tb->lbytes > tb->zeroes_num) {
+ body += (tb->lbytes - tb->zeroes_num);
+ tb->zeroes_num = 0;
+ } else
+ tb->zeroes_num -= tb->lbytes;
+
+ RFALSE(ih_item_len(ih) <= 0,
+ "PAP-12085: there is nothing to insert into S[0]: "
+ "ih_item_len=%d", ih_item_len(ih));
+ } else {
+ /* new item in whole falls into L[0] */
+ /* Shift lnum[0]-1 items to L[0] */
+ ret = leaf_shift_left(tb, tb->lnum[0] - 1, tb->lbytes);
+
+ /* Insert new item into L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body,
+ tb->zeroes_num);
+ tb->insert_size[0] = 0;
+ tb->zeroes_num = 0;
+ }
+}
- case M_CUT:{ /* cut item in S[0] */
- if (is_direntry_le_ih(ih)) {
+static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ int n = B_NR_ITEMS(tb->L[0]);
+ struct buffer_info bi;
- /* UFS unlink semantics are such that you can only delete one directory entry at a time. */
- /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
- tb->insert_size[0] = -1;
- leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
- -tb->insert_size[0]);
+ RFALSE(tb->zeroes_num,
+ "PAP-12090: invalid parameter in case of a directory");
+
+ /* directory item */
+ if (tb->lbytes > tb->pos_in_item) {
+ /* new directory entry falls into L[0] */
+ struct item_head *pasted;
+ int ret, l_pos_in_item = tb->pos_in_item;
+
+ /*
+ * Shift lnum[0] - 1 items in whole.
+ * Shift lbytes - 1 entries from given directory item
+ */
+ ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
+ if (ret && !tb->item_pos) {
+ pasted = item_head(tb->L[0], B_NR_ITEMS(tb->L[0]) - 1);
+ l_pos_in_item += ih_entry_count(pasted) -
+ (tb->lbytes - 1);
+ }
- RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
- "PAP-12030: can not change delimiting key. CFL[0]=%p",
- tb->CFL[0]);
+ /* Append given directory entry to directory item */
+ buffer_info_init_left(tb, &bi);
+ leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
+ l_pos_in_item, tb->insert_size[0],
+ body, tb->zeroes_num);
+
+ /*
+ * previous string prepared space for pasting new entry,
+ * following string pastes this entry
+ */
+
+ /*
+ * when we have merge directory item, pos_in_item
+ * has been changed too
+ */
+
+ /* paste new directory entry. 1 is entry number */
+ leaf_paste_entries(&bi, n + tb->item_pos - ret,
+ l_pos_in_item, 1,
+ (struct reiserfs_de_head *) body,
+ body + DEH_SIZE, tb->insert_size[0]);
+ tb->insert_size[0] = 0;
+ } else {
+ /* new directory item doesn't fall into L[0] */
+ /*
+ * Shift lnum[0]-1 items in whole. Shift lbytes
+ * directory entries from directory item number lnum[0]
+ */
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ }
- if (!item_pos && !pos_in_item && tb->CFL[0]) {
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- tbS0, 0);
- }
- } else {
- leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
- -tb->insert_size[0]);
+ /* Calculate new position to append in item body */
+ tb->pos_in_item -= tb->lbytes;
+}
- RFALSE(!ih_item_len(ih),
- "PAP-12035: cut must leave non-zero dynamic length of item");
- }
- break;
+static void balance_leaf_paste_left_shift(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tb->L[0]);
+ struct buffer_info bi;
+
+ if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
+ balance_leaf_paste_left_shift_dirent(tb, ih, body);
+ return;
+ }
+
+ RFALSE(tb->lbytes <= 0,
+ "PAP-12095: there is nothing to shift to L[0]. "
+ "lbytes=%d", tb->lbytes);
+ RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
+ "PAP-12100: incorrect position to paste: "
+ "item_len=%d, pos_in_item=%d",
+ ih_item_len(item_head(tbS0, tb->item_pos)), tb->pos_in_item);
+
+ /* appended item will be in L[0] in whole */
+ if (tb->lbytes >= tb->pos_in_item) {
+ struct item_head *tbS0_pos_ih, *tbL0_ih;
+ struct item_head *tbS0_0_ih;
+ struct reiserfs_key *left_delim_key;
+ int ret, l_n, version, temp_l;
+
+ tbS0_pos_ih = item_head(tbS0, tb->item_pos);
+ tbS0_0_ih = item_head(tbS0, 0);
+
+ /*
+ * this bytes number must be appended
+ * to the last item of L[h]
+ */
+ l_n = tb->lbytes - tb->pos_in_item;
+
+ /* Calculate new insert_size[0] */
+ tb->insert_size[0] -= l_n;
+
+ RFALSE(tb->insert_size[0] <= 0,
+ "PAP-12105: there is nothing to paste into "
+ "L[0]. insert_size=%d", tb->insert_size[0]);
+
+ ret = leaf_shift_left(tb, tb->lnum[0],
+ ih_item_len(tbS0_pos_ih));
+
+ tbL0_ih = item_head(tb->L[0], n + tb->item_pos - ret);
+
+ /* Append to body of item in L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_paste_in_buffer(&bi, n + tb->item_pos - ret,
+ ih_item_len(tbL0_ih), l_n, body,
+ min_t(int, l_n, tb->zeroes_num));
+
+ /*
+ * 0-th item in S0 can be only of DIRECT type
+ * when l_n != 0
+ */
+ temp_l = l_n;
+
+ RFALSE(ih_item_len(tbS0_0_ih),
+ "PAP-12106: item length must be 0");
+ RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
+ leaf_key(tb->L[0], n + tb->item_pos - ret)),
+ "PAP-12107: items must be of the same file");
+
+ if (is_indirect_le_ih(tbL0_ih)) {
+ int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ temp_l = l_n << shift;
}
+ /* update key of first item in S0 */
+ version = ih_version(tbS0_0_ih);
+ add_le_key_k_offset(version, &tbS0_0_ih->ih_key, temp_l);
+
+ /* update left delimiting key */
+ left_delim_key = internal_key(tb->CFL[0], tb->lkey[0]);
+ add_le_key_k_offset(version, left_delim_key, temp_l);
+
+ /*
+ * Calculate new body, position in item and
+ * insert_size[0]
+ */
+ if (l_n > tb->zeroes_num) {
+ body += (l_n - tb->zeroes_num);
+ tb->zeroes_num = 0;
+ } else
+ tb->zeroes_num -= l_n;
+ tb->pos_in_item = 0;
+
+ RFALSE(comp_short_le_keys(&tbS0_0_ih->ih_key,
+ leaf_key(tb->L[0],
+ B_NR_ITEMS(tb->L[0]) - 1)) ||
+ !op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size) ||
+ !op_is_left_mergeable(left_delim_key, tbS0->b_size),
+ "PAP-12120: item must be merge-able with left "
+ "neighboring item");
+ } else {
+ /* only part of the appended item will be in L[0] */
+
+ /* Calculate position in item for append in S[0] */
+ tb->pos_in_item -= tb->lbytes;
+
+ RFALSE(tb->pos_in_item <= 0,
+ "PAP-12125: no place for paste. pos_in_item=%d",
+ tb->pos_in_item);
+
+ /*
+ * Shift lnum[0] - 1 items in whole.
+ * Shift lbytes - 1 byte from item number lnum[0]
+ */
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ }
+}
+
- default:
- print_cur_tb("12040");
- reiserfs_panic(tb->tb_sb, "PAP-12040",
- "unexpected mode: %s(%d)",
- (flag ==
- M_PASTE) ? "PASTE" : ((flag ==
- M_INSERT) ? "INSERT" :
- "UNKNOWN"), flag);
+/* appended item will be in L[0] in whole */
+static void balance_leaf_paste_left_whole(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tb->L[0]);
+ struct buffer_info bi;
+ struct item_head *pasted;
+ int ret;
+
+ /* if we paste into first item of S[0] and it is left mergable */
+ if (!tb->item_pos &&
+ op_is_left_mergeable(leaf_key(tbS0, 0), tbS0->b_size)) {
+ /*
+ * then increment pos_in_item by the size of the
+ * last item in L[0]
+ */
+ pasted = item_head(tb->L[0], n - 1);
+ if (is_direntry_le_ih(pasted))
+ tb->pos_in_item += ih_entry_count(pasted);
+ else
+ tb->pos_in_item += ih_item_len(pasted);
}
- /* the rule is that no shifting occurs unless by shifting a node can be freed */
- n = B_NR_ITEMS(tbS0);
- if (tb->lnum[0]) { /* L[0] takes part in balancing */
- if (tb->lnum[0] == -1) { /* L[0] must be joined with S[0] */
- if (tb->rnum[0] == -1) { /* R[0] must be also joined with S[0] */
- if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
- /* all contents of all the 3 buffers will be in L[0] */
- if (PATH_H_POSITION(tb->tb_path, 1) == 0
- && 1 < B_NR_ITEMS(tb->FR[0]))
- replace_key(tb, tb->CFL[0],
- tb->lkey[0],
- tb->FR[0], 1);
-
- leaf_move_items(LEAF_FROM_S_TO_L, tb, n,
- -1, NULL);
- leaf_move_items(LEAF_FROM_R_TO_L, tb,
- B_NR_ITEMS(tb->R[0]),
- -1, NULL);
-
- reiserfs_invalidate_buffer(tb, tbS0);
- reiserfs_invalidate_buffer(tb,
- tb->R[0]);
-
- return 0;
- }
- /* all contents of all the 3 buffers will be in R[0] */
- leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1,
- NULL);
- leaf_move_items(LEAF_FROM_L_TO_R, tb,
- B_NR_ITEMS(tb->L[0]), -1, NULL);
+ /*
+ * Shift lnum[0] - 1 items in whole.
+ * Shift lbytes - 1 byte from item number lnum[0]
+ */
+ ret = leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+
+ /* Append to body of item in L[0] */
+ buffer_info_init_left(tb, &bi);
+ leaf_paste_in_buffer(&bi, n + tb->item_pos - ret, tb->pos_in_item,
+ tb->insert_size[0], body, tb->zeroes_num);
+
+ /* if appended item is directory, paste entry */
+ pasted = item_head(tb->L[0], n + tb->item_pos - ret);
+ if (is_direntry_le_ih(pasted))
+ leaf_paste_entries(&bi, n + tb->item_pos - ret,
+ tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ /*
+ * if appended item is indirect item, put unformatted node
+ * into un list
+ */
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
- /* right_delimiting_key is correct in R[0] */
- replace_key(tb, tb->CFR[0], tb->rkey[0],
- tb->R[0], 0);
+ tb->insert_size[0] = 0;
+ tb->zeroes_num = 0;
+}
- reiserfs_invalidate_buffer(tb, tbS0);
- reiserfs_invalidate_buffer(tb, tb->L[0]);
+static void balance_leaf_paste_left(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ /* we must shift the part of the appended item */
+ if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
+ balance_leaf_paste_left_shift(tb, ih, body);
+ else
+ balance_leaf_paste_left_whole(tb, ih, body);
+}
- return -1;
- }
+/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
+static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag)
+{
+ if (tb->lnum[0] <= 0)
+ return;
- RFALSE(tb->rnum[0] != 0,
- "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
- /* all contents of L[0] and S[0] will be in L[0] */
- leaf_shift_left(tb, n, -1);
+ /* new item or it part falls to L[0], shift it too */
+ if (tb->item_pos < tb->lnum[0]) {
+ BUG_ON(flag != M_INSERT && flag != M_PASTE);
- reiserfs_invalidate_buffer(tb, tbS0);
+ if (flag == M_INSERT)
+ balance_leaf_insert_left(tb, ih, body);
+ else /* M_PASTE */
+ balance_leaf_paste_left(tb, ih, body);
+ } else
+ /* new item doesn't fall into L[0] */
+ leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+}
+
+
+static void balance_leaf_insert_right(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ struct buffer_info bi;
+ int ret;
- return 0;
+ /* new item or part of it doesn't fall into R[0] */
+ if (n - tb->rnum[0] >= tb->item_pos) {
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+ return;
+ }
+
+ /* new item or its part falls to R[0] */
+
+ /* part of new item falls into R[0] */
+ if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {
+ loff_t old_key_comp, old_len, r_zeroes_number;
+ const char *r_body;
+ int version, shift;
+ loff_t offset;
+
+ leaf_shift_right(tb, tb->rnum[0] - 1, -1);
+
+ version = ih_version(ih);
+
+ /* Remember key component and item length */
+ old_key_comp = le_ih_k_offset(ih);
+ old_len = ih_item_len(ih);
+
+ /*
+ * Calculate key component and item length to insert
+ * into R[0]
+ */
+ shift = 0;
+ if (is_indirect_le_ih(ih))
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ offset = le_ih_k_offset(ih) + ((old_len - tb->rbytes) << shift);
+ set_le_ih_k_offset(ih, offset);
+ put_ih_item_len(ih, tb->rbytes);
+
+ /* Insert part of the item into R[0] */
+ buffer_info_init_right(tb, &bi);
+ if ((old_len - tb->rbytes) > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + (old_len - tb->rbytes) - tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num -
+ (old_len - tb->rbytes);
+ tb->zeroes_num -= r_zeroes_number;
}
- /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
-
- RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
- (tb->lnum[0] + tb->rnum[0] > n + 1),
- "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
- tb->rnum[0], tb->lnum[0], n);
- RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
- (tb->lbytes != -1 || tb->rbytes != -1),
- "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split",
- tb->rbytes, tb->lbytes);
- RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
- (tb->lbytes < 1 || tb->rbytes != -1),
- "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split",
- tb->rbytes, tb->lbytes);
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
+
+ /* Replace right delimiting key by first key in R[0] */
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ /*
+ * Calculate key component and item length to
+ * insert into S[0]
+ */
+ set_le_ih_k_offset(ih, old_key_comp);
+ put_ih_item_len(ih, old_len - tb->rbytes);
+
+ tb->insert_size[0] -= tb->rbytes;
+
+ } else {
+ /* whole new item falls into R[0] */
+
+ /* Shift rnum[0]-1 items to R[0] */
+ ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes);
+
+ /* Insert new item into R[0] */
+ buffer_info_init_right(tb, &bi);
+ leaf_insert_into_buf(&bi, tb->item_pos - n + tb->rnum[0] - 1,
+ ih, body, tb->zeroes_num);
+
+ if (tb->item_pos - n + tb->rnum[0] - 1 == 0)
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ tb->zeroes_num = tb->insert_size[0] = 0;
+ }
+}
+
+
+static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct buffer_info bi;
+ int entry_count;
+
+ RFALSE(tb->zeroes_num,
+ "PAP-12145: invalid parameter in case of a directory");
+ entry_count = ih_entry_count(item_head(tbS0, tb->item_pos));
+
+ /* new directory entry falls into R[0] */
+ if (entry_count - tb->rbytes < tb->pos_in_item) {
+ int paste_entry_position;
+
+ RFALSE(tb->rbytes - 1 >= entry_count || !tb->insert_size[0],
+ "PAP-12150: no enough of entries to shift to R[0]: "
+ "rbytes=%d, entry_count=%d", tb->rbytes, entry_count);
+
+ /*
+ * Shift rnum[0]-1 items in whole.
+ * Shift rbytes-1 directory entries from directory
+ * item number rnum[0]
+ */
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes - 1);
+
+ /* Paste given directory entry to directory item */
+ paste_entry_position = tb->pos_in_item - entry_count +
+ tb->rbytes - 1;
+ buffer_info_init_right(tb, &bi);
+ leaf_paste_in_buffer(&bi, 0, paste_entry_position,
+ tb->insert_size[0], body, tb->zeroes_num);
+
+ /* paste entry */
+ leaf_paste_entries(&bi, 0, paste_entry_position, 1,
+ (struct reiserfs_de_head *) body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ /* change delimiting keys */
+ if (paste_entry_position == 0)
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+
+ tb->insert_size[0] = 0;
+ tb->pos_in_item++;
+ } else {
+ /* new directory entry doesn't fall into R[0] */
leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+ }
+}
- reiserfs_invalidate_buffer(tb, tbS0);
+static void balance_leaf_paste_right_shift(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n_shift, n_rem, r_zeroes_number, version;
+ unsigned long temp_rem;
+ const char *r_body;
+ struct buffer_info bi;
- return 0;
+ /* we append to directory item */
+ if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
+ balance_leaf_paste_right_shift_dirent(tb, ih, body);
+ return;
}
- if (tb->rnum[0] == -1) {
- /* all contents of R[0] and S[0] will be in R[0] */
- leaf_shift_right(tb, n, -1);
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
+ /* regular object */
+
+ /*
+ * Calculate number of bytes which must be shifted
+ * from appended item
+ */
+ n_shift = tb->rbytes - tb->insert_size[0];
+ if (n_shift < 0)
+ n_shift = 0;
+
+ RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)),
+ "PAP-12155: invalid position to paste. ih_item_len=%d, "
+ "pos_in_item=%d", tb->pos_in_item,
+ ih_item_len(item_head(tbS0, tb->item_pos)));
+
+ leaf_shift_right(tb, tb->rnum[0], n_shift);
+
+ /*
+ * Calculate number of bytes which must remain in body
+ * after appending to R[0]
+ */
+ n_rem = tb->insert_size[0] - tb->rbytes;
+ if (n_rem < 0)
+ n_rem = 0;
+
+ temp_rem = n_rem;
+
+ version = ih_version(item_head(tb->R[0], 0));
+
+ if (is_indirect_le_key(version, leaf_key(tb->R[0], 0))) {
+ int shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ temp_rem = n_rem << shift;
}
- RFALSE(tb->rnum[0],
- "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
- return 0;
+ add_le_key_k_offset(version, leaf_key(tb->R[0], 0), temp_rem);
+ add_le_key_k_offset(version, internal_key(tb->CFR[0], tb->rkey[0]),
+ temp_rem);
+
+ do_balance_mark_internal_dirty(tb, tb->CFR[0], 0);
+
+ /* Append part of body into R[0] */
+ buffer_info_init_right(tb, &bi);
+ if (n_rem > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + n_rem - tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num - n_rem;
+ tb->zeroes_num -= r_zeroes_number;
+ }
+
+ leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
+ r_body, r_zeroes_number);
+
+ if (is_indirect_le_ih(item_head(tb->R[0], 0)))
+ set_ih_free_space(item_head(tb->R[0], 0), 0);
+
+ tb->insert_size[0] = n_rem;
+ if (!n_rem)
+ tb->pos_in_item++;
}
-static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item header of inserted item (this is on little endian) */
- const char *body, /* body of inserted item or bytes to paste */
- int flag, /* i - insert, d - delete, c - cut, p - paste
- (see comment to do_balance) */
- struct item_head *insert_key, /* in our processing of one level we sometimes determine what
- must be inserted into the next higher level. This insertion
- consists of a key or two keys and their corresponding
- pointers */
- struct buffer_head **insert_ptr /* inserted node-ptrs for the next level */
- )
+static void balance_leaf_paste_right_whole(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
{
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
- int item_pos = PATH_LAST_POSITION(tb->tb_path); /* index into the array of item headers in S[0]
- of the affected item */
+ int n = B_NR_ITEMS(tbS0);
+ struct item_head *pasted;
struct buffer_info bi;
- struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */
- int snum[2]; /* number of items that will be placed
- into S_new (includes partially shifted
- items) */
- int sbytes[2]; /* if an item is partially shifted into S_new then
- if it is a directory item
- it is the number of entries from the item that are shifted into S_new
- else
- it is the number of bytes from the item that are shifted into S_new
- */
- int n, i;
- int ret_val;
- int pos_in_item;
- int zeros_num;
- PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+ buffer_info_init_right(tb, &bi);
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+ /* append item in R[0] */
+ if (tb->pos_in_item >= 0) {
+ buffer_info_init_right(tb, &bi);
+ leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->rnum[0],
+ tb->pos_in_item, tb->insert_size[0], body,
+ tb->zeroes_num);
+ }
- /* Make balance in case insert_size[0] < 0 */
- if (tb->insert_size[0] < 0)
- return balance_leaf_when_delete(tb, flag);
+ /* paste new entry, if item is directory item */
+ pasted = item_head(tb->R[0], tb->item_pos - n + tb->rnum[0]);
+ if (is_direntry_le_ih(pasted) && tb->pos_in_item >= 0) {
+ leaf_paste_entries(&bi, tb->item_pos - n + tb->rnum[0],
+ tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
- zeros_num = 0;
- if (flag == M_INSERT && !body)
- zeros_num = ih_item_len(ih);
+ if (!tb->pos_in_item) {
- pos_in_item = tb->tb_path->pos_in_item;
- /* for indirect item pos_in_item is measured in unformatted node
- pointers. Recalculate to bytes */
- if (flag != M_INSERT
- && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos)))
- pos_in_item *= UNFM_P_SIZE;
-
- if (tb->lnum[0] > 0) {
- /* Shift lnum[0] items from S[0] to the left neighbor L[0] */
- if (item_pos < tb->lnum[0]) {
- /* new item or it part falls to L[0], shift it too */
- n = B_NR_ITEMS(tb->L[0]);
-
- switch (flag) {
- case M_INSERT: /* insert item into L[0] */
-
- if (item_pos == tb->lnum[0] - 1
- && tb->lbytes != -1) {
- /* part of new item falls into L[0] */
- int new_item_len;
- int version;
-
- ret_val =
- leaf_shift_left(tb, tb->lnum[0] - 1,
- -1);
-
- /* Calculate item length to insert to S[0] */
- new_item_len =
- ih_item_len(ih) - tb->lbytes;
- /* Calculate and check item length to insert to L[0] */
- put_ih_item_len(ih,
- ih_item_len(ih) -
- new_item_len);
-
- RFALSE(ih_item_len(ih) <= 0,
- "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
- ih_item_len(ih));
-
- /* Insert new item into L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_insert_into_buf(&bi,
- n + item_pos -
- ret_val, ih, body,
- zeros_num >
- ih_item_len(ih) ?
- ih_item_len(ih) :
- zeros_num);
-
- version = ih_version(ih);
-
- /* Calculate key component, item length and body to insert into S[0] */
- set_le_ih_k_offset(ih,
- le_ih_k_offset(ih) +
- (tb->
- lbytes <<
- (is_indirect_le_ih
- (ih) ? tb->tb_sb->
- s_blocksize_bits -
- UNFM_P_SHIFT :
- 0)));
-
- put_ih_item_len(ih, new_item_len);
- if (tb->lbytes > zeros_num) {
- body +=
- (tb->lbytes - zeros_num);
- zeros_num = 0;
- } else
- zeros_num -= tb->lbytes;
-
- RFALSE(ih_item_len(ih) <= 0,
- "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
- ih_item_len(ih));
- } else {
- /* new item in whole falls into L[0] */
- /* Shift lnum[0]-1 items to L[0] */
- ret_val =
- leaf_shift_left(tb, tb->lnum[0] - 1,
- tb->lbytes);
- /* Insert new item into L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_insert_into_buf(&bi,
- n + item_pos -
- ret_val, ih, body,
- zeros_num);
- tb->insert_size[0] = 0;
- zeros_num = 0;
- }
- break;
-
- case M_PASTE: /* append item in L[0] */
-
- if (item_pos == tb->lnum[0] - 1
- && tb->lbytes != -1) {
- /* we must shift the part of the appended item */
- if (is_direntry_le_ih
- (B_N_PITEM_HEAD(tbS0, item_pos))) {
-
- RFALSE(zeros_num,
- "PAP-12090: invalid parameter in case of a directory");
- /* directory item */
- if (tb->lbytes > pos_in_item) {
- /* new directory entry falls into L[0] */
- struct item_head
- *pasted;
- int l_pos_in_item =
- pos_in_item;
-
- /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
- ret_val =
- leaf_shift_left(tb,
- tb->
- lnum
- [0],
- tb->
- lbytes
- -
- 1);
- if (ret_val
- && !item_pos) {
- pasted =
- B_N_PITEM_HEAD
- (tb->L[0],
- B_NR_ITEMS
- (tb->
- L[0]) -
- 1);
- l_pos_in_item +=
- I_ENTRY_COUNT
- (pasted) -
- (tb->
- lbytes -
- 1);
- }
-
- /* Append given directory entry to directory item */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer
- (&bi,
- n + item_pos -
- ret_val,
- l_pos_in_item,
- tb->insert_size[0],
- body, zeros_num);
-
- /* previous string prepared space for pasting new entry, following string pastes this entry */
-
- /* when we have merge directory item, pos_in_item has been changed too */
-
- /* paste new directory entry. 1 is entry number */
- leaf_paste_entries(&bi,
- n +
- item_pos
- -
- ret_val,
- l_pos_in_item,
- 1,
- (struct
- reiserfs_de_head
- *)
- body,
- body
- +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
- tb->insert_size[0] = 0;
- } else {
- /* new directory item doesn't fall into L[0] */
- /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
- leaf_shift_left(tb,
- tb->
- lnum[0],
- tb->
- lbytes);
- }
- /* Calculate new position to append in item body */
- pos_in_item -= tb->lbytes;
- } else {
- /* regular object */
- RFALSE(tb->lbytes <= 0,
- "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
- tb->lbytes);
- RFALSE(pos_in_item !=
- ih_item_len
- (B_N_PITEM_HEAD
- (tbS0, item_pos)),
- "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
- ih_item_len
- (B_N_PITEM_HEAD
- (tbS0, item_pos)),
- pos_in_item);
-
- if (tb->lbytes >= pos_in_item) {
- /* appended item will be in L[0] in whole */
- int l_n;
-
- /* this bytes number must be appended to the last item of L[h] */
- l_n =
- tb->lbytes -
- pos_in_item;
-
- /* Calculate new insert_size[0] */
- tb->insert_size[0] -=
- l_n;
-
- RFALSE(tb->
- insert_size[0] <=
- 0,
- "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
- tb->
- insert_size[0]);
- ret_val =
- leaf_shift_left(tb,
- tb->
- lnum
- [0],
- ih_item_len
- (B_N_PITEM_HEAD
- (tbS0,
- item_pos)));
- /* Append to body of item in L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer
- (&bi,
- n + item_pos -
- ret_val,
- ih_item_len
- (B_N_PITEM_HEAD
- (tb->L[0],
- n + item_pos -
- ret_val)), l_n,
- body,
- zeros_num >
- l_n ? l_n :
- zeros_num);
- /* 0-th item in S0 can be only of DIRECT type when l_n != 0 */
- {
- int version;
- int temp_l =
- l_n;
-
- RFALSE
- (ih_item_len
- (B_N_PITEM_HEAD
- (tbS0,
- 0)),
- "PAP-12106: item length must be 0");
- RFALSE
- (comp_short_le_keys
- (B_N_PKEY
- (tbS0, 0),
- B_N_PKEY
- (tb->L[0],
- n +
- item_pos
- -
- ret_val)),
- "PAP-12107: items must be of the same file");
- if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) {
- temp_l =
- l_n
- <<
- (tb->
- tb_sb->
- s_blocksize_bits
- -
- UNFM_P_SHIFT);
- }
- /* update key of first item in S0 */
- version =
- ih_version
- (B_N_PITEM_HEAD
- (tbS0, 0));
- set_le_key_k_offset
- (version,
- B_N_PKEY
- (tbS0, 0),
- le_key_k_offset
- (version,
- B_N_PKEY
- (tbS0,
- 0)) +
- temp_l);
- /* update left delimiting key */
- set_le_key_k_offset
- (version,
- B_N_PDELIM_KEY
- (tb->
- CFL[0],
- tb->
- lkey[0]),
- le_key_k_offset
- (version,
- B_N_PDELIM_KEY
- (tb->
- CFL[0],
- tb->
- lkey[0]))
- + temp_l);
- }
-
- /* Calculate new body, position in item and insert_size[0] */
- if (l_n > zeros_num) {
- body +=
- (l_n -
- zeros_num);
- zeros_num = 0;
- } else
- zeros_num -=
- l_n;
- pos_in_item = 0;
-
- RFALSE
- (comp_short_le_keys
- (B_N_PKEY(tbS0, 0),
- B_N_PKEY(tb->L[0],
- B_NR_ITEMS
- (tb->
- L[0]) -
- 1))
- ||
- !op_is_left_mergeable
- (B_N_PKEY(tbS0, 0),
- tbS0->b_size)
- ||
- !op_is_left_mergeable
- (B_N_PDELIM_KEY
- (tb->CFL[0],
- tb->lkey[0]),
- tbS0->b_size),
- "PAP-12120: item must be merge-able with left neighboring item");
- } else { /* only part of the appended item will be in L[0] */
-
- /* Calculate position in item for append in S[0] */
- pos_in_item -=
- tb->lbytes;
-
- RFALSE(pos_in_item <= 0,
- "PAP-12125: no place for paste. pos_in_item=%d",
- pos_in_item);
-
- /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
- leaf_shift_left(tb,
- tb->
- lnum[0],
- tb->
- lbytes);
- }
- }
- } else { /* appended item will be in L[0] in whole */
-
- struct item_head *pasted;
-
- if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) { /* if we paste into first item of S[0] and it is left mergable */
- /* then increment pos_in_item by the size of the last item in L[0] */
- pasted =
- B_N_PITEM_HEAD(tb->L[0],
- n - 1);
- if (is_direntry_le_ih(pasted))
- pos_in_item +=
- ih_entry_count
- (pasted);
- else
- pos_in_item +=
- ih_item_len(pasted);
- }
-
- /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
- ret_val =
- leaf_shift_left(tb, tb->lnum[0],
- tb->lbytes);
- /* Append to body of item in L[0] */
- buffer_info_init_left(tb, &bi);
- leaf_paste_in_buffer(&bi,
- n + item_pos -
- ret_val,
- pos_in_item,
- tb->insert_size[0],
- body, zeros_num);
-
- /* if appended item is directory, paste entry */
- pasted =
- B_N_PITEM_HEAD(tb->L[0],
- n + item_pos -
- ret_val);
- if (is_direntry_le_ih(pasted))
- leaf_paste_entries(&bi,
- n +
- item_pos -
- ret_val,
- pos_in_item,
- 1,
- (struct
- reiserfs_de_head
- *)body,
- body +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
- /* if appended item is indirect item, put unformatted node into un list */
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- tb->insert_size[0] = 0;
- zeros_num = 0;
- }
- break;
- default: /* cases d and t */
- reiserfs_panic(tb->tb_sb, "PAP-12130",
- "lnum > 0: unexpected mode: "
- " %s(%d)",
- (flag ==
- M_DELETE) ? "DELETE" : ((flag ==
- M_CUT)
- ? "CUT"
- :
- "UNKNOWN"),
- flag);
- }
- } else {
- /* new item doesn't fall into L[0] */
- leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+ RFALSE(tb->item_pos - n + tb->rnum[0],
+ "PAP-12165: directory item must be first "
+ "item of node when pasting is in 0th position");
+
+ /* update delimiting keys */
+ replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
}
}
- /* tb->lnum[0] > 0 */
- /* Calculate new item position */
- item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
-
- if (tb->rnum[0] > 0) {
- /* shift rnum[0] items from S[0] to the right neighbor R[0] */
- n = B_NR_ITEMS(tbS0);
- switch (flag) {
-
- case M_INSERT: /* insert item */
- if (n - tb->rnum[0] < item_pos) { /* new item or its part falls to R[0] */
- if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { /* part of new item falls into R[0] */
- loff_t old_key_comp, old_len,
- r_zeros_number;
- const char *r_body;
- int version;
- loff_t offset;
-
- leaf_shift_right(tb, tb->rnum[0] - 1,
- -1);
-
- version = ih_version(ih);
- /* Remember key component and item length */
- old_key_comp = le_ih_k_offset(ih);
- old_len = ih_item_len(ih);
-
- /* Calculate key component and item length to insert into R[0] */
- offset =
- le_ih_k_offset(ih) +
- ((old_len -
- tb->
- rbytes) << (is_indirect_le_ih(ih)
- ? tb->tb_sb->
- s_blocksize_bits -
- UNFM_P_SHIFT : 0));
- set_le_ih_k_offset(ih, offset);
- put_ih_item_len(ih, tb->rbytes);
- /* Insert part of the item into R[0] */
- buffer_info_init_right(tb, &bi);
- if ((old_len - tb->rbytes) > zeros_num) {
- r_zeros_number = 0;
- r_body =
- body + (old_len -
- tb->rbytes) -
- zeros_num;
- } else {
- r_body = body;
- r_zeros_number =
- zeros_num - (old_len -
- tb->rbytes);
- zeros_num -= r_zeros_number;
- }
-
- leaf_insert_into_buf(&bi, 0, ih, r_body,
- r_zeros_number);
-
- /* Replace right delimiting key by first key in R[0] */
- replace_key(tb, tb->CFR[0], tb->rkey[0],
- tb->R[0], 0);
-
- /* Calculate key component and item length to insert into S[0] */
- set_le_ih_k_offset(ih, old_key_comp);
- put_ih_item_len(ih,
- old_len - tb->rbytes);
-
- tb->insert_size[0] -= tb->rbytes;
-
- } else { /* whole new item falls into R[0] */
-
- /* Shift rnum[0]-1 items to R[0] */
- ret_val =
- leaf_shift_right(tb,
- tb->rnum[0] - 1,
- tb->rbytes);
- /* Insert new item into R[0] */
- buffer_info_init_right(tb, &bi);
- leaf_insert_into_buf(&bi,
- item_pos - n +
- tb->rnum[0] - 1,
- ih, body,
- zeros_num);
-
- if (item_pos - n + tb->rnum[0] - 1 == 0) {
- replace_key(tb, tb->CFR[0],
- tb->rkey[0],
- tb->R[0], 0);
-
- }
- zeros_num = tb->insert_size[0] = 0;
- }
- } else { /* new item or part of it doesn't fall into R[0] */
-
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- }
- break;
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
+ tb->zeroes_num = tb->insert_size[0] = 0;
+}
- case M_PASTE: /* append item */
-
- if (n - tb->rnum[0] <= item_pos) { /* pasted item or part of it falls to R[0] */
- if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) { /* we must shift the part of the appended item */
- if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) { /* we append to directory item */
- int entry_count;
-
- RFALSE(zeros_num,
- "PAP-12145: invalid parameter in case of a directory");
- entry_count =
- I_ENTRY_COUNT(B_N_PITEM_HEAD
- (tbS0,
- item_pos));
- if (entry_count - tb->rbytes <
- pos_in_item)
- /* new directory entry falls into R[0] */
- {
- int paste_entry_position;
-
- RFALSE(tb->rbytes - 1 >=
- entry_count
- || !tb->
- insert_size[0],
- "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
- tb->rbytes,
- entry_count);
- /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
- leaf_shift_right(tb,
- tb->
- rnum
- [0],
- tb->
- rbytes
- - 1);
- /* Paste given directory entry to directory item */
- paste_entry_position =
- pos_in_item -
- entry_count +
- tb->rbytes - 1;
- buffer_info_init_right(tb, &bi);
- leaf_paste_in_buffer
- (&bi, 0,
- paste_entry_position,
- tb->insert_size[0],
- body, zeros_num);
- /* paste entry */
- leaf_paste_entries(&bi,
- 0,
- paste_entry_position,
- 1,
- (struct
- reiserfs_de_head
- *)
- body,
- body
- +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
-
- if (paste_entry_position
- == 0) {
- /* change delimiting keys */
- replace_key(tb,
- tb->
- CFR
- [0],
- tb->
- rkey
- [0],
- tb->
- R
- [0],
- 0);
- }
-
- tb->insert_size[0] = 0;
- pos_in_item++;
- } else { /* new directory entry doesn't fall into R[0] */
-
- leaf_shift_right(tb,
- tb->
- rnum
- [0],
- tb->
- rbytes);
- }
- } else { /* regular object */
-
- int n_shift, n_rem,
- r_zeros_number;
- const char *r_body;
-
- /* Calculate number of bytes which must be shifted from appended item */
- if ((n_shift =
- tb->rbytes -
- tb->insert_size[0]) < 0)
- n_shift = 0;
-
- RFALSE(pos_in_item !=
- ih_item_len
- (B_N_PITEM_HEAD
- (tbS0, item_pos)),
- "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
- pos_in_item,
- ih_item_len
- (B_N_PITEM_HEAD
- (tbS0, item_pos)));
-
- leaf_shift_right(tb,
- tb->rnum[0],
- n_shift);
- /* Calculate number of bytes which must remain in body after appending to R[0] */
- if ((n_rem =
- tb->insert_size[0] -
- tb->rbytes) < 0)
- n_rem = 0;
-
- {
- int version;
- unsigned long temp_rem =
- n_rem;
-
- version =
- ih_version
- (B_N_PITEM_HEAD
- (tb->R[0], 0));
- if (is_indirect_le_key
- (version,
- B_N_PKEY(tb->R[0],
- 0))) {
- temp_rem =
- n_rem <<
- (tb->tb_sb->
- s_blocksize_bits
- -
- UNFM_P_SHIFT);
- }
- set_le_key_k_offset
- (version,
- B_N_PKEY(tb->R[0],
- 0),
- le_key_k_offset
- (version,
- B_N_PKEY(tb->R[0],
- 0)) +
- temp_rem);
- set_le_key_k_offset
- (version,
- B_N_PDELIM_KEY(tb->
- CFR
- [0],
- tb->
- rkey
- [0]),
- le_key_k_offset
- (version,
- B_N_PDELIM_KEY
- (tb->CFR[0],
- tb->rkey[0])) +
- temp_rem);
- }
-/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
- k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
- do_balance_mark_internal_dirty
- (tb, tb->CFR[0], 0);
-
- /* Append part of body into R[0] */
- buffer_info_init_right(tb, &bi);
- if (n_rem > zeros_num) {
- r_zeros_number = 0;
- r_body =
- body + n_rem -
- zeros_num;
- } else {
- r_body = body;
- r_zeros_number =
- zeros_num - n_rem;
- zeros_num -=
- r_zeros_number;
- }
-
- leaf_paste_in_buffer(&bi, 0,
- n_shift,
- tb->
- insert_size
- [0] -
- n_rem,
- r_body,
- r_zeros_number);
-
- if (is_indirect_le_ih
- (B_N_PITEM_HEAD
- (tb->R[0], 0))) {
-#if 0
- RFALSE(n_rem,
- "PAP-12160: paste more than one unformatted node pointer");
-#endif
- set_ih_free_space
- (B_N_PITEM_HEAD
- (tb->R[0], 0), 0);
- }
- tb->insert_size[0] = n_rem;
- if (!n_rem)
- pos_in_item++;
- }
- } else { /* pasted item in whole falls into R[0] */
-
- struct item_head *pasted;
-
- ret_val =
- leaf_shift_right(tb, tb->rnum[0],
- tb->rbytes);
- /* append item in R[0] */
- if (pos_in_item >= 0) {
- buffer_info_init_right(tb, &bi);
- leaf_paste_in_buffer(&bi,
- item_pos -
- n +
- tb->
- rnum[0],
- pos_in_item,
- tb->
- insert_size
- [0], body,
- zeros_num);
- }
-
- /* paste new entry, if item is directory item */
- pasted =
- B_N_PITEM_HEAD(tb->R[0],
- item_pos - n +
- tb->rnum[0]);
- if (is_direntry_le_ih(pasted)
- && pos_in_item >= 0) {
- leaf_paste_entries(&bi,
- item_pos -
- n +
- tb->rnum[0],
- pos_in_item,
- 1,
- (struct
- reiserfs_de_head
- *)body,
- body +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
- if (!pos_in_item) {
-
- RFALSE(item_pos - n +
- tb->rnum[0],
- "PAP-12165: directory item must be first item of node when pasting is in 0th position");
-
- /* update delimiting keys */
- replace_key(tb,
- tb->CFR[0],
- tb->rkey[0],
- tb->R[0],
- 0);
- }
- }
-
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- zeros_num = tb->insert_size[0] = 0;
- }
- } else { /* new item doesn't fall into R[0] */
-
- leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
- }
- break;
- default: /* cases d and t */
- reiserfs_panic(tb->tb_sb, "PAP-12175",
- "rnum > 0: unexpected mode: %s(%d)",
- (flag ==
- M_DELETE) ? "DELETE" : ((flag ==
- M_CUT) ? "CUT"
- : "UNKNOWN"),
- flag);
- }
+static void balance_leaf_paste_right(struct tree_balance *tb,
+ struct item_head *ih, const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ /* new item doesn't fall into R[0] */
+ if (n - tb->rnum[0] > tb->item_pos) {
+ leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+ return;
}
- /* tb->rnum[0] > 0 */
- RFALSE(tb->blknum[0] > 3,
- "PAP-12180: blknum can not be %d. It must be <= 3",
- tb->blknum[0]);
- RFALSE(tb->blknum[0] < 0,
- "PAP-12185: blknum can not be %d. It must be >= 0",
- tb->blknum[0]);
+ /* pasted item or part of it falls to R[0] */
- /* if while adding to a node we discover that it is possible to split
- it in two, and merge the left part into the left neighbor and the
- right part into the right neighbor, eliminating the node */
- if (tb->blknum[0] == 0) { /* node S[0] is empty now */
+ if (tb->item_pos == n - tb->rnum[0] && tb->rbytes != -1)
+ /* we must shift the part of the appended item */
+ balance_leaf_paste_right_shift(tb, ih, body);
+ else
+ /* pasted item in whole falls into R[0] */
+ balance_leaf_paste_right_whole(tb, ih, body);
+}
- RFALSE(!tb->lnum[0] || !tb->rnum[0],
- "PAP-12190: lnum and rnum must not be zero");
- /* if insertion was done before 0-th position in R[0], right
- delimiting key of the tb->L[0]'s and left delimiting key are
- not set correctly */
- if (tb->CFL[0]) {
- if (!tb->CFR[0])
- reiserfs_panic(tb->tb_sb, "vs-12195",
- "CFR not initialized");
- copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
- B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
- do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
+/* shift rnum[0] items from S[0] to the right neighbor R[0] */
+static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag)
+{
+ if (tb->rnum[0] <= 0)
+ return;
+
+ BUG_ON(flag != M_INSERT && flag != M_PASTE);
+
+ if (flag == M_INSERT)
+ balance_leaf_insert_right(tb, ih, body);
+ else /* M_PASTE */
+ balance_leaf_paste_right(tb, ih, body);
+}
+
+static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ struct buffer_info bi;
+ int shift;
+
+ /* new item or it part don't falls into S_new[i] */
+ if (n - tb->snum[i] >= tb->item_pos) {
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+ tb->snum[i], tb->sbytes[i], tb->S_new[i]);
+ return;
+ }
+
+ /* new item or it's part falls to first new node S_new[i] */
+
+ /* part of new item falls into S_new[i] */
+ if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) {
+ int old_key_comp, old_len, r_zeroes_number;
+ const char *r_body;
+ int version;
+
+ /* Move snum[i]-1 items from S[0] to S_new[i] */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1,
+ tb->S_new[i]);
+
+ /* Remember key component and item length */
+ version = ih_version(ih);
+ old_key_comp = le_ih_k_offset(ih);
+ old_len = ih_item_len(ih);
+
+ /*
+ * Calculate key component and item length to insert
+ * into S_new[i]
+ */
+ shift = 0;
+ if (is_indirect_le_ih(ih))
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ set_le_ih_k_offset(ih,
+ le_ih_k_offset(ih) +
+ ((old_len - tb->sbytes[i]) << shift));
+
+ put_ih_item_len(ih, tb->sbytes[i]);
+
+ /* Insert part of the item into S_new[i] before 0-th item */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+
+ if ((old_len - tb->sbytes[i]) > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + (old_len - tb->sbytes[i]) -
+ tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num - (old_len -
+ tb->sbytes[i]);
+ tb->zeroes_num -= r_zeroes_number;
}
- reiserfs_invalidate_buffer(tb, tbS0);
- return 0;
+ leaf_insert_into_buf(&bi, 0, ih, r_body, r_zeroes_number);
+
+ /*
+ * Calculate key component and item length to
+ * insert into S[i]
+ */
+ set_le_ih_k_offset(ih, old_key_comp);
+ put_ih_item_len(ih, old_len - tb->sbytes[i]);
+ tb->insert_size[0] -= tb->sbytes[i];
+ } else {
+ /* whole new item falls into S_new[i] */
+
+ /*
+ * Shift snum[0] - 1 items to S_new[i]
+ * (sbytes[i] of split item)
+ */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+ tb->snum[i] - 1, tb->sbytes[i], tb->S_new[i]);
+
+ /* Insert new item into S_new[i] */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ leaf_insert_into_buf(&bi, tb->item_pos - n + tb->snum[i] - 1,
+ ih, body, tb->zeroes_num);
+
+ tb->zeroes_num = tb->insert_size[0] = 0;
}
+}
- /* Fill new nodes that appear in place of S[0] */
+/* we append to directory item */
+static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
+ int entry_count = ih_entry_count(aux_ih);
+ struct buffer_info bi;
+
+ if (entry_count - tb->sbytes[i] < tb->pos_in_item &&
+ tb->pos_in_item <= entry_count) {
+ /* new directory entry falls into S_new[i] */
+
+ RFALSE(!tb->insert_size[0],
+ "PAP-12215: insert_size is already 0");
+ RFALSE(tb->sbytes[i] - 1 >= entry_count,
+ "PAP-12220: there are no so much entries (%d), only %d",
+ tb->sbytes[i] - 1, entry_count);
+
+ /*
+ * Shift snum[i]-1 items in whole.
+ * Shift sbytes[i] directory entries
+ * from directory item number snum[i]
+ */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+ tb->sbytes[i] - 1, tb->S_new[i]);
+
+ /*
+ * Paste given directory entry to
+ * directory item
+ */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ leaf_paste_in_buffer(&bi, 0, tb->pos_in_item - entry_count +
+ tb->sbytes[i] - 1, tb->insert_size[0],
+ body, tb->zeroes_num);
+
+ /* paste new directory entry */
+ leaf_paste_entries(&bi, 0, tb->pos_in_item - entry_count +
+ tb->sbytes[i] - 1, 1,
+ (struct reiserfs_de_head *) body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ tb->insert_size[0] = 0;
+ tb->pos_in_item++;
+ } else {
+ /* new directory entry doesn't fall into S_new[i] */
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+ tb->sbytes[i], tb->S_new[i]);
+ }
+
+}
- /* I am told that this copying is because we need an array to enable
- the looping code. -Hans */
- snum[0] = tb->s1num, snum[1] = tb->s2num;
- sbytes[0] = tb->s1bytes;
- sbytes[1] = tb->s2bytes;
+static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct item_head *aux_ih = item_head(tbS0, tb->item_pos);
+ int n_shift, n_rem, r_zeroes_number, shift;
+ const char *r_body;
+ struct item_head *tmp;
+ struct buffer_info bi;
+
+ RFALSE(ih, "PAP-12210: ih must be 0");
+
+ if (is_direntry_le_ih(aux_ih)) {
+ balance_leaf_new_nodes_paste_dirent(tb, ih, body, insert_key,
+ insert_ptr, i);
+ return;
+ }
+
+ /* regular object */
+
+
+ RFALSE(tb->pos_in_item != ih_item_len(item_head(tbS0, tb->item_pos)) ||
+ tb->insert_size[0] <= 0,
+ "PAP-12225: item too short or insert_size <= 0");
+
+ /*
+ * Calculate number of bytes which must be shifted from appended item
+ */
+ n_shift = tb->sbytes[i] - tb->insert_size[0];
+ if (n_shift < 0)
+ n_shift = 0;
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i], n_shift,
+ tb->S_new[i]);
+
+ /*
+ * Calculate number of bytes which must remain in body after
+ * append to S_new[i]
+ */
+ n_rem = tb->insert_size[0] - tb->sbytes[i];
+ if (n_rem < 0)
+ n_rem = 0;
+
+ /* Append part of body into S_new[0] */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ if (n_rem > tb->zeroes_num) {
+ r_zeroes_number = 0;
+ r_body = body + n_rem - tb->zeroes_num;
+ } else {
+ r_body = body;
+ r_zeroes_number = tb->zeroes_num - n_rem;
+ tb->zeroes_num -= r_zeroes_number;
+ }
+
+ leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem,
+ r_body, r_zeroes_number);
+
+ tmp = item_head(tb->S_new[i], 0);
+ shift = 0;
+ if (is_indirect_le_ih(tmp)) {
+ set_ih_free_space(tmp, 0);
+ shift = tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT;
+ }
+ add_le_ih_k_offset(tmp, n_rem << shift);
+
+ tb->insert_size[0] = n_rem;
+ if (!n_rem)
+ tb->pos_in_item++;
+}
+
+static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+ int leaf_mi;
+ struct item_head *pasted;
+ struct buffer_info bi;
+
+#ifdef CONFIG_REISERFS_CHECK
+ struct item_head *ih_check = item_head(tbS0, tb->item_pos);
+
+ if (!is_direntry_le_ih(ih_check) &&
+ (tb->pos_in_item != ih_item_len(ih_check) ||
+ tb->insert_size[0] <= 0))
+ reiserfs_panic(tb->tb_sb,
+ "PAP-12235",
+ "pos_in_item must be equal to ih_item_len");
+#endif
+
+ leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i],
+ tb->sbytes[i], tb->S_new[i]);
+
+ RFALSE(leaf_mi,
+ "PAP-12240: unexpected value returned by leaf_move_items (%d)",
+ leaf_mi);
+
+ /* paste into item */
+ buffer_info_init_bh(tb, &bi, tb->S_new[i]);
+ leaf_paste_in_buffer(&bi, tb->item_pos - n + tb->snum[i],
+ tb->pos_in_item, tb->insert_size[0],
+ body, tb->zeroes_num);
+
+ pasted = item_head(tb->S_new[i], tb->item_pos - n +
+ tb->snum[i]);
+ if (is_direntry_le_ih(pasted))
+ leaf_paste_entries(&bi, tb->item_pos - n + tb->snum[i],
+ tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ /* if we paste to indirect item update ih_free_space */
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
+
+ tb->zeroes_num = tb->insert_size[0] = 0;
+
+}
+static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int i)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ int n = B_NR_ITEMS(tbS0);
+
+ /* pasted item doesn't fall into S_new[i] */
+ if (n - tb->snum[i] > tb->item_pos) {
+ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+ tb->snum[i], tb->sbytes[i], tb->S_new[i]);
+ return;
+ }
+
+ /* pasted item or part if it falls to S_new[i] */
+
+ if (tb->item_pos == n - tb->snum[i] && tb->sbytes[i] != -1)
+ /* we must shift part of the appended item */
+ balance_leaf_new_nodes_paste_shift(tb, ih, body, insert_key,
+ insert_ptr, i);
+ else
+ /* item falls wholly into S_new[i] */
+ balance_leaf_new_nodes_paste_whole(tb, ih, body, insert_key,
+ insert_ptr, i);
+}
+
+/* Fill new nodes that appear in place of S[0] */
+static void balance_leaf_new_nodes(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr,
+ int flag)
+{
+ int i;
for (i = tb->blknum[0] - 2; i >= 0; i--) {
+ BUG_ON(flag != M_INSERT && flag != M_PASTE);
- RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i,
- snum[i]);
+ RFALSE(!tb->snum[i],
+ "PAP-12200: snum[%d] == %d. Must be > 0", i,
+ tb->snum[i]);
/* here we shift from S to S_new nodes */
- S_new[i] = get_FEB(tb);
+ tb->S_new[i] = get_FEB(tb);
/* initialized block type and tree level */
- set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL);
-
- n = B_NR_ITEMS(tbS0);
-
- switch (flag) {
- case M_INSERT: /* insert item */
-
- if (n - snum[i] < item_pos) { /* new item or it's part falls to first new node S_new[i] */
- if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) { /* part of new item falls into S_new[i] */
- int old_key_comp, old_len,
- r_zeros_number;
- const char *r_body;
- int version;
-
- /* Move snum[i]-1 items from S[0] to S_new[i] */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i] - 1, -1,
- S_new[i]);
- /* Remember key component and item length */
- version = ih_version(ih);
- old_key_comp = le_ih_k_offset(ih);
- old_len = ih_item_len(ih);
-
- /* Calculate key component and item length to insert into S_new[i] */
- set_le_ih_k_offset(ih,
- le_ih_k_offset(ih) +
- ((old_len -
- sbytes[i]) <<
- (is_indirect_le_ih
- (ih) ? tb->tb_sb->
- s_blocksize_bits -
- UNFM_P_SHIFT :
- 0)));
-
- put_ih_item_len(ih, sbytes[i]);
-
- /* Insert part of the item into S_new[i] before 0-th item */
- buffer_info_init_bh(tb, &bi, S_new[i]);
-
- if ((old_len - sbytes[i]) > zeros_num) {
- r_zeros_number = 0;
- r_body =
- body + (old_len -
- sbytes[i]) -
- zeros_num;
- } else {
- r_body = body;
- r_zeros_number =
- zeros_num - (old_len -
- sbytes[i]);
- zeros_num -= r_zeros_number;
- }
-
- leaf_insert_into_buf(&bi, 0, ih, r_body,
- r_zeros_number);
-
- /* Calculate key component and item length to insert into S[i] */
- set_le_ih_k_offset(ih, old_key_comp);
- put_ih_item_len(ih,
- old_len - sbytes[i]);
- tb->insert_size[0] -= sbytes[i];
- } else { /* whole new item falls into S_new[i] */
-
- /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i] - 1, sbytes[i],
- S_new[i]);
-
- /* Insert new item into S_new[i] */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- leaf_insert_into_buf(&bi,
- item_pos - n +
- snum[i] - 1, ih,
- body, zeros_num);
-
- zeros_num = tb->insert_size[0] = 0;
- }
- }
-
- else { /* new item or it part don't falls into S_new[i] */
+ set_blkh_level(B_BLK_HEAD(tb->S_new[i]), DISK_LEAF_NODE_LEVEL);
+
+ if (flag == M_INSERT)
+ balance_leaf_new_nodes_insert(tb, ih, body, insert_key,
+ insert_ptr, i);
+ else /* M_PASTE */
+ balance_leaf_new_nodes_paste(tb, ih, body, insert_key,
+ insert_ptr, i);
+
+ memcpy(insert_key + i, leaf_key(tb->S_new[i], 0), KEY_SIZE);
+ insert_ptr[i] = tb->S_new[i];
+
+ RFALSE(!buffer_journaled(tb->S_new[i])
+ || buffer_journal_dirty(tb->S_new[i])
+ || buffer_dirty(tb->S_new[i]),
+ "PAP-12247: S_new[%d] : (%b)",
+ i, tb->S_new[i]);
+ }
+}
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i], sbytes[i], S_new[i]);
- }
- break;
+static void balance_leaf_finish_node_insert(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct buffer_info bi;
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_insert_into_buf(&bi, tb->item_pos, ih, body, tb->zeroes_num);
- case M_PASTE: /* append item */
-
- if (n - snum[i] <= item_pos) { /* pasted item or part if it falls to S_new[i] */
- if (item_pos == n - snum[i] && sbytes[i] != -1) { /* we must shift part of the appended item */
- struct item_head *aux_ih;
-
- RFALSE(ih, "PAP-12210: ih must be 0");
-
- aux_ih = B_N_PITEM_HEAD(tbS0, item_pos);
- if (is_direntry_le_ih(aux_ih)) {
- /* we append to directory item */
-
- int entry_count;
-
- entry_count =
- ih_entry_count(aux_ih);
-
- if (entry_count - sbytes[i] <
- pos_in_item
- && pos_in_item <=
- entry_count) {
- /* new directory entry falls into S_new[i] */
-
- RFALSE(!tb->
- insert_size[0],
- "PAP-12215: insert_size is already 0");
- RFALSE(sbytes[i] - 1 >=
- entry_count,
- "PAP-12220: there are no so much entries (%d), only %d",
- sbytes[i] - 1,
- entry_count);
-
- /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
- leaf_move_items
- (LEAF_FROM_S_TO_SNEW,
- tb, snum[i],
- sbytes[i] - 1,
- S_new[i]);
- /* Paste given directory entry to directory item */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- leaf_paste_in_buffer
- (&bi, 0,
- pos_in_item -
- entry_count +
- sbytes[i] - 1,
- tb->insert_size[0],
- body, zeros_num);
- /* paste new directory entry */
- leaf_paste_entries(&bi,
- 0,
- pos_in_item
- -
- entry_count
- +
- sbytes
- [i] -
- 1, 1,
- (struct
- reiserfs_de_head
- *)
- body,
- body
- +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
- tb->insert_size[0] = 0;
- pos_in_item++;
- } else { /* new directory entry doesn't fall into S_new[i] */
- leaf_move_items
- (LEAF_FROM_S_TO_SNEW,
- tb, snum[i],
- sbytes[i],
- S_new[i]);
- }
- } else { /* regular object */
-
- int n_shift, n_rem,
- r_zeros_number;
- const char *r_body;
-
- RFALSE(pos_in_item !=
- ih_item_len
- (B_N_PITEM_HEAD
- (tbS0, item_pos))
- || tb->insert_size[0] <=
- 0,
- "PAP-12225: item too short or insert_size <= 0");
-
- /* Calculate number of bytes which must be shifted from appended item */
- n_shift =
- sbytes[i] -
- tb->insert_size[0];
- if (n_shift < 0)
- n_shift = 0;
- leaf_move_items
- (LEAF_FROM_S_TO_SNEW, tb,
- snum[i], n_shift,
- S_new[i]);
-
- /* Calculate number of bytes which must remain in body after append to S_new[i] */
- n_rem =
- tb->insert_size[0] -
- sbytes[i];
- if (n_rem < 0)
- n_rem = 0;
- /* Append part of body into S_new[0] */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- if (n_rem > zeros_num) {
- r_zeros_number = 0;
- r_body =
- body + n_rem -
- zeros_num;
- } else {
- r_body = body;
- r_zeros_number =
- zeros_num - n_rem;
- zeros_num -=
- r_zeros_number;
- }
-
- leaf_paste_in_buffer(&bi, 0,
- n_shift,
- tb->
- insert_size
- [0] -
- n_rem,
- r_body,
- r_zeros_number);
- {
- struct item_head *tmp;
-
- tmp =
- B_N_PITEM_HEAD(S_new
- [i],
- 0);
- if (is_indirect_le_ih
- (tmp)) {
- set_ih_free_space
- (tmp, 0);
- set_le_ih_k_offset
- (tmp,
- le_ih_k_offset
- (tmp) +
- (n_rem <<
- (tb->
- tb_sb->
- s_blocksize_bits
- -
- UNFM_P_SHIFT)));
- } else {
- set_le_ih_k_offset
- (tmp,
- le_ih_k_offset
- (tmp) +
- n_rem);
- }
- }
-
- tb->insert_size[0] = n_rem;
- if (!n_rem)
- pos_in_item++;
- }
- } else
- /* item falls wholly into S_new[i] */
- {
- int leaf_mi;
- struct item_head *pasted;
+ /* If we insert the first key change the delimiting key */
+ if (tb->item_pos == 0) {
+ if (tb->CFL[0]) /* can be 0 in reiserfsck */
+ replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0, 0);
-#ifdef CONFIG_REISERFS_CHECK
- struct item_head *ih_check =
- B_N_PITEM_HEAD(tbS0, item_pos);
-
- if (!is_direntry_le_ih(ih_check)
- && (pos_in_item != ih_item_len(ih_check)
- || tb->insert_size[0] <= 0))
- reiserfs_panic(tb->tb_sb,
- "PAP-12235",
- "pos_in_item "
- "must be equal "
- "to ih_item_len");
-#endif /* CONFIG_REISERFS_CHECK */
-
- leaf_mi =
- leaf_move_items(LEAF_FROM_S_TO_SNEW,
- tb, snum[i],
- sbytes[i],
- S_new[i]);
-
- RFALSE(leaf_mi,
- "PAP-12240: unexpected value returned by leaf_move_items (%d)",
- leaf_mi);
-
- /* paste into item */
- buffer_info_init_bh(tb, &bi, S_new[i]);
- leaf_paste_in_buffer(&bi,
- item_pos - n +
- snum[i],
- pos_in_item,
- tb->insert_size[0],
- body, zeros_num);
-
- pasted =
- B_N_PITEM_HEAD(S_new[i],
- item_pos - n +
- snum[i]);
- if (is_direntry_le_ih(pasted)) {
- leaf_paste_entries(&bi,
- item_pos -
- n + snum[i],
- pos_in_item,
- 1,
- (struct
- reiserfs_de_head
- *)body,
- body +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
- }
-
- /* if we paste to indirect item update ih_free_space */
- if (is_indirect_le_ih(pasted))
- set_ih_free_space(pasted, 0);
- zeros_num = tb->insert_size[0] = 0;
- }
- }
+ }
+}
- else { /* pasted item doesn't fall into S_new[i] */
+static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct item_head *pasted = item_head(tbS0, tb->item_pos);
+ struct buffer_info bi;
- leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
- snum[i], sbytes[i], S_new[i]);
- }
- break;
- default: /* cases d and t */
- reiserfs_panic(tb->tb_sb, "PAP-12245",
- "blknum > 2: unexpected mode: %s(%d)",
- (flag ==
- M_DELETE) ? "DELETE" : ((flag ==
- M_CUT) ? "CUT"
- : "UNKNOWN"),
- flag);
+ if (tb->pos_in_item >= 0 && tb->pos_in_item <= ih_entry_count(pasted)) {
+ RFALSE(!tb->insert_size[0],
+ "PAP-12260: insert_size is 0 already");
+
+ /* prepare space */
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_paste_in_buffer(&bi, tb->item_pos, tb->pos_in_item,
+ tb->insert_size[0], body, tb->zeroes_num);
+
+ /* paste entry */
+ leaf_paste_entries(&bi, tb->item_pos, tb->pos_in_item, 1,
+ (struct reiserfs_de_head *)body,
+ body + DEH_SIZE, tb->insert_size[0]);
+
+ if (!tb->item_pos && !tb->pos_in_item) {
+ RFALSE(!tb->CFL[0] || !tb->L[0],
+ "PAP-12270: CFL[0]/L[0] must be specified");
+ if (tb->CFL[0])
+ replace_key(tb, tb->CFL[0], tb->lkey[0],
+ tbS0, 0);
}
- memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE);
- insert_ptr[i] = S_new[i];
-
- RFALSE(!buffer_journaled(S_new[i])
- || buffer_journal_dirty(S_new[i])
- || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)",
- i, S_new[i]);
+ tb->insert_size[0] = 0;
}
+}
- /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
- affected item which remains in S */
- if (0 <= item_pos && item_pos < tb->s0num) { /* if we must insert or append into buffer S[0] */
+static void balance_leaf_finish_node_paste(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+ struct buffer_info bi;
+ struct item_head *pasted = item_head(tbS0, tb->item_pos);
- switch (flag) {
- case M_INSERT: /* insert item into S[0] */
- buffer_info_init_tbS0(tb, &bi);
- leaf_insert_into_buf(&bi, item_pos, ih, body,
- zeros_num);
+ /* when directory, may be new entry already pasted */
+ if (is_direntry_le_ih(pasted)) {
+ balance_leaf_finish_node_paste_dirent(tb, ih, body);
+ return;
+ }
- /* If we insert the first key change the delimiting key */
- if (item_pos == 0) {
- if (tb->CFL[0]) /* can be 0 in reiserfsck */
- replace_key(tb, tb->CFL[0], tb->lkey[0],
- tbS0, 0);
+ /* regular object */
- }
- break;
+ if (tb->pos_in_item == ih_item_len(pasted)) {
+ RFALSE(tb->insert_size[0] <= 0,
+ "PAP-12275: insert size must not be %d",
+ tb->insert_size[0]);
+ buffer_info_init_tbS0(tb, &bi);
+ leaf_paste_in_buffer(&bi, tb->item_pos,
+ tb->pos_in_item, tb->insert_size[0], body,
+ tb->zeroes_num);
- case M_PASTE:{ /* append item in S[0] */
- struct item_head *pasted;
-
- pasted = B_N_PITEM_HEAD(tbS0, item_pos);
- /* when directory, may be new entry already pasted */
- if (is_direntry_le_ih(pasted)) {
- if (pos_in_item >= 0 &&
- pos_in_item <=
- ih_entry_count(pasted)) {
-
- RFALSE(!tb->insert_size[0],
- "PAP-12260: insert_size is 0 already");
-
- /* prepare space */
- buffer_info_init_tbS0(tb, &bi);
- leaf_paste_in_buffer(&bi,
- item_pos,
- pos_in_item,
- tb->
- insert_size
- [0], body,
- zeros_num);
-
- /* paste entry */
- leaf_paste_entries(&bi,
- item_pos,
- pos_in_item,
- 1,
- (struct
- reiserfs_de_head
- *)body,
- body +
- DEH_SIZE,
- tb->
- insert_size
- [0]
- );
- if (!item_pos && !pos_in_item) {
- RFALSE(!tb->CFL[0]
- || !tb->L[0],
- "PAP-12270: CFL[0]/L[0] must be specified");
- if (tb->CFL[0]) {
- replace_key(tb,
- tb->
- CFL
- [0],
- tb->
- lkey
- [0],
- tbS0,
- 0);
-
- }
- }
- tb->insert_size[0] = 0;
- }
- } else { /* regular object */
- if (pos_in_item == ih_item_len(pasted)) {
-
- RFALSE(tb->insert_size[0] <= 0,
- "PAP-12275: insert size must not be %d",
- tb->insert_size[0]);
- buffer_info_init_tbS0(tb, &bi);
- leaf_paste_in_buffer(&bi,
- item_pos,
- pos_in_item,
- tb->
- insert_size
- [0], body,
- zeros_num);
-
- if (is_indirect_le_ih(pasted)) {
-#if 0
- RFALSE(tb->
- insert_size[0] !=
- UNFM_P_SIZE,
- "PAP-12280: insert_size for indirect item must be %d, not %d",
- UNFM_P_SIZE,
- tb->
- insert_size[0]);
-#endif
- set_ih_free_space
- (pasted, 0);
- }
- tb->insert_size[0] = 0;
- }
+ if (is_indirect_le_ih(pasted))
+ set_ih_free_space(pasted, 0);
+
+ tb->insert_size[0] = 0;
+ }
#ifdef CONFIG_REISERFS_CHECK
- else {
- if (tb->insert_size[0]) {
- print_cur_tb("12285");
- reiserfs_panic(tb->
- tb_sb,
- "PAP-12285",
- "insert_size "
- "must be 0 "
- "(%d)",
- tb->insert_size[0]);
- }
- }
-#endif /* CONFIG_REISERFS_CHECK */
-
- }
- } /* case M_PASTE: */
+ else if (tb->insert_size[0]) {
+ print_cur_tb("12285");
+ reiserfs_panic(tb->tb_sb, "PAP-12285",
+ "insert_size must be 0 (%d)", tb->insert_size[0]);
+ }
+#endif
+}
+
+/*
+ * if the affected item was not wholly shifted then we
+ * perform all necessary operations on that part or whole
+ * of the affected item which remains in S
+ */
+static void balance_leaf_finish_node(struct tree_balance *tb,
+ struct item_head *ih,
+ const char *body, int flag)
+{
+ /* if we must insert or append into buffer S[0] */
+ if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
+ if (flag == M_INSERT)
+ balance_leaf_finish_node_insert(tb, ih, body);
+ else /* M_PASTE */
+ balance_leaf_finish_node_paste(tb, ih, body);
+ }
+}
+
+/**
+ * balance_leaf - reiserfs tree balancing algorithm
+ * @tb: tree balance state
+ * @ih: item header of inserted item (little endian)
+ * @body: body of inserted item or bytes to paste
+ * @flag: i - insert, d - delete, c - cut, p - paste (see do_balance)
+ * passed back:
+ * @insert_key: key to insert new nodes
+ * @insert_ptr: array of nodes to insert at the next level
+ *
+ * In our processing of one level we sometimes determine what must be
+ * inserted into the next higher level. This insertion consists of a
+ * key or two keys and their corresponding pointers.
+ */
+static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag,
+ struct item_head *insert_key,
+ struct buffer_head **insert_ptr)
+{
+ struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+
+ PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+
+ /* Make balance in case insert_size[0] < 0 */
+ if (tb->insert_size[0] < 0)
+ return balance_leaf_when_delete(tb, flag);
+
+ tb->item_pos = PATH_LAST_POSITION(tb->tb_path),
+ tb->pos_in_item = tb->tb_path->pos_in_item,
+ tb->zeroes_num = 0;
+ if (flag == M_INSERT && !body)
+ tb->zeroes_num = ih_item_len(ih);
+
+ /*
+ * for indirect item pos_in_item is measured in unformatted node
+ * pointers. Recalculate to bytes
+ */
+ if (flag != M_INSERT
+ && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
+ tb->pos_in_item *= UNFM_P_SIZE;
+
+ balance_leaf_left(tb, ih, body, flag);
+
+ /* tb->lnum[0] > 0 */
+ /* Calculate new item position */
+ tb->item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
+
+ balance_leaf_right(tb, ih, body, flag);
+
+ /* tb->rnum[0] > 0 */
+ RFALSE(tb->blknum[0] > 3,
+ "PAP-12180: blknum can not be %d. It must be <= 3", tb->blknum[0]);
+ RFALSE(tb->blknum[0] < 0,
+ "PAP-12185: blknum can not be %d. It must be >= 0", tb->blknum[0]);
+
+ /*
+ * if while adding to a node we discover that it is possible to split
+ * it in two, and merge the left part into the left neighbor and the
+ * right part into the right neighbor, eliminating the node
+ */
+ if (tb->blknum[0] == 0) { /* node S[0] is empty now */
+
+ RFALSE(!tb->lnum[0] || !tb->rnum[0],
+ "PAP-12190: lnum and rnum must not be zero");
+ /*
+ * if insertion was done before 0-th position in R[0], right
+ * delimiting key of the tb->L[0]'s and left delimiting key are
+ * not set correctly
+ */
+ if (tb->CFL[0]) {
+ if (!tb->CFR[0])
+ reiserfs_panic(tb->tb_sb, "vs-12195",
+ "CFR not initialized");
+ copy_key(internal_key(tb->CFL[0], tb->lkey[0]),
+ internal_key(tb->CFR[0], tb->rkey[0]));
+ do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
}
+
+ reiserfs_invalidate_buffer(tb, tbS0);
+ return 0;
}
+
+ balance_leaf_new_nodes(tb, ih, body, insert_key, insert_ptr, flag);
+
+ balance_leaf_finish_node(tb, ih, body, flag);
+
#ifdef CONFIG_REISERFS_CHECK
if (flag == M_PASTE && tb->insert_size[0]) {
print_cur_tb("12290");
@@ -1642,9 +1454,11 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h
"PAP-12290", "insert_size is still not 0 (%d)",
tb->insert_size[0]);
}
-#endif /* CONFIG_REISERFS_CHECK */
+#endif
+
+ /* Leaf level of the tree is balanced (end of balance_leaf) */
return 0;
-} /* Leaf level of the tree is balanced (end of balance_leaf) */
+}
/* Make empty node */
void make_empty_node(struct buffer_info *bi)
@@ -1683,9 +1497,7 @@ struct buffer_head *get_FEB(struct tree_balance *tb)
return tb->used[i];
}
-/* This is now used because reiserfs_free_block has to be able to
-** schedule.
-*/
+/* This is now used because reiserfs_free_block has to be able to schedule. */
static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
{
int i;
@@ -1751,10 +1563,10 @@ void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
if (B_IS_ITEMS_LEVEL(src))
/* source buffer contains leaf node */
- memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src),
+ memcpy(internal_key(dest, n_dest), item_head(src, n_src),
KEY_SIZE);
else
- memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src),
+ memcpy(internal_key(dest, n_dest), internal_key(src, n_src),
KEY_SIZE);
do_balance_mark_internal_dirty(tb, dest, 0);
@@ -1840,8 +1652,10 @@ static int check_before_balancing(struct tree_balance *tb)
"mount point.");
}
- /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
- prepped all of these for us). */
+ /*
+ * double check that buffers that we will modify are unlocked.
+ * (fix_nodes should already have prepped all of these for us).
+ */
if (tb->lnum[0]) {
retval |= locked_or_not_in_tree(tb, tb->L[0], "L[0]");
retval |= locked_or_not_in_tree(tb, tb->FL[0], "FL[0]");
@@ -1934,49 +1748,51 @@ static void check_internal_levels(struct tree_balance *tb)
#endif
-/* Now we have all of the buffers that must be used in balancing of
- the tree. We rely on the assumption that schedule() will not occur
- while do_balance works. ( Only interrupt handlers are acceptable.)
- We balance the tree according to the analysis made before this,
- using buffers already obtained. For SMP support it will someday be
- necessary to add ordered locking of tb. */
-
-/* Some interesting rules of balancing:
-
- we delete a maximum of two nodes per level per balancing: we never
- delete R, when we delete two of three nodes L, S, R then we move
- them into R.
-
- we only delete L if we are deleting two nodes, if we delete only
- one node we delete S
-
- if we shift leaves then we shift as much as we can: this is a
- deliberate policy of extremism in node packing which results in
- higher average utilization after repeated random balance operations
- at the cost of more memory copies and more balancing as a result of
- small insertions to full nodes.
-
- if we shift internal nodes we try to evenly balance the node
- utilization, with consequent less balancing at the cost of lower
- utilization.
-
- one could argue that the policy for directories in leaves should be
- that of internal nodes, but we will wait until another day to
- evaluate this.... It would be nice to someday measure and prove
- these assumptions as to what is optimal....
+/*
+ * Now we have all of the buffers that must be used in balancing of
+ * the tree. We rely on the assumption that schedule() will not occur
+ * while do_balance works. ( Only interrupt handlers are acceptable.)
+ * We balance the tree according to the analysis made before this,
+ * using buffers already obtained. For SMP support it will someday be
+ * necessary to add ordered locking of tb.
+ */
-*/
+/*
+ * Some interesting rules of balancing:
+ * we delete a maximum of two nodes per level per balancing: we never
+ * delete R, when we delete two of three nodes L, S, R then we move
+ * them into R.
+ *
+ * we only delete L if we are deleting two nodes, if we delete only
+ * one node we delete S
+ *
+ * if we shift leaves then we shift as much as we can: this is a
+ * deliberate policy of extremism in node packing which results in
+ * higher average utilization after repeated random balance operations
+ * at the cost of more memory copies and more balancing as a result of
+ * small insertions to full nodes.
+ *
+ * if we shift internal nodes we try to evenly balance the node
+ * utilization, with consequent less balancing at the cost of lower
+ * utilization.
+ *
+ * one could argue that the policy for directories in leaves should be
+ * that of internal nodes, but we will wait until another day to
+ * evaluate this.... It would be nice to someday measure and prove
+ * these assumptions as to what is optimal....
+ */
static inline void do_balance_starts(struct tree_balance *tb)
{
- /* use print_cur_tb() to see initial state of struct
- tree_balance */
+ /* use print_cur_tb() to see initial state of struct tree_balance */
/* store_print_tb (tb); */
/* do not delete, just comment it out */
-/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb,
- "check");*/
+ /*
+ print_tb(flag, PATH_LAST_POSITION(tb->tb_path),
+ tb->tb_path->pos_in_item, tb, "check");
+ */
RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
#ifdef CONFIG_REISERFS_CHECK
REISERFS_SB(tb->tb_sb)->cur_tb = tb;
@@ -1992,9 +1808,10 @@ static inline void do_balance_completed(struct tree_balance *tb)
REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
#endif
- /* reiserfs_free_block is no longer schedule safe. So, we need to
- ** put the buffers we want freed on the thrown list during do_balance,
- ** and then free them now
+ /*
+ * reiserfs_free_block is no longer schedule safe. So, we need to
+ * put the buffers we want freed on the thrown list during do_balance,
+ * and then free them now
*/
REISERFS_SB(tb->tb_sb)->s_do_balance++;
@@ -2005,36 +1822,40 @@ static inline void do_balance_completed(struct tree_balance *tb)
free_thrown(tb);
}
-void do_balance(struct tree_balance *tb, /* tree_balance structure */
- struct item_head *ih, /* item header of inserted item */
- const char *body, /* body of inserted item or bytes to paste */
- int flag)
-{ /* i - insert, d - delete
- c - cut, p - paste
-
- Cut means delete part of an item
- (includes removing an entry from a
- directory).
-
- Delete means delete whole item.
-
- Insert means add a new item into the
- tree.
-
- Paste means to append to the end of an
- existing file or to insert a directory
- entry. */
- int child_pos, /* position of a child node in its parent */
- h; /* level of the tree being processed */
- struct item_head insert_key[2]; /* in our processing of one level
- we sometimes determine what
- must be inserted into the next
- higher level. This insertion
- consists of a key or two keys
- and their corresponding
- pointers */
- struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next
- level */
+/*
+ * do_balance - balance the tree
+ *
+ * @tb: tree_balance structure
+ * @ih: item header of inserted item
+ * @body: body of inserted item or bytes to paste
+ * @flag: 'i' - insert, 'd' - delete, 'c' - cut, 'p' paste
+ *
+ * Cut means delete part of an item (includes removing an entry from a
+ * directory).
+ *
+ * Delete means delete whole item.
+ *
+ * Insert means add a new item into the tree.
+ *
+ * Paste means to append to the end of an existing file or to
+ * insert a directory entry.
+ */
+void do_balance(struct tree_balance *tb, struct item_head *ih,
+ const char *body, int flag)
+{
+ int child_pos; /* position of a child node in its parent */
+ int h; /* level of the tree being processed */
+
+ /*
+ * in our processing of one level we sometimes determine what
+ * must be inserted into the next higher level. This insertion
+ * consists of a key or two keys and their corresponding
+ * pointers
+ */
+ struct item_head insert_key[2];
+
+ /* inserted node-ptrs for the next level */
+ struct buffer_head *insert_ptr[2];
tb->tb_mode = flag;
tb->need_balance_dirty = 0;
@@ -2051,12 +1872,14 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */
return;
}
- atomic_inc(&(fs_generation(tb->tb_sb)));
+ atomic_inc(&fs_generation(tb->tb_sb));
do_balance_starts(tb);
- /* balance leaf returns 0 except if combining L R and S into
- one node. see balance_internal() for explanation of this
- line of code. */
+ /*
+ * balance_leaf returns 0 except if combining L R and S into
+ * one node. see balance_internal() for explanation of this
+ * line of code.
+ */
child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
@@ -2066,9 +1889,8 @@ void do_balance(struct tree_balance *tb, /* tree_balance structure */
/* Balance internal level of the tree. */
for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
- child_pos =
- balance_internal(tb, h, child_pos, insert_key, insert_ptr);
+ child_pos = balance_internal(tb, h, child_pos, insert_key,
+ insert_ptr);
do_balance_completed(tb);
-
}
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index dcaafcfc23b..db9e80ba53a 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -15,20 +15,20 @@
#include <linux/quotaops.h>
/*
-** We pack the tails of files on file close, not at the time they are written.
-** This implies an unnecessary copy of the tail and an unnecessary indirect item
-** insertion/balancing, for files that are written in one write.
-** It avoids unnecessary tail packings (balances) for files that are written in
-** multiple writes and are small enough to have tails.
-**
-** file_release is called by the VFS layer when the file is closed. If
-** this is the last open file descriptor, and the file
-** small enough to have a tail, and the tail is currently in an
-** unformatted node, the tail is converted back into a direct item.
-**
-** We use reiserfs_truncate_file to pack the tail, since it already has
-** all the conditions coded.
-*/
+ * We pack the tails of files on file close, not at the time they are written.
+ * This implies an unnecessary copy of the tail and an unnecessary indirect item
+ * insertion/balancing, for files that are written in one write.
+ * It avoids unnecessary tail packings (balances) for files that are written in
+ * multiple writes and are small enough to have tails.
+ *
+ * file_release is called by the VFS layer when the file is closed. If
+ * this is the last open file descriptor, and the file
+ * small enough to have a tail, and the tail is currently in an
+ * unformatted node, the tail is converted back into a direct item.
+ *
+ * We use reiserfs_truncate_file to pack the tail, since it already has
+ * all the conditions coded.
+ */
static int reiserfs_file_release(struct inode *inode, struct file *filp)
{
@@ -41,10 +41,10 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
if (atomic_add_unless(&REISERFS_I(inode)->openers, -1, 1))
return 0;
- mutex_lock(&(REISERFS_I(inode)->tailpack));
+ mutex_lock(&REISERFS_I(inode)->tailpack);
if (!atomic_dec_and_test(&REISERFS_I(inode)->openers)) {
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
return 0;
}
@@ -52,31 +52,35 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
if ((!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
!tail_has_to_be_packed(inode)) &&
REISERFS_I(inode)->i_prealloc_count <= 0) {
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
return 0;
}
reiserfs_write_lock(inode->i_sb);
- /* freeing preallocation only involves relogging blocks that
+ /*
+ * freeing preallocation only involves relogging blocks that
* are already in the current transaction. preallocation gets
* freed at the end of each transaction, so it is impossible for
* us to log any additional blocks (including quota blocks)
*/
err = journal_begin(&th, inode->i_sb, 1);
if (err) {
- /* uh oh, we can't allow the inode to go away while there
+ /*
+ * uh oh, we can't allow the inode to go away while there
* is still preallocation blocks pending. Try to join the
* aborted transaction
*/
jbegin_failure = err;
- err = journal_join_abort(&th, inode->i_sb, 1);
+ err = journal_join_abort(&th, inode->i_sb);
if (err) {
- /* hmpf, our choices here aren't good. We can pin the inode
- * which will disallow unmount from every happening, we can
- * do nothing, which will corrupt random memory on unmount,
- * or we can forcibly remove the file from the preallocation
- * list, which will leak blocks on disk. Lets pin the inode
+ /*
+ * hmpf, our choices here aren't good. We can pin
+ * the inode which will disallow unmount from ever
+ * happening, we can do nothing, which will corrupt
+ * random memory on unmount, or we can forcibly
+ * remove the file from the preallocation list, which
+ * will leak blocks on disk. Lets pin the inode
* and let the admin know what is going on.
*/
igrab(inode);
@@ -92,7 +96,7 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
#ifdef REISERFS_PREALLOCATE
reiserfs_discard_prealloc(&th, inode);
#endif
- err = journal_end(&th, inode->i_sb, 1);
+ err = journal_end(&th);
/* copy back the error code from journal_begin */
if (!err)
@@ -102,35 +106,38 @@ static int reiserfs_file_release(struct inode *inode, struct file *filp)
(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
tail_has_to_be_packed(inode)) {
- /* if regular file is released by last holder and it has been
- appended (we append by unformatted node only) or its direct
- item(s) had to be converted, then it may have to be
- indirect2direct converted */
+ /*
+ * if regular file is released by last holder and it has been
+ * appended (we append by unformatted node only) or its direct
+ * item(s) had to be converted, then it may have to be
+ * indirect2direct converted
+ */
err = reiserfs_truncate_file(inode, 0);
}
- out:
+out:
reiserfs_write_unlock(inode->i_sb);
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
return err;
}
static int reiserfs_file_open(struct inode *inode, struct file *file)
{
int err = dquot_file_open(inode, file);
+
+ /* somebody might be tailpacking on final close; wait for it */
if (!atomic_inc_not_zero(&REISERFS_I(inode)->openers)) {
- /* somebody might be tailpacking on final close; wait for it */
- mutex_lock(&(REISERFS_I(inode)->tailpack));
+ mutex_lock(&REISERFS_I(inode)->tailpack);
atomic_inc(&REISERFS_I(inode)->openers);
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
}
return err;
}
void reiserfs_vfs_truncate_file(struct inode *inode)
{
- mutex_lock(&(REISERFS_I(inode)->tailpack));
+ mutex_lock(&REISERFS_I(inode)->tailpack);
reiserfs_truncate_file(inode, 1);
- mutex_unlock(&(REISERFS_I(inode)->tailpack));
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
}
/* Sync a reiserfs file. */
@@ -205,10 +212,11 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
set_buffer_uptodate(bh);
if (logit) {
reiserfs_prepare_for_journal(s, bh, 1);
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
} else if (!buffer_dirty(bh)) {
mark_buffer_dirty(bh);
- /* do data=ordered on any page past the end
+ /*
+ * do data=ordered on any page past the end
* of file and any buffer marked BH_New.
*/
if (reiserfs_data_ordered(inode->i_sb) &&
@@ -219,8 +227,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
}
}
if (logit) {
- ret = journal_end(&th, s, bh_per_page + 1);
- drop_write_lock:
+ ret = journal_end(&th);
+drop_write_lock:
reiserfs_write_unlock(s);
}
/*
@@ -235,8 +243,8 @@ int reiserfs_commit_page(struct inode *inode, struct page *page,
}
const struct file_operations reiserfs_file_operations = {
- .read = do_sync_read,
- .write = do_sync_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
.unlocked_ioctl = reiserfs_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = reiserfs_compat_ioctl,
@@ -245,10 +253,10 @@ const struct file_operations reiserfs_file_operations = {
.open = reiserfs_file_open,
.release = reiserfs_file_release,
.fsync = reiserfs_sync_file,
- .aio_read = generic_file_aio_read,
- .aio_write = generic_file_aio_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
.llseek = generic_file_llseek,
};
@@ -260,4 +268,5 @@ const struct inode_operations reiserfs_file_inode_operations = {
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
+ .set_acl = reiserfs_set_acl,
};
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index dc4d4153031..6b0ddb2a909 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2,59 +2,32 @@
* Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
*/
-/**
- ** old_item_num
- ** old_entry_num
- ** set_entry_sizes
- ** create_virtual_node
- ** check_left
- ** check_right
- ** directory_part_size
- ** get_num_ver
- ** set_parameters
- ** is_leaf_removable
- ** are_leaves_removable
- ** get_empty_nodes
- ** get_lfree
- ** get_rfree
- ** is_left_neighbor_in_cache
- ** decrement_key
- ** get_far_parent
- ** get_parents
- ** can_node_be_removed
- ** ip_check_balance
- ** dc_check_balance_internal
- ** dc_check_balance_leaf
- ** dc_check_balance
- ** check_balance
- ** get_direct_parent
- ** get_neighbors
- ** fix_nodes
- **
- **
- **/
-
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/string.h>
#include "reiserfs.h"
#include <linux/buffer_head.h>
-/* To make any changes in the tree we find a node, that contains item
- to be changed/deleted or position in the node we insert a new item
- to. We call this node S. To do balancing we need to decide what we
- will shift to left/right neighbor, or to a new node, where new item
- will be etc. To make this analysis simpler we build virtual
- node. Virtual node is an array of items, that will replace items of
- node S. (For instance if we are going to delete an item, virtual
- node does not contain it). Virtual node keeps information about
- item sizes and types, mergeability of first and last items, sizes
- of all entries in directory item. We use this array of items when
- calculating what we can shift to neighbors and how many nodes we
- have to have if we do not any shiftings, if we shift to left/right
- neighbor or to both. */
-
-/* taking item number in virtual node, returns number of item, that it has in source buffer */
+/*
+ * To make any changes in the tree we find a node that contains item
+ * to be changed/deleted or position in the node we insert a new item
+ * to. We call this node S. To do balancing we need to decide what we
+ * will shift to left/right neighbor, or to a new node, where new item
+ * will be etc. To make this analysis simpler we build virtual
+ * node. Virtual node is an array of items, that will replace items of
+ * node S. (For instance if we are going to delete an item, virtual
+ * node does not contain it). Virtual node keeps information about
+ * item sizes and types, mergeability of first and last items, sizes
+ * of all entries in directory item. We use this array of items when
+ * calculating what we can shift to neighbors and how many nodes we
+ * have to have if we do not any shiftings, if we shift to left/right
+ * neighbor or to both.
+ */
+
+/*
+ * Takes item number in virtual node, returns number of item
+ * that it has in source buffer
+ */
static inline int old_item_num(int new_num, int affected_item_num, int mode)
{
if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
@@ -105,14 +78,17 @@ static void create_virtual_node(struct tree_balance *tb, int h)
vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
/* first item in the node */
- ih = B_N_PITEM_HEAD(Sh, 0);
+ ih = item_head(Sh, 0);
/* define the mergeability for 0-th item (if it is not being deleted) */
- if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
+ if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
&& (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
- /* go through all items those remain in the virtual node (except for the new (inserted) one) */
+ /*
+ * go through all items that remain in the virtual
+ * node (except for the new (inserted) one)
+ */
for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
int j;
struct virtual_item *vi = vn->vn_vi + new_num;
@@ -128,11 +104,13 @@ static void create_virtual_node(struct tree_balance *tb, int h)
vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
vi->vi_ih = ih + j;
- vi->vi_item = B_I_PITEM(Sh, ih + j);
+ vi->vi_item = ih_item_body(Sh, ih + j);
vi->vi_uarea = vn->vn_free_ptr;
- // FIXME: there is no check, that item operation did not
- // consume too much memory
+ /*
+ * FIXME: there is no check that item operation did not
+ * consume too much memory
+ */
vn->vn_free_ptr +=
op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
@@ -145,7 +123,8 @@ static void create_virtual_node(struct tree_balance *tb, int h)
if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
- vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
+ /* pointer to data which is going to be pasted */
+ vi->vi_new_data = vn->vn_data;
}
}
@@ -164,11 +143,14 @@ static void create_virtual_node(struct tree_balance *tb, int h)
tb->insert_size[0]);
}
- /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
+ /*
+ * set right merge flag we take right delimiting key and
+ * check whether it is a mergeable item
+ */
if (tb->CFR[0]) {
struct reiserfs_key *key;
- key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
+ key = internal_key(tb->CFR[0], tb->rkey[0]);
if (op_is_left_mergeable(key, Sh->b_size)
&& (vn->vn_mode != M_DELETE
|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
@@ -179,12 +161,19 @@ static void create_virtual_node(struct tree_balance *tb, int h)
if (op_is_left_mergeable(key, Sh->b_size) &&
!(vn->vn_mode != M_DELETE
|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
- /* we delete last item and it could be merged with right neighbor's first item */
+ /*
+ * we delete last item and it could be merged
+ * with right neighbor's first item
+ */
if (!
(B_NR_ITEMS(Sh) == 1
- && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
- && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
- /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
+ && is_direntry_le_ih(item_head(Sh, 0))
+ && ih_entry_count(item_head(Sh, 0)) == 1)) {
+ /*
+ * node contains more than 1 item, or item
+ * is not directory item, or this item
+ * contains more than 1 entry
+ */
print_block(Sh, 0, -1, -1);
reiserfs_panic(tb->tb_sb, "vs-8045",
"rdkey %k, affected item==%d "
@@ -198,8 +187,10 @@ static void create_virtual_node(struct tree_balance *tb, int h)
}
}
-/* using virtual node check, how many items can be shifted to left
- neighbor */
+/*
+ * Using virtual node check, how many items can be
+ * shifted to left neighbor
+ */
static void check_left(struct tree_balance *tb, int h, int cur_free)
{
int i;
@@ -259,9 +250,13 @@ static void check_left(struct tree_balance *tb, int h, int cur_free)
}
/* the item cannot be shifted entirely, try to split it */
- /* check whether L[0] can hold ih and at least one byte of the item body */
+ /*
+ * check whether L[0] can hold ih and at least one byte
+ * of the item body
+ */
+
+ /* cannot shift even a part of the current item */
if (cur_free <= ih_size) {
- /* cannot shift even a part of the current item */
tb->lbytes = -1;
return;
}
@@ -278,8 +273,10 @@ static void check_left(struct tree_balance *tb, int h, int cur_free)
return;
}
-/* using virtual node check, how many items can be shifted to right
- neighbor */
+/*
+ * Using virtual node check, how many items can be
+ * shifted to right neighbor
+ */
static void check_right(struct tree_balance *tb, int h, int cur_free)
{
int i;
@@ -338,13 +335,21 @@ static void check_right(struct tree_balance *tb, int h, int cur_free)
continue;
}
- /* check whether R[0] can hold ih and at least one byte of the item body */
- if (cur_free <= ih_size) { /* cannot shift even a part of the current item */
+ /*
+ * check whether R[0] can hold ih and at least one
+ * byte of the item body
+ */
+
+ /* cannot shift even a part of the current item */
+ if (cur_free <= ih_size) {
tb->rbytes = -1;
return;
}
- /* R[0] can hold the header of the item and at least one byte of its body */
+ /*
+ * R[0] can hold the header of the item and at least
+ * one byte of its body
+ */
cur_free -= ih_size; /* cur_free is still > 0 */
tb->rbytes = op_check_right(vi, cur_free);
@@ -361,45 +366,64 @@ static void check_right(struct tree_balance *tb, int h, int cur_free)
/*
* from - number of items, which are shifted to left neighbor entirely
* to - number of item, which are shifted to right neighbor entirely
- * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
- * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
+ * from_bytes - number of bytes of boundary item (or directory entries)
+ * which are shifted to left neighbor
+ * to_bytes - number of bytes of boundary item (or directory entries)
+ * which are shifted to right neighbor
+ */
static int get_num_ver(int mode, struct tree_balance *tb, int h,
int from, int from_bytes,
int to, int to_bytes, short *snum012, int flow)
{
int i;
int cur_free;
- // int bytes;
int units;
struct virtual_node *vn = tb->tb_vn;
- // struct virtual_item * vi;
-
int total_node_size, max_node_size, current_item_size;
int needed_nodes;
- int start_item, /* position of item we start filling node from */
- end_item, /* position of item we finish filling node by */
- start_bytes, /* number of first bytes (entries for directory) of start_item-th item
- we do not include into node that is being filled */
- end_bytes; /* number of last bytes (entries for directory) of end_item-th item
- we do node include into node that is being filled */
- int split_item_positions[2]; /* these are positions in virtual item of
- items, that are split between S[0] and
- S1new and S1new and S2new */
+
+ /* position of item we start filling node from */
+ int start_item;
+
+ /* position of item we finish filling node by */
+ int end_item;
+
+ /*
+ * number of first bytes (entries for directory) of start_item-th item
+ * we do not include into node that is being filled
+ */
+ int start_bytes;
+
+ /*
+ * number of last bytes (entries for directory) of end_item-th item
+ * we do node include into node that is being filled
+ */
+ int end_bytes;
+
+ /*
+ * these are positions in virtual item of items, that are split
+ * between S[0] and S1new and S1new and S2new
+ */
+ int split_item_positions[2];
split_item_positions[0] = -1;
split_item_positions[1] = -1;
- /* We only create additional nodes if we are in insert or paste mode
- or we are in replace mode at the internal level. If h is 0 and
- the mode is M_REPLACE then in fix_nodes we change the mode to
- paste or insert before we get here in the code. */
+ /*
+ * We only create additional nodes if we are in insert or paste mode
+ * or we are in replace mode at the internal level. If h is 0 and
+ * the mode is M_REPLACE then in fix_nodes we change the mode to
+ * paste or insert before we get here in the code.
+ */
RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
"vs-8100: insert_size < 0 in overflow");
max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
- /* snum012 [0-2] - number of items, that lay
- to S[0], first new node and second new node */
+ /*
+ * snum012 [0-2] - number of items, that lay
+ * to S[0], first new node and second new node
+ */
snum012[3] = -1; /* s1bytes */
snum012[4] = -1; /* s2bytes */
@@ -416,20 +440,22 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
total_node_size = 0;
cur_free = max_node_size;
- // start from 'from'-th item
+ /* start from 'from'-th item */
start_item = from;
- // skip its first 'start_bytes' units
+ /* skip its first 'start_bytes' units */
start_bytes = ((from_bytes != -1) ? from_bytes : 0);
- // last included item is the 'end_item'-th one
+ /* last included item is the 'end_item'-th one */
end_item = vn->vn_nr_item - to - 1;
- // do not count last 'end_bytes' units of 'end_item'-th item
+ /* do not count last 'end_bytes' units of 'end_item'-th item */
end_bytes = (to_bytes != -1) ? to_bytes : 0;
- /* go through all item beginning from the start_item-th item and ending by
- the end_item-th item. Do not count first 'start_bytes' units of
- 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
-
+ /*
+ * go through all item beginning from the start_item-th item
+ * and ending by the end_item-th item. Do not count first
+ * 'start_bytes' units of 'start_item'-th item and last
+ * 'end_bytes' of 'end_item'-th item
+ */
for (i = start_item; i <= end_item; i++) {
struct virtual_item *vi = vn->vn_vi + i;
int skip_from_end = ((i == end_item) ? end_bytes : 0);
@@ -439,7 +465,10 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
/* get size of current item */
current_item_size = vi->vi_item_len;
- /* do not take in calculation head part (from_bytes) of from-th item */
+ /*
+ * do not take in calculation head part (from_bytes)
+ * of from-th item
+ */
current_item_size -=
op_part_size(vi, 0 /*from start */ , start_bytes);
@@ -455,9 +484,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
continue;
}
+ /*
+ * virtual item length is longer, than max size of item in
+ * a node. It is impossible for direct item
+ */
if (current_item_size > max_node_size) {
- /* virtual item length is longer, than max size of item in
- a node. It is impossible for direct item */
RFALSE(is_direct_le_ih(vi->vi_ih),
"vs-8110: "
"direct item length is %d. It can not be longer than %d",
@@ -466,15 +497,18 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
flow = 1;
}
+ /* as we do not split items, take new node and continue */
if (!flow) {
- /* as we do not split items, take new node and continue */
needed_nodes++;
i--;
total_node_size = 0;
continue;
}
- // calculate number of item units which fit into node being
- // filled
+
+ /*
+ * calculate number of item units which fit into node being
+ * filled
+ */
{
int free_space;
@@ -482,17 +516,17 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
units =
op_check_left(vi, free_space, start_bytes,
skip_from_end);
+ /*
+ * nothing fits into current node, take new
+ * node and continue
+ */
if (units == -1) {
- /* nothing fits into current node, take new node and continue */
needed_nodes++, i--, total_node_size = 0;
continue;
}
}
/* something fits into the current node */
- //if (snum012[3] != -1 || needed_nodes != 1)
- // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
- //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
start_bytes += units;
snum012[needed_nodes - 1 + 3] = units;
@@ -508,9 +542,11 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
total_node_size = 0;
}
- // sum012[4] (if it is not -1) contains number of units of which
- // are to be in S1new, snum012[3] - to be in S0. They are supposed
- // to be S1bytes and S2bytes correspondingly, so recalculate
+ /*
+ * sum012[4] (if it is not -1) contains number of units of which
+ * are to be in S1new, snum012[3] - to be in S0. They are supposed
+ * to be S1bytes and S2bytes correspondingly, so recalculate
+ */
if (snum012[4] > 0) {
int split_item_num;
int bytes_to_r, bytes_to_l;
@@ -527,7 +563,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
((split_item_positions[0] ==
split_item_positions[1]) ? snum012[3] : 0);
- // s2bytes
+ /* s2bytes */
snum012[4] =
op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
bytes_to_r - bytes_to_l - bytes_to_S1new;
@@ -555,7 +591,7 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
((split_item_positions[0] == split_item_positions[1]
&& snum012[4] != -1) ? snum012[4] : 0);
- // s1bytes
+ /* s1bytes */
snum012[3] =
op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
bytes_to_r - bytes_to_l - bytes_to_S2new;
@@ -565,7 +601,8 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
}
-/* Set parameters for balancing.
+/*
+ * Set parameters for balancing.
* Performs write of results of analysis of balancing into structure tb,
* where it will later be used by the functions that actually do the balancing.
* Parameters:
@@ -575,11 +612,12 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
* rnum number of items from S[h] that must be shifted to R[h];
* blk_num number of blocks that S[h] will be splitted into;
* s012 number of items that fall into splitted nodes.
- * lbytes number of bytes which flow to the left neighbor from the item that is not
- * not shifted entirely
- * rbytes number of bytes which flow to the right neighbor from the item that is not
- * not shifted entirely
- * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array)
+ * lbytes number of bytes which flow to the left neighbor from the
+ * item that is not not shifted entirely
+ * rbytes number of bytes which flow to the right neighbor from the
+ * item that is not not shifted entirely
+ * s1bytes number of bytes which flow to the first new node when
+ * S[0] splits (this number is contained in s012 array)
*/
static void set_parameters(struct tree_balance *tb, int h, int lnum,
@@ -590,12 +628,14 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum,
tb->rnum[h] = rnum;
tb->blknum[h] = blk_num;
- if (h == 0) { /* only for leaf level */
+ /* only for leaf level */
+ if (h == 0) {
if (s012 != NULL) {
- tb->s0num = *s012++,
- tb->s1num = *s012++, tb->s2num = *s012++;
- tb->s1bytes = *s012++;
- tb->s2bytes = *s012;
+ tb->s0num = *s012++;
+ tb->snum[0] = *s012++;
+ tb->snum[1] = *s012++;
+ tb->sbytes[0] = *s012++;
+ tb->sbytes[1] = *s012;
}
tb->lbytes = lb;
tb->rbytes = rb;
@@ -607,8 +647,10 @@ static void set_parameters(struct tree_balance *tb, int h, int lnum,
PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
}
-/* check, does node disappear if we shift tb->lnum[0] items to left
- neighbor and tb->rnum[0] to the right one. */
+/*
+ * check if node disappears if we shift tb->lnum[0] items to left
+ * neighbor and tb->rnum[0] to the right one.
+ */
static int is_leaf_removable(struct tree_balance *tb)
{
struct virtual_node *vn = tb->tb_vn;
@@ -616,8 +658,10 @@ static int is_leaf_removable(struct tree_balance *tb)
int size;
int remain_items;
- /* number of items, that will be shifted to left (right) neighbor
- entirely */
+ /*
+ * number of items that will be shifted to left (right) neighbor
+ * entirely
+ */
to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
remain_items = vn->vn_nr_item;
@@ -625,21 +669,21 @@ static int is_leaf_removable(struct tree_balance *tb)
/* how many items remain in S[0] after shiftings to neighbors */
remain_items -= (to_left + to_right);
+ /* all content of node can be shifted to neighbors */
if (remain_items < 1) {
- /* all content of node can be shifted to neighbors */
set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
NULL, -1, -1);
return 1;
}
+ /* S[0] is not removable */
if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
- /* S[0] is not removable */
return 0;
- /* check, whether we can divide 1 remaining item between neighbors */
+ /* check whether we can divide 1 remaining item between neighbors */
/* get size of remaining item (in item units) */
- size = op_unit_num(&(vn->vn_vi[to_left]));
+ size = op_unit_num(&vn->vn_vi[to_left]);
if (tb->lbytes + tb->rbytes >= size) {
set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
@@ -675,23 +719,28 @@ static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
"vs-8125: item number must be 1: it is %d",
B_NR_ITEMS(S0));
- ih = B_N_PITEM_HEAD(S0, 0);
+ ih = item_head(S0, 0);
if (tb->CFR[0]
- && !comp_short_le_keys(&(ih->ih_key),
- B_N_PDELIM_KEY(tb->CFR[0],
+ && !comp_short_le_keys(&ih->ih_key,
+ internal_key(tb->CFR[0],
tb->rkey[0])))
+ /*
+ * Directory must be in correct state here: that is
+ * somewhere at the left side should exist first
+ * directory item. But the item being deleted can
+ * not be that first one because its right neighbor
+ * is item of the same directory. (But first item
+ * always gets deleted in last turn). So, neighbors
+ * of deleted item can be merged, so we can save
+ * ih_size
+ */
if (is_direntry_le_ih(ih)) {
- /* Directory must be in correct state here: that is
- somewhere at the left side should exist first directory
- item. But the item being deleted can not be that first
- one because its right neighbor is item of the same
- directory. (But first item always gets deleted in last
- turn). So, neighbors of deleted item can be merged, so
- we can save ih_size */
ih_size = IH_SIZE;
- /* we might check that left neighbor exists and is of the
- same directory */
+ /*
+ * we might check that left neighbor exists
+ * and is of the same directory
+ */
RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
"vs-8130: first directory item can not be removed until directory is not empty");
}
@@ -770,7 +819,8 @@ static void free_buffers_in_tb(struct tree_balance *tb)
}
}
-/* Get new buffers for storing new nodes that are created while balancing.
+/*
+ * Get new buffers for storing new nodes that are created while balancing.
* Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
* CARRY_ON - schedule didn't occur while the function worked;
* NO_DISK_SPACE - no disk space.
@@ -778,28 +828,33 @@ static void free_buffers_in_tb(struct tree_balance *tb)
/* The function is NOT SCHEDULE-SAFE! */
static int get_empty_nodes(struct tree_balance *tb, int h)
{
- struct buffer_head *new_bh,
- *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+ struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
- int counter, number_of_freeblk, amount_needed, /* number of needed empty blocks */
- retval = CARRY_ON;
+ int counter, number_of_freeblk;
+ int amount_needed; /* number of needed empty blocks */
+ int retval = CARRY_ON;
struct super_block *sb = tb->tb_sb;
- /* number_of_freeblk is the number of empty blocks which have been
- acquired for use by the balancing algorithm minus the number of
- empty blocks used in the previous levels of the analysis,
- number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
- after empty blocks are acquired, and the balancing analysis is
- then restarted, amount_needed is the number needed by this level
- (h) of the balancing analysis.
-
- Note that for systems with many processes writing, it would be
- more layout optimal to calculate the total number needed by all
- levels and then to run reiserfs_new_blocks to get all of them at once. */
-
- /* Initiate number_of_freeblk to the amount acquired prior to the restart of
- the analysis or 0 if not restarted, then subtract the amount needed
- by all of the levels of the tree below h. */
+ /*
+ * number_of_freeblk is the number of empty blocks which have been
+ * acquired for use by the balancing algorithm minus the number of
+ * empty blocks used in the previous levels of the analysis,
+ * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
+ * occurs after empty blocks are acquired, and the balancing analysis
+ * is then restarted, amount_needed is the number needed by this
+ * level (h) of the balancing analysis.
+ *
+ * Note that for systems with many processes writing, it would be
+ * more layout optimal to calculate the total number needed by all
+ * levels and then to run reiserfs_new_blocks to get all of them at
+ * once.
+ */
+
+ /*
+ * Initiate number_of_freeblk to the amount acquired prior to the
+ * restart of the analysis or 0 if not restarted, then subtract the
+ * amount needed by all of the levels of the tree below h.
+ */
/* blknum includes S[h], so we subtract 1 in this calculation */
for (counter = 0, number_of_freeblk = tb->cur_blknum;
counter < h; counter++)
@@ -810,13 +865,19 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
/* Allocate missing empty blocks. */
/* if Sh == 0 then we are getting a new root */
amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
- /* Amount_needed = the amount that we need more than the amount that we have. */
+ /*
+ * Amount_needed = the amount that we need more than the
+ * amount that we have.
+ */
if (amount_needed > number_of_freeblk)
amount_needed -= number_of_freeblk;
- else /* If we have enough already then there is nothing to do. */
+ else /* If we have enough already then there is nothing to do. */
return CARRY_ON;
- /* No need to check quota - is not allocated for blocks used for formatted nodes */
+ /*
+ * No need to check quota - is not allocated for blocks used
+ * for formatted nodes
+ */
if (reiserfs_new_form_blocknrs(tb, blocknrs,
amount_needed) == NO_DISK_SPACE)
return NO_DISK_SPACE;
@@ -849,8 +910,10 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
return retval;
}
-/* Get free space of the left neighbor, which is stored in the parent
- * node of the left neighbor. */
+/*
+ * Get free space of the left neighbor, which is stored in the parent
+ * node of the left neighbor.
+ */
static int get_lfree(struct tree_balance *tb, int h)
{
struct buffer_head *l, *f;
@@ -870,7 +933,8 @@ static int get_lfree(struct tree_balance *tb, int h)
return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
}
-/* Get free space of the right neighbor,
+/*
+ * Get free space of the right neighbor,
* which is stored in the parent node of the right neighbor.
*/
static int get_rfree(struct tree_balance *tb, int h)
@@ -916,7 +980,10 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
"vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
father, tb->FL[h]);
- /* Get position of the pointer to the left neighbor into the left father. */
+ /*
+ * Get position of the pointer to the left neighbor
+ * into the left father.
+ */
left_neighbor_position = (father == tb->FL[h]) ?
tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
/* Get left neighbor block number. */
@@ -940,17 +1007,20 @@ static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
static void decrement_key(struct cpu_key *key)
{
- // call item specific function for this key
+ /* call item specific function for this key */
item_ops[cpu_key_k_type(key)]->decrement_key(key);
}
-/* Calculate far left/right parent of the left/right neighbor of the current node, that
- * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
+/*
+ * Calculate far left/right parent of the left/right neighbor of the
+ * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
+ * of the parent F[h].
* Calculate left/right common parent of the current node and L[h]/R[h].
* Calculate left/right delimiting key position.
- * Returns: PATH_INCORRECT - path in the tree is not correct;
- SCHEDULE_OCCURRED - schedule occurred while the function worked;
- * CARRY_ON - schedule didn't occur while the function worked;
+ * Returns: PATH_INCORRECT - path in the tree is not correct
+ * SCHEDULE_OCCURRED - schedule occurred while the function worked
+ * CARRY_ON - schedule didn't occur while the function
+ * worked
*/
static int get_far_parent(struct tree_balance *tb,
int h,
@@ -966,8 +1036,10 @@ static int get_far_parent(struct tree_balance *tb,
first_last_position = 0,
path_offset = PATH_H_PATH_OFFSET(path, h);
- /* Starting from F[h] go upwards in the tree, and look for the common
- ancestor of F[h], and its neighbor l/r, that should be obtained. */
+ /*
+ * Starting from F[h] go upwards in the tree, and look for the common
+ * ancestor of F[h], and its neighbor l/r, that should be obtained.
+ */
counter = path_offset;
@@ -975,21 +1047,33 @@ static int get_far_parent(struct tree_balance *tb,
"PAP-8180: invalid path length");
for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
- /* Check whether parent of the current buffer in the path is really parent in the tree. */
+ /*
+ * Check whether parent of the current buffer in the path
+ * is really parent in the tree.
+ */
if (!B_IS_IN_TREE
(parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
return REPEAT_SEARCH;
+
/* Check whether position in the parent is correct. */
if ((position =
PATH_OFFSET_POSITION(path,
counter - 1)) >
B_NR_ITEMS(parent))
return REPEAT_SEARCH;
- /* Check whether parent at the path really points to the child. */
+
+ /*
+ * Check whether parent at the path really points
+ * to the child.
+ */
if (B_N_CHILD_NUM(parent, position) !=
PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
return REPEAT_SEARCH;
- /* Return delimiting key if position in the parent is not equal to first/last one. */
+
+ /*
+ * Return delimiting key if position in the parent is not
+ * equal to first/last one.
+ */
if (c_lr_par == RIGHT_PARENTS)
first_last_position = B_NR_ITEMS(parent);
if (position != first_last_position) {
@@ -1002,7 +1086,10 @@ static int get_far_parent(struct tree_balance *tb,
/* if we are in the root of the tree, then there is no common father */
if (counter == FIRST_PATH_ELEMENT_OFFSET) {
- /* Check whether first buffer in the path is the root of the tree. */
+ /*
+ * Check whether first buffer in the path is the
+ * root of the tree.
+ */
if (PATH_OFFSET_PBUFFER
(tb->tb_path,
FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
@@ -1031,12 +1118,15 @@ static int get_far_parent(struct tree_balance *tb,
}
}
- /* So, we got common parent of the current node and its left/right neighbor.
- Now we are geting the parent of the left/right neighbor. */
+ /*
+ * So, we got common parent of the current node and its
+ * left/right neighbor. Now we are getting the parent of the
+ * left/right neighbor.
+ */
/* Form key to get parent of the left/right neighbor. */
le_key2cpu_key(&s_lr_father_key,
- B_N_PDELIM_KEY(*pcom_father,
+ internal_key(*pcom_father,
(c_lr_par ==
LEFT_PARENTS) ? (tb->lkey[h - 1] =
position -
@@ -1050,7 +1140,7 @@ static int get_far_parent(struct tree_balance *tb,
if (search_by_key
(tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
h + 1) == IO_ERROR)
- // path is released
+ /* path is released */
return IO_ERROR;
if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -1071,12 +1161,15 @@ static int get_far_parent(struct tree_balance *tb,
return CARRY_ON;
}
-/* Get parents of neighbors of node in the path(S[path_offset]) and common parents of
- * S[path_offset] and L[path_offset]/R[path_offset]: F[path_offset], FL[path_offset],
- * FR[path_offset], CFL[path_offset], CFR[path_offset].
- * Calculate numbers of left and right delimiting keys position: lkey[path_offset], rkey[path_offset].
- * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
- * CARRY_ON - schedule didn't occur while the function worked;
+/*
+ * Get parents of neighbors of node in the path(S[path_offset]) and
+ * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
+ * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
+ * CFR[path_offset].
+ * Calculate numbers of left and right delimiting keys position:
+ * lkey[path_offset], rkey[path_offset].
+ * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked
+ * CARRY_ON - schedule didn't occur while the function worked
*/
static int get_parents(struct tree_balance *tb, int h)
{
@@ -1088,8 +1181,11 @@ static int get_parents(struct tree_balance *tb, int h)
/* Current node is the root of the tree or will be root of the tree */
if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
- /* The root can not have parents.
- Release nodes which previously were obtained as parents of the current node neighbors. */
+ /*
+ * The root can not have parents.
+ * Release nodes which previously were obtained as
+ * parents of the current node neighbors.
+ */
brelse(tb->FL[h]);
brelse(tb->CFL[h]);
brelse(tb->FR[h]);
@@ -1111,10 +1207,14 @@ static int get_parents(struct tree_balance *tb, int h)
get_bh(curf);
tb->lkey[h] = position - 1;
} else {
- /* Calculate current parent of L[path_offset], which is the left neighbor of the current node.
- Calculate current common parent of L[path_offset] and the current node. Note that
- CFL[path_offset] not equal FL[path_offset] and CFL[path_offset] not equal F[path_offset].
- Calculate lkey[path_offset]. */
+ /*
+ * Calculate current parent of L[path_offset], which is the
+ * left neighbor of the current node. Calculate current
+ * common parent of L[path_offset] and the current node.
+ * Note that CFL[path_offset] not equal FL[path_offset] and
+ * CFL[path_offset] not equal F[path_offset].
+ * Calculate lkey[path_offset].
+ */
if ((ret = get_far_parent(tb, h + 1, &curf,
&curcf,
LEFT_PARENTS)) != CARRY_ON)
@@ -1130,19 +1230,22 @@ static int get_parents(struct tree_balance *tb, int h)
(curcf && !B_IS_IN_TREE(curcf)),
"PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
-/* Get parent FR[h] of R[h]. */
+ /* Get parent FR[h] of R[h]. */
-/* Current node is the last child of F[h]. FR[h] != F[h]. */
+ /* Current node is the last child of F[h]. FR[h] != F[h]. */
if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
-/* Calculate current parent of R[h], which is the right neighbor of F[h].
- Calculate current common parent of R[h] and current node. Note that CFR[h]
- not equal FR[path_offset] and CFR[h] not equal F[h]. */
+ /*
+ * Calculate current parent of R[h], which is the right
+ * neighbor of F[h]. Calculate current common parent of
+ * R[h] and current node. Note that CFR[h] not equal
+ * FR[path_offset] and CFR[h] not equal F[h].
+ */
if ((ret =
get_far_parent(tb, h + 1, &curf, &curcf,
RIGHT_PARENTS)) != CARRY_ON)
return ret;
} else {
-/* Current node is not the last child of its parent F[h]. */
+ /* Current node is not the last child of its parent F[h]. */
curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
get_bh(curf);
@@ -1165,8 +1268,10 @@ static int get_parents(struct tree_balance *tb, int h)
return CARRY_ON;
}
-/* it is possible to remove node as result of shiftings to
- neighbors even when we insert or paste item. */
+/*
+ * it is possible to remove node as result of shiftings to
+ * neighbors even when we insert or paste item.
+ */
static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
struct tree_balance *tb, int h)
{
@@ -1175,21 +1280,22 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
struct item_head *ih;
struct reiserfs_key *r_key = NULL;
- ih = B_N_PITEM_HEAD(Sh, 0);
+ ih = item_head(Sh, 0);
if (tb->CFR[h])
- r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
+ r_key = internal_key(tb->CFR[h], tb->rkey[h]);
if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
/* shifting may merge items which might save space */
-
((!h
- && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
+ && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
-
((!h && r_key
&& op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
+ ((h) ? KEY_SIZE : 0)) {
/* node can not be removed */
- if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
+ if (sfree >= levbytes) {
+ /* new item fits into node S[h] without any shifting */
if (!h)
tb->s0num =
B_NR_ITEMS(Sh) +
@@ -1202,7 +1308,8 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
return !NO_BALANCING_NEEDED;
}
-/* Check whether current node S[h] is balanced when increasing its size by
+/*
+ * Check whether current node S[h] is balanced when increasing its size by
* Inserting or Pasting.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1219,39 +1326,48 @@ static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
static int ip_check_balance(struct tree_balance *tb, int h)
{
struct virtual_node *vn = tb->tb_vn;
- int levbytes, /* Number of bytes that must be inserted into (value
- is negative if bytes are deleted) buffer which
- contains node being balanced. The mnemonic is
- that the attempted change in node space used level
- is levbytes bytes. */
- ret;
+ /*
+ * Number of bytes that must be inserted into (value is negative
+ * if bytes are deleted) buffer which contains node being balanced.
+ * The mnemonic is that the attempted change in node space used
+ * level is levbytes bytes.
+ */
+ int levbytes;
+ int ret;
int lfree, sfree, rfree /* free space in L, S and R */ ;
- /* nver is short for number of vertixes, and lnver is the number if
- we shift to the left, rnver is the number if we shift to the
- right, and lrnver is the number if we shift in both directions.
- The goal is to minimize first the number of vertixes, and second,
- the number of vertixes whose contents are changed by shifting,
- and third the number of uncached vertixes whose contents are
- changed by shifting and must be read from disk. */
+ /*
+ * nver is short for number of vertixes, and lnver is the number if
+ * we shift to the left, rnver is the number if we shift to the
+ * right, and lrnver is the number if we shift in both directions.
+ * The goal is to minimize first the number of vertixes, and second,
+ * the number of vertixes whose contents are changed by shifting,
+ * and third the number of uncached vertixes whose contents are
+ * changed by shifting and must be read from disk.
+ */
int nver, lnver, rnver, lrnver;
- /* used at leaf level only, S0 = S[0] is the node being balanced,
- sInum [ I = 0,1,2 ] is the number of items that will
- remain in node SI after balancing. S1 and S2 are new
- nodes that might be created. */
+ /*
+ * used at leaf level only, S0 = S[0] is the node being balanced,
+ * sInum [ I = 0,1,2 ] is the number of items that will
+ * remain in node SI after balancing. S1 and S2 are new
+ * nodes that might be created.
+ */
- /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters.
- where 4th parameter is s1bytes and 5th - s2bytes
+ /*
+ * we perform 8 calls to get_num_ver(). For each call we
+ * calculate five parameters. where 4th parameter is s1bytes
+ * and 5th - s2bytes
+ *
+ * s0num, s1num, s2num for 8 cases
+ * 0,1 - do not shift and do not shift but bottle
+ * 2 - shift only whole item to left
+ * 3 - shift to left and bottle as much as possible
+ * 4,5 - shift to right (whole items and as much as possible
+ * 6,7 - shift to both directions (whole items and as much as possible)
*/
- short snum012[40] = { 0, }; /* s0num, s1num, s2num for 8 cases
- 0,1 - do not shift and do not shift but bottle
- 2 - shift only whole item to left
- 3 - shift to left and bottle as much as possible
- 4,5 - shift to right (whole items and as much as possible
- 6,7 - shift to both directions (whole items and as much as possible)
- */
+ short snum012[40] = { 0, };
/* Sh is the node whose balance is currently being checked */
struct buffer_head *Sh;
@@ -1265,9 +1381,10 @@ static int ip_check_balance(struct tree_balance *tb, int h)
reiserfs_panic(tb->tb_sb, "vs-8210",
"S[0] can not be 0");
switch (ret = get_empty_nodes(tb, h)) {
+ /* no balancing for higher levels needed */
case CARRY_ON:
set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+ return NO_BALANCING_NEEDED;
case NO_DISK_SPACE:
case REPEAT_SEARCH:
@@ -1278,7 +1395,9 @@ static int ip_check_balance(struct tree_balance *tb, int h)
}
}
- if ((ret = get_parents(tb, h)) != CARRY_ON) /* get parents of S[h] neighbors. */
+ /* get parents of S[h] neighbors. */
+ ret = get_parents(tb, h);
+ if (ret != CARRY_ON)
return ret;
sfree = B_FREE_SPACE(Sh);
@@ -1287,38 +1406,44 @@ static int ip_check_balance(struct tree_balance *tb, int h)
rfree = get_rfree(tb, h);
lfree = get_lfree(tb, h);
+ /* and new item fits into node S[h] without any shifting */
if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
NO_BALANCING_NEEDED)
- /* and new item fits into node S[h] without any shifting */
return NO_BALANCING_NEEDED;
create_virtual_node(tb, h);
/*
- determine maximal number of items we can shift to the left neighbor (in tb structure)
- and the maximal number of bytes that can flow to the left neighbor
- from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
+ * determine maximal number of items we can shift to the left
+ * neighbor (in tb structure) and the maximal number of bytes
+ * that can flow to the left neighbor from the left most liquid
+ * item that cannot be shifted from S[0] entirely (returned value)
*/
check_left(tb, h, lfree);
/*
- determine maximal number of items we can shift to the right neighbor (in tb structure)
- and the maximal number of bytes that can flow to the right neighbor
- from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
+ * determine maximal number of items we can shift to the right
+ * neighbor (in tb structure) and the maximal number of bytes
+ * that can flow to the right neighbor from the right most liquid
+ * item that cannot be shifted from S[0] entirely (returned value)
*/
check_right(tb, h, rfree);
- /* all contents of internal node S[h] can be moved into its
- neighbors, S[h] will be removed after balancing */
+ /*
+ * all contents of internal node S[h] can be moved into its
+ * neighbors, S[h] will be removed after balancing
+ */
if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
int to_r;
- /* Since we are working on internal nodes, and our internal
- nodes have fixed size entries, then we can balance by the
- number of items rather than the space they consume. In this
- routine we set the left node equal to the right node,
- allowing a difference of less than or equal to 1 child
- pointer. */
+ /*
+ * Since we are working on internal nodes, and our internal
+ * nodes have fixed size entries, then we can balance by the
+ * number of items rather than the space they consume. In this
+ * routine we set the left node equal to the right node,
+ * allowing a difference of less than or equal to 1 child
+ * pointer.
+ */
to_r =
((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
@@ -1328,7 +1453,10 @@ static int ip_check_balance(struct tree_balance *tb, int h)
return CARRY_ON;
}
- /* this checks balance condition, that any two neighboring nodes can not fit in one node */
+ /*
+ * this checks balance condition, that any two neighboring nodes
+ * can not fit in one node
+ */
RFALSE(h &&
(tb->lnum[h] >= vn->vn_nr_item + 1 ||
tb->rnum[h] >= vn->vn_nr_item + 1),
@@ -1337,16 +1465,22 @@ static int ip_check_balance(struct tree_balance *tb, int h)
(tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
"vs-8225: tree is not balanced on leaf level");
- /* all contents of S[0] can be moved into its neighbors
- S[0] will be removed after balancing. */
+ /*
+ * all contents of S[0] can be moved into its neighbors
+ * S[0] will be removed after balancing.
+ */
if (!h && is_leaf_removable(tb))
return CARRY_ON;
- /* why do we perform this check here rather than earlier??
- Answer: we can win 1 node in some cases above. Moreover we
- checked it above, when we checked, that S[0] is not removable
- in principle */
- if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
+ /*
+ * why do we perform this check here rather than earlier??
+ * Answer: we can win 1 node in some cases above. Moreover we
+ * checked it above, when we checked, that S[0] is not removable
+ * in principle
+ */
+
+ /* new item fits into node S[h] without any shifting */
+ if (sfree >= levbytes) {
if (!h)
tb->s0num = vn->vn_nr_item;
set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
@@ -1355,18 +1489,19 @@ static int ip_check_balance(struct tree_balance *tb, int h)
{
int lpar, rpar, nset, lset, rset, lrset;
- /*
- * regular overflowing of the node
- */
+ /* regular overflowing of the node */
- /* get_num_ver works in 2 modes (FLOW & NO_FLOW)
- lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
- nset, lset, rset, lrset - shows, whether flowing items give better packing
+ /*
+ * get_num_ver works in 2 modes (FLOW & NO_FLOW)
+ * lpar, rpar - number of items we can shift to left/right
+ * neighbor (including splitting item)
+ * nset, lset, rset, lrset - shows, whether flowing items
+ * give better packing
*/
#define FLOW 1
#define NO_FLOW 0 /* do not any splitting */
- /* we choose one the following */
+ /* we choose one of the following */
#define NOTHING_SHIFT_NO_FLOW 0
#define NOTHING_SHIFT_FLOW 5
#define LEFT_SHIFT_NO_FLOW 10
@@ -1379,10 +1514,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
lpar = tb->lnum[h];
rpar = tb->rnum[h];
- /* calculate number of blocks S[h] must be split into when
- nothing is shifted to the neighbors,
- as well as number of items in each part of the split node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * nothing is shifted to the neighbors, as well as number of
+ * items in each part of the split node (s012 numbers),
+ * and number of bytes (s1bytes) of the shared drop which
+ * flow to S1 if any
+ */
nset = NOTHING_SHIFT_NO_FLOW;
nver = get_num_ver(vn->vn_mode, tb, h,
0, -1, h ? vn->vn_nr_item : 0, -1,
@@ -1391,7 +1529,10 @@ static int ip_check_balance(struct tree_balance *tb, int h)
if (!h) {
int nver1;
- /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
+ /*
+ * note, that in this case we try to bottle
+ * between S[0] and S1 (S1 - the first new node)
+ */
nver1 = get_num_ver(vn->vn_mode, tb, h,
0, -1, 0, -1,
snum012 + NOTHING_SHIFT_FLOW, FLOW);
@@ -1399,11 +1540,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
nset = NOTHING_SHIFT_FLOW, nver = nver1;
}
- /* calculate number of blocks S[h] must be split into when
- l_shift_num first items and l_shift_bytes of the right most
- liquid item to be shifted are shifted to the left neighbor,
- as well as number of items in each part of the splitted node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * l_shift_num first items and l_shift_bytes of the right
+ * most liquid item to be shifted are shifted to the left
+ * neighbor, as well as number of items in each part of the
+ * splitted node (s012 numbers), and number of bytes
+ * (s1bytes) of the shared drop which flow to S1 if any
*/
lset = LEFT_SHIFT_NO_FLOW;
lnver = get_num_ver(vn->vn_mode, tb, h,
@@ -1422,11 +1565,13 @@ static int ip_check_balance(struct tree_balance *tb, int h)
lset = LEFT_SHIFT_FLOW, lnver = lnver1;
}
- /* calculate number of blocks S[h] must be split into when
- r_shift_num first items and r_shift_bytes of the left most
- liquid item to be shifted are shifted to the right neighbor,
- as well as number of items in each part of the splitted node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * r_shift_num first items and r_shift_bytes of the left most
+ * liquid item to be shifted are shifted to the right neighbor,
+ * as well as number of items in each part of the splitted
+ * node (s012 numbers), and number of bytes (s1bytes) of the
+ * shared drop which flow to S1 if any
*/
rset = RIGHT_SHIFT_NO_FLOW;
rnver = get_num_ver(vn->vn_mode, tb, h,
@@ -1451,10 +1596,12 @@ static int ip_check_balance(struct tree_balance *tb, int h)
rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
}
- /* calculate number of blocks S[h] must be split into when
- items are shifted in both directions,
- as well as number of items in each part of the splitted node (s012 numbers),
- and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+ /*
+ * calculate number of blocks S[h] must be split into when
+ * items are shifted in both directions, as well as number
+ * of items in each part of the splitted node (s012 numbers),
+ * and number of bytes (s1bytes) of the shared drop which
+ * flow to S1 if any
*/
lrset = LR_SHIFT_NO_FLOW;
lrnver = get_num_ver(vn->vn_mode, tb, h,
@@ -1481,10 +1628,12 @@ static int ip_check_balance(struct tree_balance *tb, int h)
lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
}
- /* Our general shifting strategy is:
- 1) to minimized number of new nodes;
- 2) to minimized number of neighbors involved in shifting;
- 3) to minimized number of disk reads; */
+ /*
+ * Our general shifting strategy is:
+ * 1) to minimized number of new nodes;
+ * 2) to minimized number of neighbors involved in shifting;
+ * 3) to minimized number of disk reads;
+ */
/* we can win TWO or ONE nodes by shifting in both directions */
if (lrnver < lnver && lrnver < rnver) {
@@ -1508,42 +1657,59 @@ static int ip_check_balance(struct tree_balance *tb, int h)
return CARRY_ON;
}
- /* if shifting doesn't lead to better packing then don't shift */
+ /*
+ * if shifting doesn't lead to better packing
+ * then don't shift
+ */
if (nver == lrnver) {
set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
-1);
return CARRY_ON;
}
- /* now we know that for better packing shifting in only one
- direction either to the left or to the right is required */
+ /*
+ * now we know that for better packing shifting in only one
+ * direction either to the left or to the right is required
+ */
- /* if shifting to the left is better than shifting to the right */
+ /*
+ * if shifting to the left is better than
+ * shifting to the right
+ */
if (lnver < rnver) {
SET_PAR_SHIFT_LEFT;
return CARRY_ON;
}
- /* if shifting to the right is better than shifting to the left */
+ /*
+ * if shifting to the right is better than
+ * shifting to the left
+ */
if (lnver > rnver) {
SET_PAR_SHIFT_RIGHT;
return CARRY_ON;
}
- /* now shifting in either direction gives the same number
- of nodes and we can make use of the cached neighbors */
+ /*
+ * now shifting in either direction gives the same number
+ * of nodes and we can make use of the cached neighbors
+ */
if (is_left_neighbor_in_cache(tb, h)) {
SET_PAR_SHIFT_LEFT;
return CARRY_ON;
}
- /* shift to the right independently on whether the right neighbor in cache or not */
+ /*
+ * shift to the right independently on whether the
+ * right neighbor in cache or not
+ */
SET_PAR_SHIFT_RIGHT;
return CARRY_ON;
}
}
-/* Check whether current node S[h] is balanced when Decreasing its size by
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
* Deleting or Cutting for INTERNAL node of S+tree.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1563,8 +1729,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
{
struct virtual_node *vn = tb->tb_vn;
- /* Sh is the node whose balance is currently being checked,
- and Fh is its father. */
+ /*
+ * Sh is the node whose balance is currently being checked,
+ * and Fh is its father.
+ */
struct buffer_head *Sh, *Fh;
int maxsize, ret;
int lfree, rfree /* free space in L and R */ ;
@@ -1574,19 +1742,25 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
maxsize = MAX_CHILD_SIZE(Sh);
-/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
-/* new_nr_item = number of items node would have if operation is */
-/* performed without balancing (new_nr_item); */
+ /*
+ * using tb->insert_size[h], which is negative in this case,
+ * create_virtual_node calculates:
+ * new_nr_item = number of items node would have if operation is
+ * performed without balancing (new_nr_item);
+ */
create_virtual_node(tb, h);
if (!Fh) { /* S[h] is the root. */
+ /* no balancing for higher levels needed */
if (vn->vn_nr_item > 0) {
set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
- return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+ return NO_BALANCING_NEEDED;
}
- /* new_nr_item == 0.
+ /*
+ * new_nr_item == 0.
* Current root will be deleted resulting in
- * decrementing the tree height. */
+ * decrementing the tree height.
+ */
set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
return CARRY_ON;
}
@@ -1602,12 +1776,18 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
check_left(tb, h, lfree);
check_right(tb, h, rfree);
- if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { /* Balance condition for the internal node is valid.
- * In this case we balance only if it leads to better packing. */
- if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { /* Here we join S[h] with one of its neighbors,
- * which is impossible with greater values of new_nr_item. */
+ /*
+ * Balance condition for the internal node is valid.
+ * In this case we balance only if it leads to better packing.
+ */
+ if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
+ /*
+ * Here we join S[h] with one of its neighbors,
+ * which is impossible with greater values of new_nr_item.
+ */
+ if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
+ /* All contents of S[h] can be moved to L[h]. */
if (tb->lnum[h] >= vn->vn_nr_item + 1) {
- /* All contents of S[h] can be moved to L[h]. */
int n;
int order_L;
@@ -1623,8 +1803,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
return CARRY_ON;
}
+ /* All contents of S[h] can be moved to R[h]. */
if (tb->rnum[h] >= vn->vn_nr_item + 1) {
- /* All contents of S[h] can be moved to R[h]. */
int n;
int order_R;
@@ -1641,8 +1821,11 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
}
}
+ /*
+ * All contents of S[h] can be moved to the neighbors
+ * (L[h] & R[h]).
+ */
if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
- /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
int to_r;
to_r =
@@ -1659,7 +1842,10 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
return NO_BALANCING_NEEDED;
}
- /* Current node contain insufficient number of items. Balancing is required. */
+ /*
+ * Current node contain insufficient number of items.
+ * Balancing is required.
+ */
/* Check whether we can merge S[h] with left neighbor. */
if (tb->lnum[h] >= vn->vn_nr_item + 1)
if (is_left_neighbor_in_cache(tb, h)
@@ -1726,7 +1912,8 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h)
return CARRY_ON;
}
-/* Check whether current node S[h] is balanced when Decreasing its size by
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
* Deleting or Truncating for LEAF node of S+tree.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1743,15 +1930,21 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
{
struct virtual_node *vn = tb->tb_vn;
- /* Number of bytes that must be deleted from
- (value is negative if bytes are deleted) buffer which
- contains node being balanced. The mnemonic is that the
- attempted change in node space used level is levbytes bytes. */
+ /*
+ * Number of bytes that must be deleted from
+ * (value is negative if bytes are deleted) buffer which
+ * contains node being balanced. The mnemonic is that the
+ * attempted change in node space used level is levbytes bytes.
+ */
int levbytes;
+
/* the maximal item size */
int maxsize, ret;
- /* S0 is the node whose balance is currently being checked,
- and F0 is its father. */
+
+ /*
+ * S0 is the node whose balance is currently being checked,
+ * and F0 is its father.
+ */
struct buffer_head *S0, *F0;
int lfree, rfree /* free space in L and R */ ;
@@ -1784,9 +1977,11 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
if (are_leaves_removable(tb, lfree, rfree))
return CARRY_ON;
- /* determine maximal number of items we can shift to the left/right neighbor
- and the maximal number of bytes that can flow to the left/right neighbor
- from the left/right most liquid item that cannot be shifted from S[0] entirely
+ /*
+ * determine maximal number of items we can shift to the left/right
+ * neighbor and the maximal number of bytes that can flow to the
+ * left/right neighbor from the left/right most liquid item that
+ * cannot be shifted from S[0] entirely
*/
check_left(tb, h, lfree);
check_right(tb, h, rfree);
@@ -1810,7 +2005,10 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
return CARRY_ON;
}
- /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
+ /*
+ * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
+ * Set parameters and return
+ */
if (is_leaf_removable(tb))
return CARRY_ON;
@@ -1820,7 +2018,8 @@ static int dc_check_balance_leaf(struct tree_balance *tb, int h)
return NO_BALANCING_NEEDED;
}
-/* Check whether current node S[h] is balanced when Decreasing its size by
+/*
+ * Check whether current node S[h] is balanced when Decreasing its size by
* Deleting or Cutting.
* Calculate parameters for balancing for current level h.
* Parameters:
@@ -1844,15 +2043,16 @@ static int dc_check_balance(struct tree_balance *tb, int h)
return dc_check_balance_leaf(tb, h);
}
-/* Check whether current node S[h] is balanced.
+/*
+ * Check whether current node S[h] is balanced.
* Calculate parameters for balancing for current level h.
* Parameters:
*
* tb tree_balance structure:
*
- * tb is a large structure that must be read about in the header file
- * at the same time as this procedure if the reader is to successfully
- * understand this procedure
+ * tb is a large structure that must be read about in the header
+ * file at the same time as this procedure if the reader is
+ * to successfully understand this procedure
*
* h current level of the node;
* inum item number in S[h];
@@ -1882,8 +2082,8 @@ static int check_balance(int mode,
RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
"vs-8255: ins_ih can not be 0 in insert mode");
+ /* Calculate balance parameters when size of node is increasing. */
if (tb->insert_size[h] > 0)
- /* Calculate balance parameters when size of node is increasing. */
return ip_check_balance(tb, h);
/* Calculate balance parameters when size of node is decreasing. */
@@ -1911,21 +2111,23 @@ static int get_direct_parent(struct tree_balance *tb, int h)
PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
return CARRY_ON;
}
- return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
+ /* Root is changed and we must recalculate the path. */
+ return REPEAT_SEARCH;
}
+ /* Parent in the path is not in the tree. */
if (!B_IS_IN_TREE
(bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
- return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
+ return REPEAT_SEARCH;
if ((position =
PATH_OFFSET_POSITION(path,
path_offset - 1)) > B_NR_ITEMS(bh))
return REPEAT_SEARCH;
+ /* Parent in the path is not parent of the current node in the tree. */
if (B_N_CHILD_NUM(bh, position) !=
PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
- /* Parent in the path is not parent of the current node in the tree. */
return REPEAT_SEARCH;
if (buffer_locked(bh)) {
@@ -1936,10 +2138,15 @@ static int get_direct_parent(struct tree_balance *tb, int h)
return REPEAT_SEARCH;
}
- return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */
+ /*
+ * Parent in the path is unlocked and really parent
+ * of the current node.
+ */
+ return CARRY_ON;
}
-/* Using lnum[h] and rnum[h] we should determine what neighbors
+/*
+ * Using lnum[h] and rnum[h] we should determine what neighbors
* of S[h] we
* need in order to balance S[h], and get them if necessary.
* Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked;
@@ -1997,7 +2204,7 @@ static int get_neighbors(struct tree_balance *tb, int h)
}
/* We need right neighbor to balance S[path_offset]. */
- if (tb->rnum[h]) { /* We need right neighbor to balance S[path_offset]. */
+ if (tb->rnum[h]) {
PROC_INFO_INC(sb, need_r_neighbor[h]);
bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
@@ -2053,9 +2260,11 @@ static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
(max_num_of_entries - 1) * sizeof(__u16));
}
-/* maybe we should fail balancing we are going to perform when kmalloc
- fails several times. But now it will loop until kmalloc gets
- required memory */
+/*
+ * maybe we should fail balancing we are going to perform when kmalloc
+ * fails several times. But now it will loop until kmalloc gets
+ * required memory
+ */
static int get_mem_for_virtual_node(struct tree_balance *tb)
{
int check_fs = 0;
@@ -2064,8 +2273,8 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
+ /* we have to allocate more memory for virtual node */
if (size > tb->vn_buf_size) {
- /* we have to allocate more memory for virtual node */
if (tb->vn_buf) {
/* free memory allocated before */
kfree(tb->vn_buf);
@@ -2079,10 +2288,12 @@ static int get_mem_for_virtual_node(struct tree_balance *tb)
/* get memory for virtual item */
buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
if (!buf) {
- /* getting memory with GFP_KERNEL priority may involve
- balancing now (due to indirect_to_direct conversion on
- dcache shrinking). So, release path and collected
- resources here */
+ /*
+ * getting memory with GFP_KERNEL priority may involve
+ * balancing now (due to indirect_to_direct conversion
+ * on dcache shrinking). So, release path and collected
+ * resources here
+ */
free_buffers_in_tb(tb);
buf = kmalloc(size, GFP_NOFS);
if (!buf) {
@@ -2168,8 +2379,10 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
for (i = tb->tb_path->path_length;
!locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
- /* if I understand correctly, we can only be sure the last buffer
- ** in the path is in the tree --clm
+ /*
+ * if I understand correctly, we can only
+ * be sure the last buffer in the path is
+ * in the tree --clm
*/
#ifdef CONFIG_REISERFS_CHECK
if (PATH_PLAST_BUFFER(tb->tb_path) ==
@@ -2256,13 +2469,15 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
}
}
}
- /* as far as I can tell, this is not required. The FEB list seems
- ** to be full of newly allocated nodes, which will never be locked,
- ** dirty, or anything else.
- ** To be safe, I'm putting in the checks and waits in. For the moment,
- ** they are needed to keep the code in journal.c from complaining
- ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well.
- ** --clm
+
+ /*
+ * as far as I can tell, this is not required. The FEB list
+ * seems to be full of newly allocated nodes, which will
+ * never be locked, dirty, or anything else.
+ * To be safe, I'm putting in the checks and waits in.
+ * For the moment, they are needed to keep the code in
+ * journal.c from complaining about the buffer.
+ * That code is inside CONFIG_REISERFS_CHECK as well. --clm
*/
for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
if (tb->FEB[i]) {
@@ -2300,7 +2515,8 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
return CARRY_ON;
}
-/* Prepare for balancing, that is
+/*
+ * Prepare for balancing, that is
* get all necessary parents, and neighbors;
* analyze what and where should be moved;
* get sufficient number of new nodes;
@@ -2309,13 +2525,14 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
* When ported to SMP kernels, only at the last moment after all needed nodes
* are collected in cache, will the resources be locked using the usual
* textbook ordered lock acquisition algorithms. Note that ensuring that
- * this code neither write locks what it does not need to write lock nor locks out of order
- * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans
+ * this code neither write locks what it does not need to write lock nor locks
+ * out of order will be a pain in the butt that could have been avoided.
+ * Grumble grumble. -Hans
*
* fix is meant in the sense of render unchanging
*
- * Latency might be improved by first gathering a list of what buffers are needed
- * and then getting as many of them in parallel as possible? -Hans
+ * Latency might be improved by first gathering a list of what buffers
+ * are needed and then getting as many of them in parallel as possible? -Hans
*
* Parameters:
* op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
@@ -2335,8 +2552,9 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
int pos_in_item;
- /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
- ** during wait_tb_buffers_run
+ /*
+ * we set wait_tb_buffers_run when we have to restore any dirty
+ * bits cleared during wait_tb_buffers_run
*/
int wait_tb_buffers_run = 0;
struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -2347,14 +2565,15 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
tb->fs_gen = get_generation(tb->tb_sb);
- /* we prepare and log the super here so it will already be in the
- ** transaction when do_balance needs to change it.
- ** This way do_balance won't have to schedule when trying to prepare
- ** the super for logging
+ /*
+ * we prepare and log the super here so it will already be in the
+ * transaction when do_balance needs to change it.
+ * This way do_balance won't have to schedule when trying to prepare
+ * the super for logging
*/
reiserfs_prepare_for_journal(tb->tb_sb,
SB_BUFFER_WITH_SB(tb->tb_sb), 1);
- journal_mark_dirty(tb->transaction_handle, tb->tb_sb,
+ journal_mark_dirty(tb->transaction_handle,
SB_BUFFER_WITH_SB(tb->tb_sb));
if (FILESYSTEM_CHANGED_TB(tb))
return REPEAT_SEARCH;
@@ -2408,7 +2627,7 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
#endif
if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
- // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
+ /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
return REPEAT_SEARCH;
/* Starting from the leaf level; for all levels h of the tree. */
@@ -2427,7 +2646,10 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
goto repeat;
if (h != MAX_HEIGHT - 1)
tb->insert_size[h + 1] = 0;
- /* ok, analysis and resource gathering are complete */
+ /*
+ * ok, analysis and resource gathering
+ * are complete
+ */
break;
}
goto repeat;
@@ -2437,15 +2659,19 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
if (ret != CARRY_ON)
goto repeat;
- /* No disk space, or schedule occurred and analysis may be
- * invalid and needs to be redone. */
+ /*
+ * No disk space, or schedule occurred and analysis may be
+ * invalid and needs to be redone.
+ */
ret = get_empty_nodes(tb, h);
if (ret != CARRY_ON)
goto repeat;
+ /*
+ * We have a positive insert size but no nodes exist on this
+ * level, this means that we are creating a new root.
+ */
if (!PATH_H_PBUFFER(tb->tb_path, h)) {
- /* We have a positive insert size but no nodes exist on this
- level, this means that we are creating a new root. */
RFALSE(tb->blknum[h] != 1,
"PAP-8350: creating new empty root");
@@ -2453,11 +2679,13 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
if (h < MAX_HEIGHT - 1)
tb->insert_size[h + 1] = 0;
} else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
+ /*
+ * The tree needs to be grown, so this node S[h]
+ * which is the root node is split into two nodes,
+ * and a new node (S[h+1]) will be created to
+ * become the root node.
+ */
if (tb->blknum[h] > 1) {
- /* The tree needs to be grown, so this node S[h]
- which is the root node is split into two nodes,
- and a new node (S[h+1]) will be created to
- become the root node. */
RFALSE(h == MAX_HEIGHT - 1,
"PAP-8355: attempt to create too high of a tree");
@@ -2487,12 +2715,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
goto repeat;
}
- repeat:
- // fix_nodes was unable to perform its calculation due to
- // filesystem got changed under us, lack of free disk space or i/o
- // failure. If the first is the case - the search will be
- // repeated. For now - free all resources acquired so far except
- // for the new allocated nodes
+repeat:
+ /*
+ * fix_nodes was unable to perform its calculation due to
+ * filesystem got changed under us, lack of free disk space or i/o
+ * failure. If the first is the case - the search will be
+ * repeated. For now - free all resources acquired so far except
+ * for the new allocated nodes
+ */
{
int i;
@@ -2548,8 +2778,6 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
}
-/* Anatoly will probably forgive me renaming tb to tb. I just
- wanted to make lines shorter */
void unfix_nodes(struct tree_balance *tb)
{
int i;
@@ -2578,8 +2806,10 @@ void unfix_nodes(struct tree_balance *tb)
for (i = 0; i < MAX_FEB_SIZE; i++) {
if (tb->FEB[i]) {
b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
- /* de-allocated block which was not used by balancing and
- bforget about buffer for it */
+ /*
+ * de-allocated block which was not used by
+ * balancing and bforget about buffer for it
+ */
brelse(tb->FEB[i]);
reiserfs_free_block(tb->transaction_handle, NULL,
blocknr, 0);
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index 91b0cc1242a..7a26c4fe6c4 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -12,12 +12,6 @@
* Yura's function is added (04/07/2000)
*/
-//
-// keyed_hash
-// yura_hash
-// r5_hash
-//
-
#include <linux/kernel.h>
#include "reiserfs.h"
#include <asm/types.h>
@@ -56,7 +50,7 @@ u32 keyed_hash(const signed char *msg, int len)
u32 pad;
int i;
- // assert(len >= 0 && len < 256);
+ /* assert(len >= 0 && len < 256); */
pad = (u32) len | ((u32) len << 8);
pad |= pad << 16;
@@ -127,9 +121,10 @@ u32 keyed_hash(const signed char *msg, int len)
return h0 ^ h1;
}
-/* What follows in this file is copyright 2000 by Hans Reiser, and the
- * licensing of what follows is governed by reiserfs/README */
-
+/*
+ * What follows in this file is copyright 2000 by Hans Reiser, and the
+ * licensing of what follows is governed by reiserfs/README
+ */
u32 yura_hash(const signed char *msg, int len)
{
int j, pow;
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index e1978fd895f..73231b1ebdb 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -12,7 +12,10 @@
int balance_internal(struct tree_balance *,
int, int, struct item_head *, struct buffer_head **);
-/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */
+/*
+ * modes of internal_shift_left, internal_shift_right and
+ * internal_insert_childs
+ */
#define INTERNAL_SHIFT_FROM_S_TO_L 0
#define INTERNAL_SHIFT_FROM_R_TO_S 1
#define INTERNAL_SHIFT_FROM_L_TO_S 2
@@ -32,7 +35,9 @@ static void internal_define_dest_src_infos(int shift_mode,
memset(src_bi, 0, sizeof(struct buffer_info));
/* define dest, src, dest parent, dest position */
switch (shift_mode) {
- case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */
+
+ /* used in internal_shift_left */
+ case INTERNAL_SHIFT_FROM_S_TO_L:
src_bi->tb = tb;
src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
@@ -52,12 +57,14 @@ static void internal_define_dest_src_infos(int shift_mode,
dest_bi->tb = tb;
dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
- dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */
+ /* dest position is analog of dest->b_item_order */
+ dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
*d_key = tb->lkey[h];
*cf = tb->CFL[h];
break;
- case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */
+ /* used in internal_shift_left */
+ case INTERNAL_SHIFT_FROM_R_TO_S:
src_bi->tb = tb;
src_bi->bi_bh = tb->R[h];
src_bi->bi_parent = tb->FR[h];
@@ -111,7 +118,8 @@ static void internal_define_dest_src_infos(int shift_mode,
}
}
-/* Insert count node pointers into buffer cur before position to + 1.
+/*
+ * Insert count node pointers into buffer cur before position to + 1.
* Insert count items into buffer cur before position to.
* Items and node pointers are specified by inserted and bh respectively.
*/
@@ -146,14 +154,14 @@ static void internal_insert_childs(struct buffer_info *cur_bi,
/* copy to_be_insert disk children */
for (i = 0; i < count; i++) {
- put_dc_size(&(new_dc[i]),
+ put_dc_size(&new_dc[i],
MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
- put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr);
+ put_dc_block_number(&new_dc[i], bh[i]->b_blocknr);
}
memcpy(dc, new_dc, DC_SIZE * count);
/* prepare space for count items */
- ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to));
+ ih = internal_key(cur, ((to == -1) ? 0 : to));
memmove(ih + count, ih,
(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
@@ -190,8 +198,10 @@ static void internal_insert_childs(struct buffer_info *cur_bi,
}
-/* Delete del_num items and node pointers from buffer cur starting from *
- * the first_i'th item and first_p'th pointers respectively. */
+/*
+ * Delete del_num items and node pointers from buffer cur starting from
+ * the first_i'th item and first_p'th pointers respectively.
+ */
static void internal_delete_pointers_items(struct buffer_info *cur_bi,
int first_p,
int first_i, int del_num)
@@ -233,7 +243,7 @@ static void internal_delete_pointers_items(struct buffer_info *cur_bi,
dc = B_N_CHILD(cur, first_p);
memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
- key = B_N_PDELIM_KEY(cur, first_i);
+ key = internal_key(cur, first_i);
memmove(key, key + del_num,
(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
del_num) * DC_SIZE);
@@ -270,22 +280,30 @@ static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
i_from = (from == 0) ? from : from - 1;
- /* delete n pointers starting from `from' position in CUR;
- delete n keys starting from 'i_from' position in CUR;
+ /*
+ * delete n pointers starting from `from' position in CUR;
+ * delete n keys starting from 'i_from' position in CUR;
*/
internal_delete_pointers_items(cur_bi, from, i_from, n);
}
-/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
-* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest
+/*
+ * copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer
+ * dest
+ * last_first == FIRST_TO_LAST means that we copy first items
+ * from src to tail of dest
+ * last_first == LAST_TO_FIRST means that we copy last items
+ * from src to head of dest
*/
static void internal_copy_pointers_items(struct buffer_info *dest_bi,
struct buffer_head *src,
int last_first, int cpy_num)
{
- /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
- * as delimiting key have already inserted to buffer dest.*/
+ /*
+ * ATTENTION! Number of node pointers in DEST is equal to number
+ * of items in DEST as delimiting key have already inserted to
+ * buffer dest.
+ */
struct buffer_head *dest = dest_bi->bi_bh;
int nr_dest, nr_src;
int dest_order, src_order;
@@ -330,13 +348,13 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi,
memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
/* prepare space for cpy_num - 1 item headers */
- key = B_N_PDELIM_KEY(dest, dest_order);
+ key = internal_key(dest, dest_order);
memmove(key + cpy_num - 1, key,
KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
cpy_num));
/* insert headers */
- memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1));
+ memcpy(key, internal_key(src, src_order), KEY_SIZE * (cpy_num - 1));
/* sizes, item number */
set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
@@ -366,7 +384,9 @@ static void internal_copy_pointers_items(struct buffer_info *dest_bi,
}
-/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest.
+/*
+ * Copy cpy_num node pointers and cpy_num - 1 items from buffer src to
+ * buffer dest.
* Delete cpy_num - del_par items and node pointers from buffer src.
* last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
* last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
@@ -385,8 +405,10 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
if (last_first == FIRST_TO_LAST) { /* shift_left occurs */
first_pointer = 0;
first_item = 0;
- /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer,
- for key - with first_item */
+ /*
+ * delete cpy_num - del_par pointers and keys starting for
+ * pointers with first_pointer, for key - with first_item
+ */
internal_delete_pointers_items(src_bi, first_pointer,
first_item, cpy_num - del_par);
} else { /* shift_right occurs */
@@ -404,7 +426,9 @@ static void internal_move_pointers_items(struct buffer_info *dest_bi,
}
/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
-static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before, /* insert key before key with n_dest number */
+static void internal_insert_key(struct buffer_info *dest_bi,
+ /* insert key before key with n_dest number */
+ int dest_position_before,
struct buffer_head *src, int src_position)
{
struct buffer_head *dest = dest_bi->bi_bh;
@@ -429,12 +453,12 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
nr = blkh_nr_item(blkh);
/* prepare space for inserting key */
- key = B_N_PDELIM_KEY(dest, dest_position_before);
+ key = internal_key(dest, dest_position_before);
memmove(key + 1, key,
(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
/* insert key */
- memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
+ memcpy(key, internal_key(src, src_position), KEY_SIZE);
/* Change dirt, free space, item number fields. */
@@ -453,13 +477,19 @@ static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_b
}
}
-/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
- * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
+/*
+ * Insert d_key'th (delimiting) key from buffer cfl to tail of dest.
+ * Copy pointer_amount node pointers and pointer_amount - 1 items from
+ * buffer src to buffer dest.
* Replace d_key'th key in buffer cfl.
* Delete pointer_amount items and node pointers from buffer src.
*/
/* this can be invoked both to shift from S to L and from R to S */
-static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
+static void internal_shift_left(
+ /*
+ * INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S
+ */
+ int mode,
struct tree_balance *tb,
int h, int pointer_amount)
{
@@ -473,7 +503,10 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO
/*printk("pointer_amount = %d\n",pointer_amount); */
if (pointer_amount) {
- /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
+ /*
+ * insert delimiting key from common father of dest and
+ * src to node dest into position B_NR_ITEM(dest)
+ */
internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
d_key_position);
@@ -492,7 +525,8 @@ static void internal_shift_left(int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FRO
}
-/* Insert delimiting key to L[h].
+/*
+ * Insert delimiting key to L[h].
* Copy n node pointers and n - 1 items from buffer S[h] to L[h].
* Delete n - 1 items and node pointers from buffer S[h].
*/
@@ -507,23 +541,27 @@ static void internal_shift1_left(struct tree_balance *tb,
internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
&dest_bi, &src_bi, &d_key_position, &cf);
- if (pointer_amount > 0) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
+ /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */
+ if (pointer_amount > 0)
internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
d_key_position);
- /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */
/* last parameter is del_parameter */
internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
pointer_amount, 1);
- /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
}
-/* Insert d_key'th (delimiting) key from buffer cfr to head of dest.
+/*
+ * Insert d_key'th (delimiting) key from buffer cfr to head of dest.
* Copy n node pointers and n - 1 items from buffer src to buffer dest.
* Replace d_key'th key in buffer cfr.
* Delete n items and node pointers from buffer src.
*/
-static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
+static void internal_shift_right(
+ /*
+ * INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S
+ */
+ int mode,
struct tree_balance *tb,
int h, int pointer_amount)
{
@@ -538,7 +576,10 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR
nr = B_NR_ITEMS(src_bi.bi_bh);
if (pointer_amount > 0) {
- /* insert delimiting key from common father of dest and src to dest node into position 0 */
+ /*
+ * insert delimiting key from common father of dest
+ * and src to dest node into position 0
+ */
internal_insert_key(&dest_bi, 0, cf, d_key_position);
if (nr == pointer_amount - 1) {
RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
@@ -559,7 +600,8 @@ static void internal_shift_right(int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FR
pointer_amount, 0);
}
-/* Insert delimiting key to R[h].
+/*
+ * Insert delimiting key to R[h].
* Copy n node pointers and n - 1 items from buffer S[h] to R[h].
* Delete n - 1 items and node pointers from buffer S[h].
*/
@@ -574,18 +616,19 @@ static void internal_shift1_right(struct tree_balance *tb,
internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
&dest_bi, &src_bi, &d_key_position, &cf);
- if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */
+ /* insert rkey from CFR[h] to right neighbor R[h] */
+ if (pointer_amount > 0)
internal_insert_key(&dest_bi, 0, cf, d_key_position);
- /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */
/* last parameter is del_parameter */
internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
pointer_amount, 1);
- /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */
}
-/* Delete insert_num node pointers together with their left items
- * and balance current node.*/
+/*
+ * Delete insert_num node pointers together with their left items
+ * and balance current node.
+ */
static void balance_internal_when_delete(struct tree_balance *tb,
int h, int child_pos)
{
@@ -626,9 +669,11 @@ static void balance_internal_when_delete(struct tree_balance *tb,
new_root = tb->R[h - 1];
else
new_root = tb->L[h - 1];
- /* switch super block's tree root block number to the new value */
+ /*
+ * switch super block's tree root block
+ * number to the new value */
PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
- //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
+ /*REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --; */
PUT_SB_TREE_HEIGHT(tb->tb_sb,
SB_TREE_HEIGHT(tb->tb_sb) - 1);
@@ -636,8 +681,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
REISERFS_SB(tb->tb_sb)->s_sbh,
1);
/*&&&&&&&&&&&&&&&&&&&&&& */
+ /* use check_internal if new root is an internal node */
if (h > 1)
- /* use check_internal if new root is an internal node */
check_internal(new_root);
/*&&&&&&&&&&&&&&&&&&&&&& */
@@ -648,7 +693,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) { /* join S[h] with L[h] */
+ /* join S[h] with L[h] */
+ if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {
RFALSE(tb->rnum[h] != 0,
"invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
@@ -660,7 +706,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) { /* join S[h] with R[h] */
+ /* join S[h] with R[h] */
+ if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {
RFALSE(tb->lnum[h] != 0,
"invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
h, tb->lnum[h]);
@@ -671,17 +718,18 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->lnum[h] < 0) { /* borrow from left neighbor L[h] */
+ /* borrow from left neighbor L[h] */
+ if (tb->lnum[h] < 0) {
RFALSE(tb->rnum[h] != 0,
"wrong tb->rnum[%d]==%d when borrow from L[h]", h,
tb->rnum[h]);
- /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */
internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
-tb->lnum[h]);
return;
}
- if (tb->rnum[h] < 0) { /* borrow from right neighbor R[h] */
+ /* borrow from right neighbor R[h] */
+ if (tb->rnum[h] < 0) {
RFALSE(tb->lnum[h] != 0,
"invalid tb->lnum[%d]==%d when borrow from R[h]",
h, tb->lnum[h]);
@@ -689,7 +737,8 @@ static void balance_internal_when_delete(struct tree_balance *tb,
return;
}
- if (tb->lnum[h] > 0) { /* split S[h] into two parts and put them into neighbors */
+ /* split S[h] into two parts and put them into neighbors */
+ if (tb->lnum[h] > 0) {
RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
"invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
h, tb->lnum[h], h, tb->rnum[h], n);
@@ -717,7 +766,7 @@ static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
return;
- memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
+ memcpy(internal_key(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
}
@@ -732,34 +781,41 @@ static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
"R[h] can not be empty if it exists (item number=%d)",
B_NR_ITEMS(tb->R[h]));
- memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
+ memcpy(internal_key(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
}
-int balance_internal(struct tree_balance *tb, /* tree_balance structure */
- int h, /* level of the tree */
- int child_pos, struct item_head *insert_key, /* key for insertion on higher level */
- struct buffer_head **insert_ptr /* node for insertion on higher level */
- )
- /* if inserting/pasting
- {
- child_pos is the position of the node-pointer in S[h] that *
- pointed to S[h-1] before balancing of the h-1 level; *
- this means that new pointers and items must be inserted AFTER *
- child_pos
- }
- else
- {
- it is the position of the leftmost pointer that must be deleted (together with
- its corresponding key to the left of the pointer)
- as a result of the previous level's balancing.
- }
- */
+
+/*
+ * if inserting/pasting {
+ * child_pos is the position of the node-pointer in S[h] that
+ * pointed to S[h-1] before balancing of the h-1 level;
+ * this means that new pointers and items must be inserted AFTER
+ * child_pos
+ * } else {
+ * it is the position of the leftmost pointer that must be deleted
+ * (together with its corresponding key to the left of the pointer)
+ * as a result of the previous level's balancing.
+ * }
+ */
+
+int balance_internal(struct tree_balance *tb,
+ int h, /* level of the tree */
+ int child_pos,
+ /* key for insertion on higher level */
+ struct item_head *insert_key,
+ /* node for insertion on higher level */
+ struct buffer_head **insert_ptr)
{
struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
struct buffer_info bi;
- int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
+
+ /*
+ * we return this: it is 0 if there is no S[h],
+ * else it is tb->S[h]->b_item_order
+ */
+ int order;
int insert_num, n, k;
struct buffer_head *S_new;
struct item_head new_insert_key;
@@ -774,8 +830,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
(tbSh) ? PATH_H_POSITION(tb->tb_path,
h + 1) /*tb->S[h]->b_item_order */ : 0;
- /* Using insert_size[h] calculate the number insert_num of items
- that must be inserted to or deleted from S[h]. */
+ /*
+ * Using insert_size[h] calculate the number insert_num of items
+ * that must be inserted to or deleted from S[h].
+ */
insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
/* Check whether insert_num is proper * */
@@ -794,23 +852,21 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
k = 0;
if (tb->lnum[h] > 0) {
- /* shift lnum[h] items from S[h] to the left neighbor L[h].
- check how many of new items fall into L[h] or CFL[h] after
- shifting */
+ /*
+ * shift lnum[h] items from S[h] to the left neighbor L[h].
+ * check how many of new items fall into L[h] or CFL[h] after
+ * shifting
+ */
n = B_NR_ITEMS(tb->L[h]); /* number of items in L[h] */
if (tb->lnum[h] <= child_pos) {
/* new items don't fall into L[h] or CFL[h] */
internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
tb->lnum[h]);
- /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */
child_pos -= tb->lnum[h];
} else if (tb->lnum[h] > child_pos + insert_num) {
/* all new items fall into L[h] */
internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
tb->lnum[h] - insert_num);
- /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
- tb->lnum[h]-insert_num);
- */
/* insert insert_num keys and node-pointers into L[h] */
bi.tb = tb;
bi.bi_bh = tb->L[h];
@@ -826,7 +882,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
} else {
struct disk_child *dc;
- /* some items fall into L[h] or CFL[h], but some don't fall */
+ /*
+ * some items fall into L[h] or CFL[h],
+ * but some don't fall
+ */
internal_shift1_left(tb, h, child_pos + 1);
/* calculate number of new items that fall into L[h] */
k = tb->lnum[h] - child_pos - 1;
@@ -841,7 +900,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
replace_lkey(tb, h, insert_key + k);
- /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
+ /*
+ * replace the first node-ptr in S[h] by
+ * node-ptr to insert_ptr[k]
+ */
dc = B_N_CHILD(tbSh, 0);
put_dc_size(dc,
MAX_CHILD_SIZE(insert_ptr[k]) -
@@ -860,17 +922,17 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
/* tb->lnum[h] > 0 */
if (tb->rnum[h] > 0) {
/*shift rnum[h] items from S[h] to the right neighbor R[h] */
- /* check how many of new items fall into R or CFR after shifting */
+ /*
+ * check how many of new items fall into R or CFR
+ * after shifting
+ */
n = B_NR_ITEMS(tbSh); /* number of items in S[h] */
if (n - tb->rnum[h] >= child_pos)
/* new items fall into S[h] */
- /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */
internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
tb->rnum[h]);
else if (n + insert_num - tb->rnum[h] < child_pos) {
/* all new items fall into R[h] */
- /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
- tb->rnum[h] - insert_num); */
internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
tb->rnum[h] - insert_num);
@@ -904,7 +966,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
replace_rkey(tb, h, insert_key + insert_num - k - 1);
- /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */
+ /*
+ * replace the first node-ptr in R[h] by
+ * node-ptr insert_ptr[insert_num-k-1]
+ */
dc = B_N_CHILD(tb->R[h], 0);
put_dc_size(dc,
MAX_CHILD_SIZE(insert_ptr
@@ -921,7 +986,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
}
}
- /** Fill new node that appears instead of S[h] **/
+ /** Fill new node that appears instead of S[h] **/
RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
@@ -997,26 +1062,30 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
/* new items don't fall into S_new */
/* store the delimiting key for the next level */
/* new_insert_key = (n - snum)'th key in S[h] */
- memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum),
+ memcpy(&new_insert_key, internal_key(tbSh, n - snum),
KEY_SIZE);
/* last parameter is del_par */
internal_move_pointers_items(&dest_bi, &src_bi,
LAST_TO_FIRST, snum, 0);
- /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */
} else if (n + insert_num - snum < child_pos) {
/* all new items fall into S_new */
/* store the delimiting key for the next level */
- /* new_insert_key = (n + insert_item - snum)'th key in S[h] */
+ /*
+ * new_insert_key = (n + insert_item - snum)'th
+ * key in S[h]
+ */
memcpy(&new_insert_key,
- B_N_PDELIM_KEY(tbSh, n + insert_num - snum),
+ internal_key(tbSh, n + insert_num - snum),
KEY_SIZE);
/* last parameter is del_par */
internal_move_pointers_items(&dest_bi, &src_bi,
LAST_TO_FIRST,
snum - insert_num, 0);
- /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */
- /* insert insert_num keys and node-pointers into S_new */
+ /*
+ * insert insert_num keys and node-pointers
+ * into S_new
+ */
internal_insert_childs(&dest_bi,
/*S_new,tb->S[h-1]->b_next, */
child_pos - n - insert_num +
@@ -1033,7 +1102,6 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
internal_move_pointers_items(&dest_bi, &src_bi,
LAST_TO_FIRST,
n - child_pos + 1, 1);
- /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */
/* calculate number of new items that fall into S_new */
k = snum - n + child_pos - 1;
@@ -1043,7 +1111,10 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
/* new_insert_key = insert_key[insert_num - k - 1] */
memcpy(&new_insert_key, insert_key + insert_num - k - 1,
KEY_SIZE);
- /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
+ /*
+ * replace first node-ptr in S_new by node-ptr
+ * to insert_ptr[insert_num-k-1]
+ */
dc = B_N_CHILD(S_new, 0);
put_dc_size(dc,
@@ -1066,7 +1137,7 @@ int balance_internal(struct tree_balance *tb, /* tree_balance structure
|| buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
S_new);
- // S_new is released in unfix_nodes
+ /* S_new is released in unfix_nodes */
}
n = B_NR_ITEMS(tbSh); /*number of items in S[h] */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ad62bdbb451..63b2b0ec49e 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -25,7 +25,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
void reiserfs_evict_inode(struct inode *inode)
{
- /* We need blocks for transaction + (user+group) quota update (possibly delete) */
+ /*
+ * We need blocks for transaction + (user+group) quota
+ * update (possibly delete)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 +
2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
@@ -35,12 +38,16 @@ void reiserfs_evict_inode(struct inode *inode)
if (!inode->i_nlink && !is_bad_inode(inode))
dquot_initialize(inode);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (inode->i_nlink)
goto no_delete;
- /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
- if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
+ /*
+ * The = 0 happens when we abort creating a new inode
+ * for some reason like lack of space..
+ * also handles bad_inode case
+ */
+ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {
reiserfs_delete_xattrs(inode);
@@ -54,34 +61,43 @@ void reiserfs_evict_inode(struct inode *inode)
err = reiserfs_delete_object(&th, inode);
- /* Do quota update inside a transaction for journaled quotas. We must do that
- * after delete_object so that quota updates go into the same transaction as
- * stat data deletion */
+ /*
+ * Do quota update inside a transaction for journaled quotas.
+ * We must do that after delete_object so that quota updates
+ * go into the same transaction as stat data deletion
+ */
if (!err) {
int depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_free_inode(inode);
reiserfs_write_lock_nested(inode->i_sb, depth);
}
- if (journal_end(&th, inode->i_sb, jbegin_count))
+ if (journal_end(&th))
goto out;
- /* check return value from reiserfs_delete_object after
+ /*
+ * check return value from reiserfs_delete_object after
* ending the transaction
*/
if (err)
goto out;
- /* all items of file are deleted, so we can remove "save" link */
- remove_save_link(inode, 0 /* not truncate */ ); /* we can't do anything
- * about an error here */
+ /*
+ * all items of file are deleted, so we can remove
+ * "save" link
+ * we can't do anything about an error here
+ */
+ remove_save_link(inode, 0 /* not truncate */);
out:
reiserfs_write_unlock(inode->i_sb);
} else {
/* no object items are in the tree */
;
}
- clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */
+
+ /* note this must go after the journal_end to prevent deadlock */
+ clear_inode(inode);
+
dquot_drop(inode);
inode->i_blocks = 0;
return;
@@ -103,8 +119,10 @@ static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
key->key_length = length;
}
-/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
- offset and type of key */
+/*
+ * take base of inode_key (it comes from inode always) (dirid, objectid)
+ * and version from an inode, set offset and type of key
+ */
void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
int type, int length)
{
@@ -114,9 +132,7 @@ void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
length);
}
-//
-// when key is 0, do not set version and short key
-//
+/* when key is 0, do not set version and short key */
inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
int version,
loff_t offset, int type, int length,
@@ -132,43 +148,47 @@ inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
set_le_ih_k_type(ih, type);
put_ih_item_len(ih, length);
/* set_ih_free_space (ih, 0); */
- // for directory items it is entry count, for directs and stat
- // datas - 0xffff, for indirects - 0
+ /*
+ * for directory items it is entry count, for directs and stat
+ * datas - 0xffff, for indirects - 0
+ */
put_ih_entry_count(ih, entry_count);
}
-//
-// FIXME: we might cache recently accessed indirect item
-
-// Ugh. Not too eager for that....
-// I cut the code until such time as I see a convincing argument (benchmark).
-// I don't want a bloated inode struct..., and I don't like code complexity....
-
-/* cutting the code is fine, since it really isn't in use yet and is easy
-** to add back in. But, Vladimir has a really good idea here. Think
-** about what happens for reading a file. For each page,
-** The VFS layer calls reiserfs_readpage, who searches the tree to find
-** an indirect item. This indirect item has X number of pointers, where
-** X is a big number if we've done the block allocation right. But,
-** we only use one or two of these pointers during each call to readpage,
-** needlessly researching again later on.
-**
-** The size of the cache could be dynamic based on the size of the file.
-**
-** I'd also like to see us cache the location the stat data item, since
-** we are needlessly researching for that frequently.
-**
-** --chris
-*/
+/*
+ * FIXME: we might cache recently accessed indirect item
+ * Ugh. Not too eager for that....
+ * I cut the code until such time as I see a convincing argument (benchmark).
+ * I don't want a bloated inode struct..., and I don't like code complexity....
+ */
-/* If this page has a file tail in it, and
-** it was read in by get_block_create_0, the page data is valid,
-** but tail is still sitting in a direct item, and we can't write to
-** it. So, look through this page, and check all the mapped buffers
-** to make sure they have valid block numbers. Any that don't need
-** to be unmapped, so that __block_write_begin will correctly call
-** reiserfs_get_block to convert the tail into an unformatted node
-*/
+/*
+ * cutting the code is fine, since it really isn't in use yet and is easy
+ * to add back in. But, Vladimir has a really good idea here. Think
+ * about what happens for reading a file. For each page,
+ * The VFS layer calls reiserfs_readpage, who searches the tree to find
+ * an indirect item. This indirect item has X number of pointers, where
+ * X is a big number if we've done the block allocation right. But,
+ * we only use one or two of these pointers during each call to readpage,
+ * needlessly researching again later on.
+ *
+ * The size of the cache could be dynamic based on the size of the file.
+ *
+ * I'd also like to see us cache the location the stat data item, since
+ * we are needlessly researching for that frequently.
+ *
+ * --chris
+ */
+
+/*
+ * If this page has a file tail in it, and
+ * it was read in by get_block_create_0, the page data is valid,
+ * but tail is still sitting in a direct item, and we can't write to
+ * it. So, look through this page, and check all the mapped buffers
+ * to make sure they have valid block numbers. Any that don't need
+ * to be unmapped, so that __block_write_begin will correctly call
+ * reiserfs_get_block to convert the tail into an unformatted node
+ */
static inline void fix_tail_page_for_writing(struct page *page)
{
struct buffer_head *head, *next, *bh;
@@ -186,8 +206,10 @@ static inline void fix_tail_page_for_writing(struct page *page)
}
}
-/* reiserfs_get_block does not need to allocate a block only if it has been
- done already or non-hole position has been found in the indirect item */
+/*
+ * reiserfs_get_block does not need to allocate a block only if it has been
+ * done already or non-hole position has been found in the indirect item
+ */
static inline int allocation_needed(int retval, b_blocknr_t allocated,
struct item_head *ih,
__le32 * item, int pos_in_item)
@@ -211,14 +233,16 @@ static inline void set_block_dev_mapped(struct buffer_head *bh,
map_bh(bh, inode->i_sb, block);
}
-//
-// files which were created in the earlier version can not be longer,
-// than 2 gb
-//
+/*
+ * files which were created in the earlier version can not be longer,
+ * than 2 gb
+ */
static int file_capable(struct inode *inode, sector_t block)
{
- if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 || // it is new file.
- block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
+ /* it is new file. */
+ if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||
+ /* old file, but 'block' is inside of 2gb */
+ block < (1 << (31 - inode->i_sb->s_blocksize_bits)))
return 1;
return 0;
@@ -228,7 +252,6 @@ static int restart_transaction(struct reiserfs_transaction_handle *th,
struct inode *inode, struct treepath *path)
{
struct super_block *s = th->t_super;
- int len = th->t_blocks_allocated;
int err;
BUG_ON(!th->t_trans_id);
@@ -241,7 +264,7 @@ static int restart_transaction(struct reiserfs_transaction_handle *th,
return 0;
}
reiserfs_update_sd(th, inode);
- err = journal_end(th, s, len);
+ err = journal_end(th);
if (!err) {
err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
if (!err)
@@ -250,14 +273,14 @@ static int restart_transaction(struct reiserfs_transaction_handle *th,
return err;
}
-// it is called by get_block when create == 0. Returns block number
-// for 'block'-th logical block of file. When it hits direct item it
-// returns 0 (being called from bmap) or read direct item into piece
-// of page (bh_result)
-
-// Please improve the english/clarity in the comment above, as it is
-// hard to understand.
-
+/*
+ * it is called by get_block when create == 0. Returns block number
+ * for 'block'-th logical block of file. When it hits direct item it
+ * returns 0 (being called from bmap) or read direct item into piece
+ * of page (bh_result)
+ * Please improve the english/clarity in the comment above, as it is
+ * hard to understand.
+ */
static int _get_block_create_0(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int args)
{
@@ -273,7 +296,7 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
int done = 0;
unsigned long offset;
- // prepare the key to look for the 'block'-th block of file
+ /* prepare the key to look for the 'block'-th block of file */
make_cpu_key(&key, inode,
(loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
3);
@@ -285,23 +308,28 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
kunmap(bh_result->b_page);
if (result == IO_ERROR)
return -EIO;
- // We do not return -ENOENT if there is a hole but page is uptodate, because it means
- // That there is some MMAPED data associated with it that is yet to be written to disk.
+ /*
+ * We do not return -ENOENT if there is a hole but page is
+ * uptodate, because it means that there is some MMAPED data
+ * associated with it that is yet to be written to disk.
+ */
if ((args & GET_BLOCK_NO_HOLE)
&& !PageUptodate(bh_result->b_page)) {
return -ENOENT;
}
return 0;
}
- //
+
bh = get_last_bh(&path);
- ih = get_ih(&path);
+ ih = tp_item_head(&path);
if (is_indirect_le_ih(ih)) {
- __le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
+ __le32 *ind_item = (__le32 *) ih_item_body(bh, ih);
- /* FIXME: here we could cache indirect item or part of it in
- the inode to avoid search_by_key in case of subsequent
- access to file */
+ /*
+ * FIXME: here we could cache indirect item or part of it in
+ * the inode to avoid search_by_key in case of subsequent
+ * access to file
+ */
blocknr = get_block_num(ind_item, path.pos_in_item);
ret = 0;
if (blocknr) {
@@ -311,8 +339,12 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
set_buffer_boundary(bh_result);
}
} else
- // We do not return -ENOENT if there is a hole but page is uptodate, because it means
- // That there is some MMAPED data associated with it that is yet to be written to disk.
+ /*
+ * We do not return -ENOENT if there is a hole but
+ * page is uptodate, because it means that there is
+ * some MMAPED data associated with it that is
+ * yet to be written to disk.
+ */
if ((args & GET_BLOCK_NO_HOLE)
&& !PageUptodate(bh_result->b_page)) {
ret = -ENOENT;
@@ -323,41 +355,45 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
kunmap(bh_result->b_page);
return ret;
}
- // requested data are in direct item(s)
+ /* requested data are in direct item(s) */
if (!(args & GET_BLOCK_READ_DIRECT)) {
- // we are called by bmap. FIXME: we can not map block of file
- // when it is stored in direct item(s)
+ /*
+ * we are called by bmap. FIXME: we can not map block of file
+ * when it is stored in direct item(s)
+ */
pathrelse(&path);
if (p)
kunmap(bh_result->b_page);
return -ENOENT;
}
- /* if we've got a direct item, and the buffer or page was uptodate,
- ** we don't want to pull data off disk again. skip to the
- ** end, where we map the buffer and return
+ /*
+ * if we've got a direct item, and the buffer or page was uptodate,
+ * we don't want to pull data off disk again. skip to the
+ * end, where we map the buffer and return
*/
if (buffer_uptodate(bh_result)) {
goto finished;
} else
/*
- ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
- ** pages without any buffers. If the page is up to date, we don't want
- ** read old data off disk. Set the up to date bit on the buffer instead
- ** and jump to the end
+ * grab_tail_page can trigger calls to reiserfs_get_block on
+ * up to date pages without any buffers. If the page is up
+ * to date, we don't want read old data off disk. Set the up
+ * to date bit on the buffer instead and jump to the end
*/
if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
set_buffer_uptodate(bh_result);
goto finished;
}
- // read file tail into part of page
+ /* read file tail into part of page */
offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
copy_item_head(&tmp_ih, ih);
- /* we only want to kmap if we are reading the tail into the page.
- ** this is not the common case, so we don't kmap until we are
- ** sure we need to. But, this means the item might move if
- ** kmap schedules
+ /*
+ * we only want to kmap if we are reading the tail into the page.
+ * this is not the common case, so we don't kmap until we are
+ * sure we need to. But, this means the item might move if
+ * kmap schedules
*/
if (!p)
p = (char *)kmap(bh_result->b_page);
@@ -368,10 +404,11 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
if (!is_direct_le_ih(ih)) {
BUG();
}
- /* make sure we don't read more bytes than actually exist in
- ** the file. This can happen in odd cases where i_size isn't
- ** correct, and when direct item padding results in a few
- ** extra bytes at the end of the direct item
+ /*
+ * make sure we don't read more bytes than actually exist in
+ * the file. This can happen in odd cases where i_size isn't
+ * correct, and when direct item padding results in a few
+ * extra bytes at the end of the direct item
*/
if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
break;
@@ -383,40 +420,43 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
} else {
chars = ih_item_len(ih) - path.pos_in_item;
}
- memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
+ memcpy(p, ih_item_body(bh, ih) + path.pos_in_item, chars);
if (done)
break;
p += chars;
+ /*
+ * we done, if read direct item is not the last item of
+ * node FIXME: we could try to check right delimiting key
+ * to see whether direct item continues in the right
+ * neighbor or rely on i_size
+ */
if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
- // we done, if read direct item is not the last item of
- // node FIXME: we could try to check right delimiting key
- // to see whether direct item continues in the right
- // neighbor or rely on i_size
break;
- // update key to look for the next piece
+ /* update key to look for the next piece */
set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
result = search_for_position_by_key(inode->i_sb, &key, &path);
if (result != POSITION_FOUND)
- // i/o error most likely
+ /* i/o error most likely */
break;
bh = get_last_bh(&path);
- ih = get_ih(&path);
+ ih = tp_item_head(&path);
} while (1);
flush_dcache_page(bh_result->b_page);
kunmap(bh_result->b_page);
- finished:
+finished:
pathrelse(&path);
if (result == IO_ERROR)
return -EIO;
- /* this buffer has valid data, but isn't valid for io. mapping it to
+ /*
+ * this buffer has valid data, but isn't valid for io. mapping it to
* block #0 tells the rest of reiserfs it just has a tail in it
*/
map_bh(bh_result, inode->i_sb, 0);
@@ -424,8 +464,10 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
return 0;
}
-// this is called to create file map. So, _get_block_create_0 will not
-// read direct item
+/*
+ * this is called to create file map. So, _get_block_create_0 will not
+ * read direct item
+ */
static int reiserfs_bmap(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int create)
{
@@ -439,22 +481,23 @@ static int reiserfs_bmap(struct inode *inode, sector_t block,
return 0;
}
-/* special version of get_block that is only used by grab_tail_page right
-** now. It is sent to __block_write_begin, and when you try to get a
-** block past the end of the file (or a block from a hole) it returns
-** -ENOENT instead of a valid buffer. __block_write_begin expects to
-** be able to do i/o on the buffers returned, unless an error value
-** is also returned.
-**
-** So, this allows __block_write_begin to be used for reading a single block
-** in a page. Where it does not produce a valid page for holes, or past the
-** end of the file. This turns out to be exactly what we need for reading
-** tails for conversion.
-**
-** The point of the wrapper is forcing a certain value for create, even
-** though the VFS layer is calling this function with create==1. If you
-** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
-** don't use this function.
+/*
+ * special version of get_block that is only used by grab_tail_page right
+ * now. It is sent to __block_write_begin, and when you try to get a
+ * block past the end of the file (or a block from a hole) it returns
+ * -ENOENT instead of a valid buffer. __block_write_begin expects to
+ * be able to do i/o on the buffers returned, unless an error value
+ * is also returned.
+ *
+ * So, this allows __block_write_begin to be used for reading a single block
+ * in a page. Where it does not produce a valid page for holes, or past the
+ * end of the file. This turns out to be exactly what we need for reading
+ * tails for conversion.
+ *
+ * The point of the wrapper is forcing a certain value for create, even
+ * though the VFS layer is calling this function with create==1. If you
+ * don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block,
+ * don't use this function.
*/
static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
struct buffer_head *bh_result,
@@ -463,8 +506,10 @@ static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
}
-/* This is special helper for reiserfs_get_block in case we are executing
- direct_IO request. */
+/*
+ * This is special helper for reiserfs_get_block in case we are executing
+ * direct_IO request.
+ */
static int reiserfs_get_blocks_direct_io(struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
@@ -474,9 +519,11 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
bh_result->b_page = NULL;
- /* We set the b_size before reiserfs_get_block call since it is
- referenced in convert_tail_for_hole() that may be called from
- reiserfs_get_block() */
+ /*
+ * We set the b_size before reiserfs_get_block call since it is
+ * referenced in convert_tail_for_hole() that may be called from
+ * reiserfs_get_block()
+ */
bh_result->b_size = (1 << inode->i_blkbits);
ret = reiserfs_get_block(inode, iblock, bh_result,
@@ -486,14 +533,18 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
/* don't allow direct io onto tail pages */
if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- /* make sure future calls to the direct io funcs for this offset
- ** in the file fail by unmapping the buffer
+ /*
+ * make sure future calls to the direct io funcs for this
+ * offset in the file fail by unmapping the buffer
*/
clear_buffer_mapped(bh_result);
ret = -EINVAL;
}
- /* Possible unpacked tail. Flush the data before pages have
- disappeared */
+
+ /*
+ * Possible unpacked tail. Flush the data before pages have
+ * disappeared
+ */
if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
int err;
@@ -507,20 +558,20 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
if (err < 0)
ret = err;
}
- out:
+out:
return ret;
}
/*
-** helper function for when reiserfs_get_block is called for a hole
-** but the file tail is still in a direct item
-** bh_result is the buffer head for the hole
-** tail_offset is the offset of the start of the tail in the file
-**
-** This calls prepare_write, which will start a new transaction
-** you should not be in a transaction, or have any paths held when you
-** call this.
-*/
+ * helper function for when reiserfs_get_block is called for a hole
+ * but the file tail is still in a direct item
+ * bh_result is the buffer head for the hole
+ * tail_offset is the offset of the start of the tail in the file
+ *
+ * This calls prepare_write, which will start a new transaction
+ * you should not be in a transaction, or have any paths held when you
+ * call this.
+ */
static int convert_tail_for_hole(struct inode *inode,
struct buffer_head *bh_result,
loff_t tail_offset)
@@ -540,9 +591,10 @@ static int convert_tail_for_hole(struct inode *inode,
tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
index = tail_offset >> PAGE_CACHE_SHIFT;
- /* hole_page can be zero in case of direct_io, we are sure
- that we cannot get here if we write with O_DIRECT into
- tail page */
+ /*
+ * hole_page can be zero in case of direct_io, we are sure
+ * that we cannot get here if we write with O_DIRECT into tail page
+ */
if (!hole_page || index != hole_page->index) {
tail_page = grab_cache_page(inode->i_mapping, index);
retval = -ENOMEM;
@@ -553,14 +605,15 @@ static int convert_tail_for_hole(struct inode *inode,
tail_page = hole_page;
}
- /* we don't have to make sure the conversion did not happen while
- ** we were locking the page because anyone that could convert
- ** must first take i_mutex.
- **
- ** We must fix the tail page for writing because it might have buffers
- ** that are mapped, but have a block number of 0. This indicates tail
- ** data that has been read directly into the page, and
- ** __block_write_begin won't trigger a get_block in this case.
+ /*
+ * we don't have to make sure the conversion did not happen while
+ * we were locking the page because anyone that could convert
+ * must first take i_mutex.
+ *
+ * We must fix the tail page for writing because it might have buffers
+ * that are mapped, but have a block number of 0. This indicates tail
+ * data that has been read directly into the page, and
+ * __block_write_begin won't trigger a get_block in this case.
*/
fix_tail_page_for_writing(tail_page);
retval = __reiserfs_write_begin(tail_page, tail_start,
@@ -573,12 +626,12 @@ static int convert_tail_for_hole(struct inode *inode,
retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
- unlock:
+unlock:
if (tail_page != hole_page) {
unlock_page(tail_page);
page_cache_release(tail_page);
}
- out:
+out:
return retval;
}
@@ -604,7 +657,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
struct buffer_head *bh_result, int create)
{
int repeat, retval = 0;
- b_blocknr_t allocated_block_nr = 0; // b_blocknr_t is (unsigned) 32 bit int
+ /* b_blocknr_t is (unsigned) 32 bit int*/
+ b_blocknr_t allocated_block_nr = 0;
INITIALIZE_PATH(path);
int pos_in_item;
struct cpu_key key;
@@ -614,12 +668,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
int done;
int fs_gen;
struct reiserfs_transaction_handle *th = NULL;
- /* space reserved in transaction batch:
- . 3 balancings in direct->indirect conversion
- . 1 block involved into reiserfs_update_sd()
- XXX in practically impossible worst case direct2indirect()
- can incur (much) more than 3 balancings.
- quota update for user, group */
+ /*
+ * space reserved in transaction batch:
+ * . 3 balancings in direct->indirect conversion
+ * . 1 block involved into reiserfs_update_sd()
+ * XXX in practically impossible worst case direct2indirect()
+ * can incur (much) more than 3 balancings.
+ * quota update for user, group
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 + 1 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
@@ -636,8 +692,9 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
return -EFBIG;
}
- /* if !create, we aren't changing the FS, so we don't need to
- ** log anything, so we don't need to start a transaction
+ /*
+ * if !create, we aren't changing the FS, so we don't need to
+ * log anything, so we don't need to start a transaction
*/
if (!(create & GET_BLOCK_CREATE)) {
int ret;
@@ -647,6 +704,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_write_unlock(inode->i_sb);
return ret;
}
+
/*
* if we're already in a transaction, make sure to close
* any new transactions we start in this func
@@ -655,8 +713,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_transaction_running(inode->i_sb))
dangle = 0;
- /* If file is of such a size, that it might have a tail and tails are enabled
- ** we should mark it as possibly needing tail packing on close
+ /*
+ * If file is of such a size, that it might have a tail and
+ * tails are enabled we should mark it as possibly needing
+ * tail packing on close
*/
if ((have_large_tails(inode->i_sb)
&& inode->i_size < i_block_size(inode) * 4)
@@ -667,7 +727,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
/* set the key of the first byte in the 'block'-th block of file */
make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
- start_trans:
+start_trans:
th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
if (!th) {
retval = -ENOMEM;
@@ -675,7 +735,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
}
reiserfs_update_inode_transaction(inode);
}
- research:
+research:
retval = search_for_position_by_key(inode->i_sb, &key, &path);
if (retval == IO_ERROR) {
@@ -684,8 +744,8 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
}
bh = get_last_bh(&path);
- ih = get_ih(&path);
- item = get_item(&path);
+ ih = tp_item_head(&path);
+ item = tp_item_body(&path);
pos_in_item = path.pos_in_item;
fs_gen = get_generation(inode->i_sb);
@@ -703,11 +763,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
_allocate_block(th, block, inode, &allocated_block_nr,
&path, create);
+ /*
+ * restart the transaction to give the journal a chance to free
+ * some blocks. releases the path, so we have to go back to
+ * research if we succeed on the second try
+ */
if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
- /* restart the transaction to give the journal a chance to free
- ** some blocks. releases the path, so we have to go back to
- ** research if we succeed on the second try
- */
SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
retval = restart_transaction(th, inode, &path);
if (retval)
@@ -734,9 +795,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (indirect_item_found(retval, ih)) {
b_blocknr_t unfm_ptr;
- /* 'block'-th block is in the file already (there is
- corresponding cell in some indirect item). But it may be
- zero unformatted node pointer (hole) */
+ /*
+ * 'block'-th block is in the file already (there is
+ * corresponding cell in some indirect item). But it may be
+ * zero unformatted node pointer (hole)
+ */
unfm_ptr = get_block_num(item, pos_in_item);
if (unfm_ptr == 0) {
/* use allocated block to plug the hole */
@@ -753,7 +816,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_add_ordered_list(inode, bh_result);
put_block_num(item, pos_in_item, allocated_block_nr);
unfm_ptr = allocated_block_nr;
- journal_mark_dirty(th, inode->i_sb, bh);
+ journal_mark_dirty(th, bh);
reiserfs_update_sd(th, inode);
}
set_block_dev_mapped(bh_result, unfm_ptr, inode);
@@ -764,9 +827,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
reiserfs_write_unlock(inode->i_sb);
- /* the item was found, so new blocks were not added to the file
- ** there is no need to make sure the inode is updated with this
- ** transaction
+ /*
+ * the item was found, so new blocks were not added to the file
+ * there is no need to make sure the inode is updated with this
+ * transaction
*/
return retval;
}
@@ -776,9 +840,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
goto start_trans;
}
- /* desired position is not found or is in the direct item. We have
- to append file with holes up to 'block'-th block converting
- direct items to indirect one if necessary */
+ /*
+ * desired position is not found or is in the direct item. We have
+ * to append file with holes up to 'block'-th block converting
+ * direct items to indirect one if necessary
+ */
done = 0;
do {
if (is_statdata_le_ih(ih)) {
@@ -790,16 +856,18 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
TYPE_INDIRECT, UNFM_P_SIZE,
0 /* free_space */ );
+ /*
+ * we are going to add 'block'-th block to the file.
+ * Use allocated block for that
+ */
if (cpu_key_k_offset(&key) == 1) {
- /* we are going to add 'block'-th block to the file. Use
- allocated block for that */
unp = cpu_to_le32(allocated_block_nr);
set_block_dev_mapped(bh_result,
allocated_block_nr, inode);
set_buffer_new(bh_result);
done = 1;
}
- tmp_key = key; // ;)
+ tmp_key = key; /* ;) */
set_cpu_key_k_offset(&tmp_key, 1);
PATH_LAST_POSITION(&path)++;
@@ -809,9 +877,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (retval) {
reiserfs_free_block(th, inode,
allocated_block_nr, 1);
- goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
+ /*
+ * retval == -ENOSPC, -EDQUOT or -EIO
+ * or -EEXIST
+ */
+ goto failure;
}
- //mark_tail_converted (inode);
} else if (is_direct_le_ih(ih)) {
/* direct item has to be converted */
loff_t tail_offset;
@@ -819,18 +890,24 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
tail_offset =
((le_ih_k_offset(ih) -
1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+
+ /*
+ * direct item we just found fits into block we have
+ * to map. Convert it into unformatted node: use
+ * bh_result for the conversion
+ */
if (tail_offset == cpu_key_k_offset(&key)) {
- /* direct item we just found fits into block we have
- to map. Convert it into unformatted node: use
- bh_result for the conversion */
set_block_dev_mapped(bh_result,
allocated_block_nr, inode);
unbh = bh_result;
done = 1;
} else {
- /* we have to padd file tail stored in direct item(s)
- up to block size and convert it to unformatted
- node. FIXME: this should also get into page cache */
+ /*
+ * we have to pad file tail stored in direct
+ * item(s) up to block size and convert it
+ * to unformatted node. FIXME: this should
+ * also get into page cache
+ */
pathrelse(&path);
/*
@@ -859,7 +936,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
inode->i_ino,
retval);
if (allocated_block_nr) {
- /* the bitmap, the super, and the stat data == 3 */
+ /*
+ * the bitmap, the super,
+ * and the stat data == 3
+ */
if (!th)
th = reiserfs_persistent_transaction(inode->i_sb, 3);
if (th)
@@ -881,43 +961,57 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
allocated_block_nr, 1);
goto failure;
}
- /* it is important the set_buffer_uptodate is done after
- ** the direct2indirect. The buffer might contain valid
- ** data newer than the data on disk (read by readpage, changed,
- ** and then sent here by writepage). direct2indirect needs
- ** to know if unbh was already up to date, so it can decide
- ** if the data in unbh needs to be replaced with data from
- ** the disk
+ /*
+ * it is important the set_buffer_uptodate is done
+ * after the direct2indirect. The buffer might
+ * contain valid data newer than the data on disk
+ * (read by readpage, changed, and then sent here by
+ * writepage). direct2indirect needs to know if unbh
+ * was already up to date, so it can decide if the
+ * data in unbh needs to be replaced with data from
+ * the disk
*/
set_buffer_uptodate(unbh);
- /* unbh->b_page == NULL in case of DIRECT_IO request, this means
- buffer will disappear shortly, so it should not be added to
+ /*
+ * unbh->b_page == NULL in case of DIRECT_IO request,
+ * this means buffer will disappear shortly, so it
+ * should not be added to
*/
if (unbh->b_page) {
- /* we've converted the tail, so we must
- ** flush unbh before the transaction commits
+ /*
+ * we've converted the tail, so we must
+ * flush unbh before the transaction commits
*/
reiserfs_add_tail_list(inode, unbh);
- /* mark it dirty now to prevent commit_write from adding
- ** this buffer to the inode's dirty buffer list
+ /*
+ * mark it dirty now to prevent commit_write
+ * from adding this buffer to the inode's
+ * dirty buffer list
*/
/*
- * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
- * It's still atomic, but it sets the page dirty too,
- * which makes it eligible for writeback at any time by the
- * VM (which was also the case with __mark_buffer_dirty())
+ * AKPM: changed __mark_buffer_dirty to
+ * mark_buffer_dirty(). It's still atomic,
+ * but it sets the page dirty too, which makes
+ * it eligible for writeback at any time by the
+ * VM (which was also the case with
+ * __mark_buffer_dirty())
*/
mark_buffer_dirty(unbh);
}
} else {
- /* append indirect item with holes if needed, when appending
- pointer to 'block'-th block use block, which is already
- allocated */
+ /*
+ * append indirect item with holes if needed, when
+ * appending pointer to 'block'-th block use block,
+ * which is already allocated
+ */
struct cpu_key tmp_key;
- unp_t unf_single = 0; // We use this in case we need to allocate only
- // one block which is a fastpath
+ /*
+ * We use this in case we need to allocate
+ * only one block which is a fastpath
+ */
+ unp_t unf_single = 0;
unp_t *un;
__u64 max_to_insert =
MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
@@ -926,14 +1020,17 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
"vs-804: invalid position for append");
- /* indirect item has to be appended, set up key of that position */
+ /*
+ * indirect item has to be appended,
+ * set up key of that position
+ * (key type is unimportant)
+ */
make_cpu_key(&tmp_key, inode,
le_key_k_offset(version,
- &(ih->ih_key)) +
+ &ih->ih_key) +
op_bytes_number(ih,
inode->i_sb->s_blocksize),
- //pos_in_item * inode->i_sb->s_blocksize,
- TYPE_INDIRECT, 3); // key type is unimportant
+ TYPE_INDIRECT, 3);
RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key),
"green-805: invalid offset");
@@ -954,8 +1051,10 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
}
}
if (blocks_needed <= max_to_insert) {
- /* we are going to add target block to the file. Use allocated
- block for that */
+ /*
+ * we are going to add target block to
+ * the file. Use allocated block for that
+ */
un[blocks_needed - 1] =
cpu_to_le32(allocated_block_nr);
set_block_dev_mapped(bh_result,
@@ -964,8 +1063,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
done = 1;
} else {
/* paste hole to the indirect item */
- /* If kmalloc failed, max_to_insert becomes zero and it means we
- only have space for one block */
+ /*
+ * If kmalloc failed, max_to_insert becomes
+ * zero and it means we only have space for
+ * one block
+ */
blocks_needed =
max_to_insert ? max_to_insert : 1;
}
@@ -984,9 +1086,12 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
goto failure;
}
if (!done) {
- /* We need to mark new file size in case this function will be
- interrupted/aborted later on. And we may do this only for
- holes. */
+ /*
+ * We need to mark new file size in case
+ * this function will be interrupted/aborted
+ * later on. And we may do this only for
+ * holes.
+ */
inode->i_size +=
inode->i_sb->s_blocksize * blocks_needed;
}
@@ -995,13 +1100,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
if (done == 1)
break;
- /* this loop could log more blocks than we had originally asked
- ** for. So, we have to allow the transaction to end if it is
- ** too big or too full. Update the inode so things are
- ** consistent if we crash before the function returns
- **
- ** release the path so that anybody waiting on the path before
- ** ending their transaction will be able to continue.
+ /*
+ * this loop could log more blocks than we had originally
+ * asked for. So, we have to allow the transaction to end
+ * if it is too big or too full. Update the inode so things
+ * are consistent if we crash before the function returns
+ * release the path so that anybody waiting on the path before
+ * ending their transaction will be able to continue.
*/
if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
retval = restart_transaction(th, inode, &path);
@@ -1031,14 +1136,14 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
goto failure;
}
bh = get_last_bh(&path);
- ih = get_ih(&path);
- item = get_item(&path);
+ ih = tp_item_head(&path);
+ item = tp_item_body(&path);
pos_in_item = path.pos_in_item;
} while (1);
retval = 0;
- failure:
+failure:
if (th && (!dangle || (retval && !th->t_trans_id))) {
int err;
if (th->t_trans_id)
@@ -1060,8 +1165,10 @@ reiserfs_readpages(struct file *file, struct address_space *mapping,
return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
}
-/* Compute real number of used bytes by file
- * Following three functions can go away when we'll have enough space in stat item
+/*
+ * Compute real number of used bytes by file
+ * Following three functions can go away when we'll have enough space in
+ * stat item
*/
static int real_space_diff(struct inode *inode, int sd_size)
{
@@ -1071,13 +1178,14 @@ static int real_space_diff(struct inode *inode, int sd_size)
if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
return sd_size;
- /* End of file is also in full block with indirect reference, so round
- ** up to the next block.
- **
- ** there is just no way to know if the tail is actually packed
- ** on the file, so we have to assume it isn't. When we pack the
- ** tail, we add 4 bytes to pretend there really is an unformatted
- ** node pointer
+ /*
+ * End of file is also in full block with indirect reference, so round
+ * up to the next block.
+ *
+ * there is just no way to know if the tail is actually packed
+ * on the file, so we have to assume it isn't. When we pack the
+ * tail, we add 4 bytes to pretend there really is an unformatted
+ * node pointer
*/
bytes =
((inode->i_size +
@@ -1108,36 +1216,36 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
bytes += (loff_t) 511;
}
- /* files from before the quota patch might i_blocks such that
- ** bytes < real_space. Deal with that here to prevent it from
- ** going negative.
+ /*
+ * files from before the quota patch might i_blocks such that
+ * bytes < real_space. Deal with that here to prevent it from
+ * going negative.
*/
if (bytes < real_space)
return 0;
return (bytes - real_space) >> 9;
}
-//
-// BAD: new directories have stat data of new type and all other items
-// of old type. Version stored in the inode says about body items, so
-// in update_stat_data we can not rely on inode, but have to check
-// item version directly
-//
+/*
+ * BAD: new directories have stat data of new type and all other items
+ * of old type. Version stored in the inode says about body items, so
+ * in update_stat_data we can not rely on inode, but have to check
+ * item version directly
+ */
-// called by read_locked_inode
+/* called by read_locked_inode */
static void init_inode(struct inode *inode, struct treepath *path)
{
struct buffer_head *bh;
struct item_head *ih;
__u32 rdev;
- //int version = ITEM_VERSION_1;
bh = PATH_PLAST_BUFFER(path);
- ih = PATH_PITEM_HEAD(path);
+ ih = tp_item_head(path);
- copy_key(INODE_PKEY(inode), &(ih->ih_key));
+ copy_key(INODE_PKEY(inode), &ih->ih_key);
- INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+ INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
REISERFS_I(inode)->i_flags = 0;
REISERFS_I(inode)->i_prealloc_block = 0;
REISERFS_I(inode)->i_prealloc_count = 0;
@@ -1147,7 +1255,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
if (stat_data_v1(ih)) {
struct stat_data_v1 *sd =
- (struct stat_data_v1 *)B_I_PITEM(bh, ih);
+ (struct stat_data_v1 *)ih_item_body(bh, ih);
unsigned long blocks;
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
@@ -1168,20 +1276,26 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
blocks = (inode->i_size + 511) >> 9;
blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
+
+ /*
+ * there was a bug in <=3.5.23 when i_blocks could take
+ * negative values. Starting from 3.5.17 this value could
+ * even be stored in stat data. For such files we set
+ * i_blocks based on file size. Just 2 notes: this can be
+ * wrong for sparse files. On-disk value will be only
+ * updated if file's inode will ever change
+ */
if (inode->i_blocks > blocks) {
- // there was a bug in <=3.5.23 when i_blocks could take negative
- // values. Starting from 3.5.17 this value could even be stored in
- // stat data. For such files we set i_blocks based on file
- // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
- // only updated if file's inode will ever change
inode->i_blocks = blocks;
}
rdev = sd_v1_rdev(sd);
REISERFS_I(inode)->i_first_direct_byte =
sd_v1_first_direct_byte(sd);
- /* an early bug in the quota code can give us an odd number for the
- ** block count. This is incorrect, fix it here.
+
+ /*
+ * an early bug in the quota code can give us an odd
+ * number for the block count. This is incorrect, fix it here.
*/
if (inode->i_blocks & 1) {
inode->i_blocks++;
@@ -1189,13 +1303,17 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode_set_bytes(inode,
to_real_used_space(inode, inode->i_blocks,
SD_V1_SIZE));
- /* nopack is initially zero for v1 objects. For v2 objects,
- nopack is initialised from sd_attrs */
+ /*
+ * nopack is initially zero for v1 objects. For v2 objects,
+ * nopack is initialised from sd_attrs
+ */
REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
} else {
- // new stat data found, but object may have old items
- // (directories and symlinks)
- struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
+ /*
+ * new stat data found, but object may have old items
+ * (directories and symlinks)
+ */
+ struct stat_data *sd = (struct stat_data *)ih_item_body(bh, ih);
inode->i_mode = sd_v2_mode(sd);
set_nlink(inode, sd_v2_nlink(sd));
@@ -1225,8 +1343,10 @@ static void init_inode(struct inode *inode, struct treepath *path)
inode_set_bytes(inode,
to_real_used_space(inode, inode->i_blocks,
SD_V2_SIZE));
- /* read persistent inode attributes from sd and initialise
- generic inode flags from them */
+ /*
+ * read persistent inode attributes from sd and initialise
+ * generic inode flags from them
+ */
REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
}
@@ -1249,7 +1369,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
}
}
-// update new stat data with inode fields
+/* update new stat data with inode fields */
static void inode2sd(void *sd, struct inode *inode, loff_t size)
{
struct stat_data *sd_v2 = (struct stat_data *)sd;
@@ -1273,7 +1393,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
set_sd_v2_attrs(sd_v2, flags);
}
-// used to copy inode's fields to old stat data
+/* used to copy inode's fields to old stat data */
static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
{
struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
@@ -1292,14 +1412,15 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
else
set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
- // Sigh. i_first_direct_byte is back
+ /* Sigh. i_first_direct_byte is back */
set_sd_v1_first_direct_byte(sd_v1,
REISERFS_I(inode)->i_first_direct_byte);
}
-/* NOTE, you must prepare the buffer head before sending it here,
-** and then log it after the call
-*/
+/*
+ * NOTE, you must prepare the buffer head before sending it here,
+ * and then log it after the call
+ */
static void update_stat_data(struct treepath *path, struct inode *inode,
loff_t size)
{
@@ -1307,17 +1428,17 @@ static void update_stat_data(struct treepath *path, struct inode *inode,
struct item_head *ih;
bh = PATH_PLAST_BUFFER(path);
- ih = PATH_PITEM_HEAD(path);
+ ih = tp_item_head(path);
if (!is_statdata_le_ih(ih))
reiserfs_panic(inode->i_sb, "vs-13065", "key %k, found item %h",
INODE_PKEY(inode), ih);
+ /* path points to old stat data */
if (stat_data_v1(ih)) {
- // path points to old stat data
- inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
+ inode2sd_v1(ih_item_body(bh, ih), inode, size);
} else {
- inode2sd(B_I_PITEM(bh, ih), inode, size);
+ inode2sd(ih_item_body(bh, ih), inode, size);
}
return;
@@ -1335,7 +1456,8 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3); //key type is unimportant
+ /* key type is unimportant */
+ make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);
for (;;) {
int pos;
@@ -1363,45 +1485,48 @@ void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
return;
}
- /* sigh, prepare_for_journal might schedule. When it schedules the
- ** FS might change. We have to detect that, and loop back to the
- ** search if the stat data item has moved
+ /*
+ * sigh, prepare_for_journal might schedule. When it
+ * schedules the FS might change. We have to detect that,
+ * and loop back to the search if the stat data item has moved
*/
bh = get_last_bh(&path);
- ih = get_ih(&path);
+ ih = tp_item_head(&path);
copy_item_head(&tmp_ih, ih);
fs_gen = get_generation(inode->i_sb);
reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+
+ /* Stat_data item has been moved after scheduling. */
if (fs_changed(fs_gen, inode->i_sb)
&& item_moved(&tmp_ih, &path)) {
reiserfs_restore_prepared_buffer(inode->i_sb, bh);
- continue; /* Stat_data item has been moved after scheduling. */
+ continue;
}
break;
}
update_stat_data(&path, inode, size);
- journal_mark_dirty(th, th->t_super, bh);
+ journal_mark_dirty(th, bh);
pathrelse(&path);
return;
}
-/* reiserfs_read_locked_inode is called to read the inode off disk, and it
-** does a make_bad_inode when things go wrong. But, we need to make sure
-** and clear the key in the private portion of the inode, otherwise a
-** corresponding iput might try to delete whatever object the inode last
-** represented.
-*/
+/*
+ * reiserfs_read_locked_inode is called to read the inode off disk, and it
+ * does a make_bad_inode when things go wrong. But, we need to make sure
+ * and clear the key in the private portion of the inode, otherwise a
+ * corresponding iput might try to delete whatever object the inode last
+ * represented.
+ */
static void reiserfs_make_bad_inode(struct inode *inode)
{
memset(INODE_PKEY(inode), 0, KEY_SIZE);
make_bad_inode(inode);
}
-//
-// initially this function was derived from minix or ext2's analog and
-// evolved as the prototype did
-//
-
+/*
+ * initially this function was derived from minix or ext2's analog and
+ * evolved as the prototype did
+ */
int reiserfs_init_locked_inode(struct inode *inode, void *p)
{
struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
@@ -1410,8 +1535,10 @@ int reiserfs_init_locked_inode(struct inode *inode, void *p)
return 0;
}
-/* looks for stat data in the tree, and fills up the fields of in-core
- inode stat data fields */
+/*
+ * looks for stat data in the tree, and fills up the fields of in-core
+ * inode stat data fields
+ */
void reiserfs_read_locked_inode(struct inode *inode,
struct reiserfs_iget_args *args)
{
@@ -1422,8 +1549,10 @@ void reiserfs_read_locked_inode(struct inode *inode,
dirino = args->dirid;
- /* set version 1, version 2 could be used too, because stat data
- key is the same in both versions */
+ /*
+ * set version 1, version 2 could be used too, because stat data
+ * key is the same in both versions
+ */
key.version = KEY_FORMAT_3_5;
key.on_disk_key.k_dir_id = dirino;
key.on_disk_key.k_objectid = inode->i_ino;
@@ -1439,8 +1568,9 @@ void reiserfs_read_locked_inode(struct inode *inode,
reiserfs_make_bad_inode(inode);
return;
}
+
+ /* a stale NFS handle can trigger this without it being an error */
if (retval != ITEM_FOUND) {
- /* a stale NFS handle can trigger this without it being an error */
pathrelse(&path_to_sd);
reiserfs_make_bad_inode(inode);
clear_nlink(inode);
@@ -1449,20 +1579,25 @@ void reiserfs_read_locked_inode(struct inode *inode,
init_inode(inode, &path_to_sd);
- /* It is possible that knfsd is trying to access inode of a file
- that is being removed from the disk by some other thread. As we
- update sd on unlink all that is required is to check for nlink
- here. This bug was first found by Sizif when debugging
- SquidNG/Butterfly, forgotten, and found again after Philippe
- Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
-
- More logical fix would require changes in fs/inode.c:iput() to
- remove inode from hash-table _after_ fs cleaned disk stuff up and
- in iget() to return NULL if I_FREEING inode is found in
- hash-table. */
- /* Currently there is one place where it's ok to meet inode with
- nlink==0: processing of open-unlinked and half-truncated files
- during mount (fs/reiserfs/super.c:finish_unfinished()). */
+ /*
+ * It is possible that knfsd is trying to access inode of a file
+ * that is being removed from the disk by some other thread. As we
+ * update sd on unlink all that is required is to check for nlink
+ * here. This bug was first found by Sizif when debugging
+ * SquidNG/Butterfly, forgotten, and found again after Philippe
+ * Gramoulle <philippe.gramoulle@mmania.com> reproduced it.
+
+ * More logical fix would require changes in fs/inode.c:iput() to
+ * remove inode from hash-table _after_ fs cleaned disk stuff up and
+ * in iget() to return NULL if I_FREEING inode is found in
+ * hash-table.
+ */
+
+ /*
+ * Currently there is one place where it's ok to meet inode with
+ * nlink==0: processing of open-unlinked and half-truncated files
+ * during mount (fs/reiserfs/super.c:finish_unfinished()).
+ */
if ((inode->i_nlink == 0) &&
!REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
reiserfs_warning(inode->i_sb, "vs-13075",
@@ -1472,7 +1607,8 @@ void reiserfs_read_locked_inode(struct inode *inode,
reiserfs_make_bad_inode(inode);
}
- reiserfs_check_path(&path_to_sd); /* init inode should be relsing */
+ /* init inode should be relsing */
+ reiserfs_check_path(&path_to_sd);
/*
* Stat data v1 doesn't support ACLs.
@@ -1481,7 +1617,7 @@ void reiserfs_read_locked_inode(struct inode *inode,
cache_no_acl(inode);
}
-/**
+/*
* reiserfs_find_actor() - "find actor" reiserfs supplies to iget5_locked().
*
* @inode: inode from hash table to check
@@ -1556,7 +1692,8 @@ static struct dentry *reiserfs_get_dentry(struct super_block *sb,
struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
- /* fhtype happens to reflect the number of u32s encoded.
+ /*
+ * fhtype happens to reflect the number of u32s encoded.
* due to a bug in earlier code, fhtype might indicate there
* are more u32s then actually fitted.
* so if fhtype seems to be more than len, reduce fhtype.
@@ -1625,13 +1762,16 @@ int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
return *lenp;
}
-/* looks for stat data, then copies fields to it, marks the buffer
- containing stat data as dirty */
-/* reiserfs inodes are never really dirty, since the dirty inode call
-** always logs them. This call allows the VFS inode marking routines
-** to properly mark inodes for datasync and such, but only actually
-** does something when called for a synchronous update.
-*/
+/*
+ * looks for stat data, then copies fields to it, marks the buffer
+ * containing stat data as dirty
+ */
+/*
+ * reiserfs inodes are never really dirty, since the dirty inode call
+ * always logs them. This call allows the VFS inode marking routines
+ * to properly mark inodes for datasync and such, but only actually
+ * does something when called for a synchronous update.
+ */
int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct reiserfs_transaction_handle th;
@@ -1639,24 +1779,28 @@ int reiserfs_write_inode(struct inode *inode, struct writeback_control *wbc)
if (inode->i_sb->s_flags & MS_RDONLY)
return -EROFS;
- /* memory pressure can sometimes initiate write_inode calls with sync == 1,
- ** these cases are just when the system needs ram, not when the
- ** inode needs to reach disk for safety, and they can safely be
- ** ignored because the altered inode has already been logged.
+ /*
+ * memory pressure can sometimes initiate write_inode calls with
+ * sync == 1,
+ * these cases are just when the system needs ram, not when the
+ * inode needs to reach disk for safety, and they can safely be
+ * ignored because the altered inode has already been logged.
*/
if (wbc->sync_mode == WB_SYNC_ALL && !(current->flags & PF_MEMALLOC)) {
reiserfs_write_lock(inode->i_sb);
if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
reiserfs_update_sd(&th, inode);
- journal_end_sync(&th, inode->i_sb, jbegin_count);
+ journal_end_sync(&th);
}
reiserfs_write_unlock(inode->i_sb);
}
return 0;
}
-/* stat data of new object is inserted already, this inserts the item
- containing "." and ".." entries */
+/*
+ * stat data of new object is inserted already, this inserts the item
+ * containing "." and ".." entries
+ */
static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
struct inode *inode,
struct item_head *ih, struct treepath *path,
@@ -1674,9 +1818,11 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
TYPE_DIRENTRY, 3 /*key length */ );
- /* compose item head for new item. Directories consist of items of
- old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
- is done by reiserfs_new_inode */
+ /*
+ * compose item head for new item. Directories consist of items of
+ * old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+ * is done by reiserfs_new_inode
+ */
if (old_format_only(sb)) {
make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
@@ -1714,9 +1860,12 @@ static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
return reiserfs_insert_item(th, path, &key, ih, inode, body);
}
-/* stat data of object has been inserted, this inserts the item
- containing the body of symlink */
-static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode of symlink */
+/*
+ * stat data of object has been inserted, this inserts the item
+ * containing the body of symlink
+ */
+static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th,
+ struct inode *inode,
struct item_head *ih,
struct treepath *path, const char *symname,
int item_len)
@@ -1754,15 +1903,26 @@ static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct i
return reiserfs_insert_item(th, path, &key, ih, inode, symname);
}
-/* inserts the stat data into the tree, and then calls
- reiserfs_new_directory (to insert ".", ".." item if new object is
- directory) or reiserfs_new_symlink (to insert symlink body if new
- object is symlink) or nothing (if new object is regular file)
-
- NOTE! uid and gid must already be set in the inode. If we return
- non-zero due to an error, we have to drop the quota previously allocated
- for the fresh inode. This can only be done outside a transaction, so
- if we return non-zero, we also end the transaction. */
+/*
+ * inserts the stat data into the tree, and then calls
+ * reiserfs_new_directory (to insert ".", ".." item if new object is
+ * directory) or reiserfs_new_symlink (to insert symlink body if new
+ * object is symlink) or nothing (if new object is regular file)
+
+ * NOTE! uid and gid must already be set in the inode. If we return
+ * non-zero due to an error, we have to drop the quota previously allocated
+ * for the fresh inode. This can only be done outside a transaction, so
+ * if we return non-zero, we also end the transaction.
+ *
+ * @th: active transaction handle
+ * @dir: parent directory for new inode
+ * @mode: mode of new inode
+ * @symname: symlink contents if inode is symlink
+ * @isize: 0 for regular file, EMPTY_DIR_SIZE for dirs, strlen(symname) for
+ * symlinks
+ * @inode: inode to be filled
+ * @security: optional security context to associate with this inode
+ */
int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
struct inode *dir, umode_t mode, const char *symname,
/* 0 for regular, EMTRY_DIR_SIZE for dirs,
@@ -1807,7 +1967,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
else
make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
- memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+ memcpy(INODE_PKEY(inode), &ih.ih_key, KEY_SIZE);
args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
depth = reiserfs_write_unlock_nested(inode->i_sb);
@@ -1820,10 +1980,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
}
if (old_format_only(sb))
- /* not a perfect generation count, as object ids can be reused, but
- ** this is as good as reiserfs can do right now.
- ** note that the private part of inode isn't filled in yet, we have
- ** to use the directory.
+ /*
+ * not a perfect generation count, as object ids can be reused,
+ * but this is as good as reiserfs can do right now.
+ * note that the private part of inode isn't filled in yet,
+ * we have to use the directory.
*/
inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
else
@@ -1850,7 +2011,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
- INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+ INIT_LIST_HEAD(&REISERFS_I(inode)->i_prealloc_list);
REISERFS_I(inode)->i_flags = 0;
REISERFS_I(inode)->i_prealloc_block = 0;
REISERFS_I(inode)->i_prealloc_count = 0;
@@ -1878,9 +2039,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_bad_inode;
}
if (old_format_only(sb)) {
+ /* i_uid or i_gid is too big to be stored in stat data v3.5 */
if (i_uid_read(inode) & ~0xffff || i_gid_read(inode) & ~0xffff) {
pathrelse(&path_to_key);
- /* i_uid or i_gid is too big to be stored in stat data v3.5 */
err = -EINVAL;
goto out_bad_inode;
}
@@ -1888,9 +2049,11 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
} else {
inode2sd(&sd, inode, inode->i_size);
}
- // store in in-core inode the key of stat data and version all
- // object items will have (directory items will have old offset
- // format, other new objects will consist of new items)
+ /*
+ * store in in-core inode the key of stat data and version all
+ * object items will have (directory items will have old offset
+ * format, other new objects will consist of new items)
+ */
if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
else
@@ -1934,7 +2097,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
- journal_end(th, th->t_super, th->t_blocks_allocated);
+ journal_end(th);
goto out_inserted_sd;
}
@@ -1945,7 +2108,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
- journal_end(th, th->t_super, th->t_blocks_allocated);
+ journal_end(th);
goto out_inserted_sd;
}
} else if (inode->i_sb->s_flags & MS_POSIXACL) {
@@ -1962,8 +2125,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
if (retval) {
err = retval;
reiserfs_check_path(&path_to_key);
- retval = journal_end(th, th->t_super,
- th->t_blocks_allocated);
+ retval = journal_end(th);
if (retval)
err = retval;
goto out_inserted_sd;
@@ -1975,11 +2137,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
return 0;
-/* it looks like you can easily compress these two goto targets into
- * one. Keeping it like this doesn't actually hurt anything, and they
- * are place holders for what the quota code actually needs.
- */
- out_bad_inode:
+out_bad_inode:
/* Invalidate the object, nothing was inserted yet */
INODE_PKEY(inode)->k_objectid = 0;
@@ -1988,16 +2146,19 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
dquot_free_inode(inode);
reiserfs_write_lock_nested(inode->i_sb, depth);
- out_end_trans:
- journal_end(th, th->t_super, th->t_blocks_allocated);
- /* Drop can be outside and it needs more credits so it's better to have it outside */
+out_end_trans:
+ journal_end(th);
+ /*
+ * Drop can be outside and it needs more credits so it's better
+ * to have it outside
+ */
depth = reiserfs_write_unlock_nested(inode->i_sb);
dquot_drop(inode);
reiserfs_write_lock_nested(inode->i_sb, depth);
inode->i_flags |= S_NOQUOTA;
make_bad_inode(inode);
- out_inserted_sd:
+out_inserted_sd:
clear_nlink(inode);
th->t_trans_id = 0; /* so the caller can't use this handle later */
unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
@@ -2006,25 +2167,26 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
}
/*
-** finds the tail page in the page cache,
-** reads the last block in.
-**
-** On success, page_result is set to a locked, pinned page, and bh_result
-** is set to an up to date buffer for the last block in the file. returns 0.
-**
-** tail conversion is not done, so bh_result might not be valid for writing
-** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
-** trying to write the block.
-**
-** on failure, nonzero is returned, page_result and bh_result are untouched.
-*/
+ * finds the tail page in the page cache,
+ * reads the last block in.
+ *
+ * On success, page_result is set to a locked, pinned page, and bh_result
+ * is set to an up to date buffer for the last block in the file. returns 0.
+ *
+ * tail conversion is not done, so bh_result might not be valid for writing
+ * check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before
+ * trying to write the block.
+ *
+ * on failure, nonzero is returned, page_result and bh_result are untouched.
+ */
static int grab_tail_page(struct inode *inode,
struct page **page_result,
struct buffer_head **bh_result)
{
- /* we want the page with the last byte in the file,
- ** not the page that will hold the next byte for appending
+ /*
+ * we want the page with the last byte in the file,
+ * not the page that will hold the next byte for appending
*/
unsigned long index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
unsigned long pos = 0;
@@ -2036,10 +2198,11 @@ static int grab_tail_page(struct inode *inode,
struct page *page;
int error;
- /* we know that we are only called with inode->i_size > 0.
- ** we also know that a file tail can never be as big as a block
- ** If i_size % blocksize == 0, our file is currently block aligned
- ** and it won't need converting or zeroing after a truncate.
+ /*
+ * we know that we are only called with inode->i_size > 0.
+ * we also know that a file tail can never be as big as a block
+ * If i_size % blocksize == 0, our file is currently block aligned
+ * and it won't need converting or zeroing after a truncate.
*/
if ((offset & (blocksize - 1)) == 0) {
return -ENOENT;
@@ -2068,10 +2231,11 @@ static int grab_tail_page(struct inode *inode,
} while (bh != head);
if (!buffer_uptodate(bh)) {
- /* note, this should never happen, prepare_write should
- ** be taking care of this for us. If the buffer isn't up to date,
- ** I've screwed up the code to find the buffer, or the code to
- ** call prepare_write
+ /*
+ * note, this should never happen, prepare_write should be
+ * taking care of this for us. If the buffer isn't up to
+ * date, I've screwed up the code to find the buffer, or the
+ * code to call prepare_write
*/
reiserfs_error(inode->i_sb, "clm-6000",
"error reading block %lu", bh->b_blocknr);
@@ -2081,21 +2245,21 @@ static int grab_tail_page(struct inode *inode,
*bh_result = bh;
*page_result = page;
- out:
+out:
return error;
- unlock:
+unlock:
unlock_page(page);
page_cache_release(page);
return error;
}
/*
-** vfs version of truncate file. Must NOT be called with
-** a transaction already started.
-**
-** some code taken from block_truncate_page
-*/
+ * vfs version of truncate file. Must NOT be called with
+ * a transaction already started.
+ *
+ * some code taken from block_truncate_page
+ */
int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
{
struct reiserfs_transaction_handle th;
@@ -2113,9 +2277,11 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
if (inode->i_size > 0) {
error = grab_tail_page(inode, &page, &bh);
if (error) {
- // -ENOENT means we truncated past the end of the file,
- // and get_block_create_0 could not find a block to read in,
- // which is ok.
+ /*
+ * -ENOENT means we truncated past the end of the
+ * file, and get_block_create_0 could not find a
+ * block to read in, which is ok.
+ */
if (error != -ENOENT)
reiserfs_error(inode->i_sb, "clm-6001",
"grab_tail_page failed %d",
@@ -2125,29 +2291,33 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
}
}
- /* so, if page != NULL, we have a buffer head for the offset at
- ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
- ** then we have an unformatted node. Otherwise, we have a direct item,
- ** and no zeroing is required on disk. We zero after the truncate,
- ** because the truncate might pack the item anyway
- ** (it will unmap bh if it packs).
+ /*
+ * so, if page != NULL, we have a buffer head for the offset at
+ * the end of the file. if the bh is mapped, and bh->b_blocknr != 0,
+ * then we have an unformatted node. Otherwise, we have a direct item,
+ * and no zeroing is required on disk. We zero after the truncate,
+ * because the truncate might pack the item anyway
+ * (it will unmap bh if it packs).
+ *
+ * it is enough to reserve space in transaction for 2 balancings:
+ * one for "save" link adding and another for the first
+ * cut_from_item. 1 is for update_sd
*/
- /* it is enough to reserve space in transaction for 2 balancings:
- one for "save" link adding and another for the first
- cut_from_item. 1 is for update_sd */
error = journal_begin(&th, inode->i_sb,
JOURNAL_PER_BALANCE_CNT * 2 + 1);
if (error)
goto out;
reiserfs_update_inode_transaction(inode);
if (update_timestamps)
- /* we are doing real truncate: if the system crashes before the last
- transaction of truncating gets committed - on reboot the file
- either appears truncated properly or not truncated at all */
+ /*
+ * we are doing real truncate: if the system crashes
+ * before the last transaction of truncating gets committed
+ * - on reboot the file either appears truncated properly
+ * or not truncated at all
+ */
add_save_link(&th, inode, 1);
err2 = reiserfs_do_truncate(&th, inode, page, update_timestamps);
- error =
- journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+ error = journal_end(&th);
if (error)
goto out;
@@ -2180,7 +2350,7 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
reiserfs_write_unlock(inode->i_sb);
return 0;
- out:
+out:
if (page) {
unlock_page(page);
page_cache_release(page);
@@ -2212,7 +2382,10 @@ static int map_block_for_writepage(struct inode *inode,
int copy_size;
int trans_running = 0;
- /* catch places below that try to log something without starting a trans */
+ /*
+ * catch places below that try to log something without
+ * starting a trans
+ */
th.t_trans_id = 0;
if (!buffer_uptodate(bh_result)) {
@@ -2220,11 +2393,11 @@ static int map_block_for_writepage(struct inode *inode,
}
kmap(bh_result->b_page);
- start_over:
+start_over:
reiserfs_write_lock(inode->i_sb);
make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
- research:
+research:
retval = search_for_position_by_key(inode->i_sb, &key, &path);
if (retval != POSITION_FOUND) {
use_get_block = 1;
@@ -2232,8 +2405,8 @@ static int map_block_for_writepage(struct inode *inode,
}
bh = get_last_bh(&path);
- ih = get_ih(&path);
- item = get_item(&path);
+ ih = tp_item_head(&path);
+ item = tp_item_body(&path);
pos_in_item = path.pos_in_item;
/* we've found an unformatted node */
@@ -2281,10 +2454,10 @@ static int map_block_for_writepage(struct inode *inode,
goto research;
}
- memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
+ memcpy(ih_item_body(bh, ih) + pos_in_item, p + bytes_copied,
copy_size);
- journal_mark_dirty(&th, inode->i_sb, bh);
+ journal_mark_dirty(&th, bh);
bytes_copied += copy_size;
set_block_dev_mapped(bh_result, 0, inode);
@@ -2304,10 +2477,10 @@ static int map_block_for_writepage(struct inode *inode,
}
retval = 0;
- out:
+out:
pathrelse(&path);
if (trans_running) {
- int err = journal_end(&th, inode->i_sb, jbegin_count);
+ int err = journal_end(&th);
if (err)
retval = err;
trans_running = 0;
@@ -2331,7 +2504,8 @@ static int map_block_for_writepage(struct inode *inode,
kunmap(bh_result->b_page);
if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
- /* we've copied data from the page into the direct item, so the
+ /*
+ * we've copied data from the page into the direct item, so the
* buffer in the page is now clean, mark it to reflect that.
*/
lock_buffer(bh_result);
@@ -2370,7 +2544,8 @@ static int reiserfs_write_full_page(struct page *page,
return 0;
}
- /* The page dirty bit is cleared before writepage is called, which
+ /*
+ * The page dirty bit is cleared before writepage is called, which
* means we have to tell create_empty_buffers to make dirty buffers
* The page really should be up to date at this point, so tossing
* in the BH_Uptodate is just a sanity check.
@@ -2381,8 +2556,9 @@ static int reiserfs_write_full_page(struct page *page,
}
head = page_buffers(page);
- /* last page in the file, zero out any contents past the
- ** last byte in the file
+ /*
+ * last page in the file, zero out any contents past the
+ * last byte in the file
*/
if (page->index >= end_index) {
unsigned last_offset;
@@ -2412,7 +2588,8 @@ static int reiserfs_write_full_page(struct page *page,
(!buffer_mapped(bh) || (buffer_mapped(bh)
&& bh->b_blocknr ==
0))) {
- /* not mapped yet, or it points to a direct item, search
+ /*
+ * not mapped yet, or it points to a direct item, search
* the btree for the mapping info, and log any direct
* items found
*/
@@ -2450,10 +2627,11 @@ static int reiserfs_write_full_page(struct page *page,
if (checked) {
reiserfs_prepare_for_journal(s, bh, 1);
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
continue;
}
- /* from this point on, we know the buffer is mapped to a
+ /*
+ * from this point on, we know the buffer is mapped to a
* real block and not a direct item
*/
if (wbc->sync_mode != WB_SYNC_NONE) {
@@ -2472,7 +2650,7 @@ static int reiserfs_write_full_page(struct page *page,
} while ((bh = bh->b_this_page) != head);
if (checked) {
- error = journal_end(&th, s, bh_per_page + 1);
+ error = journal_end(&th);
reiserfs_write_unlock(s);
if (error)
goto fail;
@@ -2497,7 +2675,7 @@ static int reiserfs_write_full_page(struct page *page,
} while (bh != head);
error = 0;
- done:
+done:
if (nr == 0) {
/*
* if this page only had a direct item, it is very possible for
@@ -2519,8 +2697,9 @@ static int reiserfs_write_full_page(struct page *page,
}
return error;
- fail:
- /* catches various errors, we need to make sure any valid dirty blocks
+fail:
+ /*
+ * catches various errors, we need to make sure any valid dirty blocks
* get to the media. The page is currently locked and not marked for
* writeback
*/
@@ -2533,8 +2712,8 @@ static int reiserfs_write_full_page(struct page *page,
mark_buffer_async_write(bh);
} else {
/*
- * clear any dirty bits that might have come from getting
- * attached to a dirty page
+ * clear any dirty bits that might have come from
+ * getting attached to a dirty page
*/
clear_buffer_dirty(bh);
}
@@ -2614,15 +2793,18 @@ static int reiserfs_write_begin(struct file *file,
ret = __block_write_begin(page, pos, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
- /* this gets a little ugly. If reiserfs_get_block returned an
- * error and left a transacstion running, we've got to close it,
- * and we've got to free handle if it was a persistent transaction.
+ /*
+ * this gets a little ugly. If reiserfs_get_block returned an
+ * error and left a transacstion running, we've got to close
+ * it, and we've got to free handle if it was a persistent
+ * transaction.
*
* But, if we had nested into an existing transaction, we need
* to just drop the ref count on the handle.
*
* If old_ref == 0, the transaction is from reiserfs_get_block,
- * and it was a persistent trans. Otherwise, it was nested above.
+ * and it was a persistent trans. Otherwise, it was nested
+ * above.
*/
if (th->t_refcount > old_ref) {
if (old_ref)
@@ -2671,15 +2853,18 @@ int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len)
ret = __block_write_begin(page, from, len, reiserfs_get_block);
if (ret && reiserfs_transaction_running(inode->i_sb)) {
struct reiserfs_transaction_handle *th = current->journal_info;
- /* this gets a little ugly. If reiserfs_get_block returned an
- * error and left a transacstion running, we've got to close it,
- * and we've got to free handle if it was a persistent transaction.
+ /*
+ * this gets a little ugly. If reiserfs_get_block returned an
+ * error and left a transacstion running, we've got to close
+ * it, and we've got to free handle if it was a persistent
+ * transaction.
*
* But, if we had nested into an existing transaction, we need
* to just drop the ref count on the handle.
*
* If old_ref == 0, the transaction is from reiserfs_get_block,
- * and it was a persistent trans. Otherwise, it was nested above.
+ * and it was a persistent trans. Otherwise, it was nested
+ * above.
*/
if (th->t_refcount > old_ref) {
if (old_ref)
@@ -2734,17 +2919,20 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
reiserfs_commit_page(inode, page, start, start + copied);
- /* generic_commit_write does this for us, but does not update the
- ** transaction tracking stuff when the size changes. So, we have
- ** to do the i_size updates here.
+ /*
+ * generic_commit_write does this for us, but does not update the
+ * transaction tracking stuff when the size changes. So, we have
+ * to do the i_size updates here.
*/
if (pos + copied > inode->i_size) {
struct reiserfs_transaction_handle myth;
reiserfs_write_lock(inode->i_sb);
locked = true;
- /* If the file have grown beyond the border where it
- can have a tail, unmark it as needing a tail
- packing */
+ /*
+ * If the file have grown beyond the border where it
+ * can have a tail, unmark it as needing a tail
+ * packing
+ */
if ((have_large_tails(inode->i_sb)
&& inode->i_size > i_block_size(inode) * 4)
|| (have_small_tails(inode->i_sb)
@@ -2759,13 +2947,13 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
inode->i_size = pos + copied;
/*
* this will just nest into our transaction. It's important
- * to use mark_inode_dirty so the inode gets pushed around on the
- * dirty lists, and so that O_SYNC works as expected
+ * to use mark_inode_dirty so the inode gets pushed around on
+ * the dirty lists, and so that O_SYNC works as expected
*/
mark_inode_dirty(inode);
reiserfs_update_sd(&myth, inode);
update_sd = 1;
- ret = journal_end(&myth, inode->i_sb, 1);
+ ret = journal_end(&myth);
if (ret)
goto journal_error;
}
@@ -2781,7 +2969,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
goto out;
}
- out:
+out:
if (locked)
reiserfs_write_unlock(inode->i_sb);
unlock_page(page);
@@ -2792,7 +2980,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
return ret == 0 ? copied : ret;
- journal_error:
+journal_error:
reiserfs_write_unlock(inode->i_sb);
locked = false;
if (th) {
@@ -2822,15 +3010,18 @@ int reiserfs_commit_write(struct file *f, struct page *page,
}
reiserfs_commit_page(inode, page, from, to);
- /* generic_commit_write does this for us, but does not update the
- ** transaction tracking stuff when the size changes. So, we have
- ** to do the i_size updates here.
+ /*
+ * generic_commit_write does this for us, but does not update the
+ * transaction tracking stuff when the size changes. So, we have
+ * to do the i_size updates here.
*/
if (pos > inode->i_size) {
struct reiserfs_transaction_handle myth;
- /* If the file have grown beyond the border where it
- can have a tail, unmark it as needing a tail
- packing */
+ /*
+ * If the file have grown beyond the border where it
+ * can have a tail, unmark it as needing a tail
+ * packing
+ */
if ((have_large_tails(inode->i_sb)
&& inode->i_size > i_block_size(inode) * 4)
|| (have_small_tails(inode->i_sb)
@@ -2845,13 +3036,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
inode->i_size = pos;
/*
* this will just nest into our transaction. It's important
- * to use mark_inode_dirty so the inode gets pushed around on the
- * dirty lists, and so that O_SYNC works as expected
+ * to use mark_inode_dirty so the inode gets pushed around
+ * on the dirty lists, and so that O_SYNC works as expected
*/
mark_inode_dirty(inode);
reiserfs_update_sd(&myth, inode);
update_sd = 1;
- ret = journal_end(&myth, inode->i_sb, 1);
+ ret = journal_end(&myth);
if (ret)
goto journal_error;
}
@@ -2863,10 +3054,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
goto out;
}
- out:
+out:
return ret;
- journal_error:
+journal_error:
if (th) {
if (!update_sd)
reiserfs_update_sd(th, inode);
@@ -2924,9 +3115,10 @@ void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
}
}
-/* decide if this buffer needs to stay around for data logging or ordered
-** write purposes
-*/
+/*
+ * decide if this buffer needs to stay around for data logging or ordered
+ * write purposes
+ */
static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
{
int ret = 1;
@@ -2937,7 +3129,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
if (!buffer_mapped(bh)) {
goto free_jh;
}
- /* the page is locked, and the only places that log a data buffer
+ /*
+ * the page is locked, and the only places that log a data buffer
* also lock the page.
*/
if (reiserfs_file_data_log(inode)) {
@@ -2952,7 +3145,8 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
struct reiserfs_journal_list *jl;
struct reiserfs_jh *jh = bh->b_private;
- /* why is this safe?
+ /*
+ * why is this safe?
* reiserfs_setattr updates i_size in the on disk
* stat data before allowing vmtruncate to be called.
*
@@ -2969,7 +3163,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
&& jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
ret = 0;
}
- free_jh:
+free_jh:
if (ret && bh->b_private) {
reiserfs_free_jh(bh);
}
@@ -3028,7 +3222,7 @@ static void reiserfs_invalidatepage(struct page *page, unsigned int offset,
ret = try_to_release_page(page, 0);
/* maybe should BUG_ON(!ret); - neilb */
}
- out:
+out:
return;
}
@@ -3080,18 +3274,20 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
return ret;
}
-/* We thank Mingming Cao for helping us understand in great detail what
- to do in this section of the code. */
+/*
+ * We thank Mingming Cao for helping us understand in great detail what
+ * to do in this section of the code.
+ */
static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov, loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter, loff_t offset)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- reiserfs_get_blocks_direct_io);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ reiserfs_get_blocks_direct_io);
/*
* In case of error extending write may have instantiated a few
@@ -3099,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
*/
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
- loff_t end = offset + iov_length(iov, nr_segs);
+ loff_t end = offset + count;
if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
truncate_setsize(inode, isize);
@@ -3127,8 +3323,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
dquot_initialize(inode);
reiserfs_write_lock(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
- /* version 2 items will be caught by the s_maxbytes check
- ** done for us in vmtruncate
+ /*
+ * version 2 items will be caught by the s_maxbytes check
+ * done for us in vmtruncate
*/
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
attr->ia_size > MAX_NON_LFS) {
@@ -3149,7 +3346,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
err = journal_begin(&th, inode->i_sb, 4);
if (!err) {
reiserfs_discard_prealloc(&th, inode);
- err = journal_end(&th, inode->i_sb, 4);
+ err = journal_end(&th);
}
if (err)
error = err;
@@ -3189,7 +3386,10 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
- /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+ /*
+ * (user+group)*(old+new) structure - we count quota
+ * info and , inode write (sb, inode)
+ */
reiserfs_write_lock(inode->i_sb);
error = journal_begin(&th, inode->i_sb, jbegin_count);
reiserfs_write_unlock(inode->i_sb);
@@ -3198,19 +3398,21 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
error = dquot_transfer(inode, attr);
reiserfs_write_lock(inode->i_sb);
if (error) {
- journal_end(&th, inode->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
goto out;
}
- /* Update corresponding info in inode so that everything is in
- * one transaction */
+ /*
+ * Update corresponding info in inode so that everything
+ * is in one transaction
+ */
if (attr->ia_valid & ATTR_UID)
inode->i_uid = attr->ia_uid;
if (attr->ia_valid & ATTR_GID)
inode->i_gid = attr->ia_gid;
mark_inode_dirty(inode);
- error = journal_end(&th, inode->i_sb, jbegin_count);
+ error = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
if (error)
goto out;
@@ -3220,8 +3422,14 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_size != i_size_read(inode)) {
error = inode_newsize_ok(inode, attr->ia_size);
if (!error) {
+ /*
+ * Could race against reiserfs_file_release
+ * if called from NFS, so take tailpack mutex.
+ */
+ mutex_lock(&REISERFS_I(inode)->tailpack);
truncate_setsize(inode, attr->ia_size);
- reiserfs_vfs_truncate_file(inode);
+ reiserfs_truncate_file(inode, 1);
+ mutex_unlock(&REISERFS_I(inode)->tailpack);
}
}
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 946ccbf5b5a..501ed6811a2 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -15,7 +15,8 @@
* reiserfs_ioctl - handler for ioctl for inode
* supported commands:
* 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
- * and prevent packing file (argument arg has to be non-zero)
+ * and prevent packing file (argument arg has t
+ * be non-zero)
* 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
* 3) That's all for a while ...
*/
@@ -132,7 +133,10 @@ setversion_out:
long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
- /* These are just misnamed, they actually get/put from/to user an int */
+ /*
+ * These are just misnamed, they actually
+ * get/put from/to user an int
+ */
switch (cmd) {
case REISERFS_IOC32_UNPACK:
cmd = REISERFS_IOC_UNPACK;
@@ -160,10 +164,10 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
int reiserfs_commit_write(struct file *f, struct page *page,
unsigned from, unsigned to);
/*
-** reiserfs_unpack
-** Function try to convert tail from direct item into indirect.
-** It set up nopack attribute in the REISERFS_I(inode)->nopack
-*/
+ * reiserfs_unpack
+ * Function try to convert tail from direct item into indirect.
+ * It set up nopack attribute in the REISERFS_I(inode)->nopack
+ */
int reiserfs_unpack(struct inode *inode, struct file *filp)
{
int retval = 0;
@@ -194,9 +198,10 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
goto out;
}
- /* we unpack by finding the page with the tail, and calling
- ** __reiserfs_write_begin on that page. This will force a
- ** reiserfs_get_block to unpack the tail for us.
+ /*
+ * we unpack by finding the page with the tail, and calling
+ * __reiserfs_write_begin on that page. This will force a
+ * reiserfs_get_block to unpack the tail for us.
*/
index = inode->i_size >> PAGE_CACHE_SHIFT;
mapping = inode->i_mapping;
@@ -214,11 +219,11 @@ int reiserfs_unpack(struct inode *inode, struct file *filp)
retval = reiserfs_commit_write(NULL, page, write_from, write_from);
REISERFS_I(inode)->i_flags |= i_nopack_mask;
- out_unlock:
+out_unlock:
unlock_page(page);
page_cache_release(page);
- out:
+out:
mutex_unlock(&inode->i_mutex);
reiserfs_write_unlock(inode->i_sb);
return retval;
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index ee382ef3d30..cfaee912ee0 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -5,15 +5,17 @@
#include <linux/time.h>
#include "reiserfs.h"
-// this contains item handlers for old item types: sd, direct,
-// indirect, directory
+/*
+ * this contains item handlers for old item types: sd, direct,
+ * indirect, directory
+ */
-/* and where are the comments? how about saying where we can find an
- explanation of each item handler method? -Hans */
+/*
+ * and where are the comments? how about saying where we can find an
+ * explanation of each item handler method? -Hans
+ */
-//////////////////////////////////////////////////////////////////////////////
-// stat data functions
-//
+/* stat data functions */
static int sd_bytes_number(struct item_head *ih, int block_size)
{
return 0;
@@ -60,7 +62,7 @@ static void sd_print_item(struct item_head *ih, char *item)
static void sd_check_item(struct item_head *ih, char *item)
{
- // FIXME: type something here!
+ /* unused */
}
static int sd_create_vi(struct virtual_node *vn,
@@ -68,7 +70,6 @@ static int sd_create_vi(struct virtual_node *vn,
int is_affected, int insert_size)
{
vi->vi_index = TYPE_STAT_DATA;
- //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
return 0;
}
@@ -117,15 +118,13 @@ static struct item_operations stat_data_ops = {
.print_vi = sd_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// direct item functions
-//
+/* direct item functions */
static int direct_bytes_number(struct item_head *ih, int block_size)
{
return ih_item_len(ih);
}
-// FIXME: this should probably switch to indirect as well
+/* FIXME: this should probably switch to indirect as well */
static void direct_decrement_key(struct cpu_key *key)
{
cpu_key_k_offset_dec(key);
@@ -144,7 +143,7 @@ static void direct_print_item(struct item_head *ih, char *item)
{
int j = 0;
-// return;
+/* return; */
printk("\"");
while (j < ih_item_len(ih))
printk("%c", item[j++]);
@@ -153,7 +152,7 @@ static void direct_print_item(struct item_head *ih, char *item)
static void direct_check_item(struct item_head *ih, char *item)
{
- // FIXME: type something here!
+ /* unused */
}
static int direct_create_vi(struct virtual_node *vn,
@@ -161,7 +160,6 @@ static int direct_create_vi(struct virtual_node *vn,
int is_affected, int insert_size)
{
vi->vi_index = TYPE_DIRECT;
- //vi->vi_type |= VI_TYPE_DIRECT;
return 0;
}
@@ -211,16 +209,13 @@ static struct item_operations direct_ops = {
.print_vi = direct_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// indirect item functions
-//
-
+/* indirect item functions */
static int indirect_bytes_number(struct item_head *ih, int block_size)
{
- return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih);
+ return ih_item_len(ih) / UNFM_P_SIZE * block_size;
}
-// decrease offset, if it becomes 0, change type to stat data
+/* decrease offset, if it becomes 0, change type to stat data */
static void indirect_decrement_key(struct cpu_key *key)
{
cpu_key_k_offset_dec(key);
@@ -228,7 +223,7 @@ static void indirect_decrement_key(struct cpu_key *key)
set_cpu_key_k_type(key, TYPE_STAT_DATA);
}
-// if it is not first item of the body, then it is mergeable
+/* if it is not first item of the body, then it is mergeable */
static int indirect_is_left_mergeable(struct reiserfs_key *key,
unsigned long bsize)
{
@@ -236,7 +231,7 @@ static int indirect_is_left_mergeable(struct reiserfs_key *key,
return (le_key_k_offset(version, key) != 1);
}
-// printing of indirect item
+/* printing of indirect item */
static void start_new_sequence(__u32 * start, int *len, __u32 new)
{
*start = new;
@@ -295,7 +290,7 @@ static void indirect_print_item(struct item_head *ih, char *item)
static void indirect_check_item(struct item_head *ih, char *item)
{
- // FIXME: type something here!
+ /* unused */
}
static int indirect_create_vi(struct virtual_node *vn,
@@ -303,7 +298,6 @@ static int indirect_create_vi(struct virtual_node *vn,
int is_affected, int insert_size)
{
vi->vi_index = TYPE_INDIRECT;
- //vi->vi_type |= VI_TYPE_INDIRECT;
return 0;
}
@@ -321,16 +315,19 @@ static int indirect_check_right(struct virtual_item *vi, int free)
return indirect_check_left(vi, free, 0, 0);
}
-// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right)
+/*
+ * return size in bytes of 'units' units. If first == 0 - calculate
+ * from the head (left), otherwise - from tail (right)
+ */
static int indirect_part_size(struct virtual_item *vi, int first, int units)
{
- // unit of indirect item is byte (yet)
+ /* unit of indirect item is byte (yet) */
return units;
}
static int indirect_unit_num(struct virtual_item *vi)
{
- // unit of indirect item is byte (yet)
+ /* unit of indirect item is byte (yet) */
return vi->vi_item_len - IH_SIZE;
}
@@ -356,10 +353,7 @@ static struct item_operations indirect_ops = {
.print_vi = indirect_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// direntry functions
-//
-
+/* direntry functions */
static int direntry_bytes_number(struct item_head *ih, int block_size)
{
reiserfs_warning(NULL, "vs-16090",
@@ -396,7 +390,7 @@ static void direntry_print_item(struct item_head *ih, char *item)
deh = (struct reiserfs_de_head *)item;
- for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+ for (i = 0; i < ih_entry_count(ih); i++, deh++) {
namelen =
(i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
deh_location(deh);
@@ -428,9 +422,9 @@ static void direntry_check_item(struct item_head *ih, char *item)
int i;
struct reiserfs_de_head *deh;
- // FIXME: type something here!
+ /* unused */
deh = (struct reiserfs_de_head *)item;
- for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+ for (i = 0; i < ih_entry_count(ih); i++, deh++) {
;
}
}
@@ -439,7 +433,8 @@ static void direntry_check_item(struct item_head *ih, char *item)
/*
* function returns old entry number in directory item in real node
- * using new entry number in virtual item in virtual node */
+ * using new entry number in virtual item in virtual node
+ */
static inline int old_entry_num(int is_affected, int virtual_entry_num,
int pos_in_item, int mode)
{
@@ -463,9 +458,11 @@ static inline int old_entry_num(int is_affected, int virtual_entry_num,
return virtual_entry_num - 1;
}
-/* Create an array of sizes of directory entries for virtual
- item. Return space used by an item. FIXME: no control over
- consuming of space used by this item handler */
+/*
+ * Create an array of sizes of directory entries for virtual
+ * item. Return space used by an item. FIXME: no control over
+ * consuming of space used by this item handler
+ */
static int direntry_create_vi(struct virtual_node *vn,
struct virtual_item *vi,
int is_affected, int insert_size)
@@ -494,8 +491,8 @@ static int direntry_create_vi(struct virtual_node *vn,
j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
vn->vn_mode);
dir_u->entry_sizes[i] =
- (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) -
- deh_location(&(deh[j])) + DEH_SIZE;
+ (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) -
+ deh_location(&deh[j]) + DEH_SIZE;
}
size += (dir_u->entry_count * sizeof(short));
@@ -529,10 +526,10 @@ static int direntry_create_vi(struct virtual_node *vn,
}
-//
-// return number of entries which may fit into specified amount of
-// free space, or -1 if free space is not enough even for 1 entry
-//
+/*
+ * return number of entries which may fit into specified amount of
+ * free space, or -1 if free space is not enough even for 1 entry
+ */
static int direntry_check_left(struct virtual_item *vi, int free,
int start_skip, int end_skip)
{
@@ -541,8 +538,8 @@ static int direntry_check_left(struct virtual_item *vi, int free,
struct direntry_uarea *dir_u = vi->vi_uarea;
for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
+ /* i-th entry doesn't fit into the remaining free space */
if (dir_u->entry_sizes[i] > free)
- /* i-th entry doesn't fit into the remaining free space */
break;
free -= dir_u->entry_sizes[i];
@@ -570,8 +567,8 @@ static int direntry_check_right(struct virtual_item *vi, int free)
struct direntry_uarea *dir_u = vi->vi_uarea;
for (i = dir_u->entry_count - 1; i >= 0; i--) {
+ /* i-th entry doesn't fit into the remaining free space */
if (dir_u->entry_sizes[i] > free)
- /* i-th entry doesn't fit into the remaining free space */
break;
free -= dir_u->entry_sizes[i];
@@ -643,9 +640,7 @@ static struct item_operations direntry_ops = {
.print_vi = direntry_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-// Error catching functions to catch errors caused by incorrect item types.
-//
+/* Error catching functions to catch errors caused by incorrect item types. */
static int errcatch_bytes_number(struct item_head *ih, int block_size)
{
reiserfs_warning(NULL, "green-16001",
@@ -685,8 +680,12 @@ static int errcatch_create_vi(struct virtual_node *vn,
{
reiserfs_warning(NULL, "green-16006",
"Invalid item type observed, run fsck ASAP");
- return 0; // We might return -1 here as well, but it won't help as create_virtual_node() from where
- // this operation is called from is of return type void.
+ /*
+ * We might return -1 here as well, but it won't help as
+ * create_virtual_node() from where this operation is called
+ * from is of return type void.
+ */
+ return 0;
}
static int errcatch_check_left(struct virtual_item *vi, int free,
@@ -739,9 +738,6 @@ static struct item_operations errcatch_ops = {
errcatch_print_vi
};
-//////////////////////////////////////////////////////////////////////////////
-//
-//
#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3)
#error Item types must use disk-format assigned values.
#endif
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index fd777032c2b..e8870de4627 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,38 +1,38 @@
/*
-** Write ahead logging implementation copyright Chris Mason 2000
-**
-** The background commits make this code very interrelated, and
-** overly complex. I need to rethink things a bit....The major players:
-**
-** journal_begin -- call with the number of blocks you expect to log.
-** If the current transaction is too
-** old, it will block until the current transaction is
-** finished, and then start a new one.
-** Usually, your transaction will get joined in with
-** previous ones for speed.
-**
-** journal_join -- same as journal_begin, but won't block on the current
-** transaction regardless of age. Don't ever call
-** this. Ever. There are only two places it should be
-** called from, and they are both inside this file.
-**
-** journal_mark_dirty -- adds blocks into this transaction. clears any flags
-** that might make them get sent to disk
-** and then marks them BH_JDirty. Puts the buffer head
-** into the current transaction hash.
-**
-** journal_end -- if the current transaction is batchable, it does nothing
-** otherwise, it could do an async/synchronous commit, or
-** a full flush of all log and real blocks in the
-** transaction.
-**
-** flush_old_commits -- if the current transaction is too old, it is ended and
-** commit blocks are sent to disk. Forces commit blocks
-** to disk for all backgrounded commits that have been
-** around too long.
-** -- Note, if you call this as an immediate flush from
-** from within kupdate, it will ignore the immediate flag
-*/
+ * Write ahead logging implementation copyright Chris Mason 2000
+ *
+ * The background commits make this code very interrelated, and
+ * overly complex. I need to rethink things a bit....The major players:
+ *
+ * journal_begin -- call with the number of blocks you expect to log.
+ * If the current transaction is too
+ * old, it will block until the current transaction is
+ * finished, and then start a new one.
+ * Usually, your transaction will get joined in with
+ * previous ones for speed.
+ *
+ * journal_join -- same as journal_begin, but won't block on the current
+ * transaction regardless of age. Don't ever call
+ * this. Ever. There are only two places it should be
+ * called from, and they are both inside this file.
+ *
+ * journal_mark_dirty -- adds blocks into this transaction. clears any flags
+ * that might make them get sent to disk
+ * and then marks them BH_JDirty. Puts the buffer head
+ * into the current transaction hash.
+ *
+ * journal_end -- if the current transaction is batchable, it does nothing
+ * otherwise, it could do an async/synchronous commit, or
+ * a full flush of all log and real blocks in the
+ * transaction.
+ *
+ * flush_old_commits -- if the current transaction is too old, it is ended and
+ * commit blocks are sent to disk. Forces commit blocks
+ * to disk for all backgrounded commits that have been
+ * around too long.
+ * -- Note, if you call this as an immediate flush from
+ * from within kupdate, it will ignore the immediate flag
+ */
#include <linux/time.h>
#include <linux/semaphore.h>
@@ -58,23 +58,19 @@
#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
j_working_list))
-/* the number of mounted filesystems. This is used to decide when to
-** start and kill the commit workqueue
-*/
-static int reiserfs_mounted_fs_count;
-
-static struct workqueue_struct *commit_wq;
-
-#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit
- structs at 4k */
+/* must be correct to keep the desc and commit structs at 4k */
+#define JOURNAL_TRANS_HALF 1018
#define BUFNR 64 /*read ahead */
/* cnode stat bits. Move these into reiserfs_fs.h */
-#define BLOCK_FREED 2 /* this block was freed, and can't be written. */
-#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */
+/* this block was freed, and can't be written. */
+#define BLOCK_FREED 2
+/* this block was freed during this transaction, and can't be written */
+#define BLOCK_FREED_HOLDER 3
-#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */
+/* used in flush_journal_list */
+#define BLOCK_NEEDS_FLUSH 4
#define BLOCK_DIRTIED 5
/* journal list state bits */
@@ -87,16 +83,14 @@ static struct workqueue_struct *commit_wq;
#define COMMIT_NOW 2 /* end and commit this transaction */
#define WAIT 4 /* wait for the log blocks to hit the disk */
-static int do_journal_end(struct reiserfs_transaction_handle *,
- struct super_block *, unsigned long nblocks,
- int flags);
+static int do_journal_end(struct reiserfs_transaction_handle *, int flags);
static int flush_journal_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall);
static int flush_commit_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall);
static int can_dirty(struct reiserfs_journal_cnode *cn);
static int journal_join(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks);
+ struct super_block *sb);
static void release_journal_dev(struct super_block *super,
struct reiserfs_journal *journal);
static int dirty_one_transaction(struct super_block *s,
@@ -107,8 +101,10 @@ static void queue_log_writer(struct super_block *s);
/* values for join in do_journal_begin_r */
enum {
JBEGIN_REG = 0, /* regular journal begin */
- JBEGIN_JOIN = 1, /* join the running transaction if at all possible */
- JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */
+ /* join the running transaction if at all possible */
+ JBEGIN_JOIN = 1,
+ /* called from cleanup code, ignores aborted flag */
+ JBEGIN_ABORT = 2,
};
static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
@@ -123,10 +119,11 @@ static void init_journal_hash(struct super_block *sb)
}
/*
-** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to
-** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for
-** more details.
-*/
+ * clears BH_Dirty and sticks the buffer on the clean list. Called because
+ * I can't allow refile_buffer to make schedule happen after I've freed a
+ * block. Look at remove_from_transaction and journal_mark_freed for
+ * more details.
+ */
static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
{
if (bh) {
@@ -163,7 +160,7 @@ static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *sb)
struct list_head *entry = journal->j_bitmap_nodes.next;
journal->j_used_bitmap_nodes++;
- repeat:
+repeat:
if (entry != &journal->j_bitmap_nodes) {
bn = list_entry(entry, struct reiserfs_bitmap_node, list);
@@ -204,7 +201,8 @@ static void allocate_bitmap_nodes(struct super_block *sb)
list_add(&bn->list, &journal->j_bitmap_nodes);
journal->j_free_bitmap_nodes++;
} else {
- break; /* this is ok, we'll try again when more are needed */
+ /* this is ok, we'll try again when more are needed */
+ break;
}
}
}
@@ -239,8 +237,8 @@ static void cleanup_bitmap_list(struct super_block *sb,
}
/*
-** only call this on FS unmount.
-*/
+ * only call this on FS unmount.
+ */
static int free_list_bitmaps(struct super_block *sb,
struct reiserfs_list_bitmap *jb_array)
{
@@ -275,9 +273,9 @@ static int free_bitmap_nodes(struct super_block *sb)
}
/*
-** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
-** jb_array is the array to be filled in.
-*/
+ * get memory for JOURNAL_NUM_BITMAPS worth of bitmaps.
+ * jb_array is the array to be filled in.
+ */
int reiserfs_allocate_list_bitmaps(struct super_block *sb,
struct reiserfs_list_bitmap *jb_array,
unsigned int bmap_nr)
@@ -306,9 +304,9 @@ int reiserfs_allocate_list_bitmaps(struct super_block *sb,
}
/*
-** find an available list bitmap. If you can't find one, flush a commit list
-** and try again
-*/
+ * find an available list bitmap. If you can't find one, flush a commit list
+ * and try again
+ */
static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
struct reiserfs_journal_list
*jl)
@@ -332,18 +330,18 @@ static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *sb,
break;
}
}
- if (jb->journal_list) { /* double check to make sure if flushed correctly */
+ /* double check to make sure if flushed correctly */
+ if (jb->journal_list)
return NULL;
- }
jb->journal_list = jl;
return jb;
}
/*
-** allocates a new chunk of X nodes, and links them all together as a list.
-** Uses the cnode->next and cnode->prev pointers
-** returns NULL on failure
-*/
+ * allocates a new chunk of X nodes, and links them all together as a list.
+ * Uses the cnode->next and cnode->prev pointers
+ * returns NULL on failure
+ */
static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
{
struct reiserfs_journal_cnode *head;
@@ -365,9 +363,7 @@ static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
return head;
}
-/*
-** pulls a cnode off the free list, or returns NULL on failure
-*/
+/* pulls a cnode off the free list, or returns NULL on failure */
static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
{
struct reiserfs_journal_cnode *cn;
@@ -393,8 +389,8 @@ static struct reiserfs_journal_cnode *get_cnode(struct super_block *sb)
}
/*
-** returns a cnode to the free list
-*/
+ * returns a cnode to the free list
+ */
static void free_cnode(struct super_block *sb,
struct reiserfs_journal_cnode *cn)
{
@@ -419,7 +415,10 @@ static void clear_prepared_bits(struct buffer_head *bh)
clear_buffer_journal_restore_dirty(bh);
}
-/* return a cnode with same dev, block number and size in table, or null if not found */
+/*
+ * return a cnode with same dev, block number and size in table,
+ * or null if not found
+ */
static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
super_block
*sb,
@@ -439,23 +438,24 @@ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
}
/*
-** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated
-** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
-** being overwritten by a replay after crashing.
-**
-** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting
-** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make
-** sure you never write the block without logging it.
-**
-** next_zero_bit is a suggestion about the next block to try for find_forward.
-** when bl is rejected because it is set in a journal list bitmap, we search
-** for the next zero bit in the bitmap that rejected bl. Then, we return that
-** through next_zero_bit for find_forward to try.
-**
-** Just because we return something in next_zero_bit does not mean we won't
-** reject it on the next call to reiserfs_in_journal
-**
-*/
+ * this actually means 'can this block be reallocated yet?'. If you set
+ * search_all, a block can only be allocated if it is not in the current
+ * transaction, was not freed by the current transaction, and has no chance
+ * of ever being overwritten by a replay after crashing.
+ *
+ * If you don't set search_all, a block can only be allocated if it is not
+ * in the current transaction. Since deleting a block removes it from the
+ * current transaction, this case should never happen. If you don't set
+ * search_all, make sure you never write the block without logging it.
+ *
+ * next_zero_bit is a suggestion about the next block to try for find_forward.
+ * when bl is rejected because it is set in a journal list bitmap, we search
+ * for the next zero bit in the bitmap that rejected bl. Then, we return
+ * that through next_zero_bit for find_forward to try.
+ *
+ * Just because we return something in next_zero_bit does not mean we won't
+ * reject it on the next call to reiserfs_in_journal
+ */
int reiserfs_in_journal(struct super_block *sb,
unsigned int bmap_nr, int bit_nr, int search_all,
b_blocknr_t * next_zero_bit)
@@ -469,9 +469,11 @@ int reiserfs_in_journal(struct super_block *sb,
*next_zero_bit = 0; /* always start this at zero. */
PROC_INFO_INC(sb, journal.in_journal);
- /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
- ** if we crash before the transaction that freed it commits, this transaction won't
- ** have committed either, and the block will never be written
+ /*
+ * If we aren't doing a search_all, this is a metablock, and it
+ * will be logged before use. if we crash before the transaction
+ * that freed it commits, this transaction won't have committed
+ * either, and the block will never be written
*/
if (search_all) {
for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
@@ -511,8 +513,7 @@ int reiserfs_in_journal(struct super_block *sb,
return 0;
}
-/* insert cn into table
-*/
+/* insert cn into table */
static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
struct reiserfs_journal_cnode *cn)
{
@@ -558,10 +559,10 @@ static inline void put_journal_list(struct super_block *s,
}
/*
-** this used to be much more involved, and I'm keeping it just in case things get ugly again.
-** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
-** transaction.
-*/
+ * this used to be much more involved, and I'm keeping it just in case
+ * things get ugly again. it gets called by flush_commit_list, and
+ * cleans up any data stored about blocks freed during a transaction.
+ */
static void cleanup_freed_for_journal_list(struct super_block *sb,
struct reiserfs_journal_list *jl)
{
@@ -756,11 +757,12 @@ static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
jh = bh->b_private;
list_del_init(&jh->list);
} else {
- no_jh:
+no_jh:
get_bh(bh);
jh = alloc_jh();
spin_lock(&j->j_dirty_buffers_lock);
- /* buffer must be locked for __add_jh, should be able to have
+ /*
+ * buffer must be locked for __add_jh, should be able to have
* two adds at the same time
*/
BUG_ON(bh->b_private);
@@ -818,7 +820,8 @@ static int write_ordered_buffers(spinlock_t * lock,
spin_lock(lock);
goto loop_next;
}
- /* in theory, dirty non-uptodate buffers should never get here,
+ /*
+ * in theory, dirty non-uptodate buffers should never get here,
* but the upper layer io error paths still have a few quirks.
* Handle them here as gracefully as we can
*/
@@ -833,7 +836,7 @@ static int write_ordered_buffers(spinlock_t * lock,
reiserfs_free_jh(bh);
unlock_buffer(bh);
}
- loop_next:
+loop_next:
put_bh(bh);
cond_resched_lock(lock);
}
@@ -856,13 +859,14 @@ static int write_ordered_buffers(spinlock_t * lock,
if (!buffer_uptodate(bh)) {
ret = -EIO;
}
- /* ugly interaction with invalidatepage here.
- * reiserfs_invalidate_page will pin any buffer that has a valid
- * journal head from an older transaction. If someone else sets
- * our buffer dirty after we write it in the first loop, and
- * then someone truncates the page away, nobody will ever write
- * the buffer. We're safe if we write the page one last time
- * after freeing the journal header.
+ /*
+ * ugly interaction with invalidatepage here.
+ * reiserfs_invalidate_page will pin any buffer that has a
+ * valid journal head from an older transaction. If someone
+ * else sets our buffer dirty after we write it in the first
+ * loop, and then someone truncates the page away, nobody
+ * will ever write the buffer. We're safe if we write the
+ * page one last time after freeing the journal header.
*/
if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) {
spin_unlock(lock);
@@ -887,7 +891,7 @@ static int flush_older_commits(struct super_block *s,
unsigned int other_trans_id;
unsigned int first_trans_id;
- find_first:
+find_first:
/*
* first we walk backwards to find the oldest uncommitted transation
*/
@@ -923,9 +927,11 @@ static int flush_older_commits(struct super_block *s,
if (!journal_list_still_alive(s, trans_id))
return 1;
- /* the one we just flushed is gone, this means all
- * older lists are also gone, so first_jl is no longer
- * valid either. Go back to the beginning.
+ /*
+ * the one we just flushed is gone, this means
+ * all older lists are also gone, so first_jl
+ * is no longer valid either. Go back to the
+ * beginning.
*/
if (!journal_list_still_alive
(s, other_trans_id)) {
@@ -958,12 +964,12 @@ static int reiserfs_async_progress_wait(struct super_block *s)
}
/*
-** if this journal list still has commit blocks unflushed, send them to disk.
-**
-** log areas must be flushed in order (transaction 2 can't commit before transaction 1)
-** Before the commit block can by written, every other log block must be safely on disk
-**
-*/
+ * if this journal list still has commit blocks unflushed, send them to disk.
+ *
+ * log areas must be flushed in order (transaction 2 can't commit before
+ * transaction 1) Before the commit block can by written, every other log
+ * block must be safely on disk
+ */
static int flush_commit_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall)
{
@@ -982,8 +988,9 @@ static int flush_commit_list(struct super_block *s,
return 0;
}
- /* before we can put our commit blocks on disk, we have to make sure everyone older than
- ** us is on disk too
+ /*
+ * before we can put our commit blocks on disk, we have to make
+ * sure everyone older than us is on disk too
*/
BUG_ON(jl->j_len <= 0);
BUG_ON(trans_id == journal->j_trans_id);
@@ -991,7 +998,10 @@ static int flush_commit_list(struct super_block *s,
get_journal_list(jl);
if (flushall) {
if (flush_older_commits(s, jl) == 1) {
- /* list disappeared during flush_older_commits. return */
+ /*
+ * list disappeared during flush_older_commits.
+ * return
+ */
goto put_jl;
}
}
@@ -1006,9 +1016,9 @@ static int flush_commit_list(struct super_block *s,
BUG_ON(jl->j_trans_id == 0);
/* this commit is done, exit */
- if (atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (atomic_read(&jl->j_commit_left) <= 0) {
if (flushall) {
- atomic_set(&(jl->j_older_commits_done), 1);
+ atomic_set(&jl->j_older_commits_done, 1);
}
mutex_unlock(&jl->j_commit_mutex);
goto put_jl;
@@ -1063,9 +1073,10 @@ static int flush_commit_list(struct super_block *s,
depth = reiserfs_write_unlock_nested(s);
__wait_on_buffer(tbh);
reiserfs_write_lock_nested(s, depth);
- // since we're using ll_rw_blk above, it might have skipped over
- // a locked buffer. Double check here
- //
+ /*
+ * since we're using ll_rw_blk above, it might have skipped
+ * over a locked buffer. Double check here
+ */
/* redundant, sync_dirty_buffer() checks */
if (buffer_dirty(tbh)) {
depth = reiserfs_write_unlock_nested(s);
@@ -1079,17 +1090,21 @@ static int flush_commit_list(struct super_block *s,
#endif
retval = -EIO;
}
- put_bh(tbh); /* once for journal_find_get_block */
- put_bh(tbh); /* once due to original getblk in do_journal_end */
- atomic_dec(&(jl->j_commit_left));
+ /* once for journal_find_get_block */
+ put_bh(tbh);
+ /* once due to original getblk in do_journal_end */
+ put_bh(tbh);
+ atomic_dec(&jl->j_commit_left);
}
- BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
+ BUG_ON(atomic_read(&jl->j_commit_left) != 1);
- /* If there was a write error in the journal - we can't commit
+ /*
+ * If there was a write error in the journal - we can't commit
* this transaction - it will be invalid and, if successful,
* will just end up propagating the write error out to
- * the file system. */
+ * the file system.
+ */
if (likely(!retval && !reiserfs_is_journal_aborted (journal))) {
if (buffer_dirty(jl->j_commit_bh))
BUG();
@@ -1102,9 +1117,11 @@ static int flush_commit_list(struct super_block *s,
reiserfs_write_lock_nested(s, depth);
}
- /* If there was a write error in the journal - we can't commit this
+ /*
+ * If there was a write error in the journal - we can't commit this
* transaction - it will be invalid and, if successful, will just end
- * up propagating the write error out to the filesystem. */
+ * up propagating the write error out to the filesystem.
+ */
if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
#ifdef CONFIG_REISERFS_CHECK
reiserfs_warning(s, "journal-615", "buffer write failed");
@@ -1119,7 +1136,10 @@ static int flush_commit_list(struct super_block *s,
}
journal->j_last_commit_id = jl->j_trans_id;
- /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */
+ /*
+ * now, every commit block is on the disk. It is safe to allow
+ * blocks freed during this transaction to be reallocated
+ */
cleanup_freed_for_journal_list(s, jl);
retval = retval ? retval : journal->j_errno;
@@ -1127,13 +1147,13 @@ static int flush_commit_list(struct super_block *s,
/* mark the metadata dirty */
if (!retval)
dirty_one_transaction(s, jl);
- atomic_dec(&(jl->j_commit_left));
+ atomic_dec(&jl->j_commit_left);
if (flushall) {
- atomic_set(&(jl->j_older_commits_done), 1);
+ atomic_set(&jl->j_older_commits_done, 1);
}
mutex_unlock(&jl->j_commit_mutex);
- put_jl:
+put_jl:
put_journal_list(s, jl);
if (retval)
@@ -1143,9 +1163,9 @@ static int flush_commit_list(struct super_block *s,
}
/*
-** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or
-** returns NULL if it can't find anything
-*/
+ * flush_journal_list frequently needs to find a newer transaction for a
+ * given block. This does that, or returns NULL if it can't find anything
+ */
static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
reiserfs_journal_cnode
*cn)
@@ -1169,10 +1189,11 @@ static void remove_journal_hash(struct super_block *,
int);
/*
-** once all the real blocks have been flushed, it is safe to remove them from the
-** journal list for this transaction. Aside from freeing the cnode, this also allows the
-** block to be reallocated for data blocks if it had been deleted.
-*/
+ * once all the real blocks have been flushed, it is safe to remove them
+ * from the journal list for this transaction. Aside from freeing the
+ * cnode, this also allows the block to be reallocated for data blocks
+ * if it had been deleted.
+ */
static void remove_all_from_journal_list(struct super_block *sb,
struct reiserfs_journal_list *jl,
int debug)
@@ -1181,8 +1202,9 @@ static void remove_all_from_journal_list(struct super_block *sb,
struct reiserfs_journal_cnode *cn, *last;
cn = jl->j_realblock;
- /* which is better, to lock once around the whole loop, or
- ** to lock for each call to remove_journal_hash?
+ /*
+ * which is better, to lock once around the whole loop, or
+ * to lock for each call to remove_journal_hash?
*/
while (cn) {
if (cn->blocknr != 0) {
@@ -1204,12 +1226,13 @@ static void remove_all_from_journal_list(struct super_block *sb,
}
/*
-** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block.
-** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start
-** releasing blocks in this transaction for reuse as data blocks.
-** called by flush_journal_list, before it calls remove_all_from_journal_list
-**
-*/
+ * if this timestamp is greater than the timestamp we wrote last to the
+ * header block, write it to the header block. once this is done, I can
+ * safely say the log area for this transaction won't ever be replayed,
+ * and I can start releasing blocks in this transaction for reuse as data
+ * blocks. called by flush_journal_list, before it calls
+ * remove_all_from_journal_list
+ */
static int _update_journal_header_block(struct super_block *sb,
unsigned long offset,
unsigned int trans_id)
@@ -1279,10 +1302,11 @@ static int flush_older_journal_lists(struct super_block *sb,
struct reiserfs_journal *journal = SB_JOURNAL(sb);
unsigned int trans_id = jl->j_trans_id;
- /* we know we are the only ones flushing things, no extra race
+ /*
+ * we know we are the only ones flushing things, no extra race
* protection is required.
*/
- restart:
+restart:
entry = journal->j_journal_list.next;
/* Did we wrap? */
if (entry == &journal->j_journal_list)
@@ -1309,15 +1333,16 @@ static void del_from_work_list(struct super_block *s,
}
}
-/* flush a journal list, both commit and real blocks
-**
-** always set flushall to 1, unless you are calling from inside
-** flush_journal_list
-**
-** IMPORTANT. This can only be called while there are no journal writers,
-** and the journal is locked. That means it can only be called from
-** do_journal_end, or by journal_release
-*/
+/*
+ * flush a journal list, both commit and real blocks
+ *
+ * always set flushall to 1, unless you are calling from inside
+ * flush_journal_list
+ *
+ * IMPORTANT. This can only be called while there are no journal writers,
+ * and the journal is locked. That means it can only be called from
+ * do_journal_end, or by journal_release
+ */
static int flush_journal_list(struct super_block *s,
struct reiserfs_journal_list *jl, int flushall)
{
@@ -1354,13 +1379,14 @@ static int flush_journal_list(struct super_block *s,
}
/* if all the work is already done, get out of here */
- if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
- atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (atomic_read(&jl->j_nonzerolen) <= 0 &&
+ atomic_read(&jl->j_commit_left) <= 0) {
goto flush_older_and_return;
}
- /* start by putting the commit list on disk. This will also flush
- ** the commit lists of any olders transactions
+ /*
+ * start by putting the commit list on disk. This will also flush
+ * the commit lists of any olders transactions
*/
flush_commit_list(s, jl, 1);
@@ -1369,15 +1395,16 @@ static int flush_journal_list(struct super_block *s,
BUG();
/* are we done now? */
- if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
- atomic_read(&(jl->j_commit_left)) <= 0) {
+ if (atomic_read(&jl->j_nonzerolen) <= 0 &&
+ atomic_read(&jl->j_commit_left) <= 0) {
goto flush_older_and_return;
}
- /* loop through each cnode, see if we need to write it,
- ** or wait on a more recent transaction, or just ignore it
+ /*
+ * loop through each cnode, see if we need to write it,
+ * or wait on a more recent transaction, or just ignore it
*/
- if (atomic_read(&(journal->j_wcount)) != 0) {
+ if (atomic_read(&journal->j_wcount) != 0) {
reiserfs_panic(s, "journal-844", "journal list is flushing, "
"wcount is not 0");
}
@@ -1391,20 +1418,25 @@ static int flush_journal_list(struct super_block *s,
goto free_cnode;
}
- /* This transaction failed commit. Don't write out to the disk */
+ /*
+ * This transaction failed commit.
+ * Don't write out to the disk
+ */
if (!(jl->j_state & LIST_DIRTY))
goto free_cnode;
pjl = find_newer_jl_for_cn(cn);
- /* the order is important here. We check pjl to make sure we
- ** don't clear BH_JDirty_wait if we aren't the one writing this
- ** block to disk
+ /*
+ * the order is important here. We check pjl to make sure we
+ * don't clear BH_JDirty_wait if we aren't the one writing this
+ * block to disk
*/
if (!pjl && cn->bh) {
saved_bh = cn->bh;
- /* we do this to make sure nobody releases the buffer while
- ** we are working with it
+ /*
+ * we do this to make sure nobody releases the
+ * buffer while we are working with it
*/
get_bh(saved_bh);
@@ -1413,13 +1445,17 @@ static int flush_journal_list(struct super_block *s,
was_jwait = 1;
was_dirty = 1;
} else if (can_dirty(cn)) {
- /* everything with !pjl && jwait should be writable */
+ /*
+ * everything with !pjl && jwait
+ * should be writable
+ */
BUG();
}
}
- /* if someone has this block in a newer transaction, just make
- ** sure they are committed, and don't try writing it to disk
+ /*
+ * if someone has this block in a newer transaction, just make
+ * sure they are committed, and don't try writing it to disk
*/
if (pjl) {
if (atomic_read(&pjl->j_commit_left))
@@ -1427,16 +1463,18 @@ static int flush_journal_list(struct super_block *s,
goto free_cnode;
}
- /* bh == NULL when the block got to disk on its own, OR,
- ** the block got freed in a future transaction
+ /*
+ * bh == NULL when the block got to disk on its own, OR,
+ * the block got freed in a future transaction
*/
if (saved_bh == NULL) {
goto free_cnode;
}
- /* this should never happen. kupdate_one_transaction has this list
- ** locked while it works, so we should never see a buffer here that
- ** is not marked JDirty_wait
+ /*
+ * this should never happen. kupdate_one_transaction has
+ * this list locked while it works, so we should never see a
+ * buffer here that is not marked JDirty_wait
*/
if ((!was_jwait) && !buffer_locked(saved_bh)) {
reiserfs_warning(s, "journal-813",
@@ -1447,7 +1485,10 @@ static int flush_journal_list(struct super_block *s,
was_jwait ? ' ' : '!');
}
if (was_dirty) {
- /* we inc again because saved_bh gets decremented at free_cnode */
+ /*
+ * we inc again because saved_bh gets decremented
+ * at free_cnode
+ */
get_bh(saved_bh);
set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
lock_buffer(saved_bh);
@@ -1463,13 +1504,16 @@ static int flush_journal_list(struct super_block *s,
(unsigned long long)saved_bh->
b_blocknr, __func__);
}
- free_cnode:
+free_cnode:
last = cn;
cn = cn->next;
if (saved_bh) {
- /* we incremented this to keep others from taking the buffer head away */
+ /*
+ * we incremented this to keep others from
+ * taking the buffer head away
+ */
put_bh(saved_bh);
- if (atomic_read(&(saved_bh->b_count)) < 0) {
+ if (atomic_read(&saved_bh->b_count) < 0) {
reiserfs_warning(s, "journal-945",
"saved_bh->b_count < 0");
}
@@ -1499,8 +1543,10 @@ static int flush_journal_list(struct super_block *s,
#endif
err = -EIO;
}
- /* note, we must clear the JDirty_wait bit after the up to date
- ** check, otherwise we race against our flushpage routine
+ /*
+ * note, we must clear the JDirty_wait bit
+ * after the up to date check, otherwise we
+ * race against our flushpage routine
*/
BUG_ON(!test_clear_buffer_journal_dirty
(cn->bh));
@@ -1518,25 +1564,27 @@ static int flush_journal_list(struct super_block *s,
reiserfs_abort(s, -EIO,
"Write error while pushing transaction to disk in %s",
__func__);
- flush_older_and_return:
+flush_older_and_return:
- /* before we can update the journal header block, we _must_ flush all
- ** real blocks from all older transactions to disk. This is because
- ** once the header block is updated, this transaction will not be
- ** replayed after a crash
+ /*
+ * before we can update the journal header block, we _must_ flush all
+ * real blocks from all older transactions to disk. This is because
+ * once the header block is updated, this transaction will not be
+ * replayed after a crash
*/
if (flushall) {
flush_older_journal_lists(s, jl);
}
err = journal->j_errno;
- /* before we can remove everything from the hash tables for this
- ** transaction, we must make sure it can never be replayed
- **
- ** since we are only called from do_journal_end, we know for sure there
- ** are no allocations going on while we are flushing journal lists. So,
- ** we only need to update the journal header block for the last list
- ** being flushed
+ /*
+ * before we can remove everything from the hash tables for this
+ * transaction, we must make sure it can never be replayed
+ *
+ * since we are only called from do_journal_end, we know for sure there
+ * are no allocations going on while we are flushing journal lists. So,
+ * we only need to update the journal header block for the last list
+ * being flushed
*/
if (!err && flushall) {
err =
@@ -1561,11 +1609,12 @@ static int flush_journal_list(struct super_block *s,
}
journal->j_last_flush_id = jl->j_trans_id;
- /* not strictly required since we are freeing the list, but it should
+ /*
+ * not strictly required since we are freeing the list, but it should
* help find code using dead lists later on
*/
jl->j_len = 0;
- atomic_set(&(jl->j_nonzerolen), 0);
+ atomic_set(&jl->j_nonzerolen, 0);
jl->j_start = 0;
jl->j_realblock = NULL;
jl->j_commit_bh = NULL;
@@ -1592,15 +1641,17 @@ static int write_one_transaction(struct super_block *s,
cn = jl->j_realblock;
while (cn) {
- /* if the blocknr == 0, this has been cleared from the hash,
- ** skip it
+ /*
+ * if the blocknr == 0, this has been cleared from the hash,
+ * skip it
*/
if (cn->blocknr == 0) {
goto next;
}
if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
struct buffer_head *tmp_bh;
- /* we can race against journal_mark_freed when we try
+ /*
+ * we can race against journal_mark_freed when we try
* to lock_buffer(cn->bh), so we have to inc the buffer
* count, and recheck things after locking
*/
@@ -1619,7 +1670,7 @@ static int write_one_transaction(struct super_block *s,
}
put_bh(tmp_bh);
}
- next:
+next:
cn = cn->next;
cond_resched();
}
@@ -1637,15 +1688,17 @@ static int dirty_one_transaction(struct super_block *s,
jl->j_state |= LIST_DIRTY;
cn = jl->j_realblock;
while (cn) {
- /* look for a more recent transaction that logged this
- ** buffer. Only the most recent transaction with a buffer in
- ** it is allowed to send that buffer to disk
+ /*
+ * look for a more recent transaction that logged this
+ * buffer. Only the most recent transaction with a buffer in
+ * it is allowed to send that buffer to disk
*/
pjl = find_newer_jl_for_cn(cn);
if (!pjl && cn->blocknr && cn->bh
&& buffer_journal_dirty(cn->bh)) {
BUG_ON(!can_dirty(cn));
- /* if the buffer is prepared, it will either be logged
+ /*
+ * if the buffer is prepared, it will either be logged
* or restored. If restored, we need to make sure
* it actually gets marked dirty
*/
@@ -1682,7 +1735,8 @@ static int kupdate_transactions(struct super_block *s,
goto done;
}
- /* we've got j_flush_mutex held, nobody is going to delete any
+ /*
+ * we've got j_flush_mutex held, nobody is going to delete any
* of these lists out from underneath us
*/
while ((num_trans && transactions_flushed < num_trans) ||
@@ -1716,20 +1770,21 @@ static int kupdate_transactions(struct super_block *s,
write_chunk(&chunk);
}
- done:
+done:
mutex_unlock(&journal->j_flush_mutex);
return ret;
}
-/* for o_sync and fsync heavy applications, they tend to use
-** all the journa list slots with tiny transactions. These
-** trigger lots and lots of calls to update the header block, which
-** adds seeks and slows things down.
-**
-** This function tries to clear out a large chunk of the journal lists
-** at once, which makes everything faster since only the newest journal
-** list updates the header block
-*/
+/*
+ * for o_sync and fsync heavy applications, they tend to use
+ * all the journa list slots with tiny transactions. These
+ * trigger lots and lots of calls to update the header block, which
+ * adds seeks and slows things down.
+ *
+ * This function tries to clear out a large chunk of the journal lists
+ * at once, which makes everything faster since only the newest journal
+ * list updates the header block
+ */
static int flush_used_journal_lists(struct super_block *s,
struct reiserfs_journal_list *jl)
{
@@ -1766,9 +1821,11 @@ static int flush_used_journal_lists(struct super_block *s,
}
get_journal_list(jl);
get_journal_list(flush_jl);
- /* try to find a group of blocks we can flush across all the
- ** transactions, but only bother if we've actually spanned
- ** across multiple lists
+
+ /*
+ * try to find a group of blocks we can flush across all the
+ * transactions, but only bother if we've actually spanned
+ * across multiple lists
*/
if (flush_jl != jl) {
ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
@@ -1780,9 +1837,9 @@ static int flush_used_journal_lists(struct super_block *s,
}
/*
-** removes any nodes in table with name block and dev as bh.
-** only touchs the hnext and hprev pointers.
-*/
+ * removes any nodes in table with name block and dev as bh.
+ * only touchs the hnext and hprev pointers.
+ */
void remove_journal_hash(struct super_block *sb,
struct reiserfs_journal_cnode **table,
struct reiserfs_journal_list *jl,
@@ -1811,8 +1868,12 @@ void remove_journal_hash(struct super_block *sb,
cur->blocknr = 0;
cur->sb = NULL;
cur->state = 0;
- if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */
- atomic_dec(&(cur->jlist->j_nonzerolen));
+ /*
+ * anybody who clears the cur->bh will also
+ * dec the nonzerolen
+ */
+ if (cur->bh && cur->jlist)
+ atomic_dec(&cur->jlist->j_nonzerolen);
cur->bh = NULL;
cur->jlist = NULL;
}
@@ -1832,17 +1893,18 @@ static void free_journal_ram(struct super_block *sb)
if (journal->j_header_bh) {
brelse(journal->j_header_bh);
}
- /* j_header_bh is on the journal dev, make sure not to release the journal
- * dev until we brelse j_header_bh
+ /*
+ * j_header_bh is on the journal dev, make sure
+ * not to release the journal dev until we brelse j_header_bh
*/
release_journal_dev(sb, journal);
vfree(journal);
}
/*
-** call on unmount. Only set error to 1 if you haven't made your way out
-** of read_super() yet. Any other caller must keep error at 0.
-*/
+ * call on unmount. Only set error to 1 if you haven't made your way out
+ * of read_super() yet. Any other caller must keep error at 0.
+ */
static int do_journal_release(struct reiserfs_transaction_handle *th,
struct super_block *sb, int error)
{
@@ -1850,21 +1912,25 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
int flushed = 0;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
- /* we only want to flush out transactions if we were called with error == 0
+ /*
+ * we only want to flush out transactions if we were
+ * called with error == 0
*/
if (!error && !(sb->s_flags & MS_RDONLY)) {
/* end the current trans */
BUG_ON(!th->t_trans_id);
- do_journal_end(th, sb, 10, FLUSH_ALL);
+ do_journal_end(th, FLUSH_ALL);
- /* make sure something gets logged to force our way into the flush code */
- if (!journal_join(&myth, sb, 1)) {
+ /*
+ * make sure something gets logged to force
+ * our way into the flush code
+ */
+ if (!journal_join(&myth, sb)) {
reiserfs_prepare_for_journal(sb,
SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&myth, sb,
- SB_BUFFER_WITH_SB(sb));
- do_journal_end(&myth, sb, 1, FLUSH_ALL);
+ journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
+ do_journal_end(&myth, FLUSH_ALL);
flushed = 1;
}
}
@@ -1872,17 +1938,15 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
/* this also catches errors during the do_journal_end above */
if (!error && reiserfs_is_journal_aborted(journal)) {
memset(&myth, 0, sizeof(myth));
- if (!journal_join_abort(&myth, sb, 1)) {
+ if (!journal_join_abort(&myth, sb)) {
reiserfs_prepare_for_journal(sb,
SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&myth, sb,
- SB_BUFFER_WITH_SB(sb));
- do_journal_end(&myth, sb, 1, FLUSH_ALL);
+ journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb));
+ do_journal_end(&myth, FLUSH_ALL);
}
}
- reiserfs_mounted_fs_count--;
/* wait for all commits to finish */
cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
@@ -1893,12 +1957,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
reiserfs_write_unlock(sb);
cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
- flush_workqueue(commit_wq);
-
- if (!reiserfs_mounted_fs_count) {
- destroy_workqueue(commit_wq);
- commit_wq = NULL;
- }
+ flush_workqueue(REISERFS_SB(sb)->commit_wq);
free_journal_ram(sb);
@@ -1907,25 +1966,24 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
return 0;
}
-/*
-** call on unmount. flush all journal trans, release all alloc'd ram
-*/
+/* * call on unmount. flush all journal trans, release all alloc'd ram */
int journal_release(struct reiserfs_transaction_handle *th,
struct super_block *sb)
{
return do_journal_release(th, sb, 0);
}
-/*
-** only call from an error condition inside reiserfs_read_super!
-*/
+/* only call from an error condition inside reiserfs_read_super! */
int journal_release_error(struct reiserfs_transaction_handle *th,
struct super_block *sb)
{
return do_journal_release(th, sb, 1);
}
-/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */
+/*
+ * compares description block with commit block.
+ * returns 1 if they differ, 0 if they are the same
+ */
static int journal_compare_desc_commit(struct super_block *sb,
struct reiserfs_journal_desc *desc,
struct reiserfs_journal_commit *commit)
@@ -1939,11 +1997,12 @@ static int journal_compare_desc_commit(struct super_block *sb,
return 0;
}
-/* returns 0 if it did not find a description block
-** returns -1 if it found a corrupt commit block
-** returns 1 if both desc and commit were valid
-** NOTE: only called during fs mount
-*/
+/*
+ * returns 0 if it did not find a description block
+ * returns -1 if it found a corrupt commit block
+ * returns 1 if both desc and commit were valid
+ * NOTE: only called during fs mount
+ */
static int journal_transaction_is_valid(struct super_block *sb,
struct buffer_head *d_bh,
unsigned int *oldest_invalid_trans_id,
@@ -1989,7 +2048,10 @@ static int journal_transaction_is_valid(struct super_block *sb,
}
offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
- /* ok, we have a journal description block, lets see if the transaction was valid */
+ /*
+ * ok, we have a journal description block,
+ * let's see if the transaction was valid
+ */
c_bh =
journal_bread(sb,
SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
@@ -2041,11 +2103,11 @@ static void brelse_array(struct buffer_head **heads, int num)
}
/*
-** given the start, and values for the oldest acceptable transactions,
-** this either reads in a replays a transaction, or returns because the
-** transaction is invalid, or too old.
-** NOTE: only called during fs mount
-*/
+ * given the start, and values for the oldest acceptable transactions,
+ * this either reads in a replays a transaction, or returns because the
+ * transaction is invalid, or too old.
+ * NOTE: only called during fs mount
+ */
static int journal_read_transaction(struct super_block *sb,
unsigned long cur_dblock,
unsigned long oldest_start,
@@ -2119,7 +2181,10 @@ static int journal_read_transaction(struct super_block *sb,
}
trans_id = get_desc_trans_id(desc);
- /* now we know we've got a good transaction, and it was inside the valid time ranges */
+ /*
+ * now we know we've got a good transaction, and it was
+ * inside the valid time ranges
+ */
log_blocks = kmalloc(get_desc_trans_len(desc) *
sizeof(struct buffer_head *), GFP_NOFS);
real_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2164,7 +2229,7 @@ static int journal_read_transaction(struct super_block *sb,
reiserfs_warning(sb, "journal-1204",
"REPLAY FAILURE fsck required! "
"Trying to replay onto a log block");
- abort_replay:
+abort_replay:
brelse_array(log_blocks, i);
brelse_array(real_blocks, i);
brelse(c_bh);
@@ -2226,7 +2291,10 @@ static int journal_read_transaction(struct super_block *sb,
"journal-1095: setting journal " "start to offset %ld",
cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb));
- /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
+ /*
+ * init starting values for the first transaction, in case
+ * this is the last transaction to be replayed.
+ */
journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(sb);
journal->j_last_flush_trans_id = trans_id;
journal->j_trans_id = trans_id + 1;
@@ -2240,12 +2308,14 @@ static int journal_read_transaction(struct super_block *sb,
return 0;
}
-/* This function reads blocks starting from block and to max_block of bufsize
- size (but no more than BUFNR blocks at a time). This proved to improve
- mounting speed on self-rebuilding raid5 arrays at least.
- Right now it is only used from journal code. But later we might use it
- from other places.
- Note: Do not use journal_getblk/sb_getblk functions here! */
+/*
+ * This function reads blocks starting from block and to max_block of bufsize
+ * size (but no more than BUFNR blocks at a time). This proved to improve
+ * mounting speed on self-rebuilding raid5 arrays at least.
+ * Right now it is only used from journal code. But later we might use it
+ * from other places.
+ * Note: Do not use journal_getblk/sb_getblk functions here!
+ */
static struct buffer_head *reiserfs_breada(struct block_device *dev,
b_blocknr_t block, int bufsize,
b_blocknr_t max_block)
@@ -2284,15 +2354,17 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
}
/*
-** read and replay the log
-** on a clean unmount, the journal header's next unflushed pointer will
-** be to an invalid transaction. This tests that before finding all the
-** transactions in the log, which makes normal mount times fast.
-** After a crash, this starts with the next unflushed transaction, and
-** replays until it finds one too old, or invalid.
-** On exit, it sets things up so the first transaction will work correctly.
-** NOTE: only called during fs mount
-*/
+ * read and replay the log
+ * on a clean unmount, the journal header's next unflushed pointer will be
+ * to an invalid transaction. This tests that before finding all the
+ * transactions in the log, which makes normal mount times fast.
+ *
+ * After a crash, this starts with the next unflushed transaction, and
+ * replays until it finds one too old, or invalid.
+ *
+ * On exit, it sets things up so the first transaction will work correctly.
+ * NOTE: only called during fs mount
+ */
static int journal_read(struct super_block *sb)
{
struct reiserfs_journal *journal = SB_JOURNAL(sb);
@@ -2316,9 +2388,10 @@ static int journal_read(struct super_block *sb)
bdevname(journal->j_dev_bd, b));
start = get_seconds();
- /* step 1, read in the journal header block. Check the transaction it says
- ** is the first unflushed, and if that transaction is not valid,
- ** replay is done
+ /*
+ * step 1, read in the journal header block. Check the transaction
+ * it says is the first unflushed, and if that transaction is not
+ * valid, replay is done
*/
journal->j_header_bh = journal_bread(sb,
SB_ONDISK_JOURNAL_1st_BLOCK(sb)
@@ -2342,9 +2415,10 @@ static int journal_read(struct super_block *sb)
le32_to_cpu(jh->j_last_flush_trans_id));
valid_journal_header = 1;
- /* now, we try to read the first unflushed offset. If it is not valid,
- ** there is nothing more we can do, and it makes no sense to read
- ** through the whole log.
+ /*
+ * now, we try to read the first unflushed offset. If it
+ * is not valid, there is nothing more we can do, and it
+ * makes no sense to read through the whole log.
*/
d_bh =
journal_bread(sb,
@@ -2358,15 +2432,19 @@ static int journal_read(struct super_block *sb)
goto start_log_replay;
}
- /* ok, there are transactions that need to be replayed. start with the first log block, find
- ** all the valid transactions, and pick out the oldest.
+ /*
+ * ok, there are transactions that need to be replayed. start
+ * with the first log block, find all the valid transactions, and
+ * pick out the oldest.
*/
while (continue_replay
&& cur_dblock <
(SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb))) {
- /* Note that it is required for blocksize of primary fs device and journal
- device to be the same */
+ /*
+ * Note that it is required for blocksize of primary fs
+ * device and journal device to be the same
+ */
d_bh =
reiserfs_breada(journal->j_dev_bd, cur_dblock,
sb->s_blocksize,
@@ -2413,7 +2491,7 @@ static int journal_read(struct super_block *sb)
brelse(d_bh);
}
- start_log_replay:
+start_log_replay:
cur_dblock = oldest_start;
if (oldest_trans_id) {
reiserfs_debug(sb, REISERFS_DEBUG_CODE,
@@ -2444,9 +2522,11 @@ static int journal_read(struct super_block *sb)
reiserfs_debug(sb, REISERFS_DEBUG_CODE,
"journal-1225: No valid " "transactions found");
}
- /* j_start does not get set correctly if we don't replay any transactions.
- ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
- ** copy the trans_id from the header
+ /*
+ * j_start does not get set correctly if we don't replay any
+ * transactions. if we had a valid journal_header, set j_start
+ * to the first unflushed transaction value, copy the trans_id
+ * from the header
*/
if (valid_journal_header && replay_count == 0) {
journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
@@ -2475,8 +2555,9 @@ static int journal_read(struct super_block *sb)
_update_journal_header_block(sb, journal->j_start,
journal->j_last_flush_trans_id)) {
reiserfs_write_unlock(sb);
- /* replay failed, caller must call free_journal_ram and abort
- ** the mount
+ /*
+ * replay failed, caller must call free_journal_ram and abort
+ * the mount
*/
return -1;
}
@@ -2569,7 +2650,7 @@ static int journal_init_dev(struct super_block *super,
return 0;
}
-/**
+/*
* When creating/tuning a file system user can assign some
* journal params within boundaries which depend on the ratio
* blocksize/standard_blocksize.
@@ -2587,8 +2668,7 @@ static int check_advise_trans_params(struct super_block *sb,
struct reiserfs_journal *journal)
{
if (journal->j_trans_max) {
- /* Non-default journal params.
- Do sanity check for them. */
+ /* Non-default journal params. Do sanity check for them. */
int ratio = 1;
if (sb->s_blocksize < REISERFS_STANDARD_BLKSIZE)
ratio = REISERFS_STANDARD_BLKSIZE / sb->s_blocksize;
@@ -2610,10 +2690,12 @@ static int check_advise_trans_params(struct super_block *sb,
return 1;
}
} else {
- /* Default journal params.
- The file system was created by old version
- of mkreiserfs, so some fields contain zeros,
- and we need to advise proper values for them */
+ /*
+ * Default journal params.
+ * The file system was created by old version
+ * of mkreiserfs, so some fields contain zeros,
+ * and we need to advise proper values for them
+ */
if (sb->s_blocksize != REISERFS_STANDARD_BLKSIZE) {
reiserfs_warning(sb, "sh-464", "bad blocksize (%u)",
sb->s_blocksize);
@@ -2626,9 +2708,7 @@ static int check_advise_trans_params(struct super_block *sb,
return 0;
}
-/*
-** must be called once on fs mount. calls journal_read for you
-*/
+/* must be called once on fs mount. calls journal_read for you */
int journal_init(struct super_block *sb, const char *j_dev_name,
int old_format, unsigned int commit_max_age)
{
@@ -2667,8 +2747,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
REISERFS_DISK_OFFSET_IN_BYTES /
sb->s_blocksize + 2);
- /* Sanity check to see is the standard journal fitting within first bitmap
- (actual for small blocksizes) */
+ /*
+ * Sanity check to see is the standard journal fitting
+ * within first bitmap (actual for small blocksizes)
+ */
if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
(SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
SB_ONDISK_JOURNAL_SIZE(sb) > sb->s_blocksize * 8)) {
@@ -2754,20 +2836,20 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
journal->j_start = 0;
journal->j_len = 0;
journal->j_len_alloc = 0;
- atomic_set(&(journal->j_wcount), 0);
- atomic_set(&(journal->j_async_throttle), 0);
+ atomic_set(&journal->j_wcount, 0);
+ atomic_set(&journal->j_async_throttle, 0);
journal->j_bcount = 0;
journal->j_trans_start_time = 0;
journal->j_last = NULL;
journal->j_first = NULL;
- init_waitqueue_head(&(journal->j_join_wait));
+ init_waitqueue_head(&journal->j_join_wait);
mutex_init(&journal->j_mutex);
mutex_init(&journal->j_flush_mutex);
journal->j_trans_id = 10;
journal->j_mount_id = 10;
journal->j_state = 0;
- atomic_set(&(journal->j_jlock), 0);
+ atomic_set(&journal->j_jlock, 0);
journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
journal->j_cnode_free_orig = journal->j_cnode_free_list;
journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
@@ -2807,23 +2889,19 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
goto free_and_return;
}
- reiserfs_mounted_fs_count++;
- if (reiserfs_mounted_fs_count <= 1)
- commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
-
INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
journal->j_work_sb = sb;
return 0;
- free_and_return:
+free_and_return:
free_journal_ram(sb);
return 1;
}
/*
-** test for a polite end of the current transaction. Used by file_write, and should
-** be used by delete to make sure they don't write more than can fit inside a single
-** transaction
-*/
+ * test for a polite end of the current transaction. Used by file_write,
+ * and should be used by delete to make sure they don't write more than
+ * can fit inside a single transaction
+ */
int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
int new_alloc)
{
@@ -2835,7 +2913,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
return 0;
if (journal->j_must_wait > 0 ||
(journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
- atomic_read(&(journal->j_jlock)) ||
+ atomic_read(&journal->j_jlock) ||
(now - journal->j_trans_start_time) > journal->j_max_trans_age ||
journal->j_cnode_free < (journal->j_trans_max * 3)) {
return 1;
@@ -2846,8 +2924,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
return 0;
}
-/* this must be called inside a transaction
-*/
+/* this must be called inside a transaction */
void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
{
struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
@@ -2857,8 +2934,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
return;
}
-/* this must be called without a transaction started
-*/
+/* this must be called without a transaction started */
void reiserfs_allow_writes(struct super_block *s)
{
struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -2866,8 +2942,7 @@ void reiserfs_allow_writes(struct super_block *s)
wake_up(&journal->j_join_wait);
}
-/* this must be called without a transaction started
-*/
+/* this must be called without a transaction started */
void reiserfs_wait_on_write_block(struct super_block *s)
{
struct reiserfs_journal *journal = SB_JOURNAL(s);
@@ -2929,11 +3004,12 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
}
}
-/* join == true if you must join an existing transaction.
-** join == false if you can deal with waiting for others to finish
-**
-** this will block until the transaction is joinable. send the number of blocks you
-** expect to use in nblocks.
+/*
+ * join == true if you must join an existing transaction.
+ * join == false if you can deal with waiting for others to finish
+ *
+ * this will block until the transaction is joinable. send the number of
+ * blocks you expect to use in nblocks.
*/
static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
struct super_block *sb, unsigned long nblocks,
@@ -2955,7 +3031,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
th->t_refcount = 1;
th->t_super = sb;
- relock:
+relock:
lock_journal(sb);
if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
unlock_journal(sb);
@@ -2974,9 +3050,11 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
}
now = get_seconds();
- /* if there is no room in the journal OR
- ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning
- ** we don't sleep if there aren't other writers
+ /*
+ * if there is no room in the journal OR
+ * if this transaction is too old, and we weren't called joinable,
+ * wait for it to finish before beginning we don't sleep if there
+ * aren't other writers
*/
if ((!join && journal->j_must_wait > 0) ||
@@ -2990,7 +3068,8 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
|| (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
old_trans_id = journal->j_trans_id;
- unlock_journal(sb); /* allow others to finish this transaction */
+ /* allow others to finish this transaction */
+ unlock_journal(sb);
if (!join && (journal->j_len_alloc + nblocks + 2) >=
journal->j_max_batch &&
@@ -3002,8 +3081,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
goto relock;
}
}
- /* don't mess with joining the transaction if all we have to do is
- * wait for someone else to do a commit
+ /*
+ * don't mess with joining the transaction if all we
+ * have to do is wait for someone else to do a commit
*/
if (atomic_read(&journal->j_jlock)) {
while (journal->j_trans_id == old_trans_id &&
@@ -3012,15 +3092,15 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
}
goto relock;
}
- retval = journal_join(&myth, sb, 1);
+ retval = journal_join(&myth, sb);
if (retval)
goto out_fail;
/* someone might have ended the transaction while we joined */
if (old_trans_id != journal->j_trans_id) {
- retval = do_journal_end(&myth, sb, 1, 0);
+ retval = do_journal_end(&myth, 0);
} else {
- retval = do_journal_end(&myth, sb, 1, COMMIT_NOW);
+ retval = do_journal_end(&myth, COMMIT_NOW);
}
if (retval)
@@ -3033,7 +3113,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
if (journal->j_trans_start_time == 0) {
journal->j_trans_start_time = get_seconds();
}
- atomic_inc(&(journal->j_wcount));
+ atomic_inc(&journal->j_wcount);
journal->j_len_alloc += nblocks;
th->t_blocks_logged = 0;
th->t_blocks_allocated = nblocks;
@@ -3042,11 +3122,13 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
INIT_LIST_HEAD(&th->t_list);
return 0;
- out_fail:
+out_fail:
memset(th, 0, sizeof(*th));
- /* Re-set th->t_super, so we can properly keep track of how many
+ /*
+ * Re-set th->t_super, so we can properly keep track of how many
* persistent transactions there are. We need to do this so if this
- * call is part of a failed restart_transaction, we can free it later */
+ * call is part of a failed restart_transaction, we can free it later
+ */
th->t_super = sb;
return retval;
}
@@ -3059,14 +3141,15 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
int ret;
struct reiserfs_transaction_handle *th;
- /* if we're nesting into an existing transaction. It will be
- ** persistent on its own
+ /*
+ * if we're nesting into an existing transaction. It will be
+ * persistent on its own
*/
if (reiserfs_transaction_running(s)) {
th = current->journal_info;
th->t_refcount++;
BUG_ON(th->t_refcount < 2);
-
+
return th;
}
th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS);
@@ -3087,7 +3170,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
struct super_block *s = th->t_super;
int ret = 0;
if (th->t_trans_id)
- ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+ ret = journal_end(th);
else
ret = -EIO;
if (th->t_refcount == 0) {
@@ -3098,29 +3181,31 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
}
static int journal_join(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+ struct super_block *sb)
{
struct reiserfs_transaction_handle *cur_th = current->journal_info;
- /* this keeps do_journal_end from NULLing out the current->journal_info
- ** pointer
+ /*
+ * this keeps do_journal_end from NULLing out the
+ * current->journal_info pointer
*/
th->t_handle_save = cur_th;
BUG_ON(cur_th && cur_th->t_refcount > 1);
- return do_journal_begin_r(th, sb, nblocks, JBEGIN_JOIN);
+ return do_journal_begin_r(th, sb, 1, JBEGIN_JOIN);
}
int journal_join_abort(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+ struct super_block *sb)
{
struct reiserfs_transaction_handle *cur_th = current->journal_info;
- /* this keeps do_journal_end from NULLing out the current->journal_info
- ** pointer
+ /*
+ * this keeps do_journal_end from NULLing out the
+ * current->journal_info pointer
*/
th->t_handle_save = cur_th;
BUG_ON(cur_th && cur_th->t_refcount > 1);
- return do_journal_begin_r(th, sb, nblocks, JBEGIN_ABORT);
+ return do_journal_begin_r(th, sb, 1, JBEGIN_ABORT);
}
int journal_begin(struct reiserfs_transaction_handle *th,
@@ -3142,9 +3227,10 @@ int journal_begin(struct reiserfs_transaction_handle *th,
"journal_info != 0");
return 0;
} else {
- /* we've ended up with a handle from a different filesystem.
- ** save it and restore on journal_end. This should never
- ** really happen...
+ /*
+ * we've ended up with a handle from a different
+ * filesystem. save it and restore on journal_end.
+ * This should never really happen...
*/
reiserfs_warning(sb, "clm-2100",
"nesting info a different FS");
@@ -3157,9 +3243,10 @@ int journal_begin(struct reiserfs_transaction_handle *th,
ret = do_journal_begin_r(th, sb, nblocks, JBEGIN_REG);
BUG_ON(current->journal_info != th);
- /* I guess this boils down to being the reciprocal of clm-2100 above.
- * If do_journal_begin_r fails, we need to put it back, since journal_end
- * won't be called to do it. */
+ /*
+ * I guess this boils down to being the reciprocal of clm-2100 above.
+ * If do_journal_begin_r fails, we need to put it back, since
+ * journal_end won't be called to do it. */
if (ret)
current->journal_info = th->t_handle_save;
else
@@ -3169,17 +3256,19 @@ int journal_begin(struct reiserfs_transaction_handle *th,
}
/*
-** puts bh into the current transaction. If it was already there, reorders removes the
-** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
-**
-** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the
-** transaction is committed.
-**
-** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
-*/
+ * puts bh into the current transaction. If it was already there, reorders
+ * removes the old pointers from the hash, and puts new ones in (to make
+ * sure replay happen in the right order).
+ *
+ * if it was dirty, cleans and files onto the clean list. I can't let it
+ * be dirty again until the transaction is committed.
+ *
+ * if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
+ */
int journal_mark_dirty(struct reiserfs_transaction_handle *th,
- struct super_block *sb, struct buffer_head *bh)
+ struct buffer_head *bh)
{
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
struct reiserfs_journal_cnode *cn = NULL;
int count_already_incd = 0;
@@ -3201,9 +3290,10 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
return 0;
}
- /* this must be turned into a panic instead of a warning. We can't allow
- ** a dirty or journal_dirty or locked buffer to be logged, as some changes
- ** could get to disk too early. NOT GOOD.
+ /*
+ * this must be turned into a panic instead of a warning. We can't
+ * allow a dirty or journal_dirty or locked buffer to be logged, as
+ * some changes could get to disk too early. NOT GOOD.
*/
if (!prepared || buffer_dirty(bh)) {
reiserfs_warning(sb, "journal-1777",
@@ -3216,14 +3306,16 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
buffer_journal_dirty(bh) ? ' ' : '!');
}
- if (atomic_read(&(journal->j_wcount)) <= 0) {
+ if (atomic_read(&journal->j_wcount) <= 0) {
reiserfs_warning(sb, "journal-1409",
"returning because j_wcount was %d",
- atomic_read(&(journal->j_wcount)));
+ atomic_read(&journal->j_wcount));
return 1;
}
- /* this error means I've screwed up, and we've overflowed the transaction.
- ** Nothing can be done here, except make the FS readonly or panic.
+ /*
+ * this error means I've screwed up, and we've overflowed
+ * the transaction. Nothing can be done here, except make the
+ * FS readonly or panic.
*/
if (journal->j_len >= journal->j_trans_max) {
reiserfs_panic(th->t_super, "journal-1413",
@@ -3280,9 +3372,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
return 0;
}
-int journal_end(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+int journal_end(struct reiserfs_transaction_handle *th)
{
+ struct super_block *sb = th->t_super;
if (!current->journal_info && th->t_refcount > 1)
reiserfs_warning(sb, "REISER-NESTING",
"th NULL, refcount %d", th->t_refcount);
@@ -3297,8 +3389,9 @@ int journal_end(struct reiserfs_transaction_handle *th,
struct reiserfs_transaction_handle *cur_th =
current->journal_info;
- /* we aren't allowed to close a nested transaction on a different
- ** filesystem from the one in the task struct
+ /*
+ * we aren't allowed to close a nested transaction on a
+ * different filesystem from the one in the task struct
*/
BUG_ON(cur_th->t_super != th->t_super);
@@ -3308,17 +3401,18 @@ int journal_end(struct reiserfs_transaction_handle *th,
}
return 0;
} else {
- return do_journal_end(th, sb, nblocks, 0);
+ return do_journal_end(th, 0);
}
}
-/* removes from the current transaction, relsing and descrementing any counters.
-** also files the removed buffer directly onto the clean list
-**
-** called by journal_mark_freed when a block has been deleted
-**
-** returns 1 if it cleaned and relsed the buffer. 0 otherwise
-*/
+/*
+ * removes from the current transaction, relsing and descrementing any counters.
+ * also files the removed buffer directly onto the clean list
+ *
+ * called by journal_mark_freed when a block has been deleted
+ *
+ * returns 1 if it cleaned and relsed the buffer. 0 otherwise
+ */
static int remove_from_transaction(struct super_block *sb,
b_blocknr_t blocknr, int already_cleaned)
{
@@ -3354,7 +3448,7 @@ static int remove_from_transaction(struct super_block *sb,
clear_buffer_dirty(bh);
clear_buffer_journal_test(bh);
put_bh(bh);
- if (atomic_read(&(bh->b_count)) < 0) {
+ if (atomic_read(&bh->b_count) < 0) {
reiserfs_warning(sb, "journal-1752",
"b_count < 0");
}
@@ -3367,15 +3461,16 @@ static int remove_from_transaction(struct super_block *sb,
}
/*
-** for any cnode in a journal list, it can only be dirtied of all the
-** transactions that include it are committed to disk.
-** this checks through each transaction, and returns 1 if you are allowed to dirty,
-** and 0 if you aren't
-**
-** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log
-** blocks for a given transaction on disk
-**
-*/
+ * for any cnode in a journal list, it can only be dirtied of all the
+ * transactions that include it are committed to disk.
+ * this checks through each transaction, and returns 1 if you are allowed
+ * to dirty, and 0 if you aren't
+ *
+ * it is called by dirty_journal_list, which is called after
+ * flush_commit_list has gotten all the log blocks for a given
+ * transaction on disk
+ *
+ */
static int can_dirty(struct reiserfs_journal_cnode *cn)
{
struct super_block *sb = cn->sb;
@@ -3383,9 +3478,10 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
struct reiserfs_journal_cnode *cur = cn->hprev;
int can_dirty = 1;
- /* first test hprev. These are all newer than cn, so any node here
- ** with the same block number and dev means this node can't be sent
- ** to disk right now.
+ /*
+ * first test hprev. These are all newer than cn, so any node here
+ * with the same block number and dev means this node can't be sent
+ * to disk right now.
*/
while (cur && can_dirty) {
if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
@@ -3394,13 +3490,14 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
}
cur = cur->hprev;
}
- /* then test hnext. These are all older than cn. As long as they
- ** are committed to the log, it is safe to write cn to disk
+ /*
+ * then test hnext. These are all older than cn. As long as they
+ * are committed to the log, it is safe to write cn to disk
*/
cur = cn->hnext;
while (cur && can_dirty) {
if (cur->jlist && cur->jlist->j_len > 0 &&
- atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
+ atomic_read(&cur->jlist->j_commit_left) > 0 && cur->bh &&
cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
can_dirty = 0;
}
@@ -3409,12 +3506,13 @@ static int can_dirty(struct reiserfs_journal_cnode *cn)
return can_dirty;
}
-/* syncs the commit blocks, but does not force the real buffers to disk
-** will wait until the current transaction is done/committed before returning
-*/
-int journal_end_sync(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks)
+/*
+ * syncs the commit blocks, but does not force the real buffers to disk
+ * will wait until the current transaction is done/committed before returning
+ */
+int journal_end_sync(struct reiserfs_transaction_handle *th)
{
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
BUG_ON(!th->t_trans_id);
@@ -3423,14 +3521,12 @@ int journal_end_sync(struct reiserfs_transaction_handle *th,
if (journal->j_len == 0) {
reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
}
- return do_journal_end(th, sb, nblocks, COMMIT_NOW | WAIT);
+ return do_journal_end(th, COMMIT_NOW | WAIT);
}
-/*
-** writeback the pending async commits to disk
-*/
+/* writeback the pending async commits to disk */
static void flush_async_commits(struct work_struct *work)
{
struct reiserfs_journal *journal =
@@ -3450,9 +3546,9 @@ static void flush_async_commits(struct work_struct *work)
}
/*
-** flushes any old transactions to disk
-** ends the current transaction if it is too old
-*/
+ * flushes any old transactions to disk
+ * ends the current transaction if it is too old
+ */
void reiserfs_flush_old_commits(struct super_block *sb)
{
time_t now;
@@ -3460,48 +3556,53 @@ void reiserfs_flush_old_commits(struct super_block *sb)
struct reiserfs_journal *journal = SB_JOURNAL(sb);
now = get_seconds();
- /* safety check so we don't flush while we are replaying the log during
+ /*
+ * safety check so we don't flush while we are replaying the log during
* mount
*/
if (list_empty(&journal->j_journal_list))
return;
- /* check the current transaction. If there are no writers, and it is
+ /*
+ * check the current transaction. If there are no writers, and it is
* too old, finish it, and force the commit blocks to disk
*/
if (atomic_read(&journal->j_wcount) <= 0 &&
journal->j_trans_start_time > 0 &&
journal->j_len > 0 &&
(now - journal->j_trans_start_time) > journal->j_max_trans_age) {
- if (!journal_join(&th, sb, 1)) {
+ if (!journal_join(&th, sb)) {
reiserfs_prepare_for_journal(sb,
SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&th, sb,
- SB_BUFFER_WITH_SB(sb));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
- /* we're only being called from kreiserfsd, it makes no sense to do
- ** an async commit so that kreiserfsd can do it later
+ /*
+ * we're only being called from kreiserfsd, it makes
+ * no sense to do an async commit so that kreiserfsd
+ * can do it later
*/
- do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
+ do_journal_end(&th, COMMIT_NOW | WAIT);
}
}
}
/*
-** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit
-**
-** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all
-** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just
-** flushes the commit list and returns 0.
-**
-** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait.
-**
-** Note, we can't allow the journal_end to proceed while there are still writers in the log.
-*/
-static int check_journal_end(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks,
- int flags)
+ * returns 0 if do_journal_end should return right away, returns 1 if
+ * do_journal_end should finish the commit
+ *
+ * if the current transaction is too old, but still has writers, this will
+ * wait on j_join_wait until all the writers are done. By the time it
+ * wakes up, the transaction it was called has already ended, so it just
+ * flushes the commit list and returns 0.
+ *
+ * Won't batch when flush or commit_now is set. Also won't batch when
+ * others are waiting on j_join_wait.
+ *
+ * Note, we can't allow the journal_end to proceed while there are still
+ * writers in the log.
+ */
+static int check_journal_end(struct reiserfs_transaction_handle *th, int flags)
{
time_t now;
@@ -3509,6 +3610,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
int commit_now = flags & COMMIT_NOW;
int wait_on_commit = flags & WAIT;
struct reiserfs_journal_list *jl;
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
BUG_ON(!th->t_trans_id);
@@ -3520,23 +3622,27 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
}
journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
- if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */
- atomic_dec(&(journal->j_wcount));
- }
+ /* <= 0 is allowed. unmounting might not call begin */
+ if (atomic_read(&journal->j_wcount) > 0)
+ atomic_dec(&journal->j_wcount);
- /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released
- ** will be dealt with by next transaction that actually writes something, but should be taken
- ** care of in this trans
+ /*
+ * BUG, deal with case where j_len is 0, but people previously
+ * freed blocks need to be released will be dealt with by next
+ * transaction that actually writes something, but should be taken
+ * care of in this trans
*/
BUG_ON(journal->j_len == 0);
- /* if wcount > 0, and we are called to with flush or commit_now,
- ** we wait on j_join_wait. We will wake up when the last writer has
- ** finished the transaction, and started it on its way to the disk.
- ** Then, we flush the commit or journal list, and just return 0
- ** because the rest of journal end was already done for this transaction.
+ /*
+ * if wcount > 0, and we are called to with flush or commit_now,
+ * we wait on j_join_wait. We will wake up when the last writer has
+ * finished the transaction, and started it on its way to the disk.
+ * Then, we flush the commit or journal list, and just return 0
+ * because the rest of journal end was already done for this
+ * transaction.
*/
- if (atomic_read(&(journal->j_wcount)) > 0) {
+ if (atomic_read(&journal->j_wcount) > 0) {
if (flush || commit_now) {
unsigned trans_id;
@@ -3544,27 +3650,30 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
trans_id = jl->j_trans_id;
if (wait_on_commit)
jl->j_state |= LIST_COMMIT_PENDING;
- atomic_set(&(journal->j_jlock), 1);
+ atomic_set(&journal->j_jlock, 1);
if (flush) {
journal->j_next_full_flush = 1;
}
unlock_journal(sb);
- /* sleep while the current transaction is still j_jlocked */
+ /*
+ * sleep while the current transaction is
+ * still j_jlocked
+ */
while (journal->j_trans_id == trans_id) {
if (atomic_read(&journal->j_jlock)) {
queue_log_writer(sb);
} else {
lock_journal(sb);
if (journal->j_trans_id == trans_id) {
- atomic_set(&(journal->j_jlock),
+ atomic_set(&journal->j_jlock,
1);
}
unlock_journal(sb);
}
}
BUG_ON(journal->j_trans_id == trans_id);
-
+
if (commit_now
&& journal_list_still_alive(sb, trans_id)
&& wait_on_commit) {
@@ -3584,7 +3693,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
}
/* don't batch when someone is waiting on j_join_wait */
/* don't batch when syncing the commit or flushing the whole trans */
- if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
+ if (!(journal->j_must_wait > 0) && !(atomic_read(&journal->j_jlock))
&& !flush && !commit_now && (journal->j_len < journal->j_max_batch)
&& journal->j_len_alloc < journal->j_max_batch
&& journal->j_cnode_free > (journal->j_trans_max * 3)) {
@@ -3602,19 +3711,22 @@ static int check_journal_end(struct reiserfs_transaction_handle *th,
}
/*
-** Does all the work that makes deleting blocks safe.
-** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on.
-**
-** otherwise:
-** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes
-** before this transaction has finished.
-**
-** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with
-** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash,
-** the block can't be reallocated yet.
-**
-** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
-*/
+ * Does all the work that makes deleting blocks safe.
+ * when deleting a block mark BH_JNew, just remove it from the current
+ * transaction, clean it's buffer_head and move on.
+ *
+ * otherwise:
+ * set a bit for the block in the journal bitmap. That will prevent it from
+ * being allocated for unformatted nodes before this transaction has finished.
+ *
+ * mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers.
+ * That will prevent any old transactions with this block from trying to flush
+ * to the real location. Since we aren't removing the cnode from the
+ * journal_list_hash, *the block can't be reallocated yet.
+ *
+ * Then remove it from the current transaction, decrementing any counters and
+ * filing it on the clean list.
+ */
int journal_mark_freed(struct reiserfs_transaction_handle *th,
struct super_block *sb, b_blocknr_t blocknr)
{
@@ -3637,7 +3749,10 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
reiserfs_clean_and_file_buffer(bh);
cleaned = remove_from_transaction(sb, blocknr, cleaned);
} else {
- /* set the bit for this block in the journal bitmap for this transaction */
+ /*
+ * set the bit for this block in the journal bitmap
+ * for this transaction
+ */
jb = journal->j_current_jl->j_list_bitmap;
if (!jb) {
reiserfs_panic(sb, "journal-1702",
@@ -3653,17 +3768,22 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
}
cleaned = remove_from_transaction(sb, blocknr, cleaned);
- /* find all older transactions with this block, make sure they don't try to write it out */
+ /*
+ * find all older transactions with this block,
+ * make sure they don't try to write it out
+ */
cn = get_journal_hash_dev(sb, journal->j_list_hash_table,
blocknr);
while (cn) {
if (sb == cn->sb && blocknr == cn->blocknr) {
set_bit(BLOCK_FREED, &cn->state);
if (cn->bh) {
+ /*
+ * remove_from_transaction will brelse
+ * the buffer if it was in the current
+ * trans
+ */
if (!cleaned) {
- /* remove_from_transaction will brelse the buffer if it was
- ** in the current trans
- */
clear_buffer_journal_dirty(cn->
bh);
clear_buffer_dirty(cn->bh);
@@ -3672,16 +3792,19 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th,
cleaned = 1;
put_bh(cn->bh);
if (atomic_read
- (&(cn->bh->b_count)) < 0) {
+ (&cn->bh->b_count) < 0) {
reiserfs_warning(sb,
"journal-2138",
"cn->bh->b_count < 0");
}
}
- if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */
- atomic_dec(&
- (cn->jlist->
- j_nonzerolen));
+ /*
+ * since we are clearing the bh,
+ * we MUST dec nonzerolen
+ */
+ if (cn->jlist) {
+ atomic_dec(&cn->jlist->
+ j_nonzerolen);
}
cn->bh = NULL;
}
@@ -3714,10 +3837,16 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
struct reiserfs_journal *journal = SB_JOURNAL(sb);
int ret = 0;
- /* is it from the current transaction, or from an unknown transaction? */
+ /*
+ * is it from the current transaction,
+ * or from an unknown transaction?
+ */
if (id == journal->j_trans_id) {
jl = journal->j_current_jl;
- /* try to let other writers come in and grow this transaction */
+ /*
+ * try to let other writers come in and
+ * grow this transaction
+ */
let_transaction_grow(sb, id);
if (journal->j_trans_id != id) {
goto flush_commit_only;
@@ -3731,21 +3860,22 @@ static int __commit_trans_jl(struct inode *inode, unsigned long id,
if (journal->j_trans_id != id) {
reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
- ret = journal_end(&th, sb, 1);
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(sb));
+ ret = journal_end(&th);
goto flush_commit_only;
}
- ret = journal_end_sync(&th, sb, 1);
+ ret = journal_end_sync(&th);
if (!ret)
ret = 1;
} else {
- /* this gets tricky, we have to make sure the journal list in
+ /*
+ * this gets tricky, we have to make sure the journal list in
* the inode still exists. We know the list is still around
* if we've got a larger transaction id than the oldest list
*/
- flush_commit_only:
+flush_commit_only:
if (journal_list_still_alive(inode->i_sb, id)) {
/*
* we only set ret to 1 when we know for sure
@@ -3768,7 +3898,8 @@ int reiserfs_commit_for_inode(struct inode *inode)
unsigned int id = REISERFS_I(inode)->i_trans_id;
struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
- /* for the whole inode, assume unset id means it was
+ /*
+ * for the whole inode, assume unset id means it was
* changed in the current transaction. More conservative
*/
if (!id || !jl) {
@@ -3806,12 +3937,11 @@ void reiserfs_restore_prepared_buffer(struct super_block *sb,
extern struct tree_balance *cur_tb;
/*
-** before we can change a metadata block, we have to make sure it won't
-** be written to disk while we are altering it. So, we must:
-** clean it
-** wait on it.
-**
-*/
+ * before we can change a metadata block, we have to make sure it won't
+ * be written to disk while we are altering it. So, we must:
+ * clean it
+ * wait on it.
+ */
int reiserfs_prepare_for_journal(struct super_block *sb,
struct buffer_head *bh, int wait)
{
@@ -3832,19 +3962,18 @@ int reiserfs_prepare_for_journal(struct super_block *sb,
}
/*
-** long and ugly. If flush, will not return until all commit
-** blocks and all real buffers in the trans are on disk.
-** If no_async, won't return until all commit blocks are on disk.
-**
-** keep reading, there are comments as you go along
-**
-** If the journal is aborted, we just clean up. Things like flushing
-** journal lists, etc just won't happen.
-*/
-static int do_journal_end(struct reiserfs_transaction_handle *th,
- struct super_block *sb, unsigned long nblocks,
- int flags)
+ * long and ugly. If flush, will not return until all commit
+ * blocks and all real buffers in the trans are on disk.
+ * If no_async, won't return until all commit blocks are on disk.
+ *
+ * keep reading, there are comments as you go along
+ *
+ * If the journal is aborted, we just clean up. Things like flushing
+ * journal lists, etc just won't happen.
+ */
+static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
{
+ struct super_block *sb = th->t_super;
struct reiserfs_journal *journal = SB_JOURNAL(sb);
struct reiserfs_journal_cnode *cn, *next, *jl_cn;
struct reiserfs_journal_cnode *last_cn = NULL;
@@ -3866,9 +3995,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
BUG_ON(th->t_refcount > 1);
BUG_ON(!th->t_trans_id);
+ BUG_ON(!th->t_super);
- /* protect flush_older_commits from doing mistakes if the
- transaction ID counter gets overflowed. */
+ /*
+ * protect flush_older_commits from doing mistakes if the
+ * transaction ID counter gets overflowed.
+ */
if (th->t_trans_id == ~0U)
flags |= FLUSH_ALL | COMMIT_NOW | WAIT;
flush = flags & FLUSH_ALL;
@@ -3879,7 +4011,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
if (journal->j_len == 0) {
reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
1);
- journal_mark_dirty(th, sb, SB_BUFFER_WITH_SB(sb));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(sb));
}
lock_journal(sb);
@@ -3892,10 +4024,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
wait_on_commit = 1;
}
- /* check_journal_end locks the journal, and unlocks if it does not return 1
- ** it tells us if we should continue with the journal_end, or just return
+ /*
+ * check_journal_end locks the journal, and unlocks if it does
+ * not return 1 it tells us if we should continue with the
+ * journal_end, or just return
*/
- if (!check_journal_end(th, sb, nblocks, flags)) {
+ if (!check_journal_end(th, flags)) {
reiserfs_schedule_old_flush(sb);
wake_queued_writers(sb);
reiserfs_async_progress_wait(sb);
@@ -3908,19 +4042,23 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
}
/*
- ** j must wait means we have to flush the log blocks, and the real blocks for
- ** this transaction
+ * j must wait means we have to flush the log blocks, and the
+ * real blocks for this transaction
*/
if (journal->j_must_wait > 0) {
flush = 1;
}
#ifdef REISERFS_PREALLOCATE
- /* quota ops might need to nest, setup the journal_info pointer for them
- * and raise the refcount so that it is > 0. */
+ /*
+ * quota ops might need to nest, setup the journal_info pointer
+ * for them and raise the refcount so that it is > 0.
+ */
current->journal_info = th;
th->t_refcount++;
- reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
- * the transaction */
+
+ /* it should not involve new blocks into the transaction */
+ reiserfs_discard_all_prealloc(th);
+
th->t_refcount--;
current->journal_info = th->t_handle_save;
#endif
@@ -3936,7 +4074,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
set_desc_trans_id(desc, journal->j_trans_id);
- /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */
+ /*
+ * setup commit block. Don't write (keep it clean too) this one
+ * until after everyone else is written
+ */
c_bh = journal_getblk(sb, SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
((journal->j_start + journal->j_len +
1) % SB_ONDISK_JOURNAL_SIZE(sb)));
@@ -3948,7 +4089,8 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
/* init this journal list */
jl = journal->j_current_jl;
- /* we lock the commit before doing anything because
+ /*
+ * we lock the commit before doing anything because
* we want to make sure nobody tries to run flush_commit_list until
* the new transaction is fully setup, and we've already flushed the
* ordered bh list
@@ -3968,9 +4110,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
atomic_set(&jl->j_commit_left, journal->j_len + 2);
jl->j_realblock = NULL;
- /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
- ** for each real block, add it to the journal list hash,
- ** copy into real block index array in the commit or desc block
+ /*
+ * The ENTIRE FOR LOOP MUST not cause schedule to occur.
+ * for each real block, add it to the journal list hash,
+ * copy into real block index array in the commit or desc block
*/
trans_half = journal_trans_half(sb->s_blocksize);
for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
@@ -3989,9 +4132,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
last_cn->next = jl_cn;
}
last_cn = jl_cn;
- /* make sure the block we are trying to log is not a block
- of journal or reserved area */
-
+ /*
+ * make sure the block we are trying to log
+ * is not a block of journal or reserved area
+ */
if (is_block_in_log_or_reserved_area
(sb, cn->bh->b_blocknr)) {
reiserfs_panic(sb, "journal-2332",
@@ -4021,19 +4165,26 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
set_desc_trans_id(desc, journal->j_trans_id);
set_commit_trans_len(commit, journal->j_len);
- /* special check in case all buffers in the journal were marked for not logging */
+ /*
+ * special check in case all buffers in the journal
+ * were marked for not logging
+ */
BUG_ON(journal->j_len == 0);
- /* we're about to dirty all the log blocks, mark the description block
+ /*
+ * we're about to dirty all the log blocks, mark the description block
* dirty now too. Don't mark the commit block dirty until all the
* others are on disk
*/
mark_buffer_dirty(d_bh);
- /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
+ /*
+ * first data block is j_start + 1, so add one to
+ * cur_write_start wherever you use it
+ */
cur_write_start = journal->j_start;
cn = journal->j_first;
- jindex = 1; /* start at one so we don't get the desc again */
+ jindex = 1; /* start at one so we don't get the desc again */
while (cn) {
clear_buffer_journal_new(cn->bh);
/* copy all the real blocks into log area. dirty log blocks */
@@ -4059,7 +4210,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
set_buffer_journal_dirty(cn->bh);
clear_buffer_journaled(cn->bh);
} else {
- /* JDirty cleared sometime during transaction. don't log this one */
+ /*
+ * JDirty cleared sometime during transaction.
+ * don't log this one
+ */
reiserfs_warning(sb, "journal-2048",
"BAD, buffer in journal hash, "
"but not JDirty!");
@@ -4071,9 +4225,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
reiserfs_cond_resched(sb);
}
- /* we are done with both the c_bh and d_bh, but
- ** c_bh must be written after all other commit blocks,
- ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+ /*
+ * we are done with both the c_bh and d_bh, but
+ * c_bh must be written after all other commit blocks,
+ * so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
*/
journal->j_current_jl = alloc_journal_list(sb);
@@ -4088,7 +4243,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
journal->j_start =
(journal->j_start + journal->j_len +
2) % SB_ONDISK_JOURNAL_SIZE(sb);
- atomic_set(&(journal->j_wcount), 0);
+ atomic_set(&journal->j_wcount, 0);
journal->j_bcount = 0;
journal->j_last = NULL;
journal->j_first = NULL;
@@ -4104,15 +4259,18 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
journal->j_next_async_flush = 0;
init_journal_hash(sb);
- // make sure reiserfs_add_jh sees the new current_jl before we
- // write out the tails
+ /*
+ * make sure reiserfs_add_jh sees the new current_jl before we
+ * write out the tails
+ */
smp_mb();
- /* tail conversion targets have to hit the disk before we end the
+ /*
+ * tail conversion targets have to hit the disk before we end the
* transaction. Otherwise a later transaction might repack the tail
- * before this transaction commits, leaving the data block unflushed and
- * clean, if we crash before the later transaction commits, the data block
- * is lost.
+ * before this transaction commits, leaving the data block unflushed
+ * and clean, if we crash before the later transaction commits, the
+ * data block is lost.
*/
if (!list_empty(&jl->j_tail_bh_list)) {
depth = reiserfs_write_unlock_nested(sb);
@@ -4123,24 +4281,27 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
BUG_ON(!list_empty(&jl->j_tail_bh_list));
mutex_unlock(&jl->j_commit_mutex);
- /* honor the flush wishes from the caller, simple commits can
- ** be done outside the journal lock, they are done below
- **
- ** if we don't flush the commit list right now, we put it into
- ** the work queue so the people waiting on the async progress work
- ** queue don't wait for this proc to flush journal lists and such.
+ /*
+ * honor the flush wishes from the caller, simple commits can
+ * be done outside the journal lock, they are done below
+ *
+ * if we don't flush the commit list right now, we put it into
+ * the work queue so the people waiting on the async progress work
+ * queue don't wait for this proc to flush journal lists and such.
*/
if (flush) {
flush_commit_list(sb, jl, 1);
flush_journal_list(sb, jl, 1);
} else if (!(jl->j_state & LIST_COMMIT_PENDING))
- queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
+ queue_delayed_work(REISERFS_SB(sb)->commit_wq,
+ &journal->j_work, HZ / 10);
- /* if the next transaction has any chance of wrapping, flush
- ** transactions that might get overwritten. If any journal lists are very
- ** old flush them as well.
+ /*
+ * if the next transaction has any chance of wrapping, flush
+ * transactions that might get overwritten. If any journal lists
+ * are very old flush them as well.
*/
- first_jl:
+first_jl:
list_for_each_safe(entry, safe, &journal->j_journal_list) {
temp_jl = JOURNAL_LIST_ENTRY(entry);
if (journal->j_start <= temp_jl->j_start) {
@@ -4151,8 +4312,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
} else if ((journal->j_start +
journal->j_trans_max + 1) <
SB_ONDISK_JOURNAL_SIZE(sb)) {
- /* if we don't cross into the next transaction and we don't
- * wrap, there is no way we can overlap any later transactions
+ /*
+ * if we don't cross into the next
+ * transaction and we don't wrap, there is
+ * no way we can overlap any later transactions
* break now
*/
break;
@@ -4166,10 +4329,12 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
flush_used_journal_lists(sb, temp_jl);
goto first_jl;
} else {
- /* we don't overlap anything from out start to the end of the
- * log, and our wrapped portion doesn't overlap anything at
- * the start of the log. We can break
- */
+ /*
+ * we don't overlap anything from out start
+ * to the end of the log, and our wrapped
+ * portion doesn't overlap anything at
+ * the start of the log. We can break
+ */
break;
}
}
@@ -4183,23 +4348,25 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
"could not get a list bitmap");
}
- atomic_set(&(journal->j_jlock), 0);
+ atomic_set(&journal->j_jlock, 0);
unlock_journal(sb);
/* wake up any body waiting to join. */
clear_bit(J_WRITERS_QUEUED, &journal->j_state);
- wake_up(&(journal->j_join_wait));
+ wake_up(&journal->j_join_wait);
if (!flush && wait_on_commit &&
journal_list_still_alive(sb, commit_trans_id)) {
flush_commit_list(sb, jl, 1);
}
- out:
+out:
reiserfs_check_lock_depth(sb, "journal end2");
memset(th, 0, sizeof(*th));
- /* Re-set th->t_super, so we can properly keep track of how many
+ /*
+ * Re-set th->t_super, so we can properly keep track of how many
* persistent transactions there are. We need to do this so if this
- * call is part of a failed restart_transaction, we can free it later */
+ * call is part of a failed restart_transaction, we can free it later
+ */
th->t_super = sb;
return journal->j_errno;
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 79e5a8b4c22..d6744c8b24e 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -8,46 +8,42 @@
#include "reiserfs.h"
#include <linux/buffer_head.h>
-/* these are used in do_balance.c */
-
-/* leaf_move_items
- leaf_shift_left
- leaf_shift_right
- leaf_delete_items
- leaf_insert_into_buf
- leaf_paste_in_buffer
- leaf_cut_from_buffer
- leaf_paste_entries
- */
-
-/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */
+/*
+ * copy copy_count entries from source directory item to dest buffer
+ * (creating new item if needed)
+ */
static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
struct buffer_head *source, int last_first,
int item_num, int from, int copy_count)
{
struct buffer_head *dest = dest_bi->bi_bh;
- int item_num_in_dest; /* either the number of target item,
- or if we must create a new item,
- the number of the item we will
- create it next to */
+ /*
+ * either the number of target item, or if we must create a
+ * new item, the number of the item we will create it next to
+ */
+ int item_num_in_dest;
+
struct item_head *ih;
struct reiserfs_de_head *deh;
int copy_records_len; /* length of all records in item to be copied */
char *records;
- ih = B_N_PITEM_HEAD(source, item_num);
+ ih = item_head(source, item_num);
RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
- /* length of all record to be copied and first byte of the last of them */
+ /*
+ * length of all record to be copied and first byte of
+ * the last of them
+ */
deh = B_I_DEH(source, ih);
if (copy_count) {
- copy_records_len = (from ? deh_location(&(deh[from - 1])) :
+ copy_records_len = (from ? deh_location(&deh[from - 1]) :
ih_item_len(ih)) -
- deh_location(&(deh[from + copy_count - 1]));
+ deh_location(&deh[from + copy_count - 1]);
records =
source->b_data + ih_location(ih) +
- deh_location(&(deh[from + copy_count - 1]));
+ deh_location(&deh[from + copy_count - 1]);
} else {
copy_records_len = 0;
records = NULL;
@@ -59,12 +55,15 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
- 1);
- /* if there are no items in dest or the first/last item in dest is not item of the same directory */
+ /*
+ * if there are no items in dest or the first/last item in
+ * dest is not item of the same directory
+ */
if ((item_num_in_dest == -1) ||
(last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
(last_first == LAST_TO_FIRST
&& comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
- B_N_PKEY(dest,
+ leaf_key(dest,
item_num_in_dest))))
{
/* create new item in dest */
@@ -80,16 +79,22 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
if (last_first == LAST_TO_FIRST) {
/* form key by the following way */
- if (from < I_ENTRY_COUNT(ih)) {
+ if (from < ih_entry_count(ih)) {
set_le_ih_k_offset(&new_ih,
- deh_offset(&(deh[from])));
- /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */
+ deh_offset(&deh[from]));
} else {
- /* no entries will be copied to this item in this function */
+ /*
+ * no entries will be copied to this
+ * item in this function
+ */
set_le_ih_k_offset(&new_ih, U32_MAX);
- /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
+ /*
+ * this item is not yet valid, but we
+ * want I_IS_DIRECTORY_ITEM to return 1
+ * for it, so we -1
+ */
}
- set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key),
+ set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key,
TYPE_DIRENTRY);
}
@@ -113,36 +118,44 @@ static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
leaf_paste_entries(dest_bi, item_num_in_dest,
(last_first ==
- FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
+ FIRST_TO_LAST) ? ih_entry_count(item_head(dest,
item_num_in_dest))
: 0, copy_count, deh + from, records,
DEH_SIZE * copy_count + copy_records_len);
}
-/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or
- part of it or nothing (see the return 0 below) from SOURCE to the end
- (if last_first) or beginning (!last_first) of the DEST */
+/*
+ * Copy the first (if last_first == FIRST_TO_LAST) or last
+ * (last_first == LAST_TO_FIRST) item or part of it or nothing
+ * (see the return 0 below) from SOURCE to the end (if last_first)
+ * or beginning (!last_first) of the DEST
+ */
/* returns 1 if anything was copied, else 0 */
static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
struct buffer_head *src, int last_first,
int bytes_or_entries)
{
struct buffer_head *dest = dest_bi->bi_bh;
- int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */
+ /* number of items in the source and destination buffers */
+ int dest_nr_item, src_nr_item;
struct item_head *ih;
struct item_head *dih;
dest_nr_item = B_NR_ITEMS(dest);
+ /*
+ * if ( DEST is empty or first item of SOURCE and last item of
+ * DEST are the items of different objects or of different types )
+ * then there is no need to treat this item differently from the
+ * other items that we copy, so we return
+ */
if (last_first == FIRST_TO_LAST) {
- /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
- or of different types ) then there is no need to treat this item differently from the other items
- that we copy, so we return */
- ih = B_N_PITEM_HEAD(src, 0);
- dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1);
+ ih = item_head(src, 0);
+ dih = item_head(dest, dest_nr_item - 1);
+
+ /* there is nothing to merge */
if (!dest_nr_item
- || (!op_is_left_mergeable(&(ih->ih_key), src->b_size)))
- /* there is nothing to merge */
+ || (!op_is_left_mergeable(&ih->ih_key, src->b_size)))
return 0;
RFALSE(!ih_item_len(ih),
@@ -157,8 +170,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
return 1;
}
- /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
- part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
+ /*
+ * copy part of the body of the first item of SOURCE
+ * to the end of the body of the last item of the DEST
+ * part defined by 'bytes_or_entries'; if bytes_or_entries
+ * == -1 copy whole body; don't create new item header
*/
if (bytes_or_entries == -1)
bytes_or_entries = ih_item_len(ih);
@@ -176,11 +192,13 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
}
#endif
- /* merge first item (or its part) of src buffer with the last
- item of dest buffer. Both are of the same file */
+ /*
+ * merge first item (or its part) of src buffer with the last
+ * item of dest buffer. Both are of the same file
+ */
leaf_paste_in_buffer(dest_bi,
dest_nr_item - 1, ih_item_len(dih),
- bytes_or_entries, B_I_PITEM(src, ih), 0);
+ bytes_or_entries, ih_item_body(src, ih), 0);
if (is_indirect_le_ih(dih)) {
RFALSE(get_ih_free_space(dih),
@@ -195,19 +213,23 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
/* copy boundary item to right (last_first == LAST_TO_FIRST) */
- /* ( DEST is empty or last item of SOURCE and first item of DEST
- are the items of different object or of different types )
+ /*
+ * (DEST is empty or last item of SOURCE and first item of DEST
+ * are the items of different object or of different types)
*/
src_nr_item = B_NR_ITEMS(src);
- ih = B_N_PITEM_HEAD(src, src_nr_item - 1);
- dih = B_N_PITEM_HEAD(dest, 0);
+ ih = item_head(src, src_nr_item - 1);
+ dih = item_head(dest, 0);
- if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size))
+ if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size))
return 0;
if (is_direntry_le_ih(ih)) {
+ /*
+ * bytes_or_entries = entries number in last
+ * item body of SOURCE
+ */
if (bytes_or_entries == -1)
- /* bytes_or_entries = entries number in last item body of SOURCE */
bytes_or_entries = ih_entry_count(ih);
leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
@@ -217,9 +239,11 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
return 1;
}
- /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
- part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
- don't create new item header
+ /*
+ * copy part of the body of the last item of SOURCE to the
+ * begin of the body of the first item of the DEST; part defined
+ * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body;
+ * change first item key of the DEST; don't create new item header
*/
RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
@@ -270,15 +294,18 @@ static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
}
leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
- B_I_PITEM(src,
+ ih_item_body(src,
ih) + ih_item_len(ih) - bytes_or_entries,
0);
return 1;
}
-/* copy cpy_mun items from buffer src to buffer dest
- * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest
- * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest
+/*
+ * copy cpy_mun items from buffer src to buffer dest
+ * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning
+ * from first-th item in src to tail of dest
+ * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning
+ * from first-th item in src to head of dest
*/
static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
struct buffer_head *src, int last_first,
@@ -311,11 +338,14 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
nr = blkh_nr_item(blkh);
free_space = blkh_free_space(blkh);
- /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
+ /*
+ * we will insert items before 0-th or nr-th item in dest buffer.
+ * It depends of last_first parameter
+ */
dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
/* location of head of first new item */
- ih = B_N_PITEM_HEAD(dest, dest_before);
+ ih = item_head(dest, dest_before);
RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
"vs-10140: not enough free space for headers %d (needed %d)",
@@ -325,7 +355,7 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
/* copy item headers */
- memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE);
+ memcpy(ih, item_head(src, first), cpy_num * IH_SIZE);
free_space -= (IH_SIZE * cpy_num);
set_blkh_free_space(blkh, free_space);
@@ -338,8 +368,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
}
/* prepare space for items */
- last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before]));
- last_inserted_loc = ih_location(&(ih[cpy_num - 1]));
+ last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]);
+ last_inserted_loc = ih_location(&ih[cpy_num - 1]);
/* check free space */
RFALSE(free_space < j - last_inserted_loc,
@@ -352,7 +382,8 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
/* copy items */
memcpy(dest->b_data + last_inserted_loc,
- B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc);
+ item_body(src, (first + cpy_num - 1)),
+ j - last_inserted_loc);
/* sizes, item number */
set_blkh_nr_item(blkh, nr + cpy_num);
@@ -376,8 +407,10 @@ static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
}
}
-/* This function splits the (liquid) item into two items (useful when
- shifting part of an item into another node.) */
+/*
+ * This function splits the (liquid) item into two items (useful when
+ * shifting part of an item into another node.)
+ */
static void leaf_item_bottle(struct buffer_info *dest_bi,
struct buffer_head *src, int last_first,
int item_num, int cpy_bytes)
@@ -389,17 +422,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
"vs-10170: bytes == - 1 means: do not split item");
if (last_first == FIRST_TO_LAST) {
- /* if ( if item in position item_num in buffer SOURCE is directory item ) */
- ih = B_N_PITEM_HEAD(src, item_num);
+ /*
+ * if ( if item in position item_num in buffer SOURCE
+ * is directory item )
+ */
+ ih = item_head(src, item_num);
if (is_direntry_le_ih(ih))
leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
item_num, 0, cpy_bytes);
else {
struct item_head n_ih;
- /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST
- part defined by 'cpy_bytes'; create new item header; change old item_header (????);
- n_ih = new item_header;
+ /*
+ * copy part of the body of the item number 'item_num'
+ * of SOURCE to the end of the DEST part defined by
+ * 'cpy_bytes'; create new item header; change old
+ * item_header (????); n_ih = new item_header;
*/
memcpy(&n_ih, ih, IH_SIZE);
put_ih_item_len(&n_ih, cpy_bytes);
@@ -411,30 +449,36 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
set_ih_free_space(&n_ih, 0);
}
- RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size),
+ RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size),
"vs-10190: bad mergeability of item %h", ih);
n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
- B_N_PITEM(src, item_num), 0);
+ item_body(src, item_num), 0);
}
} else {
- /* if ( if item in position item_num in buffer SOURCE is directory item ) */
- ih = B_N_PITEM_HEAD(src, item_num);
+ /*
+ * if ( if item in position item_num in buffer
+ * SOURCE is directory item )
+ */
+ ih = item_head(src, item_num);
if (is_direntry_le_ih(ih))
leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
item_num,
- I_ENTRY_COUNT(ih) - cpy_bytes,
+ ih_entry_count(ih) - cpy_bytes,
cpy_bytes);
else {
struct item_head n_ih;
- /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST
- part defined by 'cpy_bytes'; create new item header;
- n_ih = new item_header;
+ /*
+ * copy part of the body of the item number 'item_num'
+ * of SOURCE to the begin of the DEST part defined by
+ * 'cpy_bytes'; create new item header;
+ * n_ih = new item_header;
*/
memcpy(&n_ih, ih, SHORT_KEY_SIZE);
- n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+ /* Endian safe, both le */
+ n_ih.ih_version = ih->ih_version;
if (is_direct_le_ih(ih)) {
set_le_ih_k_offset(&n_ih,
@@ -458,20 +502,22 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
/* set item length */
put_ih_item_len(&n_ih, cpy_bytes);
- n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
+ /* Endian safe, both le */
+ n_ih.ih_version = ih->ih_version;
leaf_insert_into_buf(dest_bi, 0, &n_ih,
- B_N_PITEM(src,
- item_num) +
- ih_item_len(ih) - cpy_bytes, 0);
+ item_body(src, item_num) +
+ ih_item_len(ih) - cpy_bytes, 0);
}
}
}
-/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST.
- If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST.
- From last item copy cpy_num bytes for regular item and cpy_num directory entries for
- directory item. */
+/*
+ * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE
+ * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole
+ * items from SOURCE to DEST. From last item copy cpy_num bytes for regular
+ * item and cpy_num directory entries for directory item.
+ */
static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
int last_first, int cpy_num, int cpy_bytes)
{
@@ -498,22 +544,34 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
else
bytes = -1;
- /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
+ /*
+ * copy the first item or it part or nothing to the end of
+ * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes))
+ */
i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
cpy_num -= i;
if (cpy_num == 0)
return i;
pos += i;
if (cpy_bytes == -1)
- /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
+ /*
+ * copy first cpy_num items starting from position
+ * 'pos' of SOURCE to end of DEST
+ */
leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
pos, cpy_num);
else {
- /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
+ /*
+ * copy first cpy_num-1 items starting from position
+ * 'pos-1' of the SOURCE to the end of the DEST
+ */
leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
pos, cpy_num - 1);
- /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
+ /*
+ * copy part of the item which number is
+ * cpy_num+pos-1 to the end of the DEST
+ */
leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
cpy_num + pos - 1, cpy_bytes);
}
@@ -525,7 +583,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
else
bytes = -1;
- /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
+ /*
+ * copy the last item or it part or nothing to the
+ * begin of the DEST
+ * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes));
+ */
i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
cpy_num -= i;
@@ -534,15 +596,24 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
pos = src_nr_item - cpy_num - i;
if (cpy_bytes == -1) {
- /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
+ /*
+ * starting from position 'pos' copy last cpy_num
+ * items of SOURCE to begin of DEST
+ */
leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
pos, cpy_num);
} else {
- /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
+ /*
+ * copy last cpy_num-1 items starting from position
+ * 'pos+1' of the SOURCE to the begin of the DEST;
+ */
leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
pos + 1, cpy_num - 1);
- /* copy part of the item which number is pos to the begin of the DEST */
+ /*
+ * copy part of the item which number is pos to
+ * the begin of the DEST
+ */
leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
cpy_bytes);
}
@@ -550,9 +621,11 @@ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
return i;
}
-/* there are types of coping: from S[0] to L[0], from S[0] to R[0],
- from R[0] to L[0]. for each of these we have to define parent and
- positions of destination and source buffers */
+/*
+ * there are types of coping: from S[0] to L[0], from S[0] to R[0],
+ * from R[0] to L[0]. for each of these we have to define parent and
+ * positions of destination and source buffers
+ */
static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
struct buffer_info *dest_bi,
struct buffer_info *src_bi,
@@ -568,7 +641,9 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
src_bi->tb = tb;
src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
- src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); /* src->b_item_order */
+
+ /* src->b_item_order */
+ src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
dest_bi->tb = tb;
dest_bi->bi_bh = tb->L[0];
dest_bi->bi_parent = tb->FL[0];
@@ -633,8 +708,10 @@ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
}
-/* copy mov_num items and mov_bytes of the (mov_num-1)th item to
- neighbor. Delete them from source */
+/*
+ * copy mov_num items and mov_bytes of the (mov_num-1)th item to
+ * neighbor. Delete them from source
+ */
int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
int mov_bytes, struct buffer_head *Snew)
{
@@ -657,18 +734,24 @@ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
return ret_value;
}
-/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1)
- from S[0] to L[0] and replace the delimiting key */
+/*
+ * Shift shift_num items (and shift_bytes of last shifted item if
+ * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key
+ */
int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
{
struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
int i;
- /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
+ /*
+ * move shift_num (and shift_bytes bytes) items from S[0]
+ * to left neighbor L[0]
+ */
i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
if (shift_num) {
- if (B_NR_ITEMS(S0) == 0) { /* number of items in S[0] == 0 */
+ /* number of items in S[0] == 0 */
+ if (B_NR_ITEMS(S0) == 0) {
RFALSE(shift_bytes != -1,
"vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
@@ -691,10 +774,10 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
RFALSE((shift_bytes != -1 &&
- !(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0))
- && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) &&
+ !(is_direntry_le_ih(item_head(S0, 0))
+ && !ih_entry_count(item_head(S0, 0)))) &&
(!op_is_left_mergeable
- (B_N_PKEY(S0, 0), S0->b_size)),
+ (leaf_key(S0, 0), S0->b_size)),
"vs-10280: item must be mergeable");
}
}
@@ -704,13 +787,18 @@ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
/* CLEANING STOPPED HERE */
-/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */
+/*
+ * Shift shift_num (shift_bytes) items from S[0] to the right neighbor,
+ * and replace the delimiting key
+ */
int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
{
- // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
int ret_value;
- /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
+ /*
+ * move shift_num (and shift_bytes) items from S[0] to
+ * right neighbor R[0]
+ */
ret_value =
leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
@@ -725,12 +813,16 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
static void leaf_delete_items_entirely(struct buffer_info *bi,
int first, int del_num);
-/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
- If not.
- If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
- the first item. Part defined by del_bytes. Don't delete first item header
- If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
- the last item . Part defined by del_bytes. Don't delete last item header.
+/*
+ * If del_bytes == -1, starting from position 'first' delete del_num
+ * items in whole in buffer CUR.
+ * If not.
+ * If last_first == 0. Starting from position 'first' delete del_num-1
+ * items in whole. Delete part of body of the first item. Part defined by
+ * del_bytes. Don't delete first item header
+ * If last_first == 1. Starting from position 'first+1' delete del_num-1
+ * items in whole. Delete part of body of the last item . Part defined by
+ * del_bytes. Don't delete last item header.
*/
void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
int first, int del_num, int del_bytes)
@@ -761,32 +853,43 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
leaf_delete_items_entirely(cur_bi, first, del_num);
else {
if (last_first == FIRST_TO_LAST) {
- /* delete del_num-1 items beginning from item in position first */
+ /*
+ * delete del_num-1 items beginning from
+ * item in position first
+ */
leaf_delete_items_entirely(cur_bi, first, del_num - 1);
- /* delete the part of the first item of the bh
- do not delete item header
+ /*
+ * delete the part of the first item of the bh
+ * do not delete item header
*/
leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
} else {
struct item_head *ih;
int len;
- /* delete del_num-1 items beginning from item in position first+1 */
+ /*
+ * delete del_num-1 items beginning from
+ * item in position first+1
+ */
leaf_delete_items_entirely(cur_bi, first + 1,
del_num - 1);
- ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1);
+ ih = item_head(bh, B_NR_ITEMS(bh) - 1);
if (is_direntry_le_ih(ih))
/* the last item is directory */
- /* len = numbers of directory entries in this item */
+ /*
+ * len = numbers of directory entries
+ * in this item
+ */
len = ih_entry_count(ih);
else
/* len = body len of item */
len = ih_item_len(ih);
- /* delete the part of the last item of the bh
- do not delete item header
+ /*
+ * delete the part of the last item of the bh
+ * do not delete item header
*/
leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
len - del_bytes, del_bytes);
@@ -820,10 +923,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
zeros_number, ih_item_len(inserted_item_ih));
/* get item new item must be inserted before */
- ih = B_N_PITEM_HEAD(bh, before);
+ ih = item_head(bh, before);
/* prepare space for the body of new item */
- last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size;
+ last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size;
unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
@@ -846,8 +949,8 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
/* change locations */
for (i = before; i < nr + 1; i++) {
- unmoved_loc -= ih_item_len(&(ih[i - before]));
- put_ih_location(&(ih[i - before]), unmoved_loc);
+ unmoved_loc -= ih_item_len(&ih[i - before]);
+ put_ih_location(&ih[i - before], unmoved_loc);
}
/* sizes, free space, item number */
@@ -867,8 +970,10 @@ void leaf_insert_into_buf(struct buffer_info *bi, int before,
}
}
-/* paste paste_size bytes to affected_item_num-th item.
- When item is a directory, this only prepare space for new entries */
+/*
+ * paste paste_size bytes to affected_item_num-th item.
+ * When item is a directory, this only prepare space for new entries
+ */
void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
int pos_in_item, int paste_size,
const char *body, int zeros_number)
@@ -902,9 +1007,9 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
#endif /* CONFIG_REISERFS_CHECK */
/* item to be appended */
- ih = B_N_PITEM_HEAD(bh, affected_item_num);
+ ih = item_head(bh, affected_item_num);
- last_loc = ih_location(&(ih[nr - affected_item_num - 1]));
+ last_loc = ih_location(&ih[nr - affected_item_num - 1]);
unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
/* prepare space */
@@ -913,8 +1018,8 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
/* change locations */
for (i = affected_item_num; i < nr; i++)
- put_ih_location(&(ih[i - affected_item_num]),
- ih_location(&(ih[i - affected_item_num])) -
+ put_ih_location(&ih[i - affected_item_num],
+ ih_location(&ih[i - affected_item_num]) -
paste_size);
if (body) {
@@ -957,10 +1062,12 @@ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
}
}
-/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
- does not have free space, so it moves DEHs and remaining records as
- necessary. Return value is size of removed part of directory item
- in bytes. */
+/*
+ * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
+ * does not have free space, so it moves DEHs and remaining records as
+ * necessary. Return value is size of removed part of directory item
+ * in bytes.
+ */
static int leaf_cut_entries(struct buffer_head *bh,
struct item_head *ih, int from, int del_count)
{
@@ -971,12 +1078,14 @@ static int leaf_cut_entries(struct buffer_head *bh,
int cut_records_len; /* length of all removed records */
int i;
- /* make sure, that item is directory and there are enough entries to
- remove */
+ /*
+ * make sure that item is directory and there are enough entries to
+ * remove
+ */
RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
- RFALSE(I_ENTRY_COUNT(ih) < from + del_count,
+ RFALSE(ih_entry_count(ih) < from + del_count,
"10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d",
- I_ENTRY_COUNT(ih), from, del_count);
+ ih_entry_count(ih), from, del_count);
if (del_count == 0)
return 0;
@@ -987,22 +1096,24 @@ static int leaf_cut_entries(struct buffer_head *bh,
/* entry head array */
deh = B_I_DEH(bh, ih);
- /* first byte of remaining entries, those are BEFORE cut entries
- (prev_record) and length of all removed records (cut_records_len) */
+ /*
+ * first byte of remaining entries, those are BEFORE cut entries
+ * (prev_record) and length of all removed records (cut_records_len)
+ */
prev_record_offset =
- (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih));
+ (from ? deh_location(&deh[from - 1]) : ih_item_len(ih));
cut_records_len = prev_record_offset /*from_record */ -
- deh_location(&(deh[from + del_count - 1]));
+ deh_location(&deh[from + del_count - 1]);
prev_record = item + prev_record_offset;
/* adjust locations of remaining entries */
- for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--)
- put_deh_location(&(deh[i]),
+ for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--)
+ put_deh_location(&deh[i],
deh_location(&deh[i]) -
(DEH_SIZE * del_count));
for (i = 0; i < from; i++)
- put_deh_location(&(deh[i]),
+ put_deh_location(&deh[i],
deh_location(&deh[i]) - (DEH_SIZE * del_count +
cut_records_len));
@@ -1021,14 +1132,15 @@ static int leaf_cut_entries(struct buffer_head *bh,
return DEH_SIZE * del_count + cut_records_len;
}
-/* when cut item is part of regular file
- pos_in_item - first byte that must be cut
- cut_size - number of bytes to be cut beginning from pos_in_item
-
- when cut item is part of directory
- pos_in_item - number of first deleted entry
- cut_size - count of deleted entries
- */
+/*
+ * when cut item is part of regular file
+ * pos_in_item - first byte that must be cut
+ * cut_size - number of bytes to be cut beginning from pos_in_item
+ *
+ * when cut item is part of directory
+ * pos_in_item - number of first deleted entry
+ * cut_size - count of deleted entries
+ */
void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
int pos_in_item, int cut_size)
{
@@ -1043,7 +1155,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
nr = blkh_nr_item(blkh);
/* item head of truncated item */
- ih = B_N_PITEM_HEAD(bh, cut_item_num);
+ ih = item_head(bh, cut_item_num);
if (is_direntry_le_ih(ih)) {
/* first cut entry () */
@@ -1055,7 +1167,6 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
cut_item_num);
/* change item key by key of first entry in the item */
set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
- /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */
}
} else {
/* item is direct or indirect */
@@ -1089,7 +1200,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
}
/* location of the last item */
- last_loc = ih_location(&(ih[nr - cut_item_num - 1]));
+ last_loc = ih_location(&ih[nr - cut_item_num - 1]);
/* location of the item, which is remaining at the same place */
unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
@@ -1108,7 +1219,7 @@ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
/* change locations */
for (i = cut_item_num; i < nr; i++)
- put_ih_location(&(ih[i - cut_item_num]),
+ put_ih_location(&ih[i - cut_item_num],
ih_location(&ih[i - cut_item_num]) + cut_size);
/* size, free space */
@@ -1156,14 +1267,14 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
return;
}
- ih = B_N_PITEM_HEAD(bh, first);
+ ih = item_head(bh, first);
/* location of unmovable item */
j = (first == 0) ? bh->b_size : ih_location(ih - 1);
/* delete items */
- last_loc = ih_location(&(ih[nr - 1 - first]));
- last_removed_loc = ih_location(&(ih[del_num - 1]));
+ last_loc = ih_location(&ih[nr - 1 - first]);
+ last_removed_loc = ih_location(&ih[del_num - 1]);
memmove(bh->b_data + last_loc + j - last_removed_loc,
bh->b_data + last_loc, last_removed_loc - last_loc);
@@ -1173,8 +1284,8 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
/* change item location */
for (i = first; i < nr - del_num; i++)
- put_ih_location(&(ih[i - first]),
- ih_location(&(ih[i - first])) + (j -
+ put_ih_location(&ih[i - first],
+ ih_location(&ih[i - first]) + (j -
last_removed_loc));
/* sizes, item number */
@@ -1195,7 +1306,10 @@ static void leaf_delete_items_entirely(struct buffer_info *bi,
}
}
-/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
+/*
+ * paste new_entry_count entries (new_dehs, records) into position
+ * before to item_num-th item
+ */
void leaf_paste_entries(struct buffer_info *bi,
int item_num,
int before,
@@ -1213,13 +1327,16 @@ void leaf_paste_entries(struct buffer_info *bi,
if (new_entry_count == 0)
return;
- ih = B_N_PITEM_HEAD(bh, item_num);
+ ih = item_head(bh, item_num);
- /* make sure, that item is directory, and there are enough records in it */
+ /*
+ * make sure, that item is directory, and there are enough
+ * records in it
+ */
RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
- RFALSE(I_ENTRY_COUNT(ih) < before,
+ RFALSE(ih_entry_count(ih) < before,
"10230: there are no entry we paste entries before. entry_count = %d, before = %d",
- I_ENTRY_COUNT(ih), before);
+ ih_entry_count(ih), before);
/* first byte of dest item */
item = bh->b_data + ih_location(ih);
@@ -1230,21 +1347,21 @@ void leaf_paste_entries(struct buffer_info *bi,
/* new records will be pasted at this point */
insert_point =
item +
- (before ? deh_location(&(deh[before - 1]))
+ (before ? deh_location(&deh[before - 1])
: (ih_item_len(ih) - paste_size));
/* adjust locations of records that will be AFTER new records */
- for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--)
- put_deh_location(&(deh[i]),
- deh_location(&(deh[i])) +
+ for (i = ih_entry_count(ih) - 1; i >= before; i--)
+ put_deh_location(&deh[i],
+ deh_location(&deh[i]) +
(DEH_SIZE * new_entry_count));
/* adjust locations of records that will be BEFORE new records */
for (i = 0; i < before; i++)
- put_deh_location(&(deh[i]),
- deh_location(&(deh[i])) + paste_size);
+ put_deh_location(&deh[i],
+ deh_location(&deh[i]) + paste_size);
- old_entry_num = I_ENTRY_COUNT(ih);
+ old_entry_num = ih_entry_count(ih);
put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
/* prepare space for pasted records */
@@ -1266,10 +1383,10 @@ void leaf_paste_entries(struct buffer_info *bi,
/* set locations of new records */
for (i = 0; i < new_entry_count; i++) {
- put_deh_location(&(deh[i]),
- deh_location(&(deh[i])) +
+ put_deh_location(&deh[i],
+ deh_location(&deh[i]) +
(-deh_location
- (&(new_dehs[new_entry_count - 1])) +
+ (&new_dehs[new_entry_count - 1]) +
insert_point + DEH_SIZE * new_entry_count -
item));
}
@@ -1277,28 +1394,26 @@ void leaf_paste_entries(struct buffer_info *bi,
/* change item key if necessary (when we paste before 0-th entry */
if (!before) {
set_le_ih_k_offset(ih, deh_offset(new_dehs));
-/* memcpy (&ih->ih_key.k_offset,
- &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
}
#ifdef CONFIG_REISERFS_CHECK
{
int prev, next;
/* check record locations */
deh = B_I_DEH(bh, ih);
- for (i = 0; i < I_ENTRY_COUNT(ih); i++) {
+ for (i = 0; i < ih_entry_count(ih); i++) {
next =
(i <
- I_ENTRY_COUNT(ih) -
- 1) ? deh_location(&(deh[i + 1])) : 0;
- prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
+ ih_entry_count(ih) -
+ 1) ? deh_location(&deh[i + 1]) : 0;
+ prev = (i != 0) ? deh_location(&deh[i - 1]) : 0;
- if (prev && prev <= deh_location(&(deh[i])))
+ if (prev && prev <= deh_location(&deh[i]))
reiserfs_error(sb_from_bi(bi), "vs-10240",
"directory item (%h) "
"corrupted (prev %a, "
"cur(%d) %a)",
ih, deh + i - 1, i, deh + i);
- if (next && next >= deh_location(&(deh[i])))
+ if (next && next >= deh_location(&deh[i]))
reiserfs_error(sb_from_bi(bi), "vs-10250",
"directory item (%h) "
"corrupted (cur(%d) %a, "
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index dc5236f6de1..cd11358b10c 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -22,8 +22,10 @@
#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { inc_nlink(i); if (i->i_nlink >= REISERFS_LINK_MAX) set_nlink(i, 1); }
#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) drop_nlink(i);
-// directory item contains array of entry headers. This performs
-// binary search through that array
+/*
+ * directory item contains array of entry headers. This performs
+ * binary search through that array
+ */
static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
{
struct item_head *ih = de->de_ih;
@@ -31,7 +33,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
int rbound, lbound, j;
lbound = 0;
- rbound = I_ENTRY_COUNT(ih) - 1;
+ rbound = ih_entry_count(ih) - 1;
for (j = (rbound + lbound) / 2; lbound <= rbound;
j = (rbound + lbound) / 2) {
@@ -43,7 +45,7 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
lbound = j + 1;
continue;
}
- // this is not name found, but matched third key component
+ /* this is not name found, but matched third key component */
de->de_entry_num = j;
return NAME_FOUND;
}
@@ -52,17 +54,21 @@ static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
return NAME_NOT_FOUND;
}
-// comment? maybe something like set de to point to what the path points to?
+/*
+ * comment? maybe something like set de to point to what the path points to?
+ */
static inline void set_de_item_location(struct reiserfs_dir_entry *de,
struct treepath *path)
{
de->de_bh = get_last_bh(path);
- de->de_ih = get_ih(path);
+ de->de_ih = tp_item_head(path);
de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
de->de_item_num = PATH_LAST_POSITION(path);
}
-// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+/*
+ * de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
+ */
inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
{
struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
@@ -71,17 +77,17 @@ inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
- de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh);
+ de->de_name = ih_item_body(de->de_bh, de->de_ih) + deh_location(deh);
if (de->de_name[de->de_namelen - 1] == 0)
de->de_namelen = strlen(de->de_name);
}
-// what entry points to
+/* what entry points to */
static inline void set_de_object_key(struct reiserfs_dir_entry *de)
{
BUG_ON(de->de_entry_num >= ih_entry_count(de->de_ih));
- de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num]));
- de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num]));
+ de->de_dir_id = deh_dir_id(&de->de_deh[de->de_entry_num]);
+ de->de_objectid = deh_objectid(&de->de_deh[de->de_entry_num]);
}
static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
@@ -96,21 +102,20 @@ static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
le32_to_cpu(de->de_ih->ih_key.k_dir_id);
de->de_entry_key.on_disk_key.k_objectid =
le32_to_cpu(de->de_ih->ih_key.k_objectid);
- set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh));
- set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY);
+ set_cpu_key_k_offset(&de->de_entry_key, deh_offset(deh));
+ set_cpu_key_k_type(&de->de_entry_key, TYPE_DIRENTRY);
}
-/* We assign a key to each directory item, and place multiple entries
-in a single directory item. A directory item has a key equal to the
-key of the first directory entry in it.
-
-This function first calls search_by_key, then, if item whose first
-entry matches is not found it looks for the entry inside directory
-item found by search_by_key. Fills the path to the entry, and to the
-entry position in the item
-
-*/
-
+/*
+ * We assign a key to each directory item, and place multiple entries in a
+ * single directory item. A directory item has a key equal to the key of
+ * the first directory entry in it.
+
+ * This function first calls search_by_key, then, if item whose first entry
+ * matches is not found it looks for the entry inside directory item found
+ * by search_by_key. Fills the path to the entry, and to the entry position
+ * in the item
+ */
/* The function is NOT SCHEDULE-SAFE! */
int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
struct treepath *path, struct reiserfs_dir_entry *de)
@@ -144,7 +149,7 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
#ifdef CONFIG_REISERFS_CHECK
if (!is_direntry_le_ih(de->de_ih) ||
- COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
+ COMP_SHORT_KEYS(&de->de_ih->ih_key, key)) {
print_block(de->de_bh, 0, -1, -1);
reiserfs_panic(sb, "vs-7005", "found item %h is not directory "
"item or does not belong to the same directory "
@@ -152,12 +157,17 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
}
#endif /* CONFIG_REISERFS_CHECK */
- /* binary search in directory item by third componen t of the
- key. sets de->de_entry_num of de */
+ /*
+ * binary search in directory item by third component of the
+ * key. sets de->de_entry_num of de
+ */
retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
path->pos_in_item = de->de_entry_num;
if (retval != NAME_NOT_FOUND) {
- // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
+ /*
+ * ugly, but rename needs de_bh, de_deh, de_name,
+ * de_namelen, de_objectid set
+ */
set_de_name_and_namelen(de);
set_de_object_key(de);
}
@@ -166,11 +176,12 @@ int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
-/* The third component is hashed, and you can choose from more than
- one hash function. Per directory hashes are not yet implemented
- but are thought about. This function should be moved to hashes.c
- Jedi, please do so. -Hans */
-
+/*
+ * The third component is hashed, and you can choose from more than
+ * one hash function. Per directory hashes are not yet implemented
+ * but are thought about. This function should be moved to hashes.c
+ * Jedi, please do so. -Hans
+ */
static __u32 get_third_component(struct super_block *s,
const char *name, int len)
{
@@ -183,11 +194,13 @@ static __u32 get_third_component(struct super_block *s,
res = REISERFS_SB(s)->s_hash_function(name, len);
- // take bits from 7-th to 30-th including both bounds
+ /* take bits from 7-th to 30-th including both bounds */
res = GET_HASH_VALUE(res);
if (res == 0)
- // needed to have no names before "." and ".." those have hash
- // value == 0 and generation conters 1 and 2 accordingly
+ /*
+ * needed to have no names before "." and ".." those have hash
+ * value == 0 and generation conters 1 and 2 accordingly
+ */
res = 128;
return res + MAX_GENERATION_NUMBER;
}
@@ -208,7 +221,7 @@ static int reiserfs_match(struct reiserfs_dir_entry *de,
/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
- /* used when hash collisions exist */
+/* used when hash collisions exist */
static int linear_search_in_dir_item(struct cpu_key *key,
struct reiserfs_dir_entry *de,
@@ -220,7 +233,7 @@ static int linear_search_in_dir_item(struct cpu_key *key,
i = de->de_entry_num;
- if (i == I_ENTRY_COUNT(de->de_ih) ||
+ if (i == ih_entry_count(de->de_ih) ||
GET_HASH_VALUE(deh_offset(deh + i)) !=
GET_HASH_VALUE(cpu_key_k_offset(key))) {
i--;
@@ -232,43 +245,50 @@ static int linear_search_in_dir_item(struct cpu_key *key,
deh += i;
for (; i >= 0; i--, deh--) {
+ /* hash value does not match, no need to check whole name */
if (GET_HASH_VALUE(deh_offset(deh)) !=
GET_HASH_VALUE(cpu_key_k_offset(key))) {
- // hash value does not match, no need to check whole name
return NAME_NOT_FOUND;
}
- /* mark, that this generation number is used */
+ /* mark that this generation number is used */
if (de->de_gen_number_bit_string)
set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
de->de_gen_number_bit_string);
- // calculate pointer to name and namelen
+ /* calculate pointer to name and namelen */
de->de_entry_num = i;
set_de_name_and_namelen(de);
+ /*
+ * de's de_name, de_namelen, de_recordlen are set.
+ * Fill the rest.
+ */
if ((retval =
reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
- // de's de_name, de_namelen, de_recordlen are set. Fill the rest:
- // key of pointed object
+ /* key of pointed object */
set_de_object_key(de);
store_de_entry_key(de);
- // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
+ /* retval can be NAME_FOUND or NAME_FOUND_INVISIBLE */
return retval;
}
}
if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
- /* we have reached left most entry in the node. In common we
- have to go to the left neighbor, but if generation counter
- is 0 already, we know for sure, that there is no name with
- the same hash value */
- // FIXME: this work correctly only because hash value can not
- // be 0. Btw, in case of Yura's hash it is probably possible,
- // so, this is a bug
+ /*
+ * we have reached left most entry in the node. In common we
+ * have to go to the left neighbor, but if generation counter
+ * is 0 already, we know for sure, that there is no name with
+ * the same hash value
+ */
+ /*
+ * FIXME: this work correctly only because hash value can not
+ * be 0. Btw, in case of Yura's hash it is probably possible,
+ * so, this is a bug
+ */
return NAME_NOT_FOUND;
RFALSE(de->de_item_num,
@@ -277,8 +297,10 @@ static int linear_search_in_dir_item(struct cpu_key *key,
return GOTO_PREVIOUS_ITEM;
}
-// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
-// FIXME: should add something like IOERROR
+/*
+ * may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
+ * FIXME: should add something like IOERROR
+ */
static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
struct treepath *path_to_entry,
struct reiserfs_dir_entry *de)
@@ -307,13 +329,19 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
retval =
linear_search_in_dir_item(&key_to_search, de, name,
namelen);
+ /*
+ * there is no need to scan directory anymore.
+ * Given entry found or does not exist
+ */
if (retval != GOTO_PREVIOUS_ITEM) {
- /* there is no need to scan directory anymore. Given entry found or does not exist */
path_to_entry->pos_in_item = de->de_entry_num;
return retval;
}
- /* there is left neighboring item of this directory and given entry can be there */
+ /*
+ * there is left neighboring item of this directory
+ * and given entry can be there
+ */
set_cpu_key_k_offset(&key_to_search,
le_ih_k_offset(de->de_ih) - 1);
pathrelse(path_to_entry);
@@ -341,14 +369,16 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
pathrelse(&path_to_entry);
if (retval == NAME_FOUND) {
inode = reiserfs_iget(dir->i_sb,
- (struct cpu_key *)&(de.de_dir_id));
+ (struct cpu_key *)&de.de_dir_id);
if (!inode || IS_ERR(inode)) {
reiserfs_write_unlock(dir->i_sb);
return ERR_PTR(-EACCES);
}
- /* Propagate the private flag so we know we're
- * in the priv tree */
+ /*
+ * Propagate the private flag so we know we're
+ * in the priv tree
+ */
if (IS_PRIVATE(dir))
inode->i_flags |= S_PRIVATE;
}
@@ -361,9 +391,9 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
}
/*
-** looks up the dentry of the parent directory for child.
-** taken from ext2_get_parent
-*/
+ * looks up the dentry of the parent directory for child.
+ * taken from ext2_get_parent
+ */
struct dentry *reiserfs_get_parent(struct dentry *child)
{
int retval;
@@ -384,7 +414,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child)
reiserfs_write_unlock(dir->i_sb);
return ERR_PTR(-ENOENT);
}
- inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+ inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&de.de_dir_id);
reiserfs_write_unlock(dir->i_sb);
return d_obtain_alias(inode);
@@ -406,8 +436,13 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
struct reiserfs_dir_entry de;
DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1);
int gen_number;
- char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc
- if we create file with short name */
+
+ /*
+ * 48 bytes now and we avoid kmalloc if we
+ * create file with short name
+ */
+ char small_buf[32 + DEH_SIZE];
+
char *buffer;
int buflen, paste_size;
int retval;
@@ -439,21 +474,30 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
(get_inode_sd_version(dir) ==
STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
- /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
+ /*
+ * fill buffer : directory entry head, name[, dir objectid | ,
+ * stat data | ,stat data, dir objectid ]
+ */
deh = (struct reiserfs_de_head *)buffer;
deh->deh_location = 0; /* JDM Endian safe if 0 */
put_deh_offset(deh, cpu_key_k_offset(&entry_key));
deh->deh_state = 0; /* JDM Endian safe if 0 */
/* put key (ino analog) to de */
- deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id; /* safe: k_dir_id is le */
- deh->deh_objectid = INODE_PKEY(inode)->k_objectid; /* safe: k_objectid is le */
+
+ /* safe: k_dir_id is le */
+ deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;
+ /* safe: k_objectid is le */
+ deh->deh_objectid = INODE_PKEY(inode)->k_objectid;
/* copy name */
memcpy((char *)(deh + 1), name, namelen);
/* padd by 0s to the 4 byte boundary */
padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
- /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
+ /*
+ * entry is ready to be pasted into tree, set 'visibility'
+ * and 'stat data in entry' attributes
+ */
mark_de_without_sd(deh);
visible ? mark_de_visible(deh) : mark_de_hidden(deh);
@@ -499,7 +543,8 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
/* update max-hash-collisions counter in reiserfs_sb_info */
PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
- if (gen_number != 0) { /* we need to re-search for the insertion point */
+ /* we need to re-search for the insertion point */
+ if (gen_number != 0) {
if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
NAME_NOT_FOUND) {
reiserfs_warning(dir->i_sb, "vs-7032",
@@ -527,18 +572,19 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
dir->i_size += paste_size;
dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
if (!S_ISDIR(inode->i_mode) && visible)
- // reiserfs_mkdir or reiserfs_rename will do that by itself
+ /* reiserfs_mkdir or reiserfs_rename will do that by itself */
reiserfs_update_sd(th, dir);
reiserfs_check_path(&path);
return 0;
}
-/* quota utility function, call if you've had to abort after calling
-** new_inode_init, and have not called reiserfs_new_inode yet.
-** This should only be called on inodes that do not have stat data
-** inserted into the tree yet.
-*/
+/*
+ * quota utility function, call if you've had to abort after calling
+ * new_inode_init, and have not called reiserfs_new_inode yet.
+ * This should only be called on inodes that do not have stat data
+ * inserted into the tree yet.
+ */
static int drop_new_inode(struct inode *inode)
{
dquot_drop(inode);
@@ -548,18 +594,23 @@ static int drop_new_inode(struct inode *inode)
return 0;
}
-/* utility function that does setup for reiserfs_new_inode.
-** dquot_initialize needs lots of credits so it's better to have it
-** outside of a transaction, so we had to pull some bits of
-** reiserfs_new_inode out into this func.
-*/
+/*
+ * utility function that does setup for reiserfs_new_inode.
+ * dquot_initialize needs lots of credits so it's better to have it
+ * outside of a transaction, so we had to pull some bits of
+ * reiserfs_new_inode out into this func.
+ */
static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode)
{
- /* Make inode invalid - just in case we are going to drop it before
- * the initialization happens */
+ /*
+ * Make inode invalid - just in case we are going to drop it before
+ * the initialization happens
+ */
INODE_PKEY(inode)->k_objectid = 0;
- /* the quota init calls have to know who to charge the quota to, so
- ** we have to set uid and gid here
+
+ /*
+ * the quota init calls have to know who to charge the quota to, so
+ * we have to set uid and gid here
*/
inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
@@ -571,7 +622,10 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
{
int retval;
struct inode *inode;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas
+ * for new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
@@ -618,7 +672,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
int err;
drop_nlink(inode);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
@@ -630,9 +684,9 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
- out_failed:
+out_failed:
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -644,7 +698,10 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
struct inode *inode;
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas
+ * for new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
@@ -685,7 +742,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
inode->i_op = &reiserfs_special_inode_operations;
init_special_inode(inode, inode->i_mode, rdev);
- //FIXME: needed for block and char devices only
+ /* FIXME: needed for block and char devices only */
reiserfs_update_sd(&th, inode);
reiserfs_update_inode_transaction(inode);
@@ -698,7 +755,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
int err;
drop_nlink(inode);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
@@ -708,9 +765,9 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
- out_failed:
+out_failed:
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -721,7 +778,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
struct inode *inode;
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas
+ * for new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
@@ -730,7 +790,10 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
dquot_initialize(dir);
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */
+ /*
+ * set flag that new packing locality created and new blocks
+ * for the content of that directory are not displaced yet
+ */
REISERFS_I(dir)->new_packing_locality = 1;
#endif
mode = S_IFDIR | mode;
@@ -754,8 +817,9 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
goto out_failed;
}
- /* inc the link count now, so another writer doesn't overflow it while
- ** we sleep later on.
+ /*
+ * inc the link count now, so another writer doesn't overflow
+ * it while we sleep later on.
*/
INC_DIR_INODE_NLINK(dir)
@@ -774,7 +838,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
inode->i_op = &reiserfs_dir_inode_operations;
inode->i_fop = &reiserfs_dir_operations;
- // note, _this_ add_entry will not update dir's stat data
+ /* note, _this_ add_entry will not update dir's stat data */
retval =
reiserfs_add_entry(&th, dir, dentry->d_name.name,
dentry->d_name.len, inode, 1 /*visible */ );
@@ -783,19 +847,19 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
clear_nlink(inode);
DEC_DIR_INODE_NLINK(dir);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
iput(inode);
goto out_failed;
}
- // the above add_entry did not update dir's stat data
+ /* the above add_entry did not update dir's stat data */
reiserfs_update_sd(&th, dir);
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(dir->i_sb);
return retval;
@@ -803,10 +867,11 @@ out_failed:
static inline int reiserfs_empty_dir(struct inode *inode)
{
- /* we can cheat because an old format dir cannot have
- ** EMPTY_DIR_SIZE, and a new format dir cannot have
- ** EMPTY_DIR_SIZE_V1. So, if the inode is either size,
- ** regardless of disk format version, the directory is empty.
+ /*
+ * we can cheat because an old format dir cannot have
+ * EMPTY_DIR_SIZE, and a new format dir cannot have
+ * EMPTY_DIR_SIZE_V1. So, if the inode is either size,
+ * regardless of disk format version, the directory is empty.
*/
if (inode->i_size != EMPTY_DIR_SIZE &&
inode->i_size != EMPTY_DIR_SIZE_V1) {
@@ -824,10 +889,12 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
- /* we will be doing 2 balancings and update 2 stat data, we change quotas
- * of the owner of the directory and of the owner of the parent directory.
- * The quota structure is possibly deleted only on last iput => outside
- * of this transaction */
+ /*
+ * we will be doing 2 balancings and update 2 stat data, we
+ * change quotas of the owner of the directory and of the owner
+ * of the parent directory. The quota structure is possibly
+ * deleted only on last iput => outside of this transaction
+ */
jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
@@ -856,8 +923,9 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
reiserfs_update_inode_transaction(dir);
if (de.de_objectid != inode->i_ino) {
- // FIXME: compare key of an object and a key found in the
- // entry
+ /*
+ * FIXME: compare key of an object and a key found in the entry
+ */
retval = -EIO;
goto end_rmdir;
}
@@ -867,7 +935,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
}
/* cut entry from dir directory */
- retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL, /* page */
+ retval = reiserfs_cut_from_item(&th, &path, &de.de_entry_key,
+ dir, NULL, /* page */
0 /*new file size - not used here */ );
if (retval < 0)
goto end_rmdir;
@@ -888,18 +957,20 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
/* prevent empty directory from getting lost */
add_save_link(&th, inode, 0 /* not truncate */ );
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_check_path(&path);
- out_rmdir:
+out_rmdir:
reiserfs_write_unlock(dir->i_sb);
return retval;
- end_rmdir:
- /* we must release path, because we did not call
- reiserfs_cut_from_item, or reiserfs_cut_from_item does not
- release path if operation was not complete */
+end_rmdir:
+ /*
+ * we must release path, because we did not call
+ * reiserfs_cut_from_item, or reiserfs_cut_from_item does not
+ * release path if operation was not complete
+ */
pathrelse(&path);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
reiserfs_write_unlock(dir->i_sb);
return err ? err : retval;
}
@@ -918,10 +989,13 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
inode = dentry->d_inode;
- /* in this transaction we can be doing at max two balancings and update
- * two stat datas, we change quotas of the owner of the directory and of
- * the owner of the parent directory. The quota structure is possibly
- * deleted only on iput => outside of this transaction */
+ /*
+ * in this transaction we can be doing at max two balancings and
+ * update two stat datas, we change quotas of the owner of the
+ * directory and of the owner of the parent directory. The quota
+ * structure is possibly deleted only on iput => outside of
+ * this transaction
+ */
jbegin_count =
JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
@@ -946,8 +1020,9 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
reiserfs_update_inode_transaction(dir);
if (de.de_objectid != inode->i_ino) {
- // FIXME: compare key of an object and a key found in the
- // entry
+ /*
+ * FIXME: compare key of an object and a key found in the entry
+ */
retval = -EIO;
goto end_unlink;
}
@@ -968,7 +1043,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
savelink = inode->i_nlink;
retval =
- reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
+ reiserfs_cut_from_item(&th, &path, &de.de_entry_key, dir, NULL,
0);
if (retval < 0) {
inc_nlink(inode);
@@ -985,18 +1060,18 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
/* prevent file from getting lost */
add_save_link(&th, inode, 0 /* not truncate */ );
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_check_path(&path);
reiserfs_write_unlock(dir->i_sb);
return retval;
- end_unlink:
+end_unlink:
pathrelse(&path);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
reiserfs_check_path(&path);
if (err)
retval = err;
- out_unlink:
+out_unlink:
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -1011,7 +1086,10 @@ static int reiserfs_symlink(struct inode *parent_dir,
struct reiserfs_transaction_handle th;
struct reiserfs_security_handle security;
int mode = S_IFLNK | S_IRWXUGO;
- /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+ /*
+ * We need blocks for transaction + (user+group)*(quotas for
+ * new inode + update of quota for directory owner)
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
@@ -1070,17 +1148,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
inode->i_op = &reiserfs_symlink_inode_operations;
inode->i_mapping->a_ops = &reiserfs_address_space_operations;
- // must be sure this inode is written with this transaction
- //
- //reiserfs_update_sd (&th, inode, READ_BLOCKS);
-
retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
dentry->d_name.len, inode, 1 /*visible */ );
if (retval) {
int err;
drop_nlink(inode);
reiserfs_update_sd(&th, inode);
- err = journal_end(&th, parent_dir->i_sb, jbegin_count);
+ err = journal_end(&th);
if (err)
retval = err;
unlock_new_inode(inode);
@@ -1090,8 +1164,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
unlock_new_inode(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
- out_failed:
+ retval = journal_end(&th);
+out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
return retval;
}
@@ -1102,7 +1176,10 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
int retval;
struct inode *inode = old_dentry->d_inode;
struct reiserfs_transaction_handle th;
- /* We need blocks for transaction + update of quotas for the owners of the directory */
+ /*
+ * We need blocks for transaction + update of quotas for
+ * the owners of the directory
+ */
int jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 +
2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
@@ -1111,7 +1188,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
reiserfs_write_lock(dir->i_sb);
if (inode->i_nlink >= REISERFS_LINK_MAX) {
- //FIXME: sd_nlink is 32 bit for new files
+ /* FIXME: sd_nlink is 32 bit for new files */
reiserfs_write_unlock(dir->i_sb);
return -EMLINK;
}
@@ -1137,7 +1214,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
if (retval) {
int err;
drop_nlink(inode);
- err = journal_end(&th, dir->i_sb, jbegin_count);
+ err = journal_end(&th);
reiserfs_write_unlock(dir->i_sb);
return err ? err : retval;
}
@@ -1147,7 +1224,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
ihold(inode);
d_instantiate(dentry, inode);
- retval = journal_end(&th, dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_write_unlock(dir->i_sb);
return retval;
}
@@ -1158,9 +1235,9 @@ static int de_still_valid(const char *name, int len,
{
struct reiserfs_dir_entry tmp = *de;
- // recalculate pointer to name and name length
+ /* recalculate pointer to name and name length */
set_de_name_and_namelen(&tmp);
- // FIXME: could check more
+ /* FIXME: could check more */
if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
return 0;
return 1;
@@ -1217,14 +1294,16 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
unsigned long savelink = 1;
struct timespec ctime;
- /* three balancings: (1) old name removal, (2) new name insertion
- and (3) maybe "save" link insertion
- stat data updates: (1) old directory,
- (2) new directory and (3) maybe old object stat data (when it is
- directory) and (4) maybe stat data of object to which new entry
- pointed initially and (5) maybe block containing ".." of
- renamed directory
- quota updates: two parent directories */
+ /*
+ * three balancings: (1) old name removal, (2) new name insertion
+ * and (3) maybe "save" link insertion
+ * stat data updates: (1) old directory,
+ * (2) new directory and (3) maybe old object stat data (when it is
+ * directory) and (4) maybe stat data of object to which new entry
+ * pointed initially and (5) maybe block containing ".." of
+ * renamed directory
+ * quota updates: two parent directories
+ */
jbegin_count =
JOURNAL_PER_BALANCE_CNT * 3 + 5 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
@@ -1235,8 +1314,10 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode = old_dentry->d_inode;
new_dentry_inode = new_dentry->d_inode;
- // make sure, that oldname still exists and points to an object we
- // are going to rename
+ /*
+ * make sure that oldname still exists and points to an object we
+ * are going to rename
+ */
old_de.de_gen_number_bit_string = NULL;
reiserfs_write_lock(old_dir->i_sb);
retval =
@@ -1256,10 +1337,11 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode_mode = old_inode->i_mode;
if (S_ISDIR(old_inode_mode)) {
- // make sure, that directory being renamed has correct ".."
- // and that its new parent directory has not too many links
- // already
-
+ /*
+ * make sure that directory being renamed has correct ".."
+ * and that its new parent directory has not too many links
+ * already
+ */
if (new_dentry_inode) {
if (!reiserfs_empty_dir(new_dentry_inode)) {
reiserfs_write_unlock(old_dir->i_sb);
@@ -1267,8 +1349,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- /* directory is renamed, its parent directory will be changed,
- ** so find ".." entry
+ /*
+ * directory is renamed, its parent directory will be changed,
+ * so find ".." entry
*/
dot_dot_de.de_gen_number_bit_string = NULL;
retval =
@@ -1303,7 +1386,7 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
"new entry is found, new inode == 0");
}
} else if (retval) {
- int err = journal_end(&th, old_dir->i_sb, jbegin_count);
+ int err = journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return err ? err : retval;
}
@@ -1311,8 +1394,9 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
reiserfs_update_inode_transaction(old_dir);
reiserfs_update_inode_transaction(new_dir);
- /* this makes it so an fsync on an open fd for the old name will
- ** commit the rename operation
+ /*
+ * this makes it so an fsync on an open fd for the old name will
+ * commit the rename operation
*/
reiserfs_update_inode_transaction(old_inode);
@@ -1320,38 +1404,45 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
reiserfs_update_inode_transaction(new_dentry_inode);
while (1) {
- // look for old name using corresponding entry key (found by reiserfs_find_entry)
+ /*
+ * look for old name using corresponding entry key
+ * (found by reiserfs_find_entry)
+ */
if ((retval =
search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
&old_entry_path,
&old_de)) != NAME_FOUND) {
pathrelse(&old_entry_path);
- journal_end(&th, old_dir->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return -EIO;
}
- copy_item_head(&old_entry_ih, get_ih(&old_entry_path));
+ copy_item_head(&old_entry_ih, tp_item_head(&old_entry_path));
reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
- // look for new name by reiserfs_find_entry
+ /* look for new name by reiserfs_find_entry */
new_de.de_gen_number_bit_string = NULL;
retval =
reiserfs_find_entry(new_dir, new_dentry->d_name.name,
new_dentry->d_name.len, &new_entry_path,
&new_de);
- // reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from
- // reiserfs_add_entry above, and we'll catch any i/o errors before we get here.
+ /*
+ * reiserfs_add_entry should not return IO_ERROR,
+ * because it is called with essentially same parameters from
+ * reiserfs_add_entry above, and we'll catch any i/o errors
+ * before we get here.
+ */
if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
pathrelse(&new_entry_path);
pathrelse(&old_entry_path);
- journal_end(&th, old_dir->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return -EIO;
}
- copy_item_head(&new_entry_ih, get_ih(&new_entry_path));
+ copy_item_head(&new_entry_ih, tp_item_head(&new_entry_path));
reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
@@ -1364,28 +1455,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
pathrelse(&dot_dot_entry_path);
pathrelse(&new_entry_path);
pathrelse(&old_entry_path);
- journal_end(&th, old_dir->i_sb, jbegin_count);
+ journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return -EIO;
}
copy_item_head(&dot_dot_ih,
- get_ih(&dot_dot_entry_path));
- // node containing ".." gets into transaction
+ tp_item_head(&dot_dot_entry_path));
+ /* node containing ".." gets into transaction */
reiserfs_prepare_for_journal(old_inode->i_sb,
dot_dot_de.de_bh, 1);
}
- /* we should check seals here, not do
- this stuff, yes? Then, having
- gathered everything into RAM we
- should lock the buffers, yes? -Hans */
- /* probably. our rename needs to hold more
- ** than one path at once. The seals would
- ** have to be written to deal with multi-path
- ** issues -chris
+ /*
+ * we should check seals here, not do
+ * this stuff, yes? Then, having
+ * gathered everything into RAM we
+ * should lock the buffers, yes? -Hans
*/
- /* sanity checking before doing the rename - avoid races many
- ** of the above checks could have scheduled. We have to be
- ** sure our items haven't been shifted by another process.
+ /*
+ * probably. our rename needs to hold more
+ * than one path at once. The seals would
+ * have to be written to deal with multi-path
+ * issues -chris
+ */
+ /*
+ * sanity checking before doing the rename - avoid races many
+ * of the above checks could have scheduled. We have to be
+ * sure our items haven't been shifted by another process.
*/
if (item_moved(&new_entry_ih, &new_entry_path) ||
!entry_points_to_object(new_dentry->d_name.name,
@@ -1430,24 +1525,28 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
break;
}
- /* ok, all the changes can be done in one fell swoop when we
- have claimed all the buffers needed. */
+ /*
+ * ok, all the changes can be done in one fell swoop when we
+ * have claimed all the buffers needed.
+ */
mark_de_visible(new_de.de_deh + new_de.de_entry_num);
set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
- journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh);
+ journal_mark_dirty(&th, new_de.de_bh);
mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
- journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh);
+ journal_mark_dirty(&th, old_de.de_bh);
ctime = CURRENT_TIME_SEC;
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
- /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of
- renamed object */
+ /*
+ * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch
+ * which adds ctime update of renamed object
+ */
old_inode->i_ctime = ctime;
if (new_dentry_inode) {
- // adjust link number of the victim
+ /* adjust link number of the victim */
if (S_ISDIR(new_dentry_inode->i_mode)) {
clear_nlink(new_dentry_inode);
} else {
@@ -1460,25 +1559,32 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (S_ISDIR(old_inode_mode)) {
/* adjust ".." of renamed directory */
set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
- journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
+ journal_mark_dirty(&th, dot_dot_de.de_bh);
+ /*
+ * there (in new_dir) was no directory, so it got new link
+ * (".." of renamed directory)
+ */
if (!new_dentry_inode)
- /* there (in new_dir) was no directory, so it got new link
- (".." of renamed directory) */
INC_DIR_INODE_NLINK(new_dir);
/* old directory lost one link - ".. " of renamed directory */
DEC_DIR_INODE_NLINK(old_dir);
}
- // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse
+ /*
+ * looks like in 2.3.99pre3 brelse is atomic.
+ * so we can use pathrelse
+ */
pathrelse(&new_entry_path);
pathrelse(&dot_dot_entry_path);
- // FIXME: this reiserfs_cut_from_item's return value may screw up
- // anybody, but it will panic if will not be able to find the
- // entry. This needs one more clean up
+ /*
+ * FIXME: this reiserfs_cut_from_item's return value may screw up
+ * anybody, but it will panic if will not be able to find the
+ * entry. This needs one more clean up
+ */
if (reiserfs_cut_from_item
- (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
+ (&th, &old_entry_path, &old_de.de_entry_key, old_dir, NULL,
0) < 0)
reiserfs_error(old_dir->i_sb, "vs-7060",
"couldn't not cut old name. Fsck later?");
@@ -1496,16 +1602,13 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
reiserfs_update_sd(&th, new_dentry_inode);
}
- retval = journal_end(&th, old_dir->i_sb, jbegin_count);
+ retval = journal_end(&th);
reiserfs_write_unlock(old_dir->i_sb);
return retval;
}
-/*
- * directories can handle most operations...
- */
+/* directories can handle most operations... */
const struct inode_operations reiserfs_dir_inode_operations = {
- //&reiserfs_dir_operations, /* default_file_ops */
.create = reiserfs_create,
.lookup = reiserfs_lookup,
.link = reiserfs_link,
@@ -1522,6 +1625,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
+ .set_acl = reiserfs_set_acl,
};
/*
@@ -1538,8 +1642,6 @@ const struct inode_operations reiserfs_symlink_inode_operations = {
.listxattr = reiserfs_listxattr,
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
- .get_acl = reiserfs_get_acl,
-
};
/*
@@ -1553,4 +1655,5 @@ const struct inode_operations reiserfs_special_inode_operations = {
.removexattr = reiserfs_removexattr,
.permission = reiserfs_permission,
.get_acl = reiserfs_get_acl,
+ .set_acl = reiserfs_set_acl,
};
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index f732d6a5251..99a5d5dae46 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -7,7 +7,7 @@
#include <linux/time.h>
#include "reiserfs.h"
-// find where objectid map starts
+/* find where objectid map starts */
#define objectid_map(s,rs) (old_format_only (s) ? \
(__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
(__le32 *)((rs) + 1))
@@ -20,7 +20,7 @@ static void check_objectid_map(struct super_block *s, __le32 * map)
reiserfs_panic(s, "vs-15010", "map corrupted: %lx",
(long unsigned int)le32_to_cpu(map[0]));
- // FIXME: add something else here
+ /* FIXME: add something else here */
}
#else
@@ -29,19 +29,21 @@ static void check_objectid_map(struct super_block *s, __le32 * map)
}
#endif
-/* When we allocate objectids we allocate the first unused objectid.
- Each sequence of objectids in use (the odd sequences) is followed
- by a sequence of objectids not in use (the even sequences). We
- only need to record the last objectid in each of these sequences
- (both the odd and even sequences) in order to fully define the
- boundaries of the sequences. A consequence of allocating the first
- objectid not in use is that under most conditions this scheme is
- extremely compact. The exception is immediately after a sequence
- of operations which deletes a large number of objects of
- non-sequential objectids, and even then it will become compact
- again as soon as more objects are created. Note that many
- interesting optimizations of layout could result from complicating
- objectid assignment, but we have deferred making them for now. */
+/*
+ * When we allocate objectids we allocate the first unused objectid.
+ * Each sequence of objectids in use (the odd sequences) is followed
+ * by a sequence of objectids not in use (the even sequences). We
+ * only need to record the last objectid in each of these sequences
+ * (both the odd and even sequences) in order to fully define the
+ * boundaries of the sequences. A consequence of allocating the first
+ * objectid not in use is that under most conditions this scheme is
+ * extremely compact. The exception is immediately after a sequence
+ * of operations which deletes a large number of objects of
+ * non-sequential objectids, and even then it will become compact
+ * again as soon as more objects are created. Note that many
+ * interesting optimizations of layout could result from complicating
+ * objectid assignment, but we have deferred making them for now.
+ */
/* get unique object identifier */
__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
@@ -64,26 +66,30 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
return 0;
}
- /* This incrementation allocates the first unused objectid. That
- is to say, the first entry on the objectid map is the first
- unused objectid, and by incrementing it we use it. See below
- where we check to see if we eliminated a sequence of unused
- objectids.... */
+ /*
+ * This incrementation allocates the first unused objectid. That
+ * is to say, the first entry on the objectid map is the first
+ * unused objectid, and by incrementing it we use it. See below
+ * where we check to see if we eliminated a sequence of unused
+ * objectids....
+ */
map[1] = cpu_to_le32(unused_objectid + 1);
- /* Now we check to see if we eliminated the last remaining member of
- the first even sequence (and can eliminate the sequence by
- eliminating its last objectid from oids), and can collapse the
- first two odd sequences into one sequence. If so, then the net
- result is to eliminate a pair of objectids from oids. We do this
- by shifting the entire map to the left. */
+ /*
+ * Now we check to see if we eliminated the last remaining member of
+ * the first even sequence (and can eliminate the sequence by
+ * eliminating its last objectid from oids), and can collapse the
+ * first two odd sequences into one sequence. If so, then the net
+ * result is to eliminate a pair of objectids from oids. We do this
+ * by shifting the entire map to the left.
+ */
if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
memmove(map + 1, map + 3,
(sb_oid_cursize(rs) - 3) * sizeof(__u32));
set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
}
- journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
return unused_objectid;
}
@@ -97,30 +103,33 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
int i = 0;
BUG_ON(!th->t_trans_id);
- //return;
+ /*return; */
check_objectid_map(s, map);
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
- journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
-
- /* start at the beginning of the objectid map (i = 0) and go to
- the end of it (i = disk_sb->s_oid_cursize). Linear search is
- what we use, though it is possible that binary search would be
- more efficient after performing lots of deletions (which is
- when oids is large.) We only check even i's. */
+ journal_mark_dirty(th, SB_BUFFER_WITH_SB(s));
+
+ /*
+ * start at the beginning of the objectid map (i = 0) and go to
+ * the end of it (i = disk_sb->s_oid_cursize). Linear search is
+ * what we use, though it is possible that binary search would be
+ * more efficient after performing lots of deletions (which is
+ * when oids is large.) We only check even i's.
+ */
while (i < sb_oid_cursize(rs)) {
if (objectid_to_release == le32_to_cpu(map[i])) {
/* This incrementation unallocates the objectid. */
- //map[i]++;
le32_add_cpu(&map[i], 1);
- /* Did we unallocate the last member of an odd sequence, and can shrink oids? */
+ /*
+ * Did we unallocate the last member of an
+ * odd sequence, and can shrink oids?
+ */
if (map[i] == map[i + 1]) {
/* shrink objectid map */
memmove(map + i, map + i + 2,
(sb_oid_cursize(rs) - i -
2) * sizeof(__u32));
- //disk_sb->s_oid_cursize -= 2;
set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
RFALSE(sb_oid_cursize(rs) < 2 ||
@@ -135,14 +144,19 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
objectid_to_release < le32_to_cpu(map[i + 1])) {
/* size of objectid map is not changed */
if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
- //objectid_map[i+1]--;
le32_add_cpu(&map[i + 1], -1);
return;
}
- /* JDM comparing two little-endian values for equality -- safe */
+ /*
+ * JDM comparing two little-endian values for
+ * equality -- safe
+ */
+ /*
+ * objectid map must be expanded, but
+ * there is no space
+ */
if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
- /* objectid map must be expanded, but there is no space */
PROC_INFO_INC(s, leaked_oid);
return;
}
@@ -178,8 +192,9 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s)
new_objectid_map = (__le32 *) (disk_sb + 1);
if (cur_size > new_size) {
- /* mark everyone used that was listed as free at the end of the objectid
- ** map
+ /*
+ * mark everyone used that was listed as free at
+ * the end of the objectid map
*/
objectid_map[new_size - 1] = objectid_map[cur_size - 1];
set_sb_oid_cursize(disk_sb, new_size);
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 54944d5a4a6..c9b47e91baf 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -172,18 +172,19 @@ static char *is_there_reiserfs_struct(char *fmt, int *what)
return k;
}
-/* debugging reiserfs we used to print out a lot of different
- variables, like keys, item headers, buffer heads etc. Values of
- most fields matter. So it took a long time just to write
- appropriative printk. With this reiserfs_warning you can use format
- specification for complex structures like you used to do with
- printfs for integers, doubles and pointers. For instance, to print
- out key structure you have to write just:
- reiserfs_warning ("bad key %k", key);
- instead of
- printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
- key->k_offset, key->k_uniqueness);
-*/
+/*
+ * debugging reiserfs we used to print out a lot of different
+ * variables, like keys, item headers, buffer heads etc. Values of
+ * most fields matter. So it took a long time just to write
+ * appropriative printk. With this reiserfs_warning you can use format
+ * specification for complex structures like you used to do with
+ * printfs for integers, doubles and pointers. For instance, to print
+ * out key structure you have to write just:
+ * reiserfs_warning ("bad key %k", key);
+ * instead of
+ * printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid,
+ * key->k_offset, key->k_uniqueness);
+ */
static DEFINE_SPINLOCK(error_lock);
static void prepare_error_buf(const char *fmt, va_list args)
{
@@ -243,15 +244,16 @@ static void prepare_error_buf(const char *fmt, va_list args)
}
-/* in addition to usual conversion specifiers this accepts reiserfs
- specific conversion specifiers:
- %k to print little endian key,
- %K to print cpu key,
- %h to print item_head,
- %t to print directory entry
- %z to print block head (arg must be struct buffer_head *
- %b to print buffer_head
-*/
+/*
+ * in addition to usual conversion specifiers this accepts reiserfs
+ * specific conversion specifiers:
+ * %k to print little endian key,
+ * %K to print cpu key,
+ * %h to print item_head,
+ * %t to print directory entry
+ * %z to print block head (arg must be struct buffer_head *
+ * %b to print buffer_head
+ */
#define do_reiserfs_warning(fmt)\
{\
@@ -304,50 +306,52 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
#endif
}
-/* The format:
-
- maintainer-errorid: [function-name:] message
-
- where errorid is unique to the maintainer and function-name is
- optional, is recommended, so that anyone can easily find the bug
- with a simple grep for the short to type string
- maintainer-errorid. Don't bother with reusing errorids, there are
- lots of numbers out there.
-
- Example:
-
- reiserfs_panic(
- p_sb, "reiser-29: reiserfs_new_blocknrs: "
- "one of search_start or rn(%d) is equal to MAX_B_NUM,"
- "which means that we are optimizing location based on the bogus location of a temp buffer (%p).",
- rn, bh
- );
-
- Regular panic()s sometimes clear the screen before the message can
- be read, thus the need for the while loop.
-
- Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it
- pointless complexity):
-
- panics in reiserfs.h have numbers from 1000 to 1999
- super.c 2000 to 2999
- preserve.c (unused) 3000 to 3999
- bitmap.c 4000 to 4999
- stree.c 5000 to 5999
- prints.c 6000 to 6999
- namei.c 7000 to 7999
- fix_nodes.c 8000 to 8999
- dir.c 9000 to 9999
- lbalance.c 10000 to 10999
- ibalance.c 11000 to 11999 not ready
- do_balan.c 12000 to 12999
- inode.c 13000 to 13999
- file.c 14000 to 14999
- objectid.c 15000 - 15999
- buffer.c 16000 - 16999
- symlink.c 17000 - 17999
-
- . */
+/*
+ * The format:
+ *
+ * maintainer-errorid: [function-name:] message
+ *
+ * where errorid is unique to the maintainer and function-name is
+ * optional, is recommended, so that anyone can easily find the bug
+ * with a simple grep for the short to type string
+ * maintainer-errorid. Don't bother with reusing errorids, there are
+ * lots of numbers out there.
+ *
+ * Example:
+ *
+ * reiserfs_panic(
+ * p_sb, "reiser-29: reiserfs_new_blocknrs: "
+ * "one of search_start or rn(%d) is equal to MAX_B_NUM,"
+ * "which means that we are optimizing location based on the "
+ * "bogus location of a temp buffer (%p).",
+ * rn, bh
+ * );
+ *
+ * Regular panic()s sometimes clear the screen before the message can
+ * be read, thus the need for the while loop.
+ *
+ * Numbering scheme for panic used by Vladimir and Anatoly( Hans completely
+ * ignores this scheme, and considers it pointless complexity):
+ *
+ * panics in reiserfs_fs.h have numbers from 1000 to 1999
+ * super.c 2000 to 2999
+ * preserve.c (unused) 3000 to 3999
+ * bitmap.c 4000 to 4999
+ * stree.c 5000 to 5999
+ * prints.c 6000 to 6999
+ * namei.c 7000 to 7999
+ * fix_nodes.c 8000 to 8999
+ * dir.c 9000 to 9999
+ * lbalance.c 10000 to 10999
+ * ibalance.c 11000 to 11999 not ready
+ * do_balan.c 12000 to 12999
+ * inode.c 13000 to 13999
+ * file.c 14000 to 14999
+ * objectid.c 15000 - 15999
+ * buffer.c 16000 - 16999
+ * symlink.c 17000 - 17999
+ *
+ * . */
void __reiserfs_panic(struct super_block *sb, const char *id,
const char *function, const char *fmt, ...)
@@ -411,9 +415,11 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
reiserfs_abort_journal(sb, errno);
}
-/* this prints internal nodes (4 keys/items in line) (dc_number,
- dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
- dc_size)...*/
+/*
+ * this prints internal nodes (4 keys/items in line) (dc_number,
+ * dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
+ * dc_size)...
+ */
static int print_internal(struct buffer_head *bh, int first, int last)
{
struct reiserfs_key *key;
@@ -439,7 +445,7 @@ static int print_internal(struct buffer_head *bh, int first, int last)
dc = B_N_CHILD(bh, from);
reiserfs_printk("PTR %d: %y ", from, dc);
- for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to;
+ for (i = from, key = internal_key(bh, from), dc++; i < to;
i++, key++, dc++) {
reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
if (i && i % 4 == 0)
@@ -463,7 +469,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first,
check_leaf(bh);
blkh = B_BLK_HEAD(bh);
- ih = B_N_PITEM_HEAD(bh, 0);
+ ih = item_head(bh, 0);
nr = blkh_nr_item(blkh);
printk
@@ -496,7 +502,7 @@ static int print_leaf(struct buffer_head *bh, int print_mode, int first,
("-------------------------------------------------------------------------------\n");
reiserfs_printk("|%2d| %h |\n", i, ih);
if (print_mode & PRINT_LEAF_ITEMS)
- op_print_item(ih, B_I_PITEM(bh, ih));
+ op_print_item(ih, ih_item_body(bh, ih));
}
printk
@@ -543,9 +549,11 @@ static int print_super_block(struct buffer_head *bh)
printk("Block count %u\n", sb_block_count(rs));
printk("Blocksize %d\n", sb_blocksize(rs));
printk("Free blocks %u\n", sb_free_blocks(rs));
- // FIXME: this would be confusing if
- // someone stores reiserfs super block in some data block ;)
+ /*
+ * FIXME: this would be confusing if
+ * someone stores reiserfs super block in some data block ;)
// skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
+ */
skipped = bh->b_blocknr;
data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
(!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
@@ -581,8 +589,8 @@ static int print_desc_block(struct buffer_head *bh)
return 0;
}
-
-void print_block(struct buffer_head *bh, ...) //int print_mode, int first, int last)
+/* ..., int print_mode, int first, int last) */
+void print_block(struct buffer_head *bh, ...)
{
va_list args;
int mode, first, last;
@@ -644,11 +652,11 @@ void store_print_tb(struct tree_balance *tb)
"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
h,
(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
- (tbSh) ? atomic_read(&(tbSh->b_count)) : -1,
+ (tbSh) ? atomic_read(&tbSh->b_count) : -1,
(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
- (tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1,
+ (tb->L[h]) ? atomic_read(&tb->L[h]->b_count) : -1,
(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
- (tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1,
+ (tb->R[h]) ? atomic_read(&tb->R[h]->b_count) : -1,
(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
(tb->FL[h]) ? (long long)(tb->FL[h]->
b_blocknr) : (-1LL),
@@ -665,9 +673,9 @@ void store_print_tb(struct tree_balance *tb)
"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
- tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes,
- tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0],
- tb->rkey[0]);
+ tb->rbytes, tb->blknum[0], tb->s0num, tb->snum[0],
+ tb->sbytes[0], tb->snum[1], tb->sbytes[1],
+ tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
/* this prints balance parameters for non-leaf levels */
h = 0;
@@ -690,7 +698,7 @@ void store_print_tb(struct tree_balance *tb)
"%p (%llu %d)%s", tb->FEB[i],
tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
b_blocknr : 0ULL,
- tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0,
+ tb->FEB[i] ? atomic_read(&tb->FEB[i]->b_count) : 0,
(i == ARRAY_SIZE(tb->FEB) - 1) ? "\n" : ", ");
sprintf(print_tb_buf + strlen(print_tb_buf),
@@ -744,8 +752,8 @@ void check_leaf(struct buffer_head *bh)
if (!bh)
return;
check_leaf_block_head(bh);
- for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
- op_check_item(ih, B_I_PITEM(bh, ih));
+ for (i = 0, ih = item_head(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
+ op_check_item(ih, ih_item_body(bh, ih));
}
void check_internal(struct buffer_head *bh)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index a958444a75f..02b0b7d0f7d 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -419,7 +419,7 @@ int reiserfs_proc_info_init(struct super_block *sb)
char *s;
/* Some block devices use /'s */
- strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+ strlcpy(b, sb->s_id, BDEVNAME_SIZE);
s = strchr(b, '/');
if (s)
*s = '!';
@@ -449,7 +449,7 @@ int reiserfs_proc_info_done(struct super_block *sb)
char *s;
/* Some block devices use /'s */
- strlcpy(b, reiserfs_bdevname(sb), BDEVNAME_SIZE);
+ strlcpy(b, sb->s_id, BDEVNAME_SIZE);
s = strchr(b, '/');
if (s)
*s = '!';
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index f8adaee537c..bf53888c7f5 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1,5 +1,6 @@
/*
- * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details
+ * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for
+ * licensing and copyright details
*/
#include <linux/reiserfs_fs.h>
@@ -23,52 +24,73 @@
struct reiserfs_journal_list;
-/** bitmasks for i_flags field in reiserfs-specific part of inode */
+/* bitmasks for i_flags field in reiserfs-specific part of inode */
typedef enum {
- /** this says what format of key do all items (but stat data) of
- an object have. If this is set, that format is 3.6 otherwise
- - 3.5 */
+ /*
+ * this says what format of key do all items (but stat data) of
+ * an object have. If this is set, that format is 3.6 otherwise - 3.5
+ */
i_item_key_version_mask = 0x0001,
- /** If this is unset, object has 3.5 stat data, otherwise, it has
- 3.6 stat data with 64bit size, 32bit nlink etc. */
+
+ /*
+ * If this is unset, object has 3.5 stat data, otherwise,
+ * it has 3.6 stat data with 64bit size, 32bit nlink etc.
+ */
i_stat_data_version_mask = 0x0002,
- /** file might need tail packing on close */
+
+ /* file might need tail packing on close */
i_pack_on_close_mask = 0x0004,
- /** don't pack tail of file */
+
+ /* don't pack tail of file */
i_nopack_mask = 0x0008,
- /** If those is set, "safe link" was created for this file during
- truncate or unlink. Safe link is used to avoid leakage of disk
- space on crash with some files open, but unlinked. */
+
+ /*
+ * If either of these are set, "safe link" was created for this
+ * file during truncate or unlink. Safe link is used to avoid
+ * leakage of disk space on crash with some files open, but unlinked.
+ */
i_link_saved_unlink_mask = 0x0010,
i_link_saved_truncate_mask = 0x0020,
+
i_has_xattr_dir = 0x0040,
i_data_log = 0x0080,
} reiserfs_inode_flags;
struct reiserfs_inode_info {
__u32 i_key[4]; /* key is still 4 32 bit integers */
- /** transient inode flags that are never stored on disk. Bitmasks
- for this field are defined above. */
+
+ /*
+ * transient inode flags that are never stored on disk. Bitmasks
+ * for this field are defined above.
+ */
__u32 i_flags;
- __u32 i_first_direct_byte; // offset of first byte stored in direct item.
+ /* offset of first byte stored in direct item. */
+ __u32 i_first_direct_byte;
/* copy of persistent inode flags read from sd_attrs. */
__u32 i_attrs;
- int i_prealloc_block; /* first unused block of a sequence of unused blocks */
+ /* first unused block of a sequence of unused blocks */
+ int i_prealloc_block;
int i_prealloc_count; /* length of that sequence */
- struct list_head i_prealloc_list; /* per-transaction list of inodes which
- * have preallocated blocks */
- unsigned new_packing_locality:1; /* new_packig_locality is created; new blocks
- * for the contents of this directory should be
- * displaced */
+ /* per-transaction list of inodes which have preallocated blocks */
+ struct list_head i_prealloc_list;
- /* we use these for fsync or O_SYNC to decide which transaction
- ** needs to be committed in order for this inode to be properly
- ** flushed */
+ /*
+ * new_packing_locality is created; new blocks for the contents
+ * of this directory should be displaced
+ */
+ unsigned new_packing_locality:1;
+
+ /*
+ * we use these for fsync or O_SYNC to decide which transaction
+ * needs to be committed in order for this inode to be properly
+ * flushed
+ */
unsigned int i_trans_id;
+
struct reiserfs_journal_list *i_jl;
atomic_t openers;
struct mutex tailpack;
@@ -82,9 +104,10 @@ typedef enum {
reiserfs_attrs_cleared = 0x00000001,
} reiserfs_super_block_flags;
-/* struct reiserfs_super_block accessors/mutators
- * since this is a disk structure, it will always be in
- * little endian format. */
+/*
+ * struct reiserfs_super_block accessors/mutators since this is a disk
+ * structure, it will always be in little endian format.
+ */
#define sb_block_count(sbp) (le32_to_cpu((sbp)->s_v1.s_block_count))
#define set_sb_block_count(sbp,v) ((sbp)->s_v1.s_block_count = cpu_to_le32(v))
#define sb_free_blocks(sbp) (le32_to_cpu((sbp)->s_v1.s_free_blocks))
@@ -152,48 +175,61 @@ typedef enum {
/* LOGGING -- */
-/* These all interelate for performance.
-**
-** If the journal block count is smaller than n transactions, you lose speed.
-** I don't know what n is yet, I'm guessing 8-16.
-**
-** typical transaction size depends on the application, how often fsync is
-** called, and how many metadata blocks you dirty in a 30 second period.
-** The more small files (<16k) you use, the larger your transactions will
-** be.
-**
-** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal
-** to wrap, which slows things down. If you need high speed meta data updates, the journal should be big enough
-** to prevent wrapping before dirty meta blocks get to disk.
-**
-** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal
-** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping.
-**
-** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash.
-**
-*/
+/*
+ * These all interelate for performance.
+ *
+ * If the journal block count is smaller than n transactions, you lose speed.
+ * I don't know what n is yet, I'm guessing 8-16.
+ *
+ * typical transaction size depends on the application, how often fsync is
+ * called, and how many metadata blocks you dirty in a 30 second period.
+ * The more small files (<16k) you use, the larger your transactions will
+ * be.
+ *
+ * If your journal fills faster than dirty buffers get flushed to disk, it
+ * must flush them before allowing the journal to wrap, which slows things
+ * down. If you need high speed meta data updates, the journal should be
+ * big enough to prevent wrapping before dirty meta blocks get to disk.
+ *
+ * If the batch max is smaller than the transaction max, you'll waste space
+ * at the end of the journal because journal_end sets the next transaction
+ * to start at 0 if the next transaction has any chance of wrapping.
+ *
+ * The large the batch max age, the better the speed, and the more meta
+ * data changes you'll lose after a crash.
+ */
/* don't mess with these for a while */
- /* we have a node size define somewhere in reiserfs_fs.h. -Hans */
+/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */
#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */
#define JOURNAL_HASH_SIZE 8192
-#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */
-
-/* One of these for every block in every transaction
-** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a
-** hash of all the in memory transactions.
-** next and prev are used by the current transaction (journal_hash).
-** hnext and hprev are used by journal_list_hash. If a block is in more than one transaction, the journal_list_hash
-** links it in multiple times. This allows flush_journal_list to remove just the cnode belonging
-** to a given transaction.
-*/
+
+/* number of copies of the bitmaps to have floating. Must be >= 2 */
+#define JOURNAL_NUM_BITMAPS 5
+
+/*
+ * One of these for every block in every transaction
+ * Each one is in two hash tables. First, a hash of the current transaction,
+ * and after journal_end, a hash of all the in memory transactions.
+ * next and prev are used by the current transaction (journal_hash).
+ * hnext and hprev are used by journal_list_hash. If a block is in more
+ * than one transaction, the journal_list_hash links it in multiple times.
+ * This allows flush_journal_list to remove just the cnode belonging to a
+ * given transaction.
+ */
struct reiserfs_journal_cnode {
struct buffer_head *bh; /* real buffer head */
struct super_block *sb; /* dev of real buffer head */
- __u32 blocknr; /* block number of real buffer head, == 0 when buffer on disk */
+
+ /* block number of real buffer head, == 0 when buffer on disk */
+ __u32 blocknr;
+
unsigned long state;
- struct reiserfs_journal_list *jlist; /* journal list this cnode lives in */
+
+ /* journal list this cnode lives in */
+ struct reiserfs_journal_list *jlist;
+
struct reiserfs_journal_cnode *next; /* next in transaction list */
struct reiserfs_journal_cnode *prev; /* prev in transaction list */
struct reiserfs_journal_cnode *hprev; /* prev in hash list */
@@ -212,18 +248,22 @@ struct reiserfs_list_bitmap {
};
/*
-** one of these for each transaction. The most important part here is the j_realblock.
-** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
-** real buffer heads dirty once all the commits hit the disk,
-** and to make sure every real block in a transaction is on disk before allowing the log area
-** to be overwritten */
+ * one of these for each transaction. The most important part here is the
+ * j_realblock. this list of cnodes is used to hash all the blocks in all
+ * the commits, to mark all the real buffer heads dirty once all the commits
+ * hit the disk, and to make sure every real block in a transaction is on
+ * disk before allowing the log area to be overwritten
+ */
struct reiserfs_journal_list {
unsigned long j_start;
unsigned long j_state;
unsigned long j_len;
atomic_t j_nonzerolen;
atomic_t j_commit_left;
- atomic_t j_older_commits_done; /* all commits older than this on disk */
+
+ /* all commits older than this on disk */
+ atomic_t j_older_commits_done;
+
struct mutex j_commit_mutex;
unsigned int j_trans_id;
time_t j_timestamp;
@@ -234,11 +274,15 @@ struct reiserfs_journal_list {
/* time ordered list of all active transactions */
struct list_head j_list;
- /* time ordered list of all transactions we haven't tried to flush yet */
+ /*
+ * time ordered list of all transactions we haven't tried
+ * to flush yet
+ */
struct list_head j_working_list;
/* list of tail conversion targets in need of flush before commit */
struct list_head j_tail_bh_list;
+
/* list of data=ordered buffers in need of flush before commit */
struct list_head j_bh_list;
int j_refcount;
@@ -246,46 +290,83 @@ struct reiserfs_journal_list {
struct reiserfs_journal {
struct buffer_head **j_ap_blocks; /* journal blocks on disk */
- struct reiserfs_journal_cnode *j_last; /* newest journal block */
- struct reiserfs_journal_cnode *j_first; /* oldest journal block. start here for traverse */
+ /* newest journal block */
+ struct reiserfs_journal_cnode *j_last;
+
+ /* oldest journal block. start here for traverse */
+ struct reiserfs_journal_cnode *j_first;
struct block_device *j_dev_bd;
fmode_t j_dev_mode;
- int j_1st_reserved_block; /* first block on s_dev of reserved area journal */
+
+ /* first block on s_dev of reserved area journal */
+ int j_1st_reserved_block;
unsigned long j_state;
unsigned int j_trans_id;
unsigned long j_mount_id;
- unsigned long j_start; /* start of current waiting commit (index into j_ap_blocks) */
+
+ /* start of current waiting commit (index into j_ap_blocks) */
+ unsigned long j_start;
unsigned long j_len; /* length of current waiting commit */
- unsigned long j_len_alloc; /* number of buffers requested by journal_begin() */
+
+ /* number of buffers requested by journal_begin() */
+ unsigned long j_len_alloc;
+
atomic_t j_wcount; /* count of writers for current commit */
- unsigned long j_bcount; /* batch count. allows turning X transactions into 1 */
- unsigned long j_first_unflushed_offset; /* first unflushed transactions offset */
- unsigned j_last_flush_trans_id; /* last fully flushed journal timestamp */
+
+ /* batch count. allows turning X transactions into 1 */
+ unsigned long j_bcount;
+
+ /* first unflushed transactions offset */
+ unsigned long j_first_unflushed_offset;
+
+ /* last fully flushed journal timestamp */
+ unsigned j_last_flush_trans_id;
+
struct buffer_head *j_header_bh;
time_t j_trans_start_time; /* time this transaction started */
struct mutex j_mutex;
struct mutex j_flush_mutex;
- wait_queue_head_t j_join_wait; /* wait for current transaction to finish before starting new one */
- atomic_t j_jlock; /* lock for j_join_wait */
+
+ /* wait for current transaction to finish before starting new one */
+ wait_queue_head_t j_join_wait;
+
+ atomic_t j_jlock; /* lock for j_join_wait */
int j_list_bitmap_index; /* number of next list bitmap to use */
- int j_must_wait; /* no more journal begins allowed. MUST sleep on j_join_wait */
- int j_next_full_flush; /* next journal_end will flush all journal list */
- int j_next_async_flush; /* next journal_end will flush all async commits */
+
+ /* no more journal begins allowed. MUST sleep on j_join_wait */
+ int j_must_wait;
+
+ /* next journal_end will flush all journal list */
+ int j_next_full_flush;
+
+ /* next journal_end will flush all async commits */
+ int j_next_async_flush;
int j_cnode_used; /* number of cnodes on the used list */
int j_cnode_free; /* number of cnodes on the free list */
- unsigned int j_trans_max; /* max number of blocks in a transaction. */
- unsigned int j_max_batch; /* max number of blocks to batch into a trans */
- unsigned int j_max_commit_age; /* in seconds, how old can an async commit be */
- unsigned int j_max_trans_age; /* in seconds, how old can a transaction be */
- unsigned int j_default_max_commit_age; /* the default for the max commit age */
+ /* max number of blocks in a transaction. */
+ unsigned int j_trans_max;
+
+ /* max number of blocks to batch into a trans */
+ unsigned int j_max_batch;
+
+ /* in seconds, how old can an async commit be */
+ unsigned int j_max_commit_age;
+
+ /* in seconds, how old can a transaction be */
+ unsigned int j_max_trans_age;
+
+ /* the default for the max commit age */
+ unsigned int j_default_max_commit_age;
struct reiserfs_journal_cnode *j_cnode_free_list;
- struct reiserfs_journal_cnode *j_cnode_free_orig; /* orig pointer returned from vmalloc */
+
+ /* orig pointer returned from vmalloc */
+ struct reiserfs_journal_cnode *j_cnode_free_orig;
struct reiserfs_journal_list *j_current_jl;
int j_free_bitmap_nodes;
@@ -306,14 +387,21 @@ struct reiserfs_journal {
/* list of all active transactions */
struct list_head j_journal_list;
+
/* lists that haven't been touched by writeback attempts */
struct list_head j_working_list;
- struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS]; /* array of bitmaps to record the deleted blocks */
- struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE]; /* hash table for real buffer heads in current trans */
- struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE]; /* hash table for all the real buffer heads in all
- the transactions */
- struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */
+ /* hash table for real buffer heads in current trans */
+ struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];
+
+ /* hash table for all the real buffer heads in all the transactions */
+ struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];
+
+ /* array of bitmaps to record the deleted blocks */
+ struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];
+
+ /* list of inodes which have preallocated blocks */
+ struct list_head j_prealloc_list;
int j_persistent_trans;
unsigned long j_max_trans_size;
unsigned long j_max_batch_size;
@@ -328,11 +416,12 @@ struct reiserfs_journal {
enum journal_state_bits {
J_WRITERS_BLOCKED = 1, /* set when new writers not allowed */
- J_WRITERS_QUEUED, /* set when log is full due to too many writers */
- J_ABORTED, /* set when log is aborted */
+ J_WRITERS_QUEUED, /* set when log is full due to too many writers */
+ J_ABORTED, /* set when log is aborted */
};
-#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */
+/* ick. magic string to find desc blocks in the journal */
+#define JOURNAL_DESC_MAGIC "ReIsErLB"
typedef __u32(*hashf_t) (const signed char *, int);
@@ -364,7 +453,10 @@ typedef struct reiserfs_proc_info_data {
stat_cnt_t leaked_oid;
stat_cnt_t leaves_removable;
- /* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */
+ /*
+ * balances per level.
+ * Use explicit 5 as MAX_HEIGHT is not visible yet.
+ */
stat_cnt_t balance_at[5]; /* XXX */
/* sbk == search_by_key */
stat_cnt_t sbk_read_at[5]; /* XXX */
@@ -416,47 +508,75 @@ typedef struct reiserfs_proc_info_data {
/* reiserfs union of in-core super block data */
struct reiserfs_sb_info {
- struct buffer_head *s_sbh; /* Buffer containing the super block */
- /* both the comment and the choice of
- name are unclear for s_rs -Hans */
- struct reiserfs_super_block *s_rs; /* Pointer to the super block in the buffer */
+ /* Buffer containing the super block */
+ struct buffer_head *s_sbh;
+
+ /* Pointer to the on-disk super block in the buffer */
+ struct reiserfs_super_block *s_rs;
struct reiserfs_bitmap_info *s_ap_bitmap;
- struct reiserfs_journal *s_journal; /* pointer to journal information */
+
+ /* pointer to journal information */
+ struct reiserfs_journal *s_journal;
+
unsigned short s_mount_state; /* reiserfs state (valid, invalid) */
/* Serialize writers access, replace the old bkl */
struct mutex lock;
+
/* Owner of the lock (can be recursive) */
struct task_struct *lock_owner;
+
/* Depth of the lock, start from -1 like the bkl */
int lock_depth;
+ struct workqueue_struct *commit_wq;
+
/* Comment? -Hans */
void (*end_io_handler) (struct buffer_head *, int);
- hashf_t s_hash_function; /* pointer to function which is used
- to sort names in directory. Set on
- mount */
- unsigned long s_mount_opt; /* reiserfs's mount options are set
- here (currently - NOTAIL, NOLOG,
- REPLAYONLY) */
-
- struct { /* This is a structure that describes block allocator options */
- unsigned long bits; /* Bitfield for enable/disable kind of options */
- unsigned long large_file_size; /* size started from which we consider file to be a large one(in blocks) */
+
+ /*
+ * pointer to function which is used to sort names in directory.
+ * Set on mount
+ */
+ hashf_t s_hash_function;
+
+ /* reiserfs's mount options are set here */
+ unsigned long s_mount_opt;
+
+ /* This is a structure that describes block allocator options */
+ struct {
+ /* Bitfield for enable/disable kind of options */
+ unsigned long bits;
+
+ /*
+ * size started from which we consider file
+ * to be a large one (in blocks)
+ */
+ unsigned long large_file_size;
+
int border; /* percentage of disk, border takes */
- int preallocmin; /* Minimal file size (in blocks) starting from which we do preallocations */
- int preallocsize; /* Number of blocks we try to prealloc when file
- reaches preallocmin size (in blocks) or
- prealloc_list is empty. */
+
+ /*
+ * Minimal file size (in blocks) starting
+ * from which we do preallocations
+ */
+ int preallocmin;
+
+ /*
+ * Number of blocks we try to prealloc when file
+ * reaches preallocmin size (in blocks) or prealloc_list
+ is empty.
+ */
+ int preallocsize;
} s_alloc_options;
/* Comment? -Hans */
wait_queue_head_t s_wait;
- /* To be obsoleted soon by per buffer seals.. -Hans */
- atomic_t s_generation_counter; // increased by one every time the
- // tree gets re-balanced
- unsigned long s_properties; /* File system properties. Currently holds
- on-disk FS format */
+ /* increased by one every time the tree gets re-balanced */
+ atomic_t s_generation_counter;
+
+ /* File system properties. Currently holds on-disk FS format */
+ unsigned long s_properties;
/* session statistics */
int s_disk_reads;
@@ -469,14 +589,23 @@ struct reiserfs_sb_info {
int s_bmaps_without_search;
int s_direct2indirect;
int s_indirect2direct;
- /* set up when it's ok for reiserfs_read_inode2() to read from
- disk inode with nlink==0. Currently this is only used during
- finish_unfinished() processing at mount time */
+
+ /*
+ * set up when it's ok for reiserfs_read_inode2() to read from
+ * disk inode with nlink==0. Currently this is only used during
+ * finish_unfinished() processing at mount time
+ */
int s_is_unlinked_ok;
+
reiserfs_proc_info_data_t s_proc_info_data;
struct proc_dir_entry *procdir;
- int reserved_blocks; /* amount of blocks reserved for further allocations */
- spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */
+
+ /* amount of blocks reserved for further allocations */
+ int reserved_blocks;
+
+
+ /* this lock on now only used to protect reserved_blocks variable */
+ spinlock_t bitmap_lock;
struct dentry *priv_root; /* root of /.reiserfs_priv */
struct dentry *xattr_root; /* root of /.reiserfs_priv/xattrs */
int j_errno;
@@ -492,14 +621,13 @@ struct reiserfs_sb_info {
char *s_jdev; /* Stored jdev for mount option showing */
#ifdef CONFIG_REISERFS_CHECK
- struct tree_balance *cur_tb; /*
- * Detects whether more than one
- * copy of tb exists per superblock
- * as a means of checking whether
- * do_balance is executing concurrently
- * against another tree reader/writer
- * on a same mount point.
- */
+ /*
+ * Detects whether more than one copy of tb exists per superblock
+ * as a means of checking whether do_balance is executing
+ * concurrently against another tree reader/writer on a same
+ * mount point.
+ */
+ struct tree_balance *cur_tb;
#endif
};
@@ -508,25 +636,36 @@ struct reiserfs_sb_info {
#define REISERFS_3_6 1
#define REISERFS_OLD_FORMAT 2
-enum reiserfs_mount_options {
/* Mount options */
- REISERFS_LARGETAIL, /* large tails will be created in a session */
- REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */
- REPLAYONLY, /* replay journal and return 0. Use by fsck */
- REISERFS_CONVERT, /* -o conv: causes conversion of old
- format super block to the new
- format. If not specified - old
- partition will be dealt with in a
- manner of 3.5.x */
-
-/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting
-** reiserfs disks from 3.5.19 or earlier. 99% of the time, this option
-** is not required. If the normal autodection code can't determine which
-** hash to use (because both hashes had the same value for a file)
-** use this option to force a specific hash. It won't allow you to override
-** the existing hash on the FS, so if you have a tea hash disk, and mount
-** with -o hash=rupasov, the mount will fail.
-*/
+enum reiserfs_mount_options {
+ /* large tails will be created in a session */
+ REISERFS_LARGETAIL,
+ /*
+ * small (for files less than block size) tails will
+ * be created in a session
+ */
+ REISERFS_SMALLTAIL,
+
+ /* replay journal and return 0. Use by fsck */
+ REPLAYONLY,
+
+ /*
+ * -o conv: causes conversion of old format super block to the
+ * new format. If not specified - old partition will be dealt
+ * with in a manner of 3.5.x
+ */
+ REISERFS_CONVERT,
+
+ /*
+ * -o hash={tea, rupasov, r5, detect} is meant for properly mounting
+ * reiserfs disks from 3.5.19 or earlier. 99% of the time, this
+ * option is not required. If the normal autodection code can't
+ * determine which hash to use (because both hashes had the same
+ * value for a file) use this option to force a specific hash.
+ * It won't allow you to override the existing hash on the FS, so
+ * if you have a tea hash disk, and mount with -o hash=rupasov,
+ * the mount will fail.
+ */
FORCE_TEA_HASH, /* try to force tea hash on mount */
FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */
FORCE_R5_HASH, /* try to force rupasov hash on mount */
@@ -536,9 +675,11 @@ enum reiserfs_mount_options {
REISERFS_DATA_ORDERED,
REISERFS_DATA_WRITEBACK,
-/* used for testing experimental features, makes benchmarking new
- features with and without more convenient, should never be used by
- users in any code shipped to users (ideally) */
+ /*
+ * used for testing experimental features, makes benchmarking new
+ * features with and without more convenient, should never be used by
+ * users in any code shipped to users (ideally)
+ */
REISERFS_NO_BORDER,
REISERFS_NO_UNHASHED_RELOCATION,
@@ -608,14 +749,6 @@ int reiserfs_resize(struct super_block *, unsigned long);
#define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
-/* A safe version of the "bdevname", which returns the "s_id" field of
- * a superblock or else "Null superblock" if the super block is NULL.
- */
-static inline char *reiserfs_bdevname(struct super_block *s)
-{
- return (s == NULL) ? "Null superblock" : s->s_id;
-}
-
#define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
*journal)
@@ -713,28 +846,28 @@ static inline void reiserfs_cond_resched(struct super_block *s)
struct fid;
-/* in reading the #defines, it may help to understand that they employ
- the following abbreviations:
-
- B = Buffer
- I = Item header
- H = Height within the tree (should be changed to LEV)
- N = Number of the item in the node
- STAT = stat data
- DEH = Directory Entry Header
- EC = Entry Count
- E = Entry number
- UL = Unsigned Long
- BLKH = BLocK Header
- UNFM = UNForMatted node
- DC = Disk Child
- P = Path
-
- These #defines are named by concatenating these abbreviations,
- where first comes the arguments, and last comes the return value,
- of the macro.
-
-*/
+/*
+ * in reading the #defines, it may help to understand that they employ
+ * the following abbreviations:
+ *
+ * B = Buffer
+ * I = Item header
+ * H = Height within the tree (should be changed to LEV)
+ * N = Number of the item in the node
+ * STAT = stat data
+ * DEH = Directory Entry Header
+ * EC = Entry Count
+ * E = Entry number
+ * UL = Unsigned Long
+ * BLKH = BLocK Header
+ * UNFM = UNForMatted node
+ * DC = Disk Child
+ * P = Path
+ *
+ * These #defines are named by concatenating these abbreviations,
+ * where first comes the arguments, and last comes the return value,
+ * of the macro.
+ */
#define USE_INODE_GENERATION_COUNTER
@@ -745,14 +878,17 @@ struct fid;
/* n must be power of 2 */
#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u))
-// to be ok for alpha and others we have to align structures to 8 byte
-// boundary.
-// FIXME: do not change 4 by anything else: there is code which relies on that
+/*
+ * to be ok for alpha and others we have to align structures to 8 byte
+ * boundary.
+ * FIXME: do not change 4 by anything else: there is code which relies on that
+ */
#define ROUND_UP(x) _ROUND_UP(x,8LL)
-/* debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
-** messages.
-*/
+/*
+ * debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug
+ * messages.
+ */
#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */
void __reiserfs_warning(struct super_block *s, const char *id,
@@ -761,7 +897,7 @@ void __reiserfs_warning(struct super_block *s, const char *id,
__reiserfs_warning(s, id, __func__, fmt, ##args)
/* assertions handling */
-/** always check a condition and panic if it's false. */
+/* always check a condition and panic if it's false. */
#define __RASSERT(cond, scond, format, args...) \
do { \
if (!(cond)) \
@@ -784,35 +920,48 @@ do { \
* Disk Data Structures
*/
-/***************************************************************************/
-/* SUPER BLOCK */
-/***************************************************************************/
+/***************************************************************************
+ * SUPER BLOCK *
+ ***************************************************************************/
/*
- * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs
- * the version in RAM is part of a larger structure containing fields never written to disk.
+ * Structure of super block on disk, a version of which in RAM is often
+ * accessed as REISERFS_SB(s)->s_rs. The version in RAM is part of a larger
+ * structure containing fields never written to disk.
*/
-#define UNSET_HASH 0 // read_super will guess about, what hash names
- // in directories were sorted with
+#define UNSET_HASH 0 /* Detect hash on disk */
#define TEA_HASH 1
#define YURA_HASH 2
#define R5_HASH 3
#define DEFAULT_HASH R5_HASH
struct journal_params {
- __le32 jp_journal_1st_block; /* where does journal start from on its
- * device */
- __le32 jp_journal_dev; /* journal device st_rdev */
- __le32 jp_journal_size; /* size of the journal */
- __le32 jp_journal_trans_max; /* max number of blocks in a transaction. */
- __le32 jp_journal_magic; /* random value made on fs creation (this
- * was sb_journal_block_count) */
- __le32 jp_journal_max_batch; /* max number of blocks to batch into a
- * trans */
- __le32 jp_journal_max_commit_age; /* in seconds, how old can an async
- * commit be */
- __le32 jp_journal_max_trans_age; /* in seconds, how old can a transaction
- * be */
+ /* where does journal start from on its * device */
+ __le32 jp_journal_1st_block;
+
+ /* journal device st_rdev */
+ __le32 jp_journal_dev;
+
+ /* size of the journal */
+ __le32 jp_journal_size;
+
+ /* max number of blocks in a transaction. */
+ __le32 jp_journal_trans_max;
+
+ /*
+ * random value made on fs creation
+ * (this was sb_journal_block_count)
+ */
+ __le32 jp_journal_magic;
+
+ /* max number of blocks to batch into a trans */
+ __le32 jp_journal_max_batch;
+
+ /* in seconds, how old can an async commit be */
+ __le32 jp_journal_max_commit_age;
+
+ /* in seconds, how old can a transaction be */
+ __le32 jp_journal_max_trans_age;
};
/* this is the super from 3.5.X, where X >= 10 */
@@ -822,26 +971,48 @@ struct reiserfs_super_block_v1 {
__le32 s_root_block; /* root block number */
struct journal_params s_journal;
__le16 s_blocksize; /* block size */
- __le16 s_oid_maxsize; /* max size of object id array, see
- * get_objectid() commentary */
+
+ /* max size of object id array, see get_objectid() commentary */
+ __le16 s_oid_maxsize;
__le16 s_oid_cursize; /* current size of object id array */
- __le16 s_umount_state; /* this is set to 1 when filesystem was
- * umounted, to 2 - when not */
- char s_magic[10]; /* reiserfs magic string indicates that
- * file system is reiserfs:
- * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */
- __le16 s_fs_state; /* it is set to used by fsck to mark which
- * phase of rebuilding is done */
- __le32 s_hash_function_code; /* indicate, what hash function is being use
- * to sort names in a directory*/
+
+ /* this is set to 1 when filesystem was umounted, to 2 - when not */
+ __le16 s_umount_state;
+
+ /*
+ * reiserfs magic string indicates that file system is reiserfs:
+ * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs"
+ */
+ char s_magic[10];
+
+ /*
+ * it is set to used by fsck to mark which
+ * phase of rebuilding is done
+ */
+ __le16 s_fs_state;
+ /*
+ * indicate, what hash function is being use
+ * to sort names in a directory
+ */
+ __le32 s_hash_function_code;
__le16 s_tree_height; /* height of disk tree */
- __le16 s_bmap_nr; /* amount of bitmap blocks needed to address
- * each block of file system */
- __le16 s_version; /* this field is only reliable on filesystem
- * with non-standard journal */
- __le16 s_reserved_for_journal; /* size in blocks of journal area on main
- * device, we need to keep after
- * making fs with non-standard journal */
+
+ /*
+ * amount of bitmap blocks needed to address
+ * each block of file system
+ */
+ __le16 s_bmap_nr;
+
+ /*
+ * this field is only reliable on filesystem with non-standard journal
+ */
+ __le16 s_version;
+
+ /*
+ * size in blocks of journal area on main device, we need to
+ * keep after making fs with non-standard journal
+ */
+ __le16 s_reserved_for_journal;
} __attribute__ ((__packed__));
#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
@@ -850,17 +1021,21 @@ struct reiserfs_super_block_v1 {
struct reiserfs_super_block {
struct reiserfs_super_block_v1 s_v1;
__le32 s_inode_generation;
- __le32 s_flags; /* Right now used only by inode-attributes, if enabled */
+
+ /* Right now used only by inode-attributes, if enabled */
+ __le32 s_flags;
+
unsigned char s_uuid[16]; /* filesystem unique identifier */
unsigned char s_label[16]; /* filesystem volume label */
__le16 s_mnt_count; /* Count of mounts since last fsck */
__le16 s_max_mnt_count; /* Maximum mounts before check */
__le32 s_lastcheck; /* Timestamp of last fsck */
__le32 s_check_interval; /* Interval between checks */
- char s_unused[76]; /* zero filled by mkreiserfs and
- * reiserfs_convert_objectid_map_v1()
- * so any additions must be updated
- * there as well. */
+
+ /*
+ * zero filled by mkreiserfs and reiserfs_convert_objectid_map_v1()
+ * so any additions must be updated there as well. */
+ char s_unused[76];
} __attribute__ ((__packed__));
#define SB_SIZE (sizeof(struct reiserfs_super_block))
@@ -868,7 +1043,7 @@ struct reiserfs_super_block {
#define REISERFS_VERSION_1 0
#define REISERFS_VERSION_2 2
-// on-disk super block fields converted to cpu form
+/* on-disk super block fields converted to cpu form */
#define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
#define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
#define SB_BLOCKSIZE(s) \
@@ -923,11 +1098,13 @@ int is_reiserfs_3_5(struct reiserfs_super_block *rs);
int is_reiserfs_3_6(struct reiserfs_super_block *rs);
int is_reiserfs_jr(struct reiserfs_super_block *rs);
-/* ReiserFS leaves the first 64k unused, so that partition labels have
- enough space. If someone wants to write a fancy bootloader that
- needs more than 64k, let us know, and this will be increased in size.
- This number must be larger than than the largest block size on any
- platform, or code will break. -Hans */
+/*
+ * ReiserFS leaves the first 64k unused, so that partition labels have
+ * enough space. If someone wants to write a fancy bootloader that
+ * needs more than 64k, let us know, and this will be increased in size.
+ * This number must be larger than than the largest block size on any
+ * platform, or code will break. -Hans
+ */
#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024)
#define REISERFS_FIRST_BLOCK unused_define
#define REISERFS_JOURNAL_OFFSET_IN_BYTES REISERFS_DISK_OFFSET_IN_BYTES
@@ -952,8 +1129,7 @@ struct unfm_nodeinfo {
unsigned short unfm_freespace;
};
-/* there are two formats of keys: 3.5 and 3.6
- */
+/* there are two formats of keys: 3.5 and 3.6 */
#define KEY_FORMAT_3_5 0
#define KEY_FORMAT_3_6 1
@@ -971,8 +1147,10 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
return sb->s_fs_info;
}
-/* Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
- * which overflows on large file systems. */
+/*
+ * Don't trust REISERFS_SB(sb)->s_bmap_nr, it's a u16
+ * which overflows on large file systems.
+ */
static inline __u32 reiserfs_bmap_count(struct super_block *sb)
{
return (SB_BLOCK_COUNT(sb) - 1) / (sb->s_blocksize * 8) + 1;
@@ -983,8 +1161,10 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
-/** this says about version of key of all items (but stat data) the
- object consists of */
+/*
+ * this says about version of key of all items (but stat data) the
+ * object consists of
+ */
#define get_inode_item_key_version( inode ) \
((REISERFS_I(inode)->i_flags & i_item_key_version_mask) ? KEY_FORMAT_3_6 : KEY_FORMAT_3_5)
@@ -1003,16 +1183,18 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
else \
REISERFS_I(inode)->i_flags &= ~i_stat_data_version_mask; })
-/* This is an aggressive tail suppression policy, I am hoping it
- improves our benchmarks. The principle behind it is that percentage
- space saving is what matters, not absolute space saving. This is
- non-intuitive, but it helps to understand it if you consider that the
- cost to access 4 blocks is not much more than the cost to access 1
- block, if you have to do a seek and rotate. A tail risks a
- non-linear disk access that is significant as a percentage of total
- time cost for a 4 block file and saves an amount of space that is
- less significant as a percentage of space, or so goes the hypothesis.
- -Hans */
+/*
+ * This is an aggressive tail suppression policy, I am hoping it
+ * improves our benchmarks. The principle behind it is that percentage
+ * space saving is what matters, not absolute space saving. This is
+ * non-intuitive, but it helps to understand it if you consider that the
+ * cost to access 4 blocks is not much more than the cost to access 1
+ * block, if you have to do a seek and rotate. A tail risks a
+ * non-linear disk access that is significant as a percentage of total
+ * time cost for a 4 block file and saves an amount of space that is
+ * less significant as a percentage of space, or so goes the hypothesis.
+ * -Hans
+ */
#define STORE_TAIL_IN_UNFM_S1(n_file_size,n_tail_size,n_block_size) \
(\
(!(n_tail_size)) || \
@@ -1026,10 +1208,11 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \
)
-/* Another strategy for tails, this one means only create a tail if all the
- file would fit into one DIRECT item.
- Primary intention for this one is to increase performance by decreasing
- seeking.
+/*
+ * Another strategy for tails, this one means only create a tail if all the
+ * file would fit into one DIRECT item.
+ * Primary intention for this one is to increase performance by decreasing
+ * seeking.
*/
#define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
(\
@@ -1043,23 +1226,21 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
#define REISERFS_VALID_FS 1
#define REISERFS_ERROR_FS 2
-//
-// there are 5 item types currently
-//
+/*
+ * there are 5 item types currently
+ */
#define TYPE_STAT_DATA 0
#define TYPE_INDIRECT 1
#define TYPE_DIRECT 2
#define TYPE_DIRENTRY 3
#define TYPE_MAXTYPE 3
-#define TYPE_ANY 15 // FIXME: comment is required
+#define TYPE_ANY 15 /* FIXME: comment is required */
-/***************************************************************************/
-/* KEY & ITEM HEAD */
-/***************************************************************************/
+/***************************************************************************
+ * KEY & ITEM HEAD *
+ ***************************************************************************/
-//
-// directories use this key as well as old files
-//
+/* * directories use this key as well as old files */
struct offset_v1 {
__le32 k_offset;
__le32 k_uniqueness;
@@ -1092,11 +1273,14 @@ static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
}
-/* Key of an item determines its location in the S+tree, and
- is composed of 4 components */
+/*
+ * Key of an item determines its location in the S+tree, and
+ * is composed of 4 components
+ */
struct reiserfs_key {
- __le32 k_dir_id; /* packing locality: by default parent
- directory object id */
+ /* packing locality: by default parent directory object id */
+ __le32 k_dir_id;
+
__le32 k_objectid; /* object identifier */
union {
struct offset_v1 k_offset_v1;
@@ -1105,8 +1289,8 @@ struct reiserfs_key {
} __attribute__ ((__packed__));
struct in_core_key {
- __u32 k_dir_id; /* packing locality: by default parent
- directory object id */
+ /* packing locality: by default parent directory object id */
+ __u32 k_dir_id;
__u32 k_objectid; /* object identifier */
__u64 k_offset;
__u8 k_type;
@@ -1115,14 +1299,16 @@ struct in_core_key {
struct cpu_key {
struct in_core_key on_disk_key;
int version;
- int key_length; /* 3 in all cases but direct2indirect and
- indirect2direct conversion */
+ /* 3 in all cases but direct2indirect and indirect2direct conversion */
+ int key_length;
};
-/* Our function for comparing keys can compare keys of different
- lengths. It takes as a parameter the length of the keys it is to
- compare. These defines are used in determining what is to be passed
- to it as that parameter. */
+/*
+ * Our function for comparing keys can compare keys of different
+ * lengths. It takes as a parameter the length of the keys it is to
+ * compare. These defines are used in determining what is to be passed
+ * to it as that parameter.
+ */
#define REISERFS_FULL_KEY_LEN 4
#define REISERFS_SHORT_KEY_LEN 2
@@ -1151,40 +1337,52 @@ struct cpu_key {
#define POSITION_FOUND 1
#define POSITION_NOT_FOUND 0
-// return values for reiserfs_find_entry and search_by_entry_key
+/* return values for reiserfs_find_entry and search_by_entry_key */
#define NAME_FOUND 1
#define NAME_NOT_FOUND 0
#define GOTO_PREVIOUS_ITEM 2
#define NAME_FOUND_INVISIBLE 3
-/* Everything in the filesystem is stored as a set of items. The
- item head contains the key of the item, its free space (for
- indirect items) and specifies the location of the item itself
- within the block. */
+/*
+ * Everything in the filesystem is stored as a set of items. The
+ * item head contains the key of the item, its free space (for
+ * indirect items) and specifies the location of the item itself
+ * within the block.
+ */
struct item_head {
- /* Everything in the tree is found by searching for it based on
- * its key.*/
+ /*
+ * Everything in the tree is found by searching for it based on
+ * its key.
+ */
struct reiserfs_key ih_key;
union {
- /* The free space in the last unformatted node of an
- indirect item if this is an indirect item. This
- equals 0xFFFF iff this is a direct item or stat data
- item. Note that the key, not this field, is used to
- determine the item type, and thus which field this
- union contains. */
+ /*
+ * The free space in the last unformatted node of an
+ * indirect item if this is an indirect item. This
+ * equals 0xFFFF iff this is a direct item or stat data
+ * item. Note that the key, not this field, is used to
+ * determine the item type, and thus which field this
+ * union contains.
+ */
__le16 ih_free_space_reserved;
- /* Iff this is a directory item, this field equals the
- number of directory entries in the directory item. */
+
+ /*
+ * Iff this is a directory item, this field equals the
+ * number of directory entries in the directory item.
+ */
__le16 ih_entry_count;
} __attribute__ ((__packed__)) u;
__le16 ih_item_len; /* total size of the item body */
- __le16 ih_item_location; /* an offset to the item body
- * within the block */
- __le16 ih_version; /* 0 for all old items, 2 for new
- ones. Highest bit is set by fsck
- temporary, cleaned after all
- done */
+
+ /* an offset to the item body within the block */
+ __le16 ih_item_location;
+
+ /*
+ * 0 for all old items, 2 for new ones. Highest bit is set by fsck
+ * temporary, cleaned after all done
+ */
+ __le16 ih_version;
} __attribute__ ((__packed__));
/* size of item header */
#define IH_SIZE (sizeof(struct item_head))
@@ -1206,27 +1404,24 @@ struct item_head {
#define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == KEY_FORMAT_3_6) ? 0 : (val)))
-/* these operate on indirect items, where you've got an array of ints
-** at a possibly unaligned location. These are a noop on ia32
-**
-** p is the array of __u32, i is the index into the array, v is the value
-** to store there.
-*/
+/*
+ * these operate on indirect items, where you've got an array of ints
+ * at a possibly unaligned location. These are a noop on ia32
+ *
+ * p is the array of __u32, i is the index into the array, v is the value
+ * to store there.
+ */
#define get_block_num(p, i) get_unaligned_le32((p) + (i))
#define put_block_num(p, i, v) put_unaligned_le32((v), (p) + (i))
-//
-// in old version uniqueness field shows key type
-//
+/* * in old version uniqueness field shows key type */
#define V1_SD_UNIQUENESS 0
#define V1_INDIRECT_UNIQUENESS 0xfffffffe
#define V1_DIRECT_UNIQUENESS 0xffffffff
#define V1_DIRENTRY_UNIQUENESS 500
-#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required
+#define V1_ANY_UNIQUENESS 555 /* FIXME: comment is required */
-//
-// here are conversion routines
-//
+/* here are conversion routines */
static inline int uniqueness2type(__u32 uniqueness) CONSTF;
static inline int uniqueness2type(__u32 uniqueness)
{
@@ -1263,11 +1458,11 @@ static inline __u32 type2uniqueness(int type)
}
}
-//
-// key is pointer to on disk key which is stored in le, result is cpu,
-// there is no way to get version of object from key, so, provide
-// version to these defines
-//
+/*
+ * key is pointer to on disk key which is stored in le, result is cpu,
+ * there is no way to get version of object from key, so, provide
+ * version to these defines
+ */
static inline loff_t le_key_k_offset(int version,
const struct reiserfs_key *key)
{
@@ -1283,9 +1478,11 @@ static inline loff_t le_ih_k_offset(const struct item_head *ih)
static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
{
- return (version == KEY_FORMAT_3_5) ?
- uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) :
- offset_v2_k_type(&(key->u.k_offset_v2));
+ if (version == KEY_FORMAT_3_5) {
+ loff_t val = le32_to_cpu(key->u.k_offset_v1.k_uniqueness);
+ return uniqueness2type(val);
+ } else
+ return offset_v2_k_type(&(key->u.k_offset_v2));
}
static inline loff_t le_ih_k_type(const struct item_head *ih)
@@ -1296,8 +1493,22 @@ static inline loff_t le_ih_k_type(const struct item_head *ih)
static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
loff_t offset)
{
- (version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) : /* jdm check */
- (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset));
+ if (version == KEY_FORMAT_3_5)
+ key->u.k_offset_v1.k_offset = cpu_to_le32(offset);
+ else
+ set_offset_v2_k_offset(&key->u.k_offset_v2, offset);
+}
+
+static inline void add_le_key_k_offset(int version, struct reiserfs_key *key,
+ loff_t offset)
+{
+ set_le_key_k_offset(version, key,
+ le_key_k_offset(version, key) + offset);
+}
+
+static inline void add_le_ih_k_offset(struct item_head *ih, loff_t offset)
+{
+ add_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
}
static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
@@ -1308,10 +1519,11 @@ static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
int type)
{
- (version == KEY_FORMAT_3_5) ?
- (void)(key->u.k_offset_v1.k_uniqueness =
- cpu_to_le32(type2uniqueness(type)))
- : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type));
+ if (version == KEY_FORMAT_3_5) {
+ type = type2uniqueness(type);
+ key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type);
+ } else
+ set_offset_v2_k_type(&key->u.k_offset_v2, type);
}
static inline void set_le_ih_k_type(struct item_head *ih, int type)
@@ -1339,9 +1551,7 @@ static inline int is_statdata_le_key(int version, struct reiserfs_key *key)
return le_key_k_type(version, key) == TYPE_STAT_DATA;
}
-//
-// item header has version.
-//
+/* item header has version. */
static inline int is_direntry_le_ih(struct item_head *ih)
{
return is_direntry_le_key(ih_version(ih), &ih->ih_key);
@@ -1362,9 +1572,7 @@ static inline int is_statdata_le_ih(struct item_head *ih)
return is_statdata_le_key(ih_version(ih), &ih->ih_key);
}
-//
-// key is pointer to cpu key, result is cpu
-//
+/* key is pointer to cpu key, result is cpu */
static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
{
return key->on_disk_key.k_offset;
@@ -1415,7 +1623,7 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key)
extern struct reiserfs_key root_key;
-/*
+/*
* Picture represents a leaf of the S+tree
* ______________________________________________________
* | | Array of | | |
@@ -1424,15 +1632,19 @@ extern struct reiserfs_key root_key;
* |______|_______________|___________________|___________|
*/
-/* Header of a disk block. More precisely, header of a formatted leaf
- or internal node, and not the header of an unformatted node. */
+/*
+ * Header of a disk block. More precisely, header of a formatted leaf
+ * or internal node, and not the header of an unformatted node.
+ */
struct block_head {
__le16 blk_level; /* Level of a block in the tree. */
__le16 blk_nr_item; /* Number of keys/items in a block. */
__le16 blk_free_space; /* Block free space in bytes. */
__le16 blk_reserved;
/* dump this in v4/planA */
- struct reiserfs_key blk_right_delim_key; /* kept only for compatibility */
+
+ /* kept only for compatibility */
+ struct reiserfs_key blk_right_delim_key;
};
#define BLKH_SIZE (sizeof(struct block_head))
@@ -1447,18 +1659,20 @@ struct block_head {
#define blkh_right_delim_key(p_blkh) ((p_blkh)->blk_right_delim_key)
#define set_blkh_right_delim_key(p_blkh,val) ((p_blkh)->blk_right_delim_key = val)
+/* values for blk_level field of the struct block_head */
+
/*
- * values for blk_level field of the struct block_head
+ * When node gets removed from the tree its blk_level is set to FREE_LEVEL.
+ * It is then used to see whether the node is still in the tree
*/
-
-#define FREE_LEVEL 0 /* when node gets removed from the tree its
- blk_level is set to FREE_LEVEL. It is then
- used to see whether the node is still in the
- tree */
+#define FREE_LEVEL 0
#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level. */
-/* Given the buffer head of a formatted node, resolve to the block head of that node. */
+/*
+ * Given the buffer head of a formatted node, resolve to the
+ * block head of that node.
+ */
#define B_BLK_HEAD(bh) ((struct block_head *)((bh)->b_data))
/* Number of items that are in buffer. */
#define B_NR_ITEMS(bh) (blkh_nr_item(B_BLK_HEAD(bh)))
@@ -1479,14 +1693,14 @@ struct block_head {
#define B_IS_KEYS_LEVEL(bh) (B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL \
&& B_LEVEL(bh) <= MAX_HEIGHT)
-/***************************************************************************/
-/* STAT DATA */
-/***************************************************************************/
+/***************************************************************************
+ * STAT DATA *
+ ***************************************************************************/
-//
-// old stat data is 32 bytes long. We are going to distinguish new one by
-// different size
-//
+/*
+ * old stat data is 32 bytes long. We are going to distinguish new one by
+ * different size
+*/
struct stat_data_v1 {
__le16 sd_mode; /* file type, permissions */
__le16 sd_nlink; /* number of hard links */
@@ -1495,20 +1709,25 @@ struct stat_data_v1 {
__le32 sd_size; /* file size */
__le32 sd_atime; /* time of last access */
__le32 sd_mtime; /* time file was last modified */
- __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+
+ /*
+ * time inode (stat data) was last changed
+ * (except changes to sd_atime and sd_mtime)
+ */
+ __le32 sd_ctime;
union {
__le32 sd_rdev;
__le32 sd_blocks; /* number of blocks file uses */
} __attribute__ ((__packed__)) u;
- __le32 sd_first_direct_byte; /* first byte of file which is stored
- in a direct item: except that if it
- equals 1 it is a symlink and if it
- equals ~(__u32)0 there is no
- direct item. The existence of this
- field really grates on me. Let's
- replace it with a macro based on
- sd_size and our tail suppression
- policy. Someday. -Hans */
+
+ /*
+ * first byte of file which is stored in a direct item: except that if
+ * it equals 1 it is a symlink and if it equals ~(__u32)0 there is no
+ * direct item. The existence of this field really grates on me.
+ * Let's replace it with a macro based on sd_size and our tail
+ * suppression policy. Someday. -Hans
+ */
+ __le32 sd_first_direct_byte;
} __attribute__ ((__packed__));
#define SD_V1_SIZE (sizeof(struct stat_data_v1))
@@ -1540,8 +1759,10 @@ struct stat_data_v1 {
/* inode flags stored in sd_attrs (nee sd_reserved) */
-/* we want common flags to have the same values as in ext2,
- so chattr(1) will work without problems */
+/*
+ * we want common flags to have the same values as in ext2,
+ * so chattr(1) will work without problems
+ */
#define REISERFS_IMMUTABLE_FL FS_IMMUTABLE_FL
#define REISERFS_APPEND_FL FS_APPEND_FL
#define REISERFS_SYNC_FL FS_SYNC_FL
@@ -1561,8 +1782,10 @@ struct stat_data_v1 {
REISERFS_COMPR_FL | \
REISERFS_NOTAIL_FL )
-/* Stat Data on disk (reiserfs version of UFS disk inode minus the
- address blocks) */
+/*
+ * Stat Data on disk (reiserfs version of UFS disk inode minus the
+ * address blocks)
+ */
struct stat_data {
__le16 sd_mode; /* file type, permissions */
__le16 sd_attrs; /* persistent inode flags */
@@ -1572,25 +1795,20 @@ struct stat_data {
__le32 sd_gid; /* group */
__le32 sd_atime; /* time of last access */
__le32 sd_mtime; /* time file was last modified */
- __le32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+
+ /*
+ * time inode (stat data) was last changed
+ * (except changes to sd_atime and sd_mtime)
+ */
+ __le32 sd_ctime;
__le32 sd_blocks;
union {
__le32 sd_rdev;
__le32 sd_generation;
- //__le32 sd_first_direct_byte;
- /* first byte of file which is stored in a
- direct item: except that if it equals 1
- it is a symlink and if it equals
- ~(__u32)0 there is no direct item. The
- existence of this field really grates
- on me. Let's replace it with a macro
- based on sd_size and our tail
- suppression policy? */
} __attribute__ ((__packed__)) u;
} __attribute__ ((__packed__));
-//
-// this is 44 bytes long
-//
+
+/* this is 44 bytes long */
#define SD_SIZE (sizeof(struct stat_data))
#define SD_V2_SIZE SD_SIZE
#define stat_data_v2(ih) (ih_version (ih) == KEY_FORMAT_3_6)
@@ -1621,48 +1839,61 @@ struct stat_data {
#define sd_v2_attrs(sdp) (le16_to_cpu((sdp)->sd_attrs))
#define set_sd_v2_attrs(sdp,v) ((sdp)->sd_attrs = cpu_to_le16(v))
-/***************************************************************************/
-/* DIRECTORY STRUCTURE */
-/***************************************************************************/
-/*
- Picture represents the structure of directory items
- ________________________________________________
- | Array of | | | | | |
- | directory |N-1| N-2 | .... | 1st |0th|
- | entry headers | | | | | |
- |_______________|___|_____|________|_______|___|
- <---- directory entries ------>
-
- First directory item has k_offset component 1. We store "." and ".."
- in one item, always, we never split "." and ".." into differing
- items. This makes, among other things, the code for removing
- directories simpler. */
+/***************************************************************************
+ * DIRECTORY STRUCTURE *
+ ***************************************************************************/
+/*
+ * Picture represents the structure of directory items
+ * ________________________________________________
+ * | Array of | | | | | |
+ * | directory |N-1| N-2 | .... | 1st |0th|
+ * | entry headers | | | | | |
+ * |_______________|___|_____|________|_______|___|
+ * <---- directory entries ------>
+ *
+ * First directory item has k_offset component 1. We store "." and ".."
+ * in one item, always, we never split "." and ".." into differing
+ * items. This makes, among other things, the code for removing
+ * directories simpler.
+ */
#define SD_OFFSET 0
#define SD_UNIQUENESS 0
#define DOT_OFFSET 1
#define DOT_DOT_OFFSET 2
#define DIRENTRY_UNIQUENESS 500
-/* */
#define FIRST_ITEM_OFFSET 1
/*
- Q: How to get key of object pointed to by entry from entry?
-
- A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key
- of object, entry points to */
+ * Q: How to get key of object pointed to by entry from entry?
+ *
+ * A: Each directory entry has its header. This header has deh_dir_id
+ * and deh_objectid fields, those are key of object, entry points to
+ */
-/* NOT IMPLEMENTED:
- Directory will someday contain stat data of object */
+/*
+ * NOT IMPLEMENTED:
+ * Directory will someday contain stat data of object
+ */
struct reiserfs_de_head {
__le32 deh_offset; /* third component of the directory entry key */
- __le32 deh_dir_id; /* objectid of the parent directory of the object, that is referenced
- by directory entry */
- __le32 deh_objectid; /* objectid of the object, that is referenced by directory entry */
+
+ /*
+ * objectid of the parent directory of the object, that is referenced
+ * by directory entry
+ */
+ __le32 deh_dir_id;
+
+ /* objectid of the object, that is referenced by directory entry */
+ __le32 deh_objectid;
__le16 deh_location; /* offset of name in the whole item */
- __le16 deh_state; /* whether 1) entry contains stat data (for future), and 2) whether
- entry is hidden (unlinked) */
+
+ /*
+ * whether 1) entry contains stat data (for future), and
+ * 2) whether entry is hidden (unlinked)
+ */
+ __le16 deh_state;
} __attribute__ ((__packed__));
#define DEH_SIZE sizeof(struct reiserfs_de_head)
#define deh_offset(p_deh) (le32_to_cpu((p_deh)->deh_offset))
@@ -1692,9 +1923,11 @@ struct reiserfs_de_head {
# define ADDR_UNALIGNED_BITS (3)
#endif
-/* These are only used to manipulate deh_state.
+/*
+ * These are only used to manipulate deh_state.
* Because of this, we'll use the ext2_ bit routines,
- * since they are little endian */
+ * since they are little endian
+ */
#ifdef ADDR_UNALIGNED_BITS
# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1)))
@@ -1729,46 +1962,16 @@ extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
__le32 par_dirid, __le32 par_objid);
-/* array of the entry headers */
- /* get item body */
-#define B_I_PITEM(bh,ih) ( (bh)->b_data + ih_location(ih) )
-#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih)))
-
-/* length of the directory entry in directory item. This define
- calculates length of i-th directory entry using directory entry
- locations from dir entry head. When it calculates length of 0-th
- directory entry, it uses length of whole item in place of entry
- location of the non-existent following entry in the calculation.
- See picture above.*/
-/*
-#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \
-((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh))))
-*/
-static inline int entry_length(const struct buffer_head *bh,
- const struct item_head *ih, int pos_in_item)
-{
- struct reiserfs_de_head *deh;
-
- deh = B_I_DEH(bh, ih) + pos_in_item;
- if (pos_in_item)
- return deh_location(deh - 1) - deh_location(deh);
-
- return ih_item_len(ih) - deh_location(deh);
-}
-
-/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */
-#define I_ENTRY_COUNT(ih) (ih_entry_count((ih)))
-
-/* name by bh, ih and entry_num */
-#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num))))
-
-// two entries per block (at least)
+/* two entries per block (at least) */
#define REISERFS_MAX_NAME(block_size) 255
-/* this structure is used for operations on directory entries. It is
- not a disk structure. */
-/* When reiserfs_find_entry or search_by_entry_key find directory
- entry, they return filled reiserfs_dir_entry structure */
+/*
+ * this structure is used for operations on directory entries. It is
+ * not a disk structure.
+ *
+ * When reiserfs_find_entry or search_by_entry_key find directory
+ * entry, they return filled reiserfs_dir_entry structure
+ */
struct reiserfs_dir_entry {
struct buffer_head *de_bh;
int de_item_num;
@@ -1786,10 +1989,14 @@ struct reiserfs_dir_entry {
struct cpu_key de_entry_key;
};
-/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */
+/*
+ * these defines are useful when a particular member of
+ * a reiserfs_dir_entry is needed
+ */
/* pointer to file name, stored in entry */
-#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + deh_location(deh))
+#define B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh) \
+ (ih_item_body(bh, ih) + deh_location(deh))
/* length of name */
#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
@@ -1812,11 +2019,13 @@ struct reiserfs_dir_entry {
* |______|_______________|___________________|___________|
*/
-/***************************************************************************/
-/* DISK CHILD */
-/***************************************************************************/
-/* Disk child pointer: The pointer from an internal node of the tree
- to a node that is on disk. */
+/***************************************************************************
+ * DISK CHILD *
+ ***************************************************************************/
+/*
+ * Disk child pointer:
+ * The pointer from an internal node of the tree to a node that is on disk.
+ */
struct disk_child {
__le32 dc_block_number; /* Disk child's block number. */
__le16 dc_size; /* Disk child's used space. */
@@ -1849,47 +2058,66 @@ struct disk_child {
#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) )
#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2)
-/***************************************************************************/
-/* PATH STRUCTURES AND DEFINES */
-/***************************************************************************/
+/***************************************************************************
+ * PATH STRUCTURES AND DEFINES *
+ ***************************************************************************/
-/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the
- key. It uses reiserfs_bread to try to find buffers in the cache given their block number. If it
- does not find them in the cache it reads them from disk. For each node search_by_key finds using
- reiserfs_bread it then uses bin_search to look through that node. bin_search will find the
- position of the block_number of the next node if it is looking through an internal node. If it
- is looking through a leaf node bin_search will find the position of the item which has key either
- equal to given key, or which is the maximal key less than the given key. */
+/*
+ * search_by_key fills up the path from the root to the leaf as it descends
+ * the tree looking for the key. It uses reiserfs_bread to try to find
+ * buffers in the cache given their block number. If it does not find
+ * them in the cache it reads them from disk. For each node search_by_key
+ * finds using reiserfs_bread it then uses bin_search to look through that
+ * node. bin_search will find the position of the block_number of the next
+ * node if it is looking through an internal node. If it is looking through
+ * a leaf node bin_search will find the position of the item which has key
+ * either equal to given key, or which is the maximal key less than the
+ * given key.
+ */
struct path_element {
- struct buffer_head *pe_buffer; /* Pointer to the buffer at the path in the tree. */
- int pe_position; /* Position in the tree node which is placed in the */
- /* buffer above. */
+ /* Pointer to the buffer at the path in the tree. */
+ struct buffer_head *pe_buffer;
+ /* Position in the tree node which is placed in the buffer above. */
+ int pe_position;
};
-#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */
-#define EXTENDED_MAX_HEIGHT 7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
-#define FIRST_PATH_ELEMENT_OFFSET 2 /* Must be equal to at least 2. */
-
-#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
-#define MAX_FEB_SIZE 6 /* this MUST be MAX_HEIGHT + 1. See about FEB below */
-
-/* We need to keep track of who the ancestors of nodes are. When we
- perform a search we record which nodes were visited while
- descending the tree looking for the node we searched for. This list
- of nodes is called the path. This information is used while
- performing balancing. Note that this path information may become
- invalid, and this means we must check it when using it to see if it
- is still valid. You'll need to read search_by_key and the comments
- in it, especially about decrement_counters_in_path(), to understand
- this structure.
-
-Paths make the code so much harder to work with and debug.... An
-enormous number of bugs are due to them, and trying to write or modify
-code that uses them just makes my head hurt. They are based on an
-excessive effort to avoid disturbing the precious VFS code.:-( The
-gods only know how we are going to SMP the code that uses them.
-znodes are the way! */
+/*
+ * maximal height of a tree. don't change this without
+ * changing JOURNAL_PER_BALANCE_CNT
+ */
+#define MAX_HEIGHT 5
+
+/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
+#define EXTENDED_MAX_HEIGHT 7
+
+/* Must be equal to at least 2. */
+#define FIRST_PATH_ELEMENT_OFFSET 2
+
+/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
+#define ILLEGAL_PATH_ELEMENT_OFFSET 1
+
+/* this MUST be MAX_HEIGHT + 1. See about FEB below */
+#define MAX_FEB_SIZE 6
+
+/*
+ * We need to keep track of who the ancestors of nodes are. When we
+ * perform a search we record which nodes were visited while
+ * descending the tree looking for the node we searched for. This list
+ * of nodes is called the path. This information is used while
+ * performing balancing. Note that this path information may become
+ * invalid, and this means we must check it when using it to see if it
+ * is still valid. You'll need to read search_by_key and the comments
+ * in it, especially about decrement_counters_in_path(), to understand
+ * this structure.
+ *
+ * Paths make the code so much harder to work with and debug.... An
+ * enormous number of bugs are due to them, and trying to write or modify
+ * code that uses them just makes my head hurt. They are based on an
+ * excessive effort to avoid disturbing the precious VFS code.:-( The
+ * gods only know how we are going to SMP the code that uses them.
+ * znodes are the way!
+ */
#define PATH_READA 0x1 /* do read ahead */
#define PATH_READA_BACK 0x2 /* read backwards */
@@ -1897,7 +2125,8 @@ znodes are the way! */
struct treepath {
int path_length; /* Length of the array above. */
int reada;
- struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */
+ /* Array of the path elements. */
+ struct path_element path_elements[EXTENDED_MAX_HEIGHT];
int pos_in_item;
};
@@ -1916,41 +2145,124 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
#define PATH_OFFSET_POSITION(path, n_offset) (PATH_OFFSET_PELEMENT(path, n_offset)->pe_position)
#define PATH_PLAST_BUFFER(path) (PATH_OFFSET_PBUFFER((path), (path)->path_length))
- /* you know, to the person who didn't
- write this the macro name does not
- at first suggest what it does.
- Maybe POSITION_FROM_PATH_END? Or
- maybe we should just focus on
- dumping paths... -Hans */
+
+/*
+ * you know, to the person who didn't write this the macro name does not
+ * at first suggest what it does. Maybe POSITION_FROM_PATH_END? Or
+ * maybe we should just focus on dumping paths... -Hans
+ */
#define PATH_LAST_POSITION(path) (PATH_OFFSET_POSITION((path), (path)->path_length))
-#define PATH_PITEM_HEAD(path) B_N_PITEM_HEAD(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path))
+/*
+ * in do_balance leaf has h == 0 in contrast with path structure,
+ * where root has level == 0. That is why we need these defines
+ */
+
+/* tb->S[h] */
+#define PATH_H_PBUFFER(path, h) \
+ PATH_OFFSET_PBUFFER(path, path->path_length - (h))
+
+/* tb->F[h] or tb->S[0]->b_parent */
+#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER(path, (h) + 1)
-/* in do_balance leaf has h == 0 in contrast with path structure,
- where root has level == 0. That is why we need these defines */
-#define PATH_H_PBUFFER(path, h) PATH_OFFSET_PBUFFER (path, path->path_length - (h)) /* tb->S[h] */
-#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1) /* tb->F[h] or tb->S[0]->b_parent */
-#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h))
-#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) /* tb->S[h]->b_item_order */
+#define PATH_H_POSITION(path, h) \
+ PATH_OFFSET_POSITION(path, path->path_length - (h))
+
+/* tb->S[h]->b_item_order */
+#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)
#define PATH_H_PATH_OFFSET(path, n_h) ((path)->path_length - (n_h))
+static inline void *reiserfs_node_data(const struct buffer_head *bh)
+{
+ return bh->b_data + sizeof(struct block_head);
+}
+
+/* get key from internal node */
+static inline struct reiserfs_key *internal_key(struct buffer_head *bh,
+ int item_num)
+{
+ struct reiserfs_key *key = reiserfs_node_data(bh);
+
+ return &key[item_num];
+}
+
+/* get the item header from leaf node */
+static inline struct item_head *item_head(const struct buffer_head *bh,
+ int item_num)
+{
+ struct item_head *ih = reiserfs_node_data(bh);
+
+ return &ih[item_num];
+}
+
+/* get the key from leaf node */
+static inline struct reiserfs_key *leaf_key(const struct buffer_head *bh,
+ int item_num)
+{
+ return &item_head(bh, item_num)->ih_key;
+}
+
+static inline void *ih_item_body(const struct buffer_head *bh,
+ const struct item_head *ih)
+{
+ return bh->b_data + ih_location(ih);
+}
+
+/* get item body from leaf node */
+static inline void *item_body(const struct buffer_head *bh, int item_num)
+{
+ return ih_item_body(bh, item_head(bh, item_num));
+}
+
+static inline struct item_head *tp_item_head(const struct treepath *path)
+{
+ return item_head(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
+}
+
+static inline void *tp_item_body(const struct treepath *path)
+{
+ return item_body(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION(path));
+}
+
#define get_last_bh(path) PATH_PLAST_BUFFER(path)
-#define get_ih(path) PATH_PITEM_HEAD(path)
#define get_item_pos(path) PATH_LAST_POSITION(path)
-#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path)))
#define item_moved(ih,path) comp_items(ih, path)
#define path_changed(ih,path) comp_items (ih, path)
-/***************************************************************************/
-/* MISC */
-/***************************************************************************/
+/* array of the entry headers */
+ /* get item body */
+#define B_I_DEH(bh, ih) ((struct reiserfs_de_head *)(ih_item_body(bh, ih)))
+
+/*
+ * length of the directory entry in directory item. This define
+ * calculates length of i-th directory entry using directory entry
+ * locations from dir entry head. When it calculates length of 0-th
+ * directory entry, it uses length of whole item in place of entry
+ * location of the non-existent following entry in the calculation.
+ * See picture above.
+ */
+static inline int entry_length(const struct buffer_head *bh,
+ const struct item_head *ih, int pos_in_item)
+{
+ struct reiserfs_de_head *deh;
+
+ deh = B_I_DEH(bh, ih) + pos_in_item;
+ if (pos_in_item)
+ return deh_location(deh - 1) - deh_location(deh);
+
+ return ih_item_len(ih) - deh_location(deh);
+}
+
+/***************************************************************************
+ * MISC *
+ ***************************************************************************/
/* Size of pointer to the unformatted node. */
#define UNFM_P_SIZE (sizeof(unp_t))
#define UNFM_P_SHIFT 2
-// in in-core inode key is stored on le form
+/* in in-core inode key is stored on le form */
#define INODE_PKEY(inode) ((struct reiserfs_key *)(REISERFS_I(inode)->i_key))
#define MAX_UL_INT 0xffffffff
@@ -1958,8 +2270,6 @@ struct treepath var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
#define MAX_US_INT 0xffff
// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
-#define U32_MAX (~(__u32)0)
-
static inline loff_t max_reiserfs_offset(struct inode *inode)
{
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
@@ -1968,7 +2278,6 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
return (loff_t) ((~(__u64) 0) >> 4);
}
-/*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/
#define MAX_KEY_OBJECTID MAX_UL_INT
#define MAX_B_NUM MAX_UL_INT
@@ -1977,9 +2286,12 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
/* the purpose is to detect overflow of an unsigned short */
#define REISERFS_LINK_MAX (MAX_US_INT - 1000)
-/* The following defines are used in reiserfs_insert_item and reiserfs_append_item */
-#define REISERFS_KERNEL_MEM 0 /* reiserfs kernel memory mode */
-#define REISERFS_USER_MEM 1 /* reiserfs user memory mode */
+/*
+ * The following defines are used in reiserfs_insert_item
+ * and reiserfs_append_item
+ */
+#define REISERFS_KERNEL_MEM 0 /* kernel memory mode */
+#define REISERFS_USER_MEM 1 /* user memory mode */
#define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
#define get_generation(s) atomic_read (&fs_generation(s))
@@ -1991,46 +2303,65 @@ static inline loff_t max_reiserfs_offset(struct inode *inode)
__fs_changed(gen, s); \
})
-/***************************************************************************/
-/* FIXATE NODES */
-/***************************************************************************/
+/***************************************************************************
+ * FIXATE NODES *
+ ***************************************************************************/
#define VI_TYPE_LEFT_MERGEABLE 1
#define VI_TYPE_RIGHT_MERGEABLE 2
-/* To make any changes in the tree we always first find node, that
- contains item to be changed/deleted or place to insert a new
- item. We call this node S. To do balancing we need to decide what
- we will shift to left/right neighbor, or to a new node, where new
- item will be etc. To make this analysis simpler we build virtual
- node. Virtual node is an array of items, that will replace items of
- node S. (For instance if we are going to delete an item, virtual
- node does not contain it). Virtual node keeps information about
- item sizes and types, mergeability of first and last items, sizes
- of all entries in directory item. We use this array of items when
- calculating what we can shift to neighbors and how many nodes we
- have to have if we do not any shiftings, if we shift to left/right
- neighbor or to both. */
+/*
+ * To make any changes in the tree we always first find node, that
+ * contains item to be changed/deleted or place to insert a new
+ * item. We call this node S. To do balancing we need to decide what
+ * we will shift to left/right neighbor, or to a new node, where new
+ * item will be etc. To make this analysis simpler we build virtual
+ * node. Virtual node is an array of items, that will replace items of
+ * node S. (For instance if we are going to delete an item, virtual
+ * node does not contain it). Virtual node keeps information about
+ * item sizes and types, mergeability of first and last items, sizes
+ * of all entries in directory item. We use this array of items when
+ * calculating what we can shift to neighbors and how many nodes we
+ * have to have if we do not any shiftings, if we shift to left/right
+ * neighbor or to both.
+ */
struct virtual_item {
- int vi_index; // index in the array of item operations
- unsigned short vi_type; // left/right mergeability
- unsigned short vi_item_len; /* length of item that it will have after balancing */
+ int vi_index; /* index in the array of item operations */
+ unsigned short vi_type; /* left/right mergeability */
+
+ /* length of item that it will have after balancing */
+ unsigned short vi_item_len;
+
struct item_head *vi_ih;
- const char *vi_item; // body of item (old or new)
- const void *vi_new_data; // 0 always but paste mode
- void *vi_uarea; // item specific area
+ const char *vi_item; /* body of item (old or new) */
+ const void *vi_new_data; /* 0 always but paste mode */
+ void *vi_uarea; /* item specific area */
};
struct virtual_node {
- char *vn_free_ptr; /* this is a pointer to the free space in the buffer */
+ /* this is a pointer to the free space in the buffer */
+ char *vn_free_ptr;
+
unsigned short vn_nr_item; /* number of items in virtual node */
- short vn_size; /* size of node , that node would have if it has unlimited size and no balancing is performed */
- short vn_mode; /* mode of balancing (paste, insert, delete, cut) */
+
+ /*
+ * size of node , that node would have if it has
+ * unlimited size and no balancing is performed
+ */
+ short vn_size;
+
+ /* mode of balancing (paste, insert, delete, cut) */
+ short vn_mode;
+
short vn_affected_item_num;
short vn_pos_in_item;
- struct item_head *vn_ins_ih; /* item header of inserted item, 0 for other modes */
+
+ /* item header of inserted item, 0 for other modes */
+ struct item_head *vn_ins_ih;
const void *vn_data;
- struct virtual_item *vn_vi; /* array of items (including a new one, excluding item to be deleted) */
+
+ /* array of items (including a new one, excluding item to be deleted) */
+ struct virtual_item *vn_vi;
};
/* used by directory items when creating virtual nodes */
@@ -2040,22 +2371,25 @@ struct direntry_uarea {
__u16 entry_sizes[1];
} __attribute__ ((__packed__));
-/***************************************************************************/
-/* TREE BALANCE */
-/***************************************************************************/
+/***************************************************************************
+ * TREE BALANCE *
+ ***************************************************************************/
-/* This temporary structure is used in tree balance algorithms, and
- constructed as we go to the extent that its various parts are
- needed. It contains arrays of nodes that can potentially be
- involved in the balancing of node S, and parameters that define how
- each of the nodes must be balanced. Note that in these algorithms
- for balancing the worst case is to need to balance the current node
- S and the left and right neighbors and all of their parents plus
- create a new node. We implement S1 balancing for the leaf nodes
- and S0 balancing for the internal nodes (S1 and S0 are defined in
- our papers.)*/
+/*
+ * This temporary structure is used in tree balance algorithms, and
+ * constructed as we go to the extent that its various parts are
+ * needed. It contains arrays of nodes that can potentially be
+ * involved in the balancing of node S, and parameters that define how
+ * each of the nodes must be balanced. Note that in these algorithms
+ * for balancing the worst case is to need to balance the current node
+ * S and the left and right neighbors and all of their parents plus
+ * create a new node. We implement S1 balancing for the leaf nodes
+ * and S0 balancing for the internal nodes (S1 and S0 are defined in
+ * our papers.)
+ */
-#define MAX_FREE_BLOCK 7 /* size of the array of buffers to free at end of do_balance */
+/* size of the array of buffers to free at end of do_balance */
+#define MAX_FREE_BLOCK 7
/* maximum number of FEB blocknrs on a single level */
#define MAX_AMOUNT_NEEDED 2
@@ -2067,64 +2401,144 @@ struct tree_balance {
struct super_block *tb_sb;
struct reiserfs_transaction_handle *transaction_handle;
struct treepath *tb_path;
- struct buffer_head *L[MAX_HEIGHT]; /* array of left neighbors of nodes in the path */
- struct buffer_head *R[MAX_HEIGHT]; /* array of right neighbors of nodes in the path */
- struct buffer_head *FL[MAX_HEIGHT]; /* array of fathers of the left neighbors */
- struct buffer_head *FR[MAX_HEIGHT]; /* array of fathers of the right neighbors */
- struct buffer_head *CFL[MAX_HEIGHT]; /* array of common parents of center node and its left neighbor */
- struct buffer_head *CFR[MAX_HEIGHT]; /* array of common parents of center node and its right neighbor */
-
- struct buffer_head *FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals
- cur_blknum. */
+
+ /* array of left neighbors of nodes in the path */
+ struct buffer_head *L[MAX_HEIGHT];
+
+ /* array of right neighbors of nodes in the path */
+ struct buffer_head *R[MAX_HEIGHT];
+
+ /* array of fathers of the left neighbors */
+ struct buffer_head *FL[MAX_HEIGHT];
+
+ /* array of fathers of the right neighbors */
+ struct buffer_head *FR[MAX_HEIGHT];
+ /* array of common parents of center node and its left neighbor */
+ struct buffer_head *CFL[MAX_HEIGHT];
+
+ /* array of common parents of center node and its right neighbor */
+ struct buffer_head *CFR[MAX_HEIGHT];
+
+ /*
+ * array of empty buffers. Number of buffers in array equals
+ * cur_blknum.
+ */
+ struct buffer_head *FEB[MAX_FEB_SIZE];
struct buffer_head *used[MAX_FEB_SIZE];
struct buffer_head *thrown[MAX_FEB_SIZE];
- int lnum[MAX_HEIGHT]; /* array of number of items which must be
- shifted to the left in order to balance the
- current node; for leaves includes item that
- will be partially shifted; for internal
- nodes, it is the number of child pointers
- rather than items. It includes the new item
- being created. The code sometimes subtracts
- one to get the number of wholly shifted
- items for other purposes. */
- int rnum[MAX_HEIGHT]; /* substitute right for left in comment above */
- int lkey[MAX_HEIGHT]; /* array indexed by height h mapping the key delimiting L[h] and
- S[h] to its item number within the node CFL[h] */
- int rkey[MAX_HEIGHT]; /* substitute r for l in comment above */
- int insert_size[MAX_HEIGHT]; /* the number of bytes by we are trying to add or remove from
- S[h]. A negative value means removing. */
- int blknum[MAX_HEIGHT]; /* number of nodes that will replace node S[h] after
- balancing on the level h of the tree. If 0 then S is
- being deleted, if 1 then S is remaining and no new nodes
- are being created, if 2 or 3 then 1 or 2 new nodes is
- being created */
+
+ /*
+ * array of number of items which must be shifted to the left in
+ * order to balance the current node; for leaves includes item that
+ * will be partially shifted; for internal nodes, it is the number
+ * of child pointers rather than items. It includes the new item
+ * being created. The code sometimes subtracts one to get the
+ * number of wholly shifted items for other purposes.
+ */
+ int lnum[MAX_HEIGHT];
+
+ /* substitute right for left in comment above */
+ int rnum[MAX_HEIGHT];
+
+ /*
+ * array indexed by height h mapping the key delimiting L[h] and
+ * S[h] to its item number within the node CFL[h]
+ */
+ int lkey[MAX_HEIGHT];
+
+ /* substitute r for l in comment above */
+ int rkey[MAX_HEIGHT];
+
+ /*
+ * the number of bytes by we are trying to add or remove from
+ * S[h]. A negative value means removing.
+ */
+ int insert_size[MAX_HEIGHT];
+
+ /*
+ * number of nodes that will replace node S[h] after balancing
+ * on the level h of the tree. If 0 then S is being deleted,
+ * if 1 then S is remaining and no new nodes are being created,
+ * if 2 or 3 then 1 or 2 new nodes is being created
+ */
+ int blknum[MAX_HEIGHT];
/* fields that are used only for balancing leaves of the tree */
- int cur_blknum; /* number of empty blocks having been already allocated */
- int s0num; /* number of items that fall into left most node when S[0] splits */
- int s1num; /* number of items that fall into first new node when S[0] splits */
- int s2num; /* number of items that fall into second new node when S[0] splits */
- int lbytes; /* number of bytes which can flow to the left neighbor from the left */
- /* most liquid item that cannot be shifted from S[0] entirely */
- /* if -1 then nothing will be partially shifted */
- int rbytes; /* number of bytes which will flow to the right neighbor from the right */
- /* most liquid item that cannot be shifted from S[0] entirely */
- /* if -1 then nothing will be partially shifted */
- int s1bytes; /* number of bytes which flow to the first new node when S[0] splits */
- /* note: if S[0] splits into 3 nodes, then items do not need to be cut */
- int s2bytes;
- struct buffer_head *buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */
- char *vn_buf; /* kmalloced memory. Used to create
- virtual node and keep map of
- dirtied bitmap blocks */
+
+ /* number of empty blocks having been already allocated */
+ int cur_blknum;
+
+ /* number of items that fall into left most node when S[0] splits */
+ int s0num;
+
+ /*
+ * number of bytes which can flow to the left neighbor from the left
+ * most liquid item that cannot be shifted from S[0] entirely
+ * if -1 then nothing will be partially shifted
+ */
+ int lbytes;
+
+ /*
+ * number of bytes which will flow to the right neighbor from the right
+ * most liquid item that cannot be shifted from S[0] entirely
+ * if -1 then nothing will be partially shifted
+ */
+ int rbytes;
+
+
+ /*
+ * index into the array of item headers in
+ * S[0] of the affected item
+ */
+ int item_pos;
+
+ /* new nodes allocated to hold what could not fit into S */
+ struct buffer_head *S_new[2];
+
+ /*
+ * number of items that will be placed into nodes in S_new
+ * when S[0] splits
+ */
+ int snum[2];
+
+ /*
+ * number of bytes which flow to nodes in S_new when S[0] splits
+ * note: if S[0] splits into 3 nodes, then items do not need to be cut
+ */
+ int sbytes[2];
+
+ int pos_in_item;
+ int zeroes_num;
+
+ /*
+ * buffers which are to be freed after do_balance finishes
+ * by unfix_nodes
+ */
+ struct buffer_head *buf_to_free[MAX_FREE_BLOCK];
+
+ /*
+ * kmalloced memory. Used to create virtual node and keep
+ * map of dirtied bitmap blocks
+ */
+ char *vn_buf;
+
int vn_buf_size; /* size of the vn_buf */
- struct virtual_node *tb_vn; /* VN starts after bitmap of bitmap blocks */
- int fs_gen; /* saved value of `reiserfs_generation' counter
- see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */
+ /* VN starts after bitmap of bitmap blocks */
+ struct virtual_node *tb_vn;
+
+ /*
+ * saved value of `reiserfs_generation' counter see
+ * FILESYSTEM_CHANGED() macro in reiserfs_fs.h
+ */
+ int fs_gen;
+
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
- struct in_core_key key; /* key pointer, to pass to block allocator or
- another low-level subsystem */
+ /*
+ * key pointer, to pass to block allocator or
+ * another low-level subsystem
+ */
+ struct in_core_key key;
#endif
};
@@ -2132,20 +2546,24 @@ struct tree_balance {
/* When inserting an item. */
#define M_INSERT 'i'
-/* When inserting into (directories only) or appending onto an already
- existent item. */
+/*
+ * When inserting into (directories only) or appending onto an already
+ * existent item.
+ */
#define M_PASTE 'p'
/* When deleting an item. */
#define M_DELETE 'd'
/* When truncating an item or removing an entry from a (directory) item. */
-#define M_CUT 'c'
+#define M_CUT 'c'
/* used when balancing on leaf level skipped (in reiserfsck) */
#define M_INTERNAL 'n'
-/* When further balancing is not needed, then do_balance does not need
- to be called. */
-#define M_SKIP_BALANCING 's'
+/*
+ * When further balancing is not needed, then do_balance does not need
+ * to be called.
+ */
+#define M_SKIP_BALANCING 's'
#define M_CONVERT 'v'
/* modes of leaf_move_items */
@@ -2158,8 +2576,10 @@ struct tree_balance {
#define FIRST_TO_LAST 0
#define LAST_TO_FIRST 1
-/* used in do_balance for passing parent of node information that has
- been gotten from tb struct */
+/*
+ * used in do_balance for passing parent of node information that has
+ * been gotten from tb struct
+ */
struct buffer_info {
struct tree_balance *tb;
struct buffer_head *bi_bh;
@@ -2177,20 +2597,24 @@ static inline struct super_block *sb_from_bi(struct buffer_info *bi)
return bi ? sb_from_tb(bi->tb) : NULL;
}
-/* there are 4 types of items: stat data, directory item, indirect, direct.
-+-------------------+------------+--------------+------------+
-| | k_offset | k_uniqueness | mergeable? |
-+-------------------+------------+--------------+------------+
-| stat data | 0 | 0 | no |
-+-------------------+------------+--------------+------------+
-| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS| no |
-| non 1st directory | hash value | | yes |
-| item | | | |
-+-------------------+------------+--------------+------------+
-| indirect item | offset + 1 |TYPE_INDIRECT | if this is not the first indirect item of the object
-+-------------------+------------+--------------+------------+
-| direct item | offset + 1 |TYPE_DIRECT | if not this is not the first direct item of the object
-+-------------------+------------+--------------+------------+
+/*
+ * there are 4 types of items: stat data, directory item, indirect, direct.
+ * +-------------------+------------+--------------+------------+
+ * | | k_offset | k_uniqueness | mergeable? |
+ * +-------------------+------------+--------------+------------+
+ * | stat data | 0 | 0 | no |
+ * +-------------------+------------+--------------+------------+
+ * | 1st directory item| DOT_OFFSET | DIRENTRY_ .. | no |
+ * | non 1st directory | hash value | UNIQUENESS | yes |
+ * | item | | | |
+ * +-------------------+------------+--------------+------------+
+ * | indirect item | offset + 1 |TYPE_INDIRECT | [1] |
+ * +-------------------+------------+--------------+------------+
+ * | direct item | offset + 1 |TYPE_DIRECT | [2] |
+ * +-------------------+------------+--------------+------------+
+ *
+ * [1] if this is not the first indirect item of the object
+ * [2] if this is not the first direct item of the object
*/
struct item_operations {
@@ -2229,49 +2653,43 @@ extern struct item_operations *item_ops[TYPE_ANY + 1];
/* number of blocks pointed to by the indirect item */
#define I_UNFM_NUM(ih) (ih_item_len(ih) / UNFM_P_SIZE)
-/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */
+/*
+ * the used space within the unformatted node corresponding
+ * to pos within the item pointed to by ih
+ */
#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - ih_free_space(ih) : (size))
-/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */
-
-/* get the item header */
-#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) )
-
-/* get key */
-#define B_N_PDELIM_KEY(bh,item_num) ( (struct reiserfs_key * )((bh)->b_data + BLKH_SIZE) + (item_num) )
-
-/* get the key */
-#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) )
-
-/* get item body */
-#define B_N_PITEM(bh,item_num) ( (bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(item_num))))
-
-/* get the stat data by the buffer header and the item order */
-#define B_N_STAT_DATA(bh,nr) \
-( (struct stat_data *)((bh)->b_data + ih_location(B_N_PITEM_HEAD((bh),(nr))) ) )
+/*
+ * number of bytes contained by the direct item or the
+ * unformatted nodes the indirect item points to
+ */
- /* following defines use reiserfs buffer header and item header */
+/* following defines use reiserfs buffer header and item header */
/* get stat-data */
#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + ih_location(ih)) )
-// this is 3976 for size==4096
+/* this is 3976 for size==4096 */
#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE)
-/* indirect items consist of entries which contain blocknrs, pos
- indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
- blocknr contained by the entry pos points to */
-#define B_I_POS_UNFM_POINTER(bh,ih,pos) le32_to_cpu(*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)))
-#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0)
+/*
+ * indirect items consist of entries which contain blocknrs, pos
+ * indicates which entry, and B_I_POS_UNFM_POINTER resolves to the
+ * blocknr contained by the entry pos points to
+ */
+#define B_I_POS_UNFM_POINTER(bh, ih, pos) \
+ le32_to_cpu(*(((unp_t *)ih_item_body(bh, ih)) + (pos)))
+#define PUT_B_I_POS_UNFM_POINTER(bh, ih, pos, val) \
+ (*(((unp_t *)ih_item_body(bh, ih)) + (pos)) = cpu_to_le32(val))
struct reiserfs_iget_args {
__u32 objectid;
__u32 dirid;
};
-/***************************************************************************/
-/* FUNCTION DECLARATIONS */
-/***************************************************************************/
+/***************************************************************************
+ * FUNCTION DECLARATIONS *
+ ***************************************************************************/
#define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
@@ -2283,7 +2701,10 @@ struct reiserfs_iget_args {
/* first block written in a commit. */
struct reiserfs_journal_desc {
__le32 j_trans_id; /* id of commit */
- __le32 j_len; /* length of commit. len +1 is the commit block */
+
+ /* length of commit. len +1 is the commit block */
+ __le32 j_len;
+
__le32 j_mount_id; /* mount id of this trans */
__le32 j_realblock[1]; /* real locations for each block */
};
@@ -2310,22 +2731,35 @@ struct reiserfs_journal_commit {
#define set_commit_trans_id(c,val) do { (c)->j_trans_id = cpu_to_le32 (val); } while (0)
#define set_commit_trans_len(c,val) do { (c)->j_len = cpu_to_le32 (val); } while (0)
-/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the
-** last fully flushed transaction. fully flushed means all the log blocks and all the real blocks are on disk,
-** and this transaction does not need to be replayed.
-*/
+/*
+ * this header block gets written whenever a transaction is considered
+ * fully flushed, and is more recent than the last fully flushed transaction.
+ * fully flushed means all the log blocks and all the real blocks are on
+ * disk, and this transaction does not need to be replayed.
+ */
struct reiserfs_journal_header {
- __le32 j_last_flush_trans_id; /* id of last fully flushed transaction */
- __le32 j_first_unflushed_offset; /* offset in the log of where to start replay after a crash */
+ /* id of last fully flushed transaction */
+ __le32 j_last_flush_trans_id;
+
+ /* offset in the log of where to start replay after a crash */
+ __le32 j_first_unflushed_offset;
+
__le32 j_mount_id;
/* 12 */ struct journal_params jh_journal;
};
/* biggest tunable defines are right here */
#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */
-#define JOURNAL_TRANS_MAX_DEFAULT 1024 /* biggest possible single transaction, don't change for now (8/3/99) */
+
+/* biggest possible single transaction, don't change for now (8/3/99) */
+#define JOURNAL_TRANS_MAX_DEFAULT 1024
#define JOURNAL_TRANS_MIN_DEFAULT 256
-#define JOURNAL_MAX_BATCH_DEFAULT 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */
+
+/*
+ * max blocks to batch into one transaction,
+ * don't make this any bigger than 900
+ */
+#define JOURNAL_MAX_BATCH_DEFAULT 900
#define JOURNAL_MIN_RATIO 2
#define JOURNAL_MAX_COMMIT_AGE 30
#define JOURNAL_MAX_TRANS_AGE 30
@@ -2350,16 +2784,18 @@ struct reiserfs_journal_header {
#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
#endif
-/* both of these can be as low as 1, or as high as you want. The min is the
-** number of 4k bitmap nodes preallocated on mount. New nodes are allocated
-** as needed, and released when transactions are committed. On release, if
-** the current number of nodes is > max, the node is freed, otherwise,
-** it is put on a free list for faster use later.
+/*
+ * both of these can be as low as 1, or as high as you want. The min is the
+ * number of 4k bitmap nodes preallocated on mount. New nodes are allocated
+ * as needed, and released when transactions are committed. On release, if
+ * the current number of nodes is > max, the node is freed, otherwise,
+ * it is put on a free list for faster use later.
*/
#define REISERFS_MIN_BITMAP_NODES 10
#define REISERFS_MAX_BITMAP_NODES 100
-#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */
+/* these are based on journal hash size of 8192 */
+#define JBH_HASH_SHIFT 13
#define JBH_HASH_MASK 8191
#define _jhashfn(sb,block) \
@@ -2367,7 +2803,7 @@ struct reiserfs_journal_header {
(((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
#define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
-// We need these to make journal.c code more readable
+/* We need these to make journal.c code more readable */
#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
@@ -2375,12 +2811,14 @@ struct reiserfs_journal_header {
enum reiserfs_bh_state_bits {
BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
BH_JDirty_wait,
- BH_JNew, /* disk block was taken off free list before
- * being in a finished transaction, or
- * written to disk. Can be reused immed. */
+ /*
+ * disk block was taken off free list before being in a
+ * finished transaction, or written to disk. Can be reused immed.
+ */
+ BH_JNew,
BH_JPrepared,
BH_JRestore_dirty,
- BH_JTest, // debugging only will go away
+ BH_JTest, /* debugging only will go away */
};
BUFFER_FNS(JDirty, journaled);
@@ -2396,27 +2834,36 @@ TAS_BUFFER_FNS(JRestore_dirty, journal_restore_dirty);
BUFFER_FNS(JTest, journal_test);
TAS_BUFFER_FNS(JTest, journal_test);
-/*
-** transaction handle which is passed around for all journal calls
-*/
+/* transaction handle which is passed around for all journal calls */
struct reiserfs_transaction_handle {
- struct super_block *t_super; /* super for this FS when journal_begin was
- called. saves calls to reiserfs_get_super
- also used by nested transactions to make
- sure they are nesting on the right FS
- _must_ be first in the handle
- */
+ /*
+ * super for this FS when journal_begin was called. saves calls to
+ * reiserfs_get_super also used by nested transactions to make
+ * sure they are nesting on the right FS _must_ be first
+ * in the handle
+ */
+ struct super_block *t_super;
+
int t_refcount;
int t_blocks_logged; /* number of blocks this writer has logged */
int t_blocks_allocated; /* number of blocks this writer allocated */
- unsigned int t_trans_id; /* sanity check, equals the current trans id */
+
+ /* sanity check, equals the current trans id */
+ unsigned int t_trans_id;
+
void *t_handle_save; /* save existing current->journal_info */
- unsigned displace_new_blocks:1; /* if new block allocation occurres, that block
- should be displaced from others */
+
+ /*
+ * if new block allocation occurres, that block
+ * should be displaced from others
+ */
+ unsigned displace_new_blocks:1;
+
struct list_head t_list;
};
-/* used to keep track of ordered and tail writes, attached to the buffer
+/*
+ * used to keep track of ordered and tail writes, attached to the buffer
* head through b_journal_head.
*/
struct reiserfs_jh {
@@ -2429,7 +2876,7 @@ void reiserfs_free_jh(struct buffer_head *bh);
int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
int journal_mark_dirty(struct reiserfs_transaction_handle *,
- struct super_block *, struct buffer_head *bh);
+ struct buffer_head *bh);
static inline int reiserfs_file_data_log(struct inode *inode)
{
@@ -2479,10 +2926,8 @@ int journal_init(struct super_block *, const char *j_dev_name, int old_format,
int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
int journal_release_error(struct reiserfs_transaction_handle *,
struct super_block *);
-int journal_end(struct reiserfs_transaction_handle *, struct super_block *,
- unsigned long);
-int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *,
- unsigned long);
+int journal_end(struct reiserfs_transaction_handle *);
+int journal_end_sync(struct reiserfs_transaction_handle *);
int journal_mark_freed(struct reiserfs_transaction_handle *,
struct super_block *, b_blocknr_t blocknr);
int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
@@ -2491,7 +2936,7 @@ int reiserfs_in_journal(struct super_block *sb, unsigned int bmap_nr,
int journal_begin(struct reiserfs_transaction_handle *,
struct super_block *sb, unsigned long);
int journal_join_abort(struct reiserfs_transaction_handle *,
- struct super_block *sb, unsigned long);
+ struct super_block *sb);
void reiserfs_abort_journal(struct super_block *sb, int errno);
void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
int reiserfs_allocate_list_bitmaps(struct super_block *s,
@@ -2513,20 +2958,18 @@ int B_IS_IN_TREE(const struct buffer_head *);
extern void copy_item_head(struct item_head *to,
const struct item_head *from);
-// first key is in cpu form, second - le
+/* first key is in cpu form, second - le */
extern int comp_short_keys(const struct reiserfs_key *le_key,
const struct cpu_key *cpu_key);
extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
-// both are in le form
+/* both are in le form */
extern int comp_le_keys(const struct reiserfs_key *,
const struct reiserfs_key *);
extern int comp_short_le_keys(const struct reiserfs_key *,
const struct reiserfs_key *);
-//
-// get key version from on disk key - kludge
-//
+/* * get key version from on disk key - kludge */
static inline int le_key_version(const struct reiserfs_key *key)
{
int type;
@@ -2603,12 +3046,12 @@ void padd_item(char *item, int total_length, int length);
/* inode.c */
/* args for the create parameter of reiserfs_get_block */
-#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
-#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
-#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
-#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
-#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
-#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
+#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
+#define GET_BLOCK_CREATE 1 /* add anything you need to find block */
+#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */
+#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */
+#define GET_BLOCK_NO_IMUX 8 /* i_mutex is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */
void reiserfs_read_locked_inode(struct inode *inode,
struct reiserfs_iget_args *args);
@@ -2807,25 +3250,49 @@ struct buffer_head *get_FEB(struct tree_balance *);
/* bitmap.c */
-/* structure contains hints for block allocator, and it is a container for
- * arguments, such as node, search path, transaction_handle, etc. */
+/*
+ * structure contains hints for block allocator, and it is a container for
+ * arguments, such as node, search path, transaction_handle, etc.
+ */
struct __reiserfs_blocknr_hint {
- struct inode *inode; /* inode passed to allocator, if we allocate unf. nodes */
+ /* inode passed to allocator, if we allocate unf. nodes */
+ struct inode *inode;
+
sector_t block; /* file offset, in blocks */
struct in_core_key key;
- struct treepath *path; /* search path, used by allocator to deternine search_start by
- * various ways */
- struct reiserfs_transaction_handle *th; /* transaction handle is needed to log super blocks and
- * bitmap blocks changes */
+
+ /*
+ * search path, used by allocator to deternine search_start by
+ * various ways
+ */
+ struct treepath *path;
+
+ /*
+ * transaction handle is needed to log super blocks
+ * and bitmap blocks changes
+ */
+ struct reiserfs_transaction_handle *th;
+
b_blocknr_t beg, end;
- b_blocknr_t search_start; /* a field used to transfer search start value (block number)
- * between different block allocator procedures
- * (determine_search_start() and others) */
- int prealloc_size; /* is set in determine_prealloc_size() function, used by underlayed
- * function that do actual allocation */
-
- unsigned formatted_node:1; /* the allocator uses different polices for getting disk space for
- * formatted/unformatted blocks with/without preallocation */
+
+ /*
+ * a field used to transfer search start value (block number)
+ * between different block allocator procedures
+ * (determine_search_start() and others)
+ */
+ b_blocknr_t search_start;
+
+ /*
+ * is set in determine_prealloc_size() function,
+ * used by underlayed function that do actual allocation
+ */
+ int prealloc_size;
+
+ /*
+ * the allocator uses different polices for getting disk
+ * space for formatted/unformatted blocks with/without preallocation
+ */
+ unsigned formatted_node:1;
unsigned preallocate:1;
};
@@ -2841,6 +3308,7 @@ void reiserfs_init_alloc_options(struct super_block *s);
*/
__le32 reiserfs_choose_packing(struct inode *dir);
+void show_alloc_options(struct seq_file *seq, struct super_block *s);
int reiserfs_init_bitmap_cache(struct super_block *sb);
void reiserfs_free_bitmap_cache(struct super_block *sb);
void reiserfs_cache_bitmap_metadata(struct super_block *sb, struct buffer_head *bh, struct reiserfs_bitmap_info *info);
@@ -2918,13 +3386,15 @@ __u32 r5_hash(const signed char *msg, int len);
#define reiserfs_test_le_bit test_bit_le
#define reiserfs_find_next_zero_le_bit find_next_zero_bit_le
-/* sometimes reiserfs_truncate may require to allocate few new blocks
- to perform indirect2direct conversion. People probably used to
- think, that truncate should work without problems on a filesystem
- without free disk space. They may complain that they can not
- truncate due to lack of free disk space. This spare space allows us
- to not worry about it. 500 is probably too much, but it should be
- absolutely safe */
+/*
+ * sometimes reiserfs_truncate may require to allocate few new blocks
+ * to perform indirect2direct conversion. People probably used to
+ * think, that truncate should work without problems on a filesystem
+ * without free disk space. They may complain that they can not
+ * truncate due to lack of free disk space. This spare space allows us
+ * to not worry about it. 500 is probably too much, but it should be
+ * absolutely safe
+ */
#define SPARE_SPACE 500
/* prototypes from ioctl.c */
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index a4ef5cd606e..6052d323bc9 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -53,8 +53,10 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
}
bforget(bh);
- /* old disk layout detection; those partitions can be mounted, but
- * cannot be resized */
+ /*
+ * old disk layout detection; those partitions can be mounted, but
+ * cannot be resized
+ */
if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
!= REISERFS_DISK_OFFSET_IN_BYTES) {
printk
@@ -86,12 +88,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
return -ENOMEM;
}
- /* the new journal bitmaps are zero filled, now we copy in the bitmap
- ** node pointers from the old journal bitmap structs, and then
- ** transfer the new data structures into the journal struct.
- **
- ** using the copy_size var below allows this code to work for
- ** both shrinking and expanding the FS.
+ /*
+ * the new journal bitmaps are zero filled, now we copy i
+ * the bitmap node pointers from the old journal bitmap
+ * structs, and then transfer the new data structures
+ * into the journal struct.
+ *
+ * using the copy_size var below allows this code to work for
+ * both shrinking and expanding the FS.
*/
copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
copy_size =
@@ -101,36 +105,45 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
jb = SB_JOURNAL(s)->j_list_bitmap + i;
memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
- /* just in case vfree schedules on us, copy the new
- ** pointer into the journal struct before freeing the
- ** old one
+ /*
+ * just in case vfree schedules on us, copy the new
+ * pointer into the journal struct before freeing the
+ * old one
*/
node_tmp = jb->bitmaps;
jb->bitmaps = jbitmap[i].bitmaps;
vfree(node_tmp);
}
- /* allocate additional bitmap blocks, reallocate array of bitmap
- * block pointers */
+ /*
+ * allocate additional bitmap blocks, reallocate
+ * array of bitmap block pointers
+ */
bitmap =
vzalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
if (!bitmap) {
- /* Journal bitmaps are still supersized, but the memory isn't
- * leaked, so I guess it's ok */
+ /*
+ * Journal bitmaps are still supersized, but the
+ * memory isn't leaked, so I guess it's ok
+ */
printk("reiserfs_resize: unable to allocate memory.\n");
return -ENOMEM;
}
for (i = 0; i < bmap_nr; i++)
bitmap[i] = old_bitmap[i];
- /* This doesn't go through the journal, but it doesn't have to.
- * The changes are still atomic: We're synced up when the journal
- * transaction begins, and the new bitmaps don't matter if the
- * transaction fails. */
+ /*
+ * This doesn't go through the journal, but it doesn't have to.
+ * The changes are still atomic: We're synced up when the
+ * journal transaction begins, and the new bitmaps don't
+ * matter if the transaction fails.
+ */
for (i = bmap_nr; i < bmap_nr_new; i++) {
int depth;
- /* don't use read_bitmap_block since it will cache
- * the uninitialized bitmap */
+ /*
+ * don't use read_bitmap_block since it will cache
+ * the uninitialized bitmap
+ */
depth = reiserfs_write_unlock_nested(s);
bh = sb_bread(s, i * s->s_blocksize * 8);
reiserfs_write_lock_nested(s, depth);
@@ -147,7 +160,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
depth = reiserfs_write_unlock_nested(s);
sync_dirty_buffer(bh);
reiserfs_write_lock_nested(s, depth);
- // update bitmap_info stuff
+ /* update bitmap_info stuff */
bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
brelse(bh);
}
@@ -156,9 +169,11 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
vfree(old_bitmap);
}
- /* begin transaction, if there was an error, it's fine. Yes, we have
+ /*
+ * begin transaction, if there was an error, it's fine. Yes, we have
* incorrect bitmaps now, but none of it is ever going to touch the
- * disk anyway. */
+ * disk anyway.
+ */
err = journal_begin(&th, s, 10);
if (err)
return err;
@@ -167,7 +182,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
info = SB_AP_BITMAP(s) + bmap_nr - 1;
bh = reiserfs_read_bitmap_block(s, bmap_nr - 1);
if (!bh) {
- int jerr = journal_end(&th, s, 10);
+ int jerr = journal_end(&th);
if (jerr)
return jerr;
return -EIO;
@@ -178,14 +193,14 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
reiserfs_clear_le_bit(i, bh->b_data);
info->free_count += s->s_blocksize * 8 - block_r;
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
brelse(bh);
/* Correct new last bitmap block - It may not be full */
info = SB_AP_BITMAP(s) + bmap_nr_new - 1;
bh = reiserfs_read_bitmap_block(s, bmap_nr_new - 1);
if (!bh) {
- int jerr = journal_end(&th, s, 10);
+ int jerr = journal_end(&th);
if (jerr)
return jerr;
return -EIO;
@@ -194,7 +209,7 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
reiserfs_prepare_for_journal(s, bh, 1);
for (i = block_r_new; i < s->s_blocksize * 8; i++)
reiserfs_set_le_bit(i, bh->b_data);
- journal_mark_dirty(&th, s, bh);
+ journal_mark_dirty(&th, bh);
brelse(bh);
info->free_count -= s->s_blocksize * 8 - block_r_new;
@@ -207,8 +222,8 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
PUT_SB_BLOCK_COUNT(s, block_count_new);
PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
SB_JOURNAL(s)->j_must_wait = 1;
- return journal_end(&th, s, 10);
+ return journal_end(&th);
}
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index b14706a05d5..dd44468edc2 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -8,46 +8,6 @@
* Pereslavl-Zalessky Russia
*/
-/*
- * This file contains functions dealing with S+tree
- *
- * B_IS_IN_TREE
- * copy_item_head
- * comp_short_keys
- * comp_keys
- * comp_short_le_keys
- * le_key2cpu_key
- * comp_le_keys
- * bin_search
- * get_lkey
- * get_rkey
- * key_in_buffer
- * decrement_bcount
- * reiserfs_check_path
- * pathrelse_and_restore
- * pathrelse
- * search_by_key_reada
- * search_by_key
- * search_for_position_by_key
- * comp_items
- * prepare_for_direct_item
- * prepare_for_direntry_item
- * prepare_for_delete_or_cut
- * calc_deleted_bytes_number
- * init_tb_struct
- * padd_item
- * reiserfs_delete_item
- * reiserfs_delete_solid_item
- * reiserfs_delete_object
- * maybe_indirect_to_direct
- * indirect_to_direct_roll_back
- * reiserfs_cut_from_item
- * truncate_directory
- * reiserfs_do_truncate
- * reiserfs_paste_into_item
- * reiserfs_insert_item
- */
-
#include <linux/time.h>
#include <linux/string.h>
#include <linux/pagemap.h>
@@ -65,21 +25,21 @@ inline int B_IS_IN_TREE(const struct buffer_head *bh)
return (B_LEVEL(bh) != FREE_LEVEL);
}
-//
-// to gets item head in le form
-//
+/* to get item head in le form */
inline void copy_item_head(struct item_head *to,
const struct item_head *from)
{
memcpy(to, from, IH_SIZE);
}
-/* k1 is pointer to on-disk structure which is stored in little-endian
- form. k2 is pointer to cpu variable. For key of items of the same
- object this returns 0.
- Returns: -1 if key1 < key2
- 0 if key1 == key2
- 1 if key1 > key2 */
+/*
+ * k1 is pointer to on-disk structure which is stored in little-endian
+ * form. k2 is pointer to cpu variable. For key of items of the same
+ * object this returns 0.
+ * Returns: -1 if key1 < key2
+ * 0 if key1 == key2
+ * 1 if key1 > key2
+ */
inline int comp_short_keys(const struct reiserfs_key *le_key,
const struct cpu_key *cpu_key)
{
@@ -97,11 +57,13 @@ inline int comp_short_keys(const struct reiserfs_key *le_key,
return 0;
}
-/* k1 is pointer to on-disk structure which is stored in little-endian
- form. k2 is pointer to cpu variable.
- Compare keys using all 4 key fields.
- Returns: -1 if key1 < key2 0
- if key1 = key2 1 if key1 > key2 */
+/*
+ * k1 is pointer to on-disk structure which is stored in little-endian
+ * form. k2 is pointer to cpu variable.
+ * Compare keys using all 4 key fields.
+ * Returns: -1 if key1 < key2 0
+ * if key1 = key2 1 if key1 > key2
+ */
static inline int comp_keys(const struct reiserfs_key *le_key,
const struct cpu_key *cpu_key)
{
@@ -155,15 +117,17 @@ inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
- // find out version of the key
+ /* find out version of the key */
version = le_key_version(from);
to->version = version;
to->on_disk_key.k_offset = le_key_k_offset(version, from);
to->on_disk_key.k_type = le_key_k_type(version, from);
}
-// this does not say which one is bigger, it only returns 1 if keys
-// are not equal, 0 otherwise
+/*
+ * this does not say which one is bigger, it only returns 1 if keys
+ * are not equal, 0 otherwise
+ */
inline int comp_le_keys(const struct reiserfs_key *k1,
const struct reiserfs_key *k2)
{
@@ -177,24 +141,27 @@ inline int comp_le_keys(const struct reiserfs_key *k1,
* *pos = number of the searched element if found, else the *
* number of the first element that is larger than key. *
**************************************************************************/
-/* For those not familiar with binary search: lbound is the leftmost item that it
- could be, rbound the rightmost item that it could be. We examine the item
- halfway between lbound and rbound, and that tells us either that we can increase
- lbound, or decrease rbound, or that we have found it, or if lbound <= rbound that
- there are no possible items, and we have not found it. With each examination we
- cut the number of possible items it could be by one more than half rounded down,
- or we find it. */
+/*
+ * For those not familiar with binary search: lbound is the leftmost item
+ * that it could be, rbound the rightmost item that it could be. We examine
+ * the item halfway between lbound and rbound, and that tells us either
+ * that we can increase lbound, or decrease rbound, or that we have found it,
+ * or if lbound <= rbound that there are no possible items, and we have not
+ * found it. With each examination we cut the number of possible items it
+ * could be by one more than half rounded down, or we find it.
+ */
static inline int bin_search(const void *key, /* Key to search for. */
const void *base, /* First item in the array. */
int num, /* Number of items in the array. */
- int width, /* Item size in the array.
- searched. Lest the reader be
- confused, note that this is crafted
- as a general function, and when it
- is applied specifically to the array
- of item headers in a node, width
- is actually the item header size not
- the item size. */
+ /*
+ * Item size in the array. searched. Lest the
+ * reader be confused, note that this is crafted
+ * as a general function, and when it is applied
+ * specifically to the array of item headers in a
+ * node, width is actually the item header size
+ * not the item size.
+ */
+ int width,
int *pos /* Number of the searched for element. */
)
{
@@ -216,8 +183,10 @@ static inline int bin_search(const void *key, /* Key to search for. */
return ITEM_FOUND; /* Key found in the array. */
}
- /* bin_search did not find given key, it returns position of key,
- that is minimal and greater than the given one. */
+ /*
+ * bin_search did not find given key, it returns position of key,
+ * that is minimal and greater than the given one.
+ */
*pos = lbound;
return ITEM_NOT_FOUND;
}
@@ -228,16 +197,20 @@ const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
/* Maximal possible key. It is never in the tree. */
static const struct reiserfs_key MAX_KEY = {
- __constant_cpu_to_le32(0xffffffff),
- __constant_cpu_to_le32(0xffffffff),
- {{__constant_cpu_to_le32(0xffffffff),
- __constant_cpu_to_le32(0xffffffff)},}
+ cpu_to_le32(0xffffffff),
+ cpu_to_le32(0xffffffff),
+ {{cpu_to_le32(0xffffffff),
+ cpu_to_le32(0xffffffff)},}
};
-/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
- of the path, and going upwards. We must check the path's validity at each step. If the key is not in
- the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
- case we return a special key, either MIN_KEY or MAX_KEY. */
+/*
+ * Get delimiting key of the buffer by looking for it in the buffers in the
+ * path, starting from the bottom of the path, and going upwards. We must
+ * check the path's validity at each step. If the key is not in the path,
+ * there is no delimiting key in the tree (buffer is first or last buffer
+ * in tree), and in this case we return a special key, either MIN_KEY or
+ * MAX_KEY.
+ */
static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_path,
const struct super_block *sb)
{
@@ -270,9 +243,12 @@ static inline const struct reiserfs_key *get_lkey(const struct treepath *chk_pat
PATH_OFFSET_PBUFFER(chk_path,
path_offset + 1)->b_blocknr)
return &MAX_KEY;
- /* Return delimiting key if position in the parent is not equal to zero. */
+ /*
+ * Return delimiting key if position in the parent
+ * is not equal to zero.
+ */
if (position)
- return B_N_PDELIM_KEY(parent, position - 1);
+ return internal_key(parent, position - 1);
}
/* Return MIN_KEY if we are in the root of the buffer tree. */
if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
@@ -308,15 +284,23 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
path_offset)) >
B_NR_ITEMS(parent))
return &MIN_KEY;
- /* Check whether parent at the path really points to the child. */
+ /*
+ * Check whether parent at the path really points
+ * to the child.
+ */
if (B_N_CHILD_NUM(parent, position) !=
PATH_OFFSET_PBUFFER(chk_path,
path_offset + 1)->b_blocknr)
return &MIN_KEY;
- /* Return delimiting key if position in the parent is not the last one. */
+
+ /*
+ * Return delimiting key if position in the parent
+ * is not the last one.
+ */
if (position != B_NR_ITEMS(parent))
- return B_N_PDELIM_KEY(parent, position);
+ return internal_key(parent, position);
}
+
/* Return MAX_KEY if we are in the root of the buffer tree. */
if (PATH_OFFSET_PBUFFER(chk_path, FIRST_PATH_ELEMENT_OFFSET)->
b_blocknr == SB_ROOT_BLOCK(sb))
@@ -324,13 +308,20 @@ inline const struct reiserfs_key *get_rkey(const struct treepath *chk_path,
return &MIN_KEY;
}
-/* Check whether a key is contained in the tree rooted from a buffer at a path. */
-/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
- the path. These delimiting keys are stored at least one level above that buffer in the tree. If the
- buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
- this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
-static inline int key_in_buffer(struct treepath *chk_path, /* Path which should be checked. */
- const struct cpu_key *key, /* Key which should be checked. */
+/*
+ * Check whether a key is contained in the tree rooted from a buffer at a path.
+ * This works by looking at the left and right delimiting keys for the buffer
+ * in the last path_element in the path. These delimiting keys are stored
+ * at least one level above that buffer in the tree. If the buffer is the
+ * first or last node in the tree order then one of the delimiting keys may
+ * be absent, and in this case get_lkey and get_rkey return a special key
+ * which is MIN_KEY or MAX_KEY.
+ */
+static inline int key_in_buffer(
+ /* Path which should be checked. */
+ struct treepath *chk_path,
+ /* Key which should be checked. */
+ const struct cpu_key *key,
struct super_block *sb
)
{
@@ -359,9 +350,11 @@ int reiserfs_check_path(struct treepath *p)
return 0;
}
-/* Drop the reference to each buffer in a path and restore
+/*
+ * Drop the reference to each buffer in a path and restore
* dirty bits clean when preparing the buffer for the log.
- * This version should only be called from fix_nodes() */
+ * This version should only be called from fix_nodes()
+ */
void pathrelse_and_restore(struct super_block *sb,
struct treepath *search_path)
{
@@ -418,14 +411,17 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
}
ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
+
+ /* free space does not match to calculated amount of use space */
if (used_space != blocksize - blkh_free_space(blkh)) {
- /* free space does not match to calculated amount of use space */
reiserfs_warning(NULL, "reiserfs-5082",
"free space seems wrong: %z", bh);
return 0;
}
- // FIXME: it is_leaf will hit performance too much - we may have
- // return 1 here
+ /*
+ * FIXME: it is_leaf will hit performance too much - we may have
+ * return 1 here
+ */
/* check tables of item heads */
ih = (struct item_head *)(buf + BLKH_SIZE);
@@ -460,7 +456,7 @@ static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
prev_location = ih_location(ih);
}
- // one may imagine much more checks
+ /* one may imagine many more checks */
return 1;
}
@@ -481,8 +477,8 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
}
nr = blkh_nr_item(blkh);
+ /* for internal which is not root we might check min number of keys */
if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
- /* for internal which is not root we might check min number of keys */
reiserfs_warning(NULL, "reiserfs-5088",
"number of key seems wrong: %z", bh);
return 0;
@@ -494,12 +490,15 @@ static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
"free space seems wrong: %z", bh);
return 0;
}
- // one may imagine much more checks
+
+ /* one may imagine many more checks */
return 1;
}
-// make sure that bh contains formatted node of reiserfs tree of
-// 'level'-th level
+/*
+ * make sure that bh contains formatted node of reiserfs tree of
+ * 'level'-th level
+ */
static int is_tree_node(struct buffer_head *bh, int level)
{
if (B_LEVEL(bh) != level) {
@@ -546,7 +545,8 @@ static int search_by_key_reada(struct super_block *s,
for (j = 0; j < i; j++) {
/*
* note, this needs attention if we are getting rid of the BKL
- * you have to make sure the prepared bit isn't set on this buffer
+ * you have to make sure the prepared bit isn't set on this
+ * buffer
*/
if (!buffer_uptodate(bh[j])) {
if (depth == -1)
@@ -558,39 +558,34 @@ static int search_by_key_reada(struct super_block *s,
return depth;
}
-/**************************************************************************
- * Algorithm SearchByKey *
- * look for item in the Disk S+Tree by its key *
- * Input: sb - super block *
- * key - pointer to the key to search *
- * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR *
- * search_path - path from the root to the needed leaf *
- **************************************************************************/
-
-/* This function fills up the path from the root to the leaf as it
- descends the tree looking for the key. It uses reiserfs_bread to
- try to find buffers in the cache given their block number. If it
- does not find them in the cache it reads them from disk. For each
- node search_by_key finds using reiserfs_bread it then uses
- bin_search to look through that node. bin_search will find the
- position of the block_number of the next node if it is looking
- through an internal node. If it is looking through a leaf node
- bin_search will find the position of the item which has key either
- equal to given key, or which is the maximal key less than the given
- key. search_by_key returns a path that must be checked for the
- correctness of the top of the path but need not be checked for the
- correctness of the bottom of the path */
-/* The function is NOT SCHEDULE-SAFE! */
-int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to search. */
- struct treepath *search_path,/* This structure was
- allocated and initialized
- by the calling
- function. It is filled up
- by this function. */
- int stop_level /* How far down the tree to search. To
- stop at leaf level - set to
- DISK_LEAF_NODE_LEVEL */
- )
+/*
+ * This function fills up the path from the root to the leaf as it
+ * descends the tree looking for the key. It uses reiserfs_bread to
+ * try to find buffers in the cache given their block number. If it
+ * does not find them in the cache it reads them from disk. For each
+ * node search_by_key finds using reiserfs_bread it then uses
+ * bin_search to look through that node. bin_search will find the
+ * position of the block_number of the next node if it is looking
+ * through an internal node. If it is looking through a leaf node
+ * bin_search will find the position of the item which has key either
+ * equal to given key, or which is the maximal key less than the given
+ * key. search_by_key returns a path that must be checked for the
+ * correctness of the top of the path but need not be checked for the
+ * correctness of the bottom of the path
+ */
+/*
+ * search_by_key - search for key (and item) in stree
+ * @sb: superblock
+ * @key: pointer to key to search for
+ * @search_path: Allocated and initialized struct treepath; Returned filled
+ * on success.
+ * @stop_level: How far down the tree to search, Use DISK_LEAF_NODE_LEVEL to
+ * stop at leaf level.
+ *
+ * The function is NOT SCHEDULE-SAFE!
+ */
+int search_by_key(struct super_block *sb, const struct cpu_key *key,
+ struct treepath *search_path, int stop_level)
{
b_blocknr_t block_number;
int expected_level;
@@ -609,17 +604,22 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
PROC_INFO_INC(sb, search_by_key);
- /* As we add each node to a path we increase its count. This means that
- we must be careful to release all nodes in a path before we either
- discard the path struct or re-use the path struct, as we do here. */
+ /*
+ * As we add each node to a path we increase its count. This means
+ * that we must be careful to release all nodes in a path before we
+ * either discard the path struct or re-use the path struct, as we
+ * do here.
+ */
pathrelse(search_path);
right_neighbor_of_leaf_node = 0;
- /* With each iteration of this loop we search through the items in the
- current node, and calculate the next current node(next path element)
- for the next iteration of this loop.. */
+ /*
+ * With each iteration of this loop we search through the items in the
+ * current node, and calculate the next current node(next path element)
+ * for the next iteration of this loop..
+ */
block_number = SB_ROOT_BLOCK(sb);
expected_level = -1;
while (1) {
@@ -639,8 +639,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
++search_path->path_length);
fs_gen = get_generation(sb);
- /* Read the next tree node, and set the last element in the path to
- have a pointer to it. */
+ /*
+ * Read the next tree node, and set the last element
+ * in the path to have a pointer to it.
+ */
if ((bh = last_element->pe_buffer =
sb_getblk(sb, block_number))) {
@@ -666,7 +668,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
if (!buffer_uptodate(bh))
goto io_error;
} else {
- io_error:
+io_error:
search_path->path_length--;
pathrelse(search_path);
return IO_ERROR;
@@ -676,9 +678,12 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
expected_level = SB_TREE_HEIGHT(sb);
expected_level--;
- /* It is possible that schedule occurred. We must check whether the key
- to search is still in the tree rooted from the current buffer. If
- not then repeat search from the root. */
+ /*
+ * It is possible that schedule occurred. We must check
+ * whether the key to search is still in the tree rooted
+ * from the current buffer. If not then repeat search
+ * from the root.
+ */
if (fs_changed(fs_gen, sb) &&
(!B_IS_IN_TREE(bh) ||
B_LEVEL(bh) != expected_level ||
@@ -689,8 +694,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
sbk_restarted[expected_level - 1]);
pathrelse(search_path);
- /* Get the root block number so that we can repeat the search
- starting from the root. */
+ /*
+ * Get the root block number so that we can
+ * repeat the search starting from the root.
+ */
block_number = SB_ROOT_BLOCK(sb);
expected_level = -1;
right_neighbor_of_leaf_node = 0;
@@ -699,9 +706,11 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
continue;
}
- /* only check that the key is in the buffer if key is not
- equal to the MAX_KEY. Latter case is only possible in
- "finish_unfinished()" processing during mount. */
+ /*
+ * only check that the key is in the buffer if key is not
+ * equal to the MAX_KEY. Latter case is only possible in
+ * "finish_unfinished()" processing during mount.
+ */
RFALSE(comp_keys(&MAX_KEY, key) &&
!key_in_buffer(search_path, key, sb),
"PAP-5130: key is not in the buffer");
@@ -713,8 +722,10 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
}
#endif
- // make sure, that the node contents look like a node of
- // certain level
+ /*
+ * make sure, that the node contents look like a node of
+ * certain level
+ */
if (!is_tree_node(bh, expected_level)) {
reiserfs_error(sb, "vs-5150",
"invalid format found in block %ld. "
@@ -732,32 +743,42 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
"vs-5152: tree level (%d) is less than stop level (%d)",
node_level, stop_level);
- retval = bin_search(key, B_N_PITEM_HEAD(bh, 0),
+ retval = bin_search(key, item_head(bh, 0),
B_NR_ITEMS(bh),
(node_level ==
DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
KEY_SIZE,
- &(last_element->pe_position));
+ &last_element->pe_position);
if (node_level == stop_level) {
return retval;
}
/* we are not in the stop level */
+ /*
+ * item has been found, so we choose the pointer which
+ * is to the right of the found one
+ */
if (retval == ITEM_FOUND)
- /* item has been found, so we choose the pointer which is to the right of the found one */
last_element->pe_position++;
- /* if item was not found we choose the position which is to
- the left of the found item. This requires no code,
- bin_search did it already. */
+ /*
+ * if item was not found we choose the position which is to
+ * the left of the found item. This requires no code,
+ * bin_search did it already.
+ */
- /* So we have chosen a position in the current node which is
- an internal node. Now we calculate child block number by
- position in the node. */
+ /*
+ * So we have chosen a position in the current node which is
+ * an internal node. Now we calculate child block number by
+ * position in the node.
+ */
block_number =
B_N_CHILD_NUM(bh, last_element->pe_position);
- /* if we are going to read leaf nodes, try for read ahead as well */
+ /*
+ * if we are going to read leaf nodes, try for read
+ * ahead as well
+ */
if ((search_path->reada & PATH_READA) &&
node_level == DISK_LEAF_NODE_LEVEL + 1) {
int pos = last_element->pe_position;
@@ -779,7 +800,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
/*
* check to make sure we're in the same object
*/
- le_key = B_N_PDELIM_KEY(bh, pos);
+ le_key = internal_key(bh, pos);
if (le32_to_cpu(le_key->k_objectid) !=
key->on_disk_key.k_objectid) {
break;
@@ -789,26 +810,28 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
}
}
-/* Form the path to an item and position in this item which contains
- file byte defined by key. If there is no such item
- corresponding to the key, we point the path to the item with
- maximal key less than key, and *pos_in_item is set to one
- past the last entry/byte in the item. If searching for entry in a
- directory item, and it is not found, *pos_in_item is set to one
- entry more than the entry with maximal key which is less than the
- sought key.
-
- Note that if there is no entry in this same node which is one more,
- then we point to an imaginary entry. for direct items, the
- position is in units of bytes, for indirect items the position is
- in units of blocknr entries, for directory items the position is in
- units of directory entries. */
-
+/*
+ * Form the path to an item and position in this item which contains
+ * file byte defined by key. If there is no such item
+ * corresponding to the key, we point the path to the item with
+ * maximal key less than key, and *pos_in_item is set to one
+ * past the last entry/byte in the item. If searching for entry in a
+ * directory item, and it is not found, *pos_in_item is set to one
+ * entry more than the entry with maximal key which is less than the
+ * sought key.
+ *
+ * Note that if there is no entry in this same node which is one more,
+ * then we point to an imaginary entry. for direct items, the
+ * position is in units of bytes, for indirect items the position is
+ * in units of blocknr entries, for directory items the position is in
+ * units of directory entries.
+ */
/* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key(struct super_block *sb, /* Pointer to the super block. */
- const struct cpu_key *p_cpu_key, /* Key to search (cpu variable) */
- struct treepath *search_path /* Filled up by this function. */
- )
+int search_for_position_by_key(struct super_block *sb,
+ /* Key to search (cpu variable) */
+ const struct cpu_key *p_cpu_key,
+ /* Filled up by this function. */
+ struct treepath *search_path)
{
struct item_head *p_le_ih; /* pointer to on-disk structure */
int blk_size;
@@ -830,7 +853,7 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b
if (retval == ITEM_FOUND) {
RFALSE(!ih_item_len
- (B_N_PITEM_HEAD
+ (item_head
(PATH_PLAST_BUFFER(search_path),
PATH_LAST_POSITION(search_path))),
"PAP-5165: item length equals zero");
@@ -844,14 +867,14 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b
/* Item is not found. Set path to the previous item. */
p_le_ih =
- B_N_PITEM_HEAD(PATH_PLAST_BUFFER(search_path),
+ item_head(PATH_PLAST_BUFFER(search_path),
--PATH_LAST_POSITION(search_path));
blk_size = sb->s_blocksize;
- if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
+ if (comp_short_keys(&p_le_ih->ih_key, p_cpu_key))
return FILE_NOT_FOUND;
- }
- // FIXME: quite ugly this far
+
+ /* FIXME: quite ugly this far */
item_offset = le_ih_k_offset(p_le_ih);
offset = cpu_key_k_offset(p_cpu_key);
@@ -866,8 +889,10 @@ int search_for_position_by_key(struct super_block *sb, /* Pointer to the super b
return POSITION_FOUND;
}
- /* Needed byte is not contained in the item pointed to by the
- path. Set pos_in_item out of the item. */
+ /*
+ * Needed byte is not contained in the item pointed to by the
+ * path. Set pos_in_item out of the item.
+ */
if (is_indirect_le_ih(p_le_ih))
pos_in_item(search_path) =
ih_item_len(p_le_ih) / UNFM_P_SIZE;
@@ -892,19 +917,17 @@ int comp_items(const struct item_head *stored_ih, const struct treepath *path)
return 1;
/* we need only to know, whether it is the same item */
- ih = get_ih(path);
+ ih = tp_item_head(path);
return memcmp(stored_ih, ih, IH_SIZE);
}
-/* unformatted nodes are not logged anymore, ever. This is safe
-** now
-*/
+/* unformatted nodes are not logged anymore, ever. This is safe now */
#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1)
-// block can not be forgotten as it is in I/O or held by someone
+/* block can not be forgotten as it is in I/O or held by someone */
#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
-// prepare for delete or cut of direct item
+/* prepare for delete or cut of direct item */
static inline int prepare_for_direct_item(struct treepath *path,
struct item_head *le_ih,
struct inode *inode,
@@ -917,9 +940,8 @@ static inline int prepare_for_direct_item(struct treepath *path,
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
return M_DELETE;
}
- // new file gets truncated
+ /* new file gets truncated */
if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
- //
round_len = ROUND_UP(new_file_length);
/* this was new_file_length < le_ih ... */
if (round_len < le_ih_k_offset(le_ih)) {
@@ -933,12 +955,13 @@ static inline int prepare_for_direct_item(struct treepath *path,
return M_CUT; /* Cut from this item. */
}
- // old file: items may have any length
+ /* old file: items may have any length */
if (new_file_length < le_ih_k_offset(le_ih)) {
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
return M_DELETE; /* Delete this item. */
}
+
/* Calculate first position and size for cutting from item. */
*cut_size = -(ih_item_len(le_ih) -
(pos_in_item(path) =
@@ -957,12 +980,15 @@ static inline int prepare_for_direntry_item(struct treepath *path,
RFALSE(ih_entry_count(le_ih) != 2,
"PAP-5220: incorrect empty directory item (%h)", le_ih);
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
- return M_DELETE; /* Delete the directory item containing "." and ".." entry. */
+ /* Delete the directory item containing "." and ".." entry. */
+ return M_DELETE;
}
if (ih_entry_count(le_ih) == 1) {
- /* Delete the directory item such as there is one record only
- in this item */
+ /*
+ * Delete the directory item such as there is one record only
+ * in this item
+ */
*cut_size = -(IH_SIZE + ih_item_len(le_ih));
return M_DELETE;
}
@@ -976,18 +1002,34 @@ static inline int prepare_for_direntry_item(struct treepath *path,
#define JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD (2 * JOURNAL_PER_BALANCE_CNT + 1)
-/* If the path points to a directory or direct item, calculate mode and the size cut, for balance.
- If the path points to an indirect item, remove some number of its unformatted nodes.
- In case of file truncate calculate whether this item must be deleted/truncated or last
- unformatted node of this item will be converted to a direct item.
- This function returns a determination of what balance mode the calling function should employ. */
-static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct treepath *path, const struct cpu_key *item_key, int *removed, /* Number of unformatted nodes which were removed
- from end of the file. */
- int *cut_size, unsigned long long new_file_length /* MAX_KEY_OFFSET in case of delete. */
+/*
+ * If the path points to a directory or direct item, calculate mode
+ * and the size cut, for balance.
+ * If the path points to an indirect item, remove some number of its
+ * unformatted nodes.
+ * In case of file truncate calculate whether this item must be
+ * deleted/truncated or last unformatted node of this item will be
+ * converted to a direct item.
+ * This function returns a determination of what balance mode the
+ * calling function should employ.
+ */
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th,
+ struct inode *inode,
+ struct treepath *path,
+ const struct cpu_key *item_key,
+ /*
+ * Number of unformatted nodes
+ * which were removed from end
+ * of the file.
+ */
+ int *removed,
+ int *cut_size,
+ /* MAX_KEY_OFFSET in case of delete. */
+ unsigned long long new_file_length
)
{
struct super_block *sb = inode->i_sb;
- struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+ struct item_head *p_le_ih = tp_item_head(path);
struct buffer_head *bh = PATH_PLAST_BUFFER(path);
BUG_ON(!th->t_trans_id);
@@ -1023,8 +1065,10 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
int pos = 0;
if ( new_file_length == max_reiserfs_offset (inode) ) {
- /* prepare_for_delete_or_cut() is called by
- * reiserfs_delete_item() */
+ /*
+ * prepare_for_delete_or_cut() is called by
+ * reiserfs_delete_item()
+ */
new_file_length = 0;
delete = 1;
}
@@ -1033,27 +1077,30 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
need_re_search = 0;
*cut_size = 0;
bh = PATH_PLAST_BUFFER(path);
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
pos = I_UNFM_NUM(&s_ih);
while (le_ih_k_offset (&s_ih) + (pos - 1) * blk_size > new_file_length) {
__le32 *unfm;
__u32 block;
- /* Each unformatted block deletion may involve one additional
- * bitmap block into the transaction, thereby the initial
- * journal space reservation might not be enough. */
+ /*
+ * Each unformatted block deletion may involve
+ * one additional bitmap block into the transaction,
+ * thereby the initial journal space reservation
+ * might not be enough.
+ */
if (!delete && (*cut_size) != 0 &&
reiserfs_transaction_free_space(th) < JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD)
break;
- unfm = (__le32 *)B_I_PITEM(bh, &s_ih) + pos - 1;
+ unfm = (__le32 *)ih_item_body(bh, &s_ih) + pos - 1;
block = get_block_num(unfm, 0);
if (block != 0) {
reiserfs_prepare_for_journal(sb, bh, 1);
put_block_num(unfm, 0, 0);
- journal_mark_dirty(th, sb, bh);
+ journal_mark_dirty(th, bh);
reiserfs_free_block(th, inode, block, 1);
}
@@ -1074,17 +1121,21 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
break;
}
}
- /* a trick. If the buffer has been logged, this will do nothing. If
- ** we've broken the loop without logging it, it will restore the
- ** buffer */
+ /*
+ * a trick. If the buffer has been logged, this will
+ * do nothing. If we've broken the loop without logging
+ * it, it will restore the buffer
+ */
reiserfs_restore_prepared_buffer(sb, bh);
} while (need_re_search &&
search_for_position_by_key(sb, item_key, path) == POSITION_FOUND);
pos_in_item(path) = pos * UNFM_P_SIZE;
if (*cut_size == 0) {
- /* Nothing were cut. maybe convert last unformatted node to the
- * direct item? */
+ /*
+ * Nothing was cut. maybe convert last unformatted node to the
+ * direct item?
+ */
result = M_CONVERT;
}
return result;
@@ -1095,7 +1146,7 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
{
int del_size;
- struct item_head *p_le_ih = PATH_PITEM_HEAD(tb->tb_path);
+ struct item_head *p_le_ih = tp_item_head(tb->tb_path);
if (is_statdata_le_ih(p_le_ih))
return 0;
@@ -1104,9 +1155,11 @@ static int calc_deleted_bytes_number(struct tree_balance *tb, char mode)
(mode ==
M_DELETE) ? ih_item_len(p_le_ih) : -tb->insert_size[0];
if (is_direntry_le_ih(p_le_ih)) {
- /* return EMPTY_DIR_SIZE; We delete emty directoris only.
- * we can't use EMPTY_DIR_SIZE, as old format dirs have a different
- * empty size. ick. FIXME, is this right? */
+ /*
+ * return EMPTY_DIR_SIZE; We delete emty directories only.
+ * we can't use EMPTY_DIR_SIZE, as old format dirs have a
+ * different empty size. ick. FIXME, is this right?
+ */
return del_size;
}
@@ -1169,7 +1222,8 @@ char head2type(struct item_head *ih)
}
#endif
-/* Delete object item.
+/*
+ * Delete object item.
* th - active transaction handle
* path - path to the deleted item
* item_key - key to search for the deleted item
@@ -1212,7 +1266,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
RFALSE(mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
s_del_balance.insert_size[0] = del_size;
ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
@@ -1221,7 +1275,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
PROC_INFO_INC(sb, delete_item_restarted);
- // file system changed, repeat search
+ /* file system changed, repeat search */
ret_value =
search_for_position_by_key(sb, item_key, path);
if (ret_value == IO_ERROR)
@@ -1238,16 +1292,18 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
unfix_nodes(&s_del_balance);
return 0;
}
- // reiserfs_delete_item returns item length when success
+
+ /* reiserfs_delete_item returns item length when success */
ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
- q_ih = get_ih(path);
+ q_ih = tp_item_head(path);
quota_cut_bytes = ih_item_len(q_ih);
- /* hack so the quota code doesn't have to guess if the file
- ** has a tail. On tail insert, we allocate quota for 1 unformatted node.
- ** We test the offset because the tail might have been
- ** split into multiple items, and we only want to decrement for
- ** the unfm node once
+ /*
+ * hack so the quota code doesn't have to guess if the file has a
+ * tail. On tail insert, we allocate quota for 1 unformatted node.
+ * We test the offset because the tail might have been
+ * split into multiple items, and we only want to decrement for
+ * the unfm node once
*/
if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(q_ih)) {
if ((le_ih_k_offset(q_ih) & (sb->s_blocksize - 1)) == 1) {
@@ -1261,33 +1317,38 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
int off;
char *data;
- /* We are in direct2indirect conversion, so move tail contents
- to the unformatted node */
- /* note, we do the copy before preparing the buffer because we
- ** don't care about the contents of the unformatted node yet.
- ** the only thing we really care about is the direct item's data
- ** is in the unformatted node.
- **
- ** Otherwise, we would have to call reiserfs_prepare_for_journal on
- ** the unformatted node, which might schedule, meaning we'd have to
- ** loop all the way back up to the start of the while loop.
- **
- ** The unformatted node must be dirtied later on. We can't be
- ** sure here if the entire tail has been deleted yet.
- **
- ** un_bh is from the page cache (all unformatted nodes are
- ** from the page cache) and might be a highmem page. So, we
- ** can't use un_bh->b_data.
- ** -clm
+ /*
+ * We are in direct2indirect conversion, so move tail contents
+ * to the unformatted node
+ */
+ /*
+ * note, we do the copy before preparing the buffer because we
+ * don't care about the contents of the unformatted node yet.
+ * the only thing we really care about is the direct item's
+ * data is in the unformatted node.
+ *
+ * Otherwise, we would have to call
+ * reiserfs_prepare_for_journal on the unformatted node,
+ * which might schedule, meaning we'd have to loop all the
+ * way back up to the start of the while loop.
+ *
+ * The unformatted node must be dirtied later on. We can't be
+ * sure here if the entire tail has been deleted yet.
+ *
+ * un_bh is from the page cache (all unformatted nodes are
+ * from the page cache) and might be a highmem page. So, we
+ * can't use un_bh->b_data.
+ * -clm
*/
data = kmap_atomic(un_bh->b_page);
off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
memcpy(data + off,
- B_I_PITEM(PATH_PLAST_BUFFER(path), &s_ih),
+ ih_item_body(PATH_PLAST_BUFFER(path), &s_ih),
ret_value);
kunmap_atomic(data);
}
+
/* Perform balancing after all resources have been collected at once. */
do_balance(&s_del_balance, NULL, NULL, M_DELETE);
@@ -1304,20 +1365,21 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
return ret_value;
}
-/* Summary Of Mechanisms For Handling Collisions Between Processes:
-
- deletion of the body of the object is performed by iput(), with the
- result that if multiple processes are operating on a file, the
- deletion of the body of the file is deferred until the last process
- that has an open inode performs its iput().
-
- writes and truncates are protected from collisions by use of
- semaphores.
-
- creates, linking, and mknod are protected from collisions with other
- processes by making the reiserfs_add_entry() the last step in the
- creation, and then rolling back all changes if there was a collision.
- - Hans
+/*
+ * Summary Of Mechanisms For Handling Collisions Between Processes:
+ *
+ * deletion of the body of the object is performed by iput(), with the
+ * result that if multiple processes are operating on a file, the
+ * deletion of the body of the file is deferred until the last process
+ * that has an open inode performs its iput().
+ *
+ * writes and truncates are protected from collisions by use of
+ * semaphores.
+ *
+ * creates, linking, and mknod are protected from collisions with other
+ * processes by making the reiserfs_add_entry() the last step in the
+ * creation, and then rolling back all changes if there was a collision.
+ * - Hans
*/
/* this deletes item which never gets split */
@@ -1347,7 +1409,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
}
if (retval != ITEM_FOUND) {
pathrelse(&path);
- // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
+ /*
+ * No need for a warning, if there is just no free
+ * space to insert '..' item into the
+ * newly-created subdir
+ */
if (!
((unsigned long long)
GET_HASH_VALUE(le_key_k_offset
@@ -1362,11 +1428,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
}
if (!tb_init) {
tb_init = 1;
- item_len = ih_item_len(PATH_PITEM_HEAD(&path));
+ item_len = ih_item_len(tp_item_head(&path));
init_tb_struct(th, &tb, th->t_super, &path,
-(IH_SIZE + item_len));
}
- quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path));
+ quota_cut_bytes = ih_item_len(tp_item_head(&path));
retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
if (retval == REPEAT_SEARCH) {
@@ -1376,7 +1442,11 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
if (retval == CARRY_ON) {
do_balance(&tb, NULL, NULL, M_DELETE);
- if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */
+ /*
+ * Should we count quota for item? (we don't
+ * count quotas for save-links)
+ */
+ if (inode) {
int depth;
#ifdef REISERQUOTA_DEBUG
reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
@@ -1391,7 +1461,8 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
}
break;
}
- // IO_ERROR, NO_DISK_SPACE, etc
+
+ /* IO_ERROR, NO_DISK_SPACE, etc */
reiserfs_warning(th->t_super, "vs-5360",
"could not delete %K due to fix_nodes failure",
&cpu_key);
@@ -1447,11 +1518,13 @@ static void unmap_buffers(struct page *page, loff_t pos)
do {
next = bh->b_this_page;
- /* we want to unmap the buffers that contain the tail, and
- ** all the buffers after it (since the tail must be at the
- ** end of the file). We don't want to unmap file data
- ** before the tail, since it might be dirty and waiting to
- ** reach disk
+ /*
+ * we want to unmap the buffers that contain
+ * the tail, and all the buffers after it
+ * (since the tail must be at the end of the
+ * file). We don't want to unmap file data
+ * before the tail, since it might be dirty
+ * and waiting to reach disk
*/
cur_index += bh->b_size;
if (cur_index > tail_index) {
@@ -1476,9 +1549,10 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
BUG_ON(new_file_size != inode->i_size);
- /* the page being sent in could be NULL if there was an i/o error
- ** reading in the last block. The user will hit problems trying to
- ** read the file, but for now we just skip the indirect2direct
+ /*
+ * the page being sent in could be NULL if there was an i/o error
+ * reading in the last block. The user will hit problems trying to
+ * read the file, but for now we just skip the indirect2direct
*/
if (atomic_read(&inode->i_count) > 1 ||
!tail_has_to_be_packed(inode) ||
@@ -1490,17 +1564,18 @@ static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
pathrelse(path);
return cut_bytes;
}
+
/* Perform the conversion to a direct_item. */
- /* return indirect_to_direct(inode, path, item_key,
- new_file_size, mode); */
return indirect2direct(th, inode, page, path, item_key,
new_file_size, mode);
}
-/* we did indirect_to_direct conversion. And we have inserted direct
- item successesfully, but there were no disk space to cut unfm
- pointer being converted. Therefore we have to delete inserted
- direct item(s) */
+/*
+ * we did indirect_to_direct conversion. And we have inserted direct
+ * item successesfully, but there were no disk space to cut unfm
+ * pointer being converted. Therefore we have to delete inserted
+ * direct item(s)
+ */
static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
struct inode *inode, struct treepath *path)
{
@@ -1509,7 +1584,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
int removed;
BUG_ON(!th->t_trans_id);
- make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4); // !!!!
+ make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);
tail_key.key_length = 4;
tail_len =
@@ -1521,7 +1596,7 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
reiserfs_panic(inode->i_sb, "vs-5615",
"found invalid item");
RFALSE(path->pos_in_item !=
- ih_item_len(PATH_PITEM_HEAD(path)) - 1,
+ ih_item_len(tp_item_head(path)) - 1,
"vs-5616: appended bytes found");
PATH_LAST_POSITION(path)--;
@@ -1539,7 +1614,6 @@ static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
reiserfs_warning(inode->i_sb, "reiserfs-5091", "indirect_to_direct "
"conversion has been rolled back due to "
"lack of disk space");
- //mark_file_without_tail (inode);
mark_inode_dirty(inode);
}
@@ -1551,15 +1625,18 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
struct page *page, loff_t new_file_size)
{
struct super_block *sb = inode->i_sb;
- /* Every function which is going to call do_balance must first
- create a tree_balance structure. Then it must fill up this
- structure by using the init_tb_struct and fix_nodes functions.
- After that we can make tree balancing. */
+ /*
+ * Every function which is going to call do_balance must first
+ * create a tree_balance structure. Then it must fill up this
+ * structure by using the init_tb_struct and fix_nodes functions.
+ * After that we can make tree balancing.
+ */
struct tree_balance s_cut_balance;
struct item_head *p_le_ih;
- int cut_size = 0, /* Amount to be cut. */
- ret_value = CARRY_ON, removed = 0, /* Number of the removed unformatted nodes. */
- is_inode_locked = 0;
+ int cut_size = 0; /* Amount to be cut. */
+ int ret_value = CARRY_ON;
+ int removed = 0; /* Number of the removed unformatted nodes. */
+ int is_inode_locked = 0;
char mode; /* Mode of the balance. */
int retval2 = -1;
int quota_cut_bytes;
@@ -1571,21 +1648,27 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
init_tb_struct(th, &s_cut_balance, inode->i_sb, path,
cut_size);
- /* Repeat this loop until we either cut the item without needing
- to balance, or we fix_nodes without schedule occurring */
+ /*
+ * Repeat this loop until we either cut the item without needing
+ * to balance, or we fix_nodes without schedule occurring
+ */
while (1) {
- /* Determine the balance mode, position of the first byte to
- be cut, and size to be cut. In case of the indirect item
- free unformatted nodes which are pointed to by the cut
- pointers. */
+ /*
+ * Determine the balance mode, position of the first byte to
+ * be cut, and size to be cut. In case of the indirect item
+ * free unformatted nodes which are pointed to by the cut
+ * pointers.
+ */
mode =
prepare_for_delete_or_cut(th, inode, path,
item_key, &removed,
&cut_size, new_file_size);
if (mode == M_CONVERT) {
- /* convert last unformatted node to direct item or leave
- tail in the unformatted node */
+ /*
+ * convert last unformatted node to direct item or
+ * leave tail in the unformatted node
+ */
RFALSE(ret_value != CARRY_ON,
"PAP-5570: can not convert twice");
@@ -1599,15 +1682,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
is_inode_locked = 1;
- /* removing of last unformatted node will change value we
- have to return to truncate. Save it */
+ /*
+ * removing of last unformatted node will
+ * change value we have to return to truncate.
+ * Save it
+ */
retval2 = ret_value;
- /*retval2 = sb->s_blocksize - (new_file_size & (sb->s_blocksize - 1)); */
- /* So, we have performed the first part of the conversion:
- inserting the new direct item. Now we are removing the
- last unformatted node pointer. Set key to search for
- it. */
+ /*
+ * So, we have performed the first part of the
+ * conversion:
+ * inserting the new direct item. Now we are
+ * removing the last unformatted node pointer.
+ * Set key to search for it.
+ */
set_cpu_key_k_type(item_key, TYPE_INDIRECT);
item_key->key_length = 4;
new_file_size -=
@@ -1650,11 +1738,13 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
return (ret_value == IO_ERROR) ? -EIO : -ENOENT;
} /* while */
- // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
+ /* check fix_nodes results (IO_ERROR or NO_DISK_SPACE) */
if (ret_value != CARRY_ON) {
if (is_inode_locked) {
- // FIXME: this seems to be not needed: we are always able
- // to cut item
+ /*
+ * FIXME: this seems to be not needed: we are always
+ * able to cut item
+ */
indirect_to_direct_roll_back(th, inode, path);
}
if (ret_value == NO_DISK_SPACE)
@@ -1671,22 +1761,23 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
/* Calculate number of bytes that need to be cut from the item. */
quota_cut_bytes =
(mode ==
- M_DELETE) ? ih_item_len(get_ih(path)) : -s_cut_balance.
+ M_DELETE) ? ih_item_len(tp_item_head(path)) : -s_cut_balance.
insert_size[0];
if (retval2 == -1)
ret_value = calc_deleted_bytes_number(&s_cut_balance, mode);
else
ret_value = retval2;
- /* For direct items, we only change the quota when deleting the last
- ** item.
+ /*
+ * For direct items, we only change the quota when deleting the last
+ * item.
*/
- p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
+ p_le_ih = tp_item_head(s_cut_balance.tb_path);
if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_le_ih)) {
if (mode == M_DELETE &&
(le_ih_k_offset(p_le_ih) & (sb->s_blocksize - 1)) ==
1) {
- // FIXME: this is to keep 3.5 happy
+ /* FIXME: this is to keep 3.5 happy */
REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
quota_cut_bytes = sb->s_blocksize + UNFM_P_SIZE;
} else {
@@ -1696,10 +1787,12 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
#ifdef CONFIG_REISERFS_CHECK
if (is_inode_locked) {
struct item_head *le_ih =
- PATH_PITEM_HEAD(s_cut_balance.tb_path);
- /* we are going to complete indirect2direct conversion. Make
- sure, that we exactly remove last unformatted node pointer
- of the item */
+ tp_item_head(s_cut_balance.tb_path);
+ /*
+ * we are going to complete indirect2direct conversion. Make
+ * sure, that we exactly remove last unformatted node pointer
+ * of the item
+ */
if (!is_indirect_le_ih(le_ih))
reiserfs_panic(sb, "vs-5652",
"item must be indirect %h", le_ih);
@@ -1717,17 +1810,20 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
"(CUT, insert_size==%d)",
le_ih, s_cut_balance.insert_size[0]);
}
- /* it would be useful to make sure, that right neighboring
- item is direct item of this file */
+ /*
+ * it would be useful to make sure, that right neighboring
+ * item is direct item of this file
+ */
}
#endif
do_balance(&s_cut_balance, NULL, NULL, mode);
if (is_inode_locked) {
- /* we've done an indirect->direct conversion. when the data block
- ** was freed, it was removed from the list of blocks that must
- ** be flushed before the transaction commits, make sure to
- ** unmap and invalidate it
+ /*
+ * we've done an indirect->direct conversion. when the
+ * data block was freed, it was removed from the list of
+ * blocks that must be flushed before the transaction
+ * commits, make sure to unmap and invalidate it
*/
unmap_buffers(page, tail_pos);
REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
@@ -1758,20 +1854,25 @@ static void truncate_directory(struct reiserfs_transaction_handle *th,
set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
}
-/* Truncate file to the new size. Note, this must be called with a transaction
- already started */
+/*
+ * Truncate file to the new size. Note, this must be called with a
+ * transaction already started
+ */
int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
- struct inode *inode, /* ->i_size contains new size */
+ struct inode *inode, /* ->i_size contains new size */
struct page *page, /* up to date for last block */
- int update_timestamps /* when it is called by
- file_release to convert
- the tail - no timestamps
- should be updated */
+ /*
+ * when it is called by file_release to convert
+ * the tail - no timestamps should be updated
+ */
+ int update_timestamps
)
{
INITIALIZE_PATH(s_search_path); /* Path to the current object item. */
struct item_head *p_le_ih; /* Pointer to an item header. */
- struct cpu_key s_item_key; /* Key to search for a previous file item. */
+
+ /* Key to search for a previous file item. */
+ struct cpu_key s_item_key;
loff_t file_size, /* Old file size. */
new_file_size; /* New file size. */
int deleted; /* Number of deleted or truncated bytes. */
@@ -1784,8 +1885,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
|| S_ISLNK(inode->i_mode)))
return 0;
+ /* deletion of directory - no need to update timestamps */
if (S_ISDIR(inode->i_mode)) {
- // deletion of directory - no need to update timestamps
truncate_directory(th, inode);
return 0;
}
@@ -1793,7 +1894,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
/* Get new file size. */
new_file_size = inode->i_size;
- // FIXME: note, that key type is unimportant here
+ /* FIXME: note, that key type is unimportant here */
make_cpu_key(&s_item_key, inode, max_reiserfs_offset(inode),
TYPE_DIRECT, 3);
@@ -1819,7 +1920,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
s_search_path.pos_in_item--;
/* Get real file size (total length of all file items) */
- p_le_ih = PATH_PITEM_HEAD(&s_search_path);
+ p_le_ih = tp_item_head(&s_search_path);
if (is_statdata_le_ih(p_le_ih))
file_size = 0;
else {
@@ -1827,9 +1928,11 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
int bytes =
op_bytes_number(p_le_ih, inode->i_sb->s_blocksize);
- /* this may mismatch with real file size: if last direct item
- had no padding zeros and last unformatted node had no free
- space, this file would have this file size */
+ /*
+ * this may mismatch with real file size: if last direct item
+ * had no padding zeros and last unformatted node had no free
+ * space, this file would have this file size
+ */
file_size = offset + bytes - 1;
}
/*
@@ -1867,18 +1970,20 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
set_cpu_key_k_offset(&s_item_key, file_size);
- /* While there are bytes to truncate and previous file item is presented in the tree. */
+ /*
+ * While there are bytes to truncate and previous
+ * file item is presented in the tree.
+ */
/*
- ** This loop could take a really long time, and could log
- ** many more blocks than a transaction can hold. So, we do a polite
- ** journal end here, and if the transaction needs ending, we make
- ** sure the file is consistent before ending the current trans
- ** and starting a new one
+ * This loop could take a really long time, and could log
+ * many more blocks than a transaction can hold. So, we do
+ * a polite journal end here, and if the transaction needs
+ * ending, we make sure the file is consistent before ending
+ * the current trans and starting a new one
*/
if (journal_transaction_should_end(th, 0) ||
reiserfs_transaction_free_space(th) <= JOURNAL_FOR_FREE_BLOCK_AND_UPDATE_SD) {
- int orig_len_alloc = th->t_blocks_allocated;
pathrelse(&s_search_path);
if (update_timestamps) {
@@ -1887,7 +1992,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
}
reiserfs_update_sd(th, inode);
- err = journal_end(th, inode->i_sb, orig_len_alloc);
+ err = journal_end(th);
if (err)
goto out;
err = journal_begin(th, inode->i_sb,
@@ -1904,25 +2009,25 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
"PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
- update_and_out:
+update_and_out:
if (update_timestamps) {
- // this is truncate, not file closing
+ /* this is truncate, not file closing */
inode->i_mtime = CURRENT_TIME_SEC;
inode->i_ctime = CURRENT_TIME_SEC;
}
reiserfs_update_sd(th, inode);
- out:
+out:
pathrelse(&s_search_path);
return err;
}
#ifdef CONFIG_REISERFS_CHECK
-// this makes sure, that we __append__, not overwrite or add holes
+/* this makes sure, that we __append__, not overwrite or add holes */
static void check_research_for_paste(struct treepath *path,
const struct cpu_key *key)
{
- struct item_head *found_ih = get_ih(path);
+ struct item_head *found_ih = tp_item_head(path);
if (is_direct_le_ih(found_ih)) {
if (le_ih_k_offset(found_ih) +
@@ -1952,13 +2057,22 @@ static void check_research_for_paste(struct treepath *path,
}
#endif /* config reiserfs check */
-/* Paste bytes to the existing item. Returns bytes number pasted into the item. */
-int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct treepath *search_path, /* Path to the pasted item. */
- const struct cpu_key *key, /* Key to search for the needed item. */
- struct inode *inode, /* Inode item belongs to */
- const char *body, /* Pointer to the bytes to paste. */
+/*
+ * Paste bytes to the existing item.
+ * Returns bytes number pasted into the item.
+ */
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
+ /* Path to the pasted item. */
+ struct treepath *search_path,
+ /* Key to search for the needed item. */
+ const struct cpu_key *key,
+ /* Inode item belongs to */
+ struct inode *inode,
+ /* Pointer to the bytes to paste. */
+ const char *body,
+ /* Size of pasted bytes. */
int pasted_size)
-{ /* Size of pasted bytes. */
+{
struct super_block *sb = inode->i_sb;
struct tree_balance s_paste_balance;
int retval;
@@ -1973,7 +2087,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
"reiserquota paste_into_item(): allocating %u id=%u type=%c",
pasted_size, inode->i_uid,
- key2type(&(key->on_disk_key)));
+ key2type(&key->on_disk_key));
#endif
depth = reiserfs_write_unlock_nested(sb);
@@ -1997,7 +2111,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
while ((retval =
fix_nodes(M_PASTE, &s_paste_balance, NULL,
body)) == REPEAT_SEARCH) {
- search_again:
+search_again:
/* file system changed while we were in the fix_nodes */
PROC_INFO_INC(th->t_super, paste_into_item_restarted);
retval =
@@ -2019,21 +2133,23 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
#endif
}
- /* Perform balancing after all resources are collected by fix_nodes, and
- accessing them will not risk triggering schedule. */
+ /*
+ * Perform balancing after all resources are collected by fix_nodes,
+ * and accessing them will not risk triggering schedule.
+ */
if (retval == CARRY_ON) {
do_balance(&s_paste_balance, NULL /*ih */ , body, M_PASTE);
return 0;
}
retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
- error_out:
+error_out:
/* this also releases the path */
unfix_nodes(&s_paste_balance);
#ifdef REISERQUOTA_DEBUG
reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
"reiserquota paste_into_item(): freeing %u id=%u type=%c",
pasted_size, inode->i_uid,
- key2type(&(key->on_disk_key)));
+ key2type(&key->on_disk_key));
#endif
depth = reiserfs_write_unlock_nested(sb);
dquot_free_space_nodirty(inode, pasted_size);
@@ -2041,7 +2157,8 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
return retval;
}
-/* Insert new item into the buffer at the path.
+/*
+ * Insert new item into the buffer at the path.
* th - active transaction handle
* path - path to the inserted item
* ih - pointer to the item header to insert
@@ -2064,8 +2181,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
fs_gen = get_generation(inode->i_sb);
quota_bytes = ih_item_len(ih);
- /* hack so the quota code doesn't have to guess if the file has
- ** a tail, links are always tails, so there's no guessing needed
+ /*
+ * hack so the quota code doesn't have to guess
+ * if the file has a tail, links are always tails,
+ * so there's no guessing needed
*/
if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(ih))
quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
@@ -2074,8 +2193,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
"reiserquota insert_item(): allocating %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
- /* We can't dirty inode here. It would be immediately written but
- * appropriate stat item isn't inserted yet... */
+ /*
+ * We can't dirty inode here. It would be immediately
+ * written but appropriate stat item isn't inserted yet...
+ */
depth = reiserfs_write_unlock_nested(inode->i_sb);
retval = dquot_alloc_space_nodirty(inode, quota_bytes);
reiserfs_write_lock_nested(inode->i_sb, depth);
@@ -2089,7 +2210,10 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
#ifdef DISPLACE_NEW_PACKING_LOCALITIES
s_ins_balance.key = key->on_disk_key;
#endif
- /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
+ /*
+ * DQUOT_* can schedule, must check to be sure calling
+ * fix_nodes is safe
+ */
if (inode && fs_changed(fs_gen, inode->i_sb)) {
goto search_again;
}
@@ -2097,7 +2221,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
while ((retval =
fix_nodes(M_INSERT, &s_ins_balance, ih,
body)) == REPEAT_SEARCH) {
- search_again:
+search_again:
/* file system changed while we were in the fix_nodes */
PROC_INFO_INC(th->t_super, insert_item_restarted);
retval = search_item(th->t_super, key, path);
@@ -2121,7 +2245,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
}
retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
- error_out:
+error_out:
/* also releases the path */
unfix_nodes(&s_ins_balance);
#ifdef REISERQUOTA_DEBUG
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3ead145dadc..a392cef6acc 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -62,7 +62,6 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
static int reiserfs_remount(struct super_block *s, int *flags, char *data);
static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
-void show_alloc_options(struct seq_file *seq, struct super_block *s);
static int reiserfs_sync_fs(struct super_block *s, int wait)
{
@@ -75,7 +74,7 @@ static int reiserfs_sync_fs(struct super_block *s, int wait)
dquot_writeback_dquots(s, -1);
reiserfs_write_lock(s);
if (!journal_begin(&th, s, 1))
- if (!journal_end_sync(&th, s, 1))
+ if (!journal_end_sync(&th))
reiserfs_flush_old_commits(s);
reiserfs_write_unlock(s);
return 0;
@@ -137,9 +136,9 @@ static int reiserfs_freeze(struct super_block *s)
} else {
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
1);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
reiserfs_block_writes(&th);
- journal_end_sync(&th, s, 1);
+ journal_end_sync(&th);
}
}
reiserfs_write_unlock(s);
@@ -154,13 +153,15 @@ static int reiserfs_unfreeze(struct super_block *s)
extern const struct in_core_key MAX_IN_CORE_KEY;
-/* this is used to delete "save link" when there are no items of a
- file it points to. It can either happen if unlink is completed but
- "save unlink" removal, or if file has both unlink and truncate
- pending and as unlink completes first (because key of "save link"
- protecting unlink is bigger that a key lf "save link" which
- protects truncate), so there left no items to make truncate
- completion on */
+/*
+ * this is used to delete "save link" when there are no items of a
+ * file it points to. It can either happen if unlink is completed but
+ * "save unlink" removal, or if file has both unlink and truncate
+ * pending and as unlink completes first (because key of "save link"
+ * protecting unlink is bigger that a key lf "save link" which
+ * protects truncate), so there left no items to make truncate
+ * completion on
+ */
static int remove_save_link_only(struct super_block *s,
struct reiserfs_key *key, int oid_free)
{
@@ -177,7 +178,7 @@ static int remove_save_link_only(struct super_block *s,
/* removals are protected by direct items */
reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
- return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT);
+ return journal_end(&th);
}
#ifdef CONFIG_QUOTA
@@ -259,7 +260,7 @@ static int finish_unfinished(struct super_block *s)
break;
}
item_pos--;
- ih = B_N_PITEM_HEAD(bh, item_pos);
+ ih = item_head(bh, item_pos);
if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
/* there are no "save" links anymore */
@@ -272,7 +273,7 @@ static int finish_unfinished(struct super_block *s)
truncate = 0;
/* reiserfs_iget needs k_dirid and k_objectid only */
- item = B_I_PITEM(bh, ih);
+ item = ih_item_body(bh, ih);
obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
obj_key.on_disk_key.k_objectid =
le32_to_cpu(ih->ih_key.k_objectid);
@@ -283,8 +284,10 @@ static int finish_unfinished(struct super_block *s)
inode = reiserfs_iget(s, &obj_key);
if (!inode) {
- /* the unlink almost completed, it just did not manage to remove
- "save" link and release objectid */
+ /*
+ * the unlink almost completed, it just did not
+ * manage to remove "save" link and release objectid
+ */
reiserfs_warning(s, "vs-2180", "iget failed for %K",
&obj_key);
retval = remove_save_link_only(s, &save_link_key, 1);
@@ -304,10 +307,13 @@ static int finish_unfinished(struct super_block *s)
reiserfs_write_lock_nested(inode->i_sb, depth);
if (truncate && S_ISDIR(inode->i_mode)) {
- /* We got a truncate request for a dir which is impossible.
- The only imaginable way is to execute unfinished truncate request
- then boot into old kernel, remove the file and create dir with
- the same key. */
+ /*
+ * We got a truncate request for a dir which
+ * is impossible. The only imaginable way is to
+ * execute unfinished truncate request then boot
+ * into old kernel, remove the file and create dir
+ * with the same key.
+ */
reiserfs_warning(s, "green-2101",
"impossible truncate on a "
"directory %k. Please report",
@@ -321,14 +327,16 @@ static int finish_unfinished(struct super_block *s)
if (truncate) {
REISERFS_I(inode)->i_flags |=
i_link_saved_truncate_mask;
- /* not completed truncate found. New size was committed together
- with "save" link */
+ /*
+ * not completed truncate found. New size was
+ * committed together with "save" link
+ */
reiserfs_info(s, "Truncating %k to %Ld ..",
INODE_PKEY(inode), inode->i_size);
- reiserfs_truncate_file(inode,
- 0
- /*don't update modification time */
- );
+
+ /* don't update modification time */
+ reiserfs_truncate_file(inode, 0);
+
retval = remove_save_link(inode, truncate);
} else {
REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
@@ -374,10 +382,12 @@ static int finish_unfinished(struct super_block *s)
return retval;
}
-/* to protect file being unlinked from getting lost we "safe" link files
- being unlinked. This link will be deleted in the same transaction with last
- item of file. mounting the filesystem we scan all these links and remove
- files which almost got lost */
+/*
+ * to protect file being unlinked from getting lost we "safe" link files
+ * being unlinked. This link will be deleted in the same transaction with last
+ * item of file. mounting the filesystem we scan all these links and remove
+ * files which almost got lost
+ */
void add_save_link(struct reiserfs_transaction_handle *th,
struct inode *inode, int truncate)
{
@@ -496,7 +506,7 @@ int remove_save_link(struct inode *inode, int truncate)
} else
REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
- return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+ return journal_end(&th);
}
static void reiserfs_kill_sb(struct super_block *s)
@@ -531,19 +541,23 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_lock(s);
- /* change file system state to current state if it was mounted with read-write permissions */
+ /*
+ * change file system state to current state if it was mounted
+ * with read-write permissions
+ */
if (!(s->s_flags & MS_RDONLY)) {
if (!journal_begin(&th, s, 10)) {
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
1);
set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
REISERFS_SB(s)->s_mount_state);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
}
}
- /* note, journal_release checks for readonly mount, and can decide not
- ** to do a journal_end
+ /*
+ * note, journal_release checks for readonly mount, and can
+ * decide not to do a journal_end
*/
journal_release(&th, s);
@@ -560,6 +574,7 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_unlock(s);
mutex_destroy(&REISERFS_SB(s)->lock);
+ destroy_workqueue(REISERFS_SB(s)->commit_wq);
kfree(s->s_fs_info);
s->s_fs_info = NULL;
}
@@ -597,7 +612,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
sizeof(struct
@@ -635,15 +650,16 @@ static void reiserfs_dirty_inode(struct inode *inode, int flags)
}
reiserfs_write_lock(inode->i_sb);
- /* this is really only used for atime updates, so they don't have
- ** to be included in O_SYNC or fsync
+ /*
+ * this is really only used for atime updates, so they don't have
+ * to be included in O_SYNC or fsync
*/
err = journal_begin(&th, inode->i_sb, 1);
if (err)
goto out;
reiserfs_update_sd(&th, inode);
- journal_end(&th, inode->i_sb, 1);
+ journal_end(&th);
out:
reiserfs_write_unlock(inode->i_sb);
@@ -789,31 +805,53 @@ static const struct export_operations reiserfs_export_ops = {
.get_parent = reiserfs_get_parent,
};
-/* this struct is used in reiserfs_getopt () for containing the value for those
- mount options that have values rather than being toggles. */
+/*
+ * this struct is used in reiserfs_getopt () for containing the value for
+ * those mount options that have values rather than being toggles.
+ */
typedef struct {
char *value;
- int setmask; /* bitmask which is to set on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. */
- int clrmask; /* bitmask which is to clear on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. This is
- applied BEFORE setmask */
+ /*
+ * bitmask which is to set on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ */
+ int setmask;
+ /*
+ * bitmask which is to clear on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ * This is applied BEFORE setmask
+ */
+ int clrmask;
} arg_desc_t;
/* Set this bit in arg_required to allow empty arguments */
#define REISERFS_OPT_ALLOWEMPTY 31
-/* this struct is used in reiserfs_getopt() for describing the set of reiserfs
- mount options */
+/*
+ * this struct is used in reiserfs_getopt() for describing the
+ * set of reiserfs mount options
+ */
typedef struct {
char *option_name;
- int arg_required; /* 0 if argument is not required, not 0 otherwise */
- const arg_desc_t *values; /* list of values accepted by an option */
- int setmask; /* bitmask which is to set on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. */
- int clrmask; /* bitmask which is to clear on mount_options bitmask when this
- value is found, 0 is no bits are to be changed. This is
- applied BEFORE setmask */
+
+ /* 0 if argument is not required, not 0 otherwise */
+ int arg_required;
+
+ /* list of values accepted by an option */
+ const arg_desc_t *values;
+
+ /*
+ * bitmask which is to set on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ */
+ int setmask;
+
+ /*
+ * bitmask which is to clear on mount_options bitmask
+ * when this value is found, 0 is no bits are to be changed.
+ * This is applied BEFORE setmask
+ */
+ int clrmask;
} opt_desc_t;
/* possible values for -o data= */
@@ -834,8 +872,10 @@ static const arg_desc_t barrier_mode[] = {
{.value = NULL}
};
-/* possible values for "-o block-allocator=" and bits which are to be set in
- s_mount_opt of reiserfs specific part of in-core super block */
+/*
+ * possible values for "-o block-allocator=" and bits which are to be set in
+ * s_mount_opt of reiserfs specific part of in-core super block
+ */
static const arg_desc_t balloc[] = {
{"noborder", 1 << REISERFS_NO_BORDER, 0},
{"border", 0, 1 << REISERFS_NO_BORDER},
@@ -865,21 +905,25 @@ static const arg_desc_t error_actions[] = {
{NULL, 0, 0},
};
-/* proceed only one option from a list *cur - string containing of mount options
- opts - array of options which are accepted
- opt_arg - if option is found and requires an argument and if it is specifed
- in the input - pointer to the argument is stored here
- bit_flags - if option requires to set a certain bit - it is set here
- return -1 if unknown option is found, opt->arg_required otherwise */
+/*
+ * proceed only one option from a list *cur - string containing of mount
+ * options
+ * opts - array of options which are accepted
+ * opt_arg - if option is found and requires an argument and if it is specifed
+ * in the input - pointer to the argument is stored here
+ * bit_flags - if option requires to set a certain bit - it is set here
+ * return -1 if unknown option is found, opt->arg_required otherwise
+ */
static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
char **opt_arg, unsigned long *bit_flags)
{
char *p;
- /* foo=bar,
- ^ ^ ^
- | | +-- option_end
- | +-- arg_start
- +-- option_start
+ /*
+ * foo=bar,
+ * ^ ^ ^
+ * | | +-- option_end
+ * | +-- arg_start
+ * +-- option_start
*/
const opt_desc_t *opt;
const arg_desc_t *arg;
@@ -894,9 +938,12 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
}
if (!strncmp(p, "alloc=", 6)) {
- /* Ugly special case, probably we should redo options parser so that
- it can understand several arguments for some options, also so that
- it can fill several bitfields with option values. */
+ /*
+ * Ugly special case, probably we should redo options
+ * parser so that it can understand several arguments for
+ * some options, also so that it can fill several bitfields
+ * with option values.
+ */
if (reiserfs_parse_alloc_options(s, p + 6)) {
return -1;
} else {
@@ -959,7 +1006,10 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
return -1;
}
- /* move to the argument, or to next option if argument is not required */
+ /*
+ * move to the argument, or to next option if argument is not
+ * required
+ */
p++;
if (opt->arg_required
@@ -996,12 +1046,20 @@ static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
}
/* returns 0 if something is wrong in option string, 1 - otherwise */
-static int reiserfs_parse_options(struct super_block *s, char *options, /* string given via mount's -o */
+static int reiserfs_parse_options(struct super_block *s,
+
+ /* string given via mount's -o */
+ char *options,
+
+ /*
+ * after the parsing phase, contains the
+ * collection of bitflags defining what
+ * mount options were selected.
+ */
unsigned long *mount_options,
- /* after the parsing phase, contains the
- collection of bitflags defining what
- mount options were selected. */
- unsigned long *blocks, /* strtol-ed from NNN of resize=NNN */
+
+ /* strtol-ed from NNN of resize=NNN */
+ unsigned long *blocks,
char **jdev_name,
unsigned int *commit_max_age,
char **qf_names,
@@ -1011,7 +1069,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
char *arg = NULL;
char *pos;
opt_desc_t opts[] = {
- /* Compatibility stuff, so that -o notail for old setups still work */
+ /*
+ * Compatibility stuff, so that -o notail for old
+ * setups still work
+ */
{"tails",.arg_required = 't',.values = tails},
{"notail",.clrmask =
(1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
@@ -1056,8 +1117,10 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
*blocks = 0;
if (!options || !*options)
- /* use default configuration: create tails, journaling on, no
- conversion to newest format */
+ /*
+ * use default configuration: create tails, journaling on, no
+ * conversion to newest format
+ */
return 1;
for (pos = options; pos;) {
@@ -1110,7 +1173,8 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
if (c == 'j') {
if (arg && *arg && jdev_name) {
- if (*jdev_name) { //Hm, already assigned?
+ /* Hm, already assigned? */
+ if (*jdev_name) {
reiserfs_warning(s, "super-6510",
"journal device was "
"already specified to "
@@ -1319,6 +1383,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
int i;
#endif
+ sync_filesystem(s);
reiserfs_write_lock(s);
#ifdef CONFIG_QUOTA
@@ -1362,8 +1427,10 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
safe_mask |= 1 << REISERFS_USRQUOTA;
safe_mask |= 1 << REISERFS_GRPQUOTA;
- /* Update the bitmask, taking care to keep
- * the bits we're not allowed to change here */
+ /*
+ * Update the bitmask, taking care to keep
+ * the bits we're not allowed to change here
+ */
REISERFS_SB(s)->s_mount_opt =
(REISERFS_SB(s)->
s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
@@ -1410,7 +1477,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
/* Mounting a rw partition read-only. */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
} else {
/* remount read-write */
if (!(s->s_flags & MS_RDONLY)) {
@@ -1427,7 +1494,9 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
handle_data_mode(s, mount_options);
handle_barrier_mode(s, mount_options);
REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
- s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
+
+ /* now it is safe to call journal_begin */
+ s->s_flags &= ~MS_RDONLY;
err = journal_begin(&th, s, 10);
if (err)
goto out_err_unlock;
@@ -1440,12 +1509,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (!old_format_only(s))
set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
}
/* this will force a full flush of all journal lists */
SB_JOURNAL(s)->j_must_wait = 1;
- err = journal_end(&th, s, 10);
+ err = journal_end(&th);
if (err)
goto out_err_unlock;
@@ -1479,7 +1548,7 @@ static int read_super_block(struct super_block *s, int offset)
if (!bh) {
reiserfs_warning(s, "sh-2006",
"bread failed (dev %s, block %lu, size %lu)",
- reiserfs_bdevname(s), offset / s->s_blocksize,
+ s->s_id, offset / s->s_blocksize,
s->s_blocksize);
return 1;
}
@@ -1489,9 +1558,9 @@ static int read_super_block(struct super_block *s, int offset)
brelse(bh);
return 1;
}
- //
- // ok, reiserfs signature (old or new) found in at the given offset
- //
+ /*
+ * ok, reiserfs signature (old or new) found in at the given offset
+ */
fs_blocksize = sb_blocksize(rs);
brelse(bh);
sb_set_blocksize(s, fs_blocksize);
@@ -1500,7 +1569,7 @@ static int read_super_block(struct super_block *s, int offset)
if (!bh) {
reiserfs_warning(s, "sh-2007",
"bread failed (dev %s, block %lu, size %lu)",
- reiserfs_bdevname(s), offset / s->s_blocksize,
+ s->s_id, offset / s->s_blocksize,
s->s_blocksize);
return 1;
}
@@ -1509,7 +1578,7 @@ static int read_super_block(struct super_block *s, int offset)
if (sb_blocksize(rs) != s->s_blocksize) {
reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
"filesystem on (dev %s, block %Lu, size %lu)",
- reiserfs_bdevname(s),
+ s->s_id,
(unsigned long long)bh->b_blocknr,
s->s_blocksize);
brelse(bh);
@@ -1529,9 +1598,11 @@ static int read_super_block(struct super_block *s, int offset)
SB_BUFFER_WITH_SB(s) = bh;
SB_DISK_SUPER_BLOCK(s) = rs;
+ /*
+ * magic is of non-standard journal filesystem, look at s_version to
+ * find which format is in use
+ */
if (is_reiserfs_jr(rs)) {
- /* magic is of non-standard journal filesystem, look at s_version to
- find which format is in use */
if (sb_version(rs) == REISERFS_VERSION_2)
reiserfs_info(s, "found reiserfs format \"3.6\""
" with non-standard journal\n");
@@ -1545,8 +1616,10 @@ static int read_super_block(struct super_block *s, int offset)
return 1;
}
} else
- /* s_version of standard format may contain incorrect information,
- so we just look at the magic string */
+ /*
+ * s_version of standard format may contain incorrect
+ * information, so we just look at the magic string
+ */
reiserfs_info(s,
"found reiserfs format \"%s\" with standard journal\n",
is_reiserfs_3_5(rs) ? "3.5" : "3.6");
@@ -1558,8 +1631,9 @@ static int read_super_block(struct super_block *s, int offset)
s->dq_op = &reiserfs_quota_operations;
#endif
- /* new format is limited by the 32 bit wide i_blocks field, want to
- ** be one full block below that.
+ /*
+ * new format is limited by the 32 bit wide i_blocks field, want to
+ * be one full block below that.
*/
s->s_maxbytes = (512LL << 32) - s->s_blocksize;
return 0;
@@ -1568,7 +1642,7 @@ static int read_super_block(struct super_block *s, int offset)
/* after journal replay, reread all bitmap and super blocks */
static int reread_meta_blocks(struct super_block *s)
{
- ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+ ll_rw_block(READ, 1, &SB_BUFFER_WITH_SB(s));
wait_on_buffer(SB_BUFFER_WITH_SB(s));
if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
reiserfs_warning(s, "reiserfs-2504", "error reading the super");
@@ -1578,14 +1652,15 @@ static int reread_meta_blocks(struct super_block *s)
return 0;
}
-/////////////////////////////////////////////////////
-// hash detection stuff
+/* hash detection stuff */
-// if root directory is empty - we set default - Yura's - hash and
-// warn about it
-// FIXME: we look for only one name in a directory. If tea and yura
-// bith have the same value - we ask user to send report to the
-// mailing list
+/*
+ * if root directory is empty - we set default - Yura's - hash and
+ * warn about it
+ * FIXME: we look for only one name in a directory. If tea and yura
+ * both have the same value - we ask user to send report to the
+ * mailing list
+ */
static __u32 find_hash_out(struct super_block *s)
{
int retval;
@@ -1593,92 +1668,83 @@ static __u32 find_hash_out(struct super_block *s)
struct cpu_key key;
INITIALIZE_PATH(path);
struct reiserfs_dir_entry de;
+ struct reiserfs_de_head *deh;
__u32 hash = DEFAULT_HASH;
+ __u32 deh_hashval, teahash, r5hash, yurahash;
inode = s->s_root->d_inode;
- do { // Some serious "goto"-hater was there ;)
- u32 teahash, r5hash, yurahash;
+ make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
+ retval = search_by_entry_key(s, &key, &path, &de);
+ if (retval == IO_ERROR) {
+ pathrelse(&path);
+ return UNSET_HASH;
+ }
+ if (retval == NAME_NOT_FOUND)
+ de.de_entry_num--;
+
+ set_de_name_and_namelen(&de);
+ deh = de.de_deh + de.de_entry_num;
- make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
- retval = search_by_entry_key(s, &key, &path, &de);
- if (retval == IO_ERROR) {
- pathrelse(&path);
- return UNSET_HASH;
- }
- if (retval == NAME_NOT_FOUND)
- de.de_entry_num--;
- set_de_name_and_namelen(&de);
- if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) {
- /* allow override in this case */
- if (reiserfs_rupasov_hash(s)) {
- hash = YURA_HASH;
- }
- reiserfs_info(s, "FS seems to be empty, autodetect "
- "is using the default hash\n");
- break;
- }
- r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
- teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
- yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
- if (((teahash == r5hash)
- &&
- (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num])))
- == r5hash)) || ((teahash == yurahash)
- && (yurahash ==
- GET_HASH_VALUE(deh_offset
- (&
- (de.
- de_deh[de.
- de_entry_num])))))
- || ((r5hash == yurahash)
- && (yurahash ==
- GET_HASH_VALUE(deh_offset
- (&(de.de_deh[de.de_entry_num])))))) {
- reiserfs_warning(s, "reiserfs-2506", "Unable to "
- "automatically detect hash function. "
- "Please mount with -o "
- "hash={tea,rupasov,r5}");
- hash = UNSET_HASH;
- break;
- }
- if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) ==
- yurahash)
+ if (deh_offset(deh) == DOT_DOT_OFFSET) {
+ /* allow override in this case */
+ if (reiserfs_rupasov_hash(s))
hash = YURA_HASH;
- else if (GET_HASH_VALUE
- (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash)
- hash = TEA_HASH;
- else if (GET_HASH_VALUE
- (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
- hash = R5_HASH;
- else {
- reiserfs_warning(s, "reiserfs-2506",
- "Unrecognised hash function");
- hash = UNSET_HASH;
- }
- } while (0);
+ reiserfs_info(s, "FS seems to be empty, autodetect is using the default hash\n");
+ goto out;
+ }
+
+ deh_hashval = GET_HASH_VALUE(deh_offset(deh));
+ r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
+ teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
+ yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
+
+ if ((teahash == r5hash && deh_hashval == r5hash) ||
+ (teahash == yurahash && deh_hashval == yurahash) ||
+ (r5hash == yurahash && deh_hashval == yurahash)) {
+ reiserfs_warning(s, "reiserfs-2506",
+ "Unable to automatically detect hash "
+ "function. Please mount with -o "
+ "hash={tea,rupasov,r5}");
+ hash = UNSET_HASH;
+ goto out;
+ }
+ if (deh_hashval == yurahash)
+ hash = YURA_HASH;
+ else if (deh_hashval == teahash)
+ hash = TEA_HASH;
+ else if (deh_hashval == r5hash)
+ hash = R5_HASH;
+ else {
+ reiserfs_warning(s, "reiserfs-2506",
+ "Unrecognised hash function");
+ hash = UNSET_HASH;
+ }
+out:
pathrelse(&path);
return hash;
}
-// finds out which hash names are sorted with
+/* finds out which hash names are sorted with */
static int what_hash(struct super_block *s)
{
__u32 code;
code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
- /* reiserfs_hash_detect() == true if any of the hash mount options
- ** were used. We must check them to make sure the user isn't
- ** using a bad hash value
+ /*
+ * reiserfs_hash_detect() == true if any of the hash mount options
+ * were used. We must check them to make sure the user isn't
+ * using a bad hash value
*/
if (code == UNSET_HASH || reiserfs_hash_detect(s))
code = find_hash_out(s);
if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
- /* detection has found the hash, and we must check against the
- ** mount options
+ /*
+ * detection has found the hash, and we must check against the
+ * mount options
*/
if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
reiserfs_warning(s, "reiserfs-2507",
@@ -1700,7 +1766,10 @@ static int what_hash(struct super_block *s)
code = UNSET_HASH;
}
} else {
- /* find_hash_out was not called or could not determine the hash */
+ /*
+ * find_hash_out was not called or
+ * could not determine the hash
+ */
if (reiserfs_rupasov_hash(s)) {
code = YURA_HASH;
} else if (reiserfs_tea_hash(s)) {
@@ -1710,8 +1779,9 @@ static int what_hash(struct super_block *s)
}
}
- /* if we are mounted RW, and we have a new valid hash code, update
- ** the super
+ /*
+ * if we are mounted RW, and we have a new valid hash code, update
+ * the super
*/
if (code != UNSET_HASH &&
!(s->s_flags & MS_RDONLY) &&
@@ -1721,7 +1791,7 @@ static int what_hash(struct super_block *s)
return code;
}
-// return pointer to appropriate function
+/* return pointer to appropriate function */
static hashf_t hash_function(struct super_block *s)
{
switch (what_hash(s)) {
@@ -1738,7 +1808,7 @@ static hashf_t hash_function(struct super_block *s)
return NULL;
}
-// this is used to set up correct value for old partitions
+/* this is used to set up correct value for old partitions */
static int function2code(hashf_t func)
{
if (func == keyed_hash)
@@ -1748,7 +1818,7 @@ static int function2code(hashf_t func)
if (func == r5_hash)
return R5_HASH;
- BUG(); // should never happen
+ BUG(); /* should never happen */
return 0;
}
@@ -1783,8 +1853,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
- /* no preallocation minimum, be smart in
- reiserfs_file_write instead */
+ /* no preallocation minimum, be smart in reiserfs_file_write instead */
sbi->s_alloc_options.preallocmin = 0;
/* Preallocate by 16 blocks (17-1) at once */
sbi->s_alloc_options.preallocsize = 17;
@@ -1796,9 +1865,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
mutex_init(&sbi->lock);
sbi->lock_depth = -1;
+ sbi->commit_wq = alloc_workqueue("reiserfs/%s", WQ_MEM_RECLAIM, 0,
+ s->s_id);
+ if (!sbi->commit_wq) {
+ SWARN(silent, s, "", "Cannot allocate commit workqueue");
+ errval = -ENOMEM;
+ goto error_unlocked;
+ }
+
jdev_name = NULL;
if (reiserfs_parse_options
- (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
+ (s, (char *)data, &sbi->s_mount_opt, &blocks, &jdev_name,
&commit_max_age, qf_names, &qfmt) == 0) {
goto error_unlocked;
}
@@ -1819,20 +1896,29 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
goto error_unlocked;
}
- /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
+ /*
+ * try old format (undistributed bitmap, super block in 8-th 1k
+ * block of a device)
+ */
if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
old_format = 1;
- /* try new format (64-th 1k block), which can contain reiserfs super block */
+
+ /*
+ * try new format (64-th 1k block), which can contain reiserfs
+ * super block
+ */
else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
SWARN(silent, s, "sh-2021", "can not find reiserfs on %s",
- reiserfs_bdevname(s));
+ s->s_id);
goto error_unlocked;
}
rs = SB_DISK_SUPER_BLOCK(s);
- /* Let's do basic sanity check to verify that underlying device is not
- smaller than the filesystem. If the check fails then abort and scream,
- because bad stuff will happen otherwise. */
+ /*
+ * Let's do basic sanity check to verify that underlying device is not
+ * smaller than the filesystem. If the check fails then abort and
+ * scream, because bad stuff will happen otherwise.
+ */
if (s->s_bdev && s->s_bdev->bd_inode
&& i_size_read(s->s_bdev->bd_inode) <
sb_block_count(rs) * sb_blocksize(rs)) {
@@ -1876,15 +1962,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
printk("reiserfs: using flush barriers\n");
}
- // set_device_ro(s->s_dev, 1) ;
if (journal_init(s, jdev_name, old_format, commit_max_age)) {
SWARN(silent, s, "sh-2022",
"unable to initialize journal space");
goto error_unlocked;
} else {
- jinit_done = 1; /* once this is set, journal_release must be called
- ** if we error out of the mount
- */
+ /*
+ * once this is set, journal_release must be called
+ * if we error out of the mount
+ */
+ jinit_done = 1;
}
if (reread_meta_blocks(s)) {
@@ -1905,7 +1992,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
root_inode =
iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
- reiserfs_init_locked_inode, (void *)(&args));
+ reiserfs_init_locked_inode, (void *)&args);
if (!root_inode) {
SWARN(silent, s, "jmacd-10", "get root inode failed");
goto error_unlocked;
@@ -1929,7 +2016,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
s->s_root = d_make_root(root_inode);
if (!s->s_root)
goto error;
- // define and initialize hash function
+ /* define and initialize hash function */
sbi->s_hash_function = hash_function(s);
if (sbi->s_hash_function == NULL) {
dput(s->s_root);
@@ -1939,11 +2026,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (is_reiserfs_3_5(rs)
|| (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
- set_bit(REISERFS_3_5, &(sbi->s_properties));
+ set_bit(REISERFS_3_5, &sbi->s_properties);
else if (old_format)
- set_bit(REISERFS_OLD_FORMAT, &(sbi->s_properties));
+ set_bit(REISERFS_OLD_FORMAT, &sbi->s_properties);
else
- set_bit(REISERFS_3_6, &(sbi->s_properties));
+ set_bit(REISERFS_3_6, &sbi->s_properties);
if (!(s->s_flags & MS_RDONLY)) {
@@ -1958,10 +2045,12 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
set_sb_umount_state(rs, REISERFS_ERROR_FS);
set_sb_fs_state(rs, 0);
- /* Clear out s_bmap_nr if it would wrap. We can handle this
+ /*
+ * Clear out s_bmap_nr if it would wrap. We can handle this
* case, but older revisions can't. This will cause the
* file system to fail mount on those older implementations,
- * avoiding corruption. -jeffm */
+ * avoiding corruption. -jeffm
+ */
if (bmap_would_wrap(reiserfs_bmap_count(s)) &&
sb_bmap_nr(rs) != 0) {
reiserfs_warning(s, "super-2030", "This file system "
@@ -1974,8 +2063,10 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
if (old_format_only(s)) {
- /* filesystem of format 3.5 either with standard or non-standard
- journal */
+ /*
+ * filesystem of format 3.5 either with standard
+ * or non-standard journal
+ */
if (convert_reiserfs(s)) {
/* and -o conv is given */
if (!silent)
@@ -1983,8 +2074,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
"converting 3.5 filesystem to the 3.6 format");
if (is_reiserfs_3_5(rs))
- /* put magic string of 3.6 format. 2.2 will not be able to
- mount this filesystem anymore */
+ /*
+ * put magic string of 3.6 format.
+ * 2.2 will not be able to
+ * mount this filesystem anymore
+ */
memcpy(rs->s_v1.s_magic,
reiserfs_3_6_magic_string,
sizeof
@@ -1992,8 +2086,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
set_sb_version(rs, REISERFS_VERSION_2);
reiserfs_convert_objectid_map_v1(s);
- set_bit(REISERFS_3_6, &(sbi->s_properties));
- clear_bit(REISERFS_3_5, &(sbi->s_properties));
+ set_bit(REISERFS_3_6, &sbi->s_properties);
+ clear_bit(REISERFS_3_5, &sbi->s_properties);
} else if (!silent) {
reiserfs_info(s, "using 3.5.x disk format\n");
}
@@ -2001,8 +2095,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
set_sb_mnt_count(rs, sb_mnt_count(rs) + 1);
- journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
- errval = journal_end(&th, s, 1);
+ journal_mark_dirty(&th, SB_BUFFER_WITH_SB(s));
+ errval = journal_end(&th);
if (errval) {
dput(s->s_root);
s->s_root = NULL;
@@ -2018,7 +2112,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
reiserfs_write_lock(s);
- /* look for files which were to be removed in previous session */
+ /*
+ * look for files which were to be removed in previous session
+ */
finish_unfinished(s);
} else {
if (old_format_only(s) && !silent) {
@@ -2034,7 +2130,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
}
reiserfs_write_lock(s);
}
- // mark hash in super block: it could be unset. overwrite should be ok
+ /*
+ * mark hash in super block: it could be unset. overwrite should be ok
+ */
set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
handle_attrs(s);
@@ -2111,9 +2209,7 @@ static int reiserfs_write_dquot(struct dquot *dquot)
depth = reiserfs_write_unlock_nested(dquot->dq_sb);
ret = dquot_commit(dquot);
reiserfs_write_lock_nested(dquot->dq_sb, depth);
- err =
- journal_end(&th, dquot->dq_sb,
- REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+ err = journal_end(&th);
if (!ret && err)
ret = err;
out:
@@ -2136,9 +2232,7 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
depth = reiserfs_write_unlock_nested(dquot->dq_sb);
ret = dquot_acquire(dquot);
reiserfs_write_lock_nested(dquot->dq_sb, depth);
- err =
- journal_end(&th, dquot->dq_sb,
- REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+ err = journal_end(&th);
if (!ret && err)
ret = err;
out:
@@ -2163,9 +2257,7 @@ static int reiserfs_release_dquot(struct dquot *dquot)
}
ret = dquot_release(dquot);
reiserfs_write_lock(dquot->dq_sb);
- err =
- journal_end(&th, dquot->dq_sb,
- REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ err = journal_end(&th);
if (!ret && err)
ret = err;
reiserfs_write_unlock(dquot->dq_sb);
@@ -2198,7 +2290,7 @@ static int reiserfs_write_info(struct super_block *sb, int type)
depth = reiserfs_write_unlock_nested(sb);
ret = dquot_commit_info(sb, type);
reiserfs_write_lock_nested(sb, depth);
- err = journal_end(&th, sb, 2);
+ err = journal_end(&th);
if (!ret && err)
ret = err;
out:
@@ -2238,7 +2330,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
goto out;
}
inode = path->dentry->d_inode;
- /* We must not pack tails for quota files on reiserfs for quota IO to work */
+ /*
+ * We must not pack tails for quota files on reiserfs for quota
+ * IO to work
+ */
if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) {
err = reiserfs_unpack(inode, NULL);
if (err) {
@@ -2268,7 +2363,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
err = journal_begin(&th, sb, 1);
if (err)
goto out;
- err = journal_end_sync(&th, sb, 1);
+ err = journal_end_sync(&th);
if (err)
goto out;
}
@@ -2279,10 +2374,12 @@ out:
return err;
}
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
+/*
+ * Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
* itself serializes the operations (and no one else should touch the files)
- * we don't have to be afraid of races */
+ * we don't have to be afraid of races
+ */
static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off)
{
@@ -2303,7 +2400,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
sb->s_blocksize - offset <
toread ? sb->s_blocksize - offset : toread;
tmp_bh.b_state = 0;
- /* Quota files are without tails so we can safely use this function */
+ /*
+ * Quota files are without tails so we can safely
+ * use this function
+ */
reiserfs_write_lock(sb);
err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
reiserfs_write_unlock(sb);
@@ -2326,8 +2426,10 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
return len;
}
-/* Write to quotafile (we know the transaction is already started and has
- * enough credits) */
+/*
+ * Write to quotafile (we know the transaction is already started and has
+ * enough credits)
+ */
static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
const char *data, size_t len, loff_t off)
{
@@ -2368,7 +2470,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
unlock_buffer(bh);
reiserfs_write_lock(sb);
reiserfs_prepare_for_journal(sb, bh, 1);
- journal_mark_dirty(current->journal_info, sb, bh);
+ journal_mark_dirty(current->journal_info, bh);
if (!journal_quota)
reiserfs_add_ordered_list(inode, bh);
reiserfs_write_unlock(sb);
@@ -2402,18 +2504,18 @@ static int __init init_reiserfs_fs(void)
{
int ret;
- if ((ret = init_inodecache())) {
+ ret = init_inodecache();
+ if (ret)
return ret;
- }
reiserfs_proc_info_global_init();
ret = register_filesystem(&reiserfs_fs_type);
+ if (ret)
+ goto out;
- if (ret == 0) {
- return 0;
- }
-
+ return 0;
+out:
reiserfs_proc_info_global_done();
destroy_inodecache();
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 5e2624d12f7..f41e19b4bb4 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -1,5 +1,6 @@
/*
- * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details
+ * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright
+ * details
*/
#include <linux/time.h>
@@ -7,29 +8,41 @@
#include <linux/buffer_head.h>
#include "reiserfs.h"
-/* access to tail : when one is going to read tail it must make sure, that is not running.
- direct2indirect and indirect2direct can not run concurrently */
+/*
+ * access to tail : when one is going to read tail it must make sure, that is
+ * not running. direct2indirect and indirect2direct can not run concurrently
+ */
-/* Converts direct items to an unformatted node. Panics if file has no
- tail. -ENOSPC if no disk space for conversion */
-/* path points to first direct item of the file regarless of how many of
- them are there */
+/*
+ * Converts direct items to an unformatted node. Panics if file has no
+ * tail. -ENOSPC if no disk space for conversion
+ */
+/*
+ * path points to first direct item of the file regardless of how many of
+ * them are there
+ */
int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
struct treepath *path, struct buffer_head *unbh,
loff_t tail_offset)
{
struct super_block *sb = inode->i_sb;
struct buffer_head *up_to_date_bh;
- struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+ struct item_head *p_le_ih = tp_item_head(path);
unsigned long total_tail = 0;
- struct cpu_key end_key; /* Key to search for the last byte of the
- converted item. */
- struct item_head ind_ih; /* new indirect item to be inserted or
- key of unfm pointer to be pasted */
- int blk_size, retval; /* returned value for reiserfs_insert_item and clones */
- unp_t unfm_ptr; /* Handle on an unformatted node
- that will be inserted in the
- tree. */
+
+ /* Key to search for the last byte of the converted item. */
+ struct cpu_key end_key;
+
+ /*
+ * new indirect item to be inserted or key
+ * of unfm pointer to be pasted
+ */
+ struct item_head ind_ih;
+ int blk_size;
+ /* returned value for reiserfs_insert_item and clones */
+ int retval;
+ /* Handle on an unformatted node that will be inserted in the tree. */
+ unp_t unfm_ptr;
BUG_ON(!th->t_trans_id);
@@ -37,8 +50,10 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
blk_size = sb->s_blocksize;
- /* and key to search for append or insert pointer to the new
- unformatted node. */
+ /*
+ * and key to search for append or insert pointer to the new
+ * unformatted node.
+ */
copy_item_head(&ind_ih, p_le_ih);
set_le_ih_k_offset(&ind_ih, tail_offset);
set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
@@ -55,7 +70,7 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
return -EIO;
}
- p_le_ih = PATH_PITEM_HEAD(path);
+ p_le_ih = tp_item_head(path);
unfm_ptr = cpu_to_le32(unbh->b_blocknr);
@@ -76,36 +91,43 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
if (retval) {
return retval;
}
- // note: from here there are two keys which have matching first
- // three key components. They only differ by the fourth one.
+ /*
+ * note: from here there are two keys which have matching first
+ * three key components. They only differ by the fourth one.
+ */
/* Set the key to search for the direct items of the file */
make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
4);
- /* Move bytes from the direct items to the new unformatted node
- and delete them. */
+ /*
+ * Move bytes from the direct items to the new unformatted node
+ * and delete them.
+ */
while (1) {
int tail_size;
- /* end_key.k_offset is set so, that we will always have found
- last item of the file */
+ /*
+ * end_key.k_offset is set so, that we will always have found
+ * last item of the file
+ */
if (search_for_position_by_key(sb, &end_key, path) ==
POSITION_FOUND)
reiserfs_panic(sb, "PAP-14050",
"direct item (%K) not found", &end_key);
- p_le_ih = PATH_PITEM_HEAD(path);
+ p_le_ih = tp_item_head(path);
RFALSE(!is_direct_le_ih(p_le_ih),
"vs-14055: direct item expected(%K), found %h",
&end_key, p_le_ih);
tail_size = (le_ih_k_offset(p_le_ih) & (blk_size - 1))
+ ih_item_len(p_le_ih) - 1;
- /* we only send the unbh pointer if the buffer is not up to date.
- ** this avoids overwriting good data from writepage() with old data
- ** from the disk or buffer cache
- ** Special case: unbh->b_page will be NULL if we are coming through
- ** DIRECT_IO handler here.
+ /*
+ * we only send the unbh pointer if the buffer is not
+ * up to date. this avoids overwriting good data from
+ * writepage() with old data from the disk or buffer cache
+ * Special case: unbh->b_page will be NULL if we are coming
+ * through DIRECT_IO handler here.
*/
if (!unbh->b_page || buffer_uptodate(unbh)
|| PageUptodate(unbh->b_page)) {
@@ -117,13 +139,15 @@ int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
up_to_date_bh);
total_tail += retval;
+
+ /* done: file does not have direct items anymore */
if (tail_size == retval)
- // done: file does not have direct items anymore
break;
}
- /* if we've copied bytes from disk into the page, we need to zero
- ** out the unused part of the block (it was not up to date before)
+ /*
+ * if we've copied bytes from disk into the page, we need to zero
+ * out the unused part of the block (it was not up to date before)
*/
if (up_to_date_bh) {
unsigned pgoff =
@@ -146,9 +170,11 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
BUG();
}
clear_buffer_dirty(bh);
- /* Remove the buffer from whatever list it belongs to. We are mostly
- interested in removing it from per-sb j_dirty_buffers list, to avoid
- BUG() on attempt to write not mapped buffer */
+ /*
+ * Remove the buffer from whatever list it belongs to. We are mostly
+ * interested in removing it from per-sb j_dirty_buffers list, to avoid
+ * BUG() on attempt to write not mapped buffer
+ */
if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
struct inode *inode = bh->b_page->mapping->host;
struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
@@ -164,12 +190,14 @@ void reiserfs_unmap_buffer(struct buffer_head *bh)
unlock_buffer(bh);
}
-/* this first locks inode (neither reads nor sync are permitted),
- reads tail through page cache, insert direct item. When direct item
- inserted successfully inode is left locked. Return value is always
- what we expect from it (number of cut bytes). But when tail remains
- in the unformatted node, we set mode to SKIP_BALANCING and unlock
- inode */
+/*
+ * this first locks inode (neither reads nor sync are permitted),
+ * reads tail through page cache, insert direct item. When direct item
+ * inserted successfully inode is left locked. Return value is always
+ * what we expect from it (number of cut bytes). But when tail remains
+ * in the unformatted node, we set mode to SKIP_BALANCING and unlock
+ * inode
+ */
int indirect2direct(struct reiserfs_transaction_handle *th,
struct inode *inode, struct page *page,
struct treepath *path, /* path to the indirect item. */
@@ -194,7 +222,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
*mode = M_SKIP_BALANCING;
/* store item head path points to. */
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
tail_len = (n_new_file_size & (block_size - 1));
if (get_inode_sd_version(inode) == STAT_DATA_V2)
@@ -207,9 +235,11 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
1) * sb->s_blocksize;
pos1 = pos;
- // we are protected by i_mutex. The tail can not disapper, not
- // append can be done either
- // we are in truncate or packing tail in file_release
+ /*
+ * we are protected by i_mutex. The tail can not disapper, not
+ * append can be done either
+ * we are in truncate or packing tail in file_release
+ */
tail = (char *)kmap(page); /* this can schedule */
@@ -220,7 +250,7 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
reiserfs_panic(sb, "PAP-5520",
"item to be converted %K does not exist",
item_key);
- copy_item_head(&s_ih, PATH_PITEM_HEAD(path));
+ copy_item_head(&s_ih, tp_item_head(path));
#ifdef CONFIG_REISERFS_CHECK
pos = le_ih_k_offset(&s_ih) - 1 +
(ih_item_len(&s_ih) / UNFM_P_SIZE -
@@ -236,9 +266,10 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
pos1 + 1, TYPE_DIRECT, round_tail_len,
0xffff /*ih_free_space */ );
- /* we want a pointer to the first byte of the tail in the page.
- ** the page was locked and this part of the page was up to date when
- ** indirect2direct was called, so we know the bytes are still valid
+ /*
+ * we want a pointer to the first byte of the tail in the page.
+ * the page was locked and this part of the page was up to date when
+ * indirect2direct was called, so we know the bytes are still valid
*/
tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
@@ -250,12 +281,14 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
/* Insert tail as new direct item in the tree */
if (reiserfs_insert_item(th, path, &key, &s_ih, inode,
tail ? tail : NULL) < 0) {
- /* No disk memory. So we can not convert last unformatted node
- to the direct item. In this case we used to adjust
- indirect items's ih_free_space. Now ih_free_space is not
- used, it would be ideal to write zeros to corresponding
- unformatted node. For now i_size is considered as guard for
- going out of file size */
+ /*
+ * No disk memory. So we can not convert last unformatted node
+ * to the direct item. In this case we used to adjust
+ * indirect items's ih_free_space. Now ih_free_space is not
+ * used, it would be ideal to write zeros to corresponding
+ * unformatted node. For now i_size is considered as guard for
+ * going out of file size
+ */
kunmap(page);
return block_size - round_tail_len;
}
@@ -264,12 +297,16 @@ int indirect2direct(struct reiserfs_transaction_handle *th,
/* make sure to get the i_blocks changes from reiserfs_insert_item */
reiserfs_update_sd(th, inode);
- // note: we have now the same as in above direct2indirect
- // conversion: there are two keys which have matching first three
- // key components. They only differ by the fouhth one.
+ /*
+ * note: we have now the same as in above direct2indirect
+ * conversion: there are two keys which have matching first three
+ * key components. They only differ by the fourth one.
+ */
- /* We have inserted new direct item and must remove last
- unformatted node. */
+ /*
+ * We have inserted new direct item and must remove last
+ * unformatted node.
+ */
*mode = M_CUT;
/* we store position of first direct item in the in-core inode */
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8a9e2dcfe00..ca416d099e7 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -50,14 +50,17 @@
#include <linux/stat.h>
#include <linux/quotaops.h>
#include <linux/security.h>
+#include <linux/posix_acl_xattr.h>
#define PRIVROOT_NAME ".reiserfs_priv"
#define XAROOT_NAME "xattrs"
-/* Helpers for inode ops. We do this so that we don't have all the VFS
+/*
+ * Helpers for inode ops. We do this so that we don't have all the VFS
* overhead and also for proper i_mutex annotation.
- * dir->i_mutex must be held for all of them. */
+ * dir->i_mutex must be held for all of them.
+ */
#ifdef CONFIG_REISERFS_FS_XATTR
static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
{
@@ -72,10 +75,12 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
return dir->i_op->mkdir(dir, dentry, mode);
}
-/* We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
+/*
+ * We use I_MUTEX_CHILD here to silence lockdep. It's safe because xattr
* mutation ops aren't called during rename or splace, which are the
* only other users of I_MUTEX_CHILD. It violates the ordering, but that's
- * better than allocating another subclass just for this code. */
+ * better than allocating another subclass just for this code.
+ */
static int xattr_unlink(struct inode *dir, struct dentry *dentry)
{
int error;
@@ -165,9 +170,11 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
return xadir;
}
-/* The following are side effects of other operations that aren't explicitly
+/*
+ * The following are side effects of other operations that aren't explicitly
* modifying extended attributes. This includes operations such as permissions
- * or ownership changes, object deletions, etc. */
+ * or ownership changes, object deletions, etc.
+ */
struct reiserfs_dentry_buf {
struct dir_context ctx;
struct dentry *xadir;
@@ -266,11 +273,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
cleanup_dentry_buf(&buf);
if (!err) {
- /* We start a transaction here to avoid a ABBA situation
+ /*
+ * We start a transaction here to avoid a ABBA situation
* between the xattr root's i_mutex and the journal lock.
* This doesn't incur much additional overhead since the
* new transaction will just nest inside the
- * outer transaction. */
+ * outer transaction.
+ */
int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
struct reiserfs_transaction_handle th;
@@ -283,7 +292,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
I_MUTEX_XATTR);
err = action(dir, data);
reiserfs_write_lock(inode->i_sb);
- jerror = journal_end(&th, inode->i_sb, blocks);
+ jerror = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
mutex_unlock(&dir->d_parent->d_inode->i_mutex);
err = jerror ?: err;
@@ -348,9 +357,11 @@ int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
}
#ifdef CONFIG_REISERFS_FS_XATTR
-/* Returns a dentry corresponding to a specific extended attribute file
+/*
+ * Returns a dentry corresponding to a specific extended attribute file
* for the inode. If flags allow, the file is created. Otherwise, a
- * valid or negative dentry, or an error is returned. */
+ * valid or negative dentry, or an error is returned.
+ */
static struct dentry *xattr_lookup(struct inode *inode, const char *name,
int flags)
{
@@ -399,8 +410,10 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
{
struct address_space *mapping = dir->i_mapping;
struct page *page;
- /* We can deadlock if we try to free dentries,
- and an unlink/rmdir has just occurred - GFP_NOFS avoids this */
+ /*
+ * We can deadlock if we try to free dentries,
+ * and an unlink/rmdir has just occurred - GFP_NOFS avoids this
+ */
mapping_set_gfp_mask(mapping, GFP_NOFS);
page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
if (!IS_ERR(page)) {
@@ -410,7 +423,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
}
return page;
- fail:
+fail:
reiserfs_put_page(page);
return ERR_PTR(-EIO);
}
@@ -588,7 +601,7 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
buffer, buffer_size, flags);
reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th, inode->i_sb, jbegin_count);
+ error2 = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
if (error == 0)
error = error2;
@@ -614,8 +627,10 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
if (name == NULL)
return -EINVAL;
- /* We can't have xattrs attached to v1 items since they don't have
- * generation numbers */
+ /*
+ * We can't have xattrs attached to v1 items since they don't have
+ * generation numbers
+ */
if (get_inode_sd_version(inode) == STAT_DATA_V1)
return -EOPNOTSUPP;
@@ -904,20 +919,24 @@ static const struct xattr_handler *reiserfs_xattr_handlers[] = {
&reiserfs_xattr_security_handler,
#endif
#ifdef CONFIG_REISERFS_FS_POSIX_ACL
- &reiserfs_posix_acl_access_handler,
- &reiserfs_posix_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL
};
static int xattr_mount_check(struct super_block *s)
{
- /* We need generation numbers to ensure that the oid mapping is correct
- * v3.5 filesystems don't have them. */
+ /*
+ * We need generation numbers to ensure that the oid mapping is correct
+ * v3.5 filesystems don't have them.
+ */
if (old_format_only(s)) {
if (reiserfs_xattrs_optional(s)) {
- /* Old format filesystem, but optional xattrs have
- * been enabled. Error out. */
+ /*
+ * Old format filesystem, but optional xattrs have
+ * been enabled. Error out.
+ */
reiserfs_warning(s, "jdm-2005",
"xattrs/ACLs not supported "
"on pre-v3.6 format filesystems. "
@@ -971,9 +990,11 @@ int reiserfs_lookup_privroot(struct super_block *s)
return err;
}
-/* We need to take a copy of the mount flags since things like
+/*
+ * We need to take a copy of the mount flags since things like
* MS_RDONLY don't get set until *after* we're called.
- * mount_flags != mount_options */
+ * mount_flags != mount_options
+ */
int reiserfs_xattr_init(struct super_block *s, int mount_flags)
{
int err = 0;
@@ -1006,8 +1027,8 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
error:
if (err) {
- clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
- clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
+ clear_bit(REISERFS_XATTRS_USER, &REISERFS_SB(s)->s_mount_opt);
+ clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt);
}
/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index f59626c5d33..857ec7e3016 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -61,7 +61,8 @@ static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
return ret;
}
-/* We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
+/*
+ * We may have to create up to 3 objects: xattr root, xattr dir, xattr file.
* Let's try to be smart about it.
* xattr root: We cache it. If it's not cached, we may need to create it.
* xattr dir: If anything has been loaded for this inode, we can set a flag
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 06c04f73da6..44503e29379 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -11,38 +11,24 @@
#include "acl.h"
#include <asm/uaccess.h>
-static int reiserfs_set_acl(struct reiserfs_transaction_handle *th,
+static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
struct inode *inode, int type,
struct posix_acl *acl);
-static int
-posix_acl_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int type)
+
+int
+reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl;
int error, error2;
struct reiserfs_transaction_handle th;
size_t jcreate_blocks;
- if (!reiserfs_posixacl(inode->i_sb))
- return -EOPNOTSUPP;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (value) {
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (IS_ERR(acl)) {
- return PTR_ERR(acl);
- } else if (acl) {
- error = posix_acl_valid(acl);
- if (error)
- goto release_and_out;
- }
- } else
- acl = NULL;
+ int size = acl ? posix_acl_xattr_size(acl->a_count) : 0;
+
- /* Pessimism: We can't assume that anything from the xattr root up
- * has been created. */
+ /*
+ * Pessimism: We can't assume that anything from the xattr root up
+ * has been created.
+ */
jcreate_blocks = reiserfs_xattr_jcreate_nblocks(inode) +
reiserfs_xattr_nblocks(inode, size) * 2;
@@ -51,44 +37,21 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
error = journal_begin(&th, inode->i_sb, jcreate_blocks);
reiserfs_write_unlock(inode->i_sb);
if (error == 0) {
- error = reiserfs_set_acl(&th, inode, type, acl);
+ error = __reiserfs_set_acl(&th, inode, type, acl);
reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th, inode->i_sb, jcreate_blocks);
+ error2 = journal_end(&th);
reiserfs_write_unlock(inode->i_sb);
if (error2)
error = error2;
}
- release_and_out:
- posix_acl_release(acl);
- return error;
-}
-
-static int
-posix_acl_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- if (!reiserfs_posixacl(dentry->d_sb))
- return -EOPNOTSUPP;
-
- acl = reiserfs_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
- error = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
- posix_acl_release(acl);
-
return error;
}
/*
* Convert from filesystem to in-memory representation.
*/
-static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
+static struct posix_acl *reiserfs_posix_acl_from_disk(const void *value, size_t size)
{
const char *end = (char *)value + size;
int n, count;
@@ -150,7 +113,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
goto fail;
return acl;
- fail:
+fail:
posix_acl_release(acl);
return ERR_PTR(-EINVAL);
}
@@ -158,7 +121,7 @@ static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
/*
* Convert from in-memory to filesystem representation.
*/
-static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
+static void *reiserfs_posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
{
reiserfs_acl_header *ext_acl;
char *e;
@@ -203,7 +166,7 @@ static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
}
return (char *)ext_acl;
- fail:
+fail:
kfree(ext_acl);
return ERR_PTR(-EINVAL);
}
@@ -221,10 +184,6 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
int size;
int retval;
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
@@ -251,13 +210,15 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
retval = reiserfs_xattr_get(inode, name, value, size);
if (retval == -ENODATA || retval == -ENOSYS) {
- /* This shouldn't actually happen as it should have
- been caught above.. but just in case */
+ /*
+ * This shouldn't actually happen as it should have
+ * been caught above.. but just in case
+ */
acl = NULL;
} else if (retval < 0) {
acl = ERR_PTR(retval);
} else {
- acl = posix_acl_from_disk(value, retval);
+ acl = reiserfs_posix_acl_from_disk(value, retval);
}
if (!IS_ERR(acl))
set_cached_acl(inode, type, acl);
@@ -273,7 +234,7 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
* BKL held [before 2.5.x]
*/
static int
-reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
+__reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
int type, struct posix_acl *acl)
{
char *name;
@@ -281,9 +242,6 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
size_t size = 0;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
name = POSIX_ACL_XATTR_ACCESS;
@@ -307,7 +265,7 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
}
if (acl) {
- value = posix_acl_to_disk(acl, &size);
+ value = reiserfs_posix_acl_to_disk(acl, &size);
if (IS_ERR(value))
return (int)PTR_ERR(value);
}
@@ -336,64 +294,61 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
return error;
}
-/* dir->i_mutex: locked,
- * inode is new and not released into the wild yet */
+/*
+ * dir->i_mutex: locked,
+ * inode is new and not released into the wild yet
+ */
int
reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
- struct posix_acl *acl;
+ struct posix_acl *default_acl, *acl;
int err = 0;
/* ACLs only get applied to files and directories */
if (S_ISLNK(inode->i_mode))
return 0;
- /* ACLs can only be used on "new" objects, so if it's an old object
- * there is nothing to inherit from */
+ /*
+ * ACLs can only be used on "new" objects, so if it's an old object
+ * there is nothing to inherit from
+ */
if (get_inode_sd_version(dir) == STAT_DATA_V1)
goto apply_umask;
- /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
+ /*
+ * Don't apply ACLs to objects in the .reiserfs_priv tree.. This
* would be useless since permissions are ignored, and a pain because
- * it introduces locking cycles */
+ * it introduces locking cycles
+ */
if (IS_PRIVATE(dir)) {
inode->i_flags |= S_PRIVATE;
goto apply_umask;
}
- acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
+ err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (err)
+ return err;
+ if (default_acl) {
+ err = __reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
+ default_acl);
+ posix_acl_release(default_acl);
+ }
if (acl) {
- /* Copy the default ACL to the default ACL of a new directory */
- if (S_ISDIR(inode->i_mode)) {
- err = reiserfs_set_acl(th, inode, ACL_TYPE_DEFAULT,
- acl);
- if (err)
- goto cleanup;
- }
-
- /* Now we reconcile the new ACL and the mode,
- potentially modifying both */
- err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
- if (err < 0)
- return err;
-
- /* If we need an ACL.. */
- if (err > 0)
- err = reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS, acl);
- cleanup:
+ if (!err)
+ err = __reiserfs_set_acl(th, inode, ACL_TYPE_ACCESS,
+ acl);
posix_acl_release(acl);
- } else {
- apply_umask:
- /* no ACL, apply umask */
- inode->i_mode &= ~current_umask();
}
return err;
+
+apply_umask:
+ /* no ACL, apply umask */
+ inode->i_mode &= ~current_umask();
+ return err;
}
/* This is used to cache the default acl before a new object is created.
@@ -442,84 +397,11 @@ int reiserfs_cache_default_acl(struct inode *inode)
*/
int reiserfs_acl_chmod(struct inode *inode)
{
- struct reiserfs_transaction_handle th;
- struct posix_acl *acl;
- size_t size;
- int error;
-
if (IS_PRIVATE(inode))
return 0;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
- !reiserfs_posixacl(inode->i_sb)) {
- return 0;
- }
-
- acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (!acl)
+ !reiserfs_posixacl(inode->i_sb))
return 0;
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- error = posix_acl_chmod(&acl, GFP_NOFS, inode->i_mode);
- if (error)
- return error;
- size = reiserfs_xattr_nblocks(inode, reiserfs_acl_size(acl->a_count));
- reiserfs_write_lock(inode->i_sb);
- error = journal_begin(&th, inode->i_sb, size * 2);
- reiserfs_write_unlock(inode->i_sb);
- if (!error) {
- int error2;
- error = reiserfs_set_acl(&th, inode, ACL_TYPE_ACCESS, acl);
- reiserfs_write_lock(inode->i_sb);
- error2 = journal_end(&th, inode->i_sb, size * 2);
- reiserfs_write_unlock(inode->i_sb);
- if (error2)
- error = error2;
- }
- posix_acl_release(acl);
- return error;
-}
-
-static size_t posix_acl_access_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
- if (!reiserfs_posixacl(dentry->d_sb))
- return 0;
- if (list && size <= list_size)
- memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
- return size;
-}
-
-const struct xattr_handler reiserfs_posix_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = posix_acl_get,
- .set = posix_acl_set,
- .list = posix_acl_access_list,
-};
-
-static size_t posix_acl_default_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name,
- size_t name_len, int type)
-{
- const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
- if (!reiserfs_posixacl(dentry->d_sb))
- return 0;
- if (list && size <= list_size)
- memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
- return size;
+ return posix_acl_chmod(inode, inode->i_mode);
}
-
-const struct xattr_handler reiserfs_posix_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = posix_acl_get,
- .set = posix_acl_set,
- .list = posix_acl_default_list,
-};
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index f373bde8f54..ea06c755486 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -72,8 +72,8 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.splice_read = generic_file_splice_read,
.mmap = romfs_mmap,
.get_unmapped_area = romfs_get_unmapped_area,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ff1d3d42e72..ef90e8bca95 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -432,6 +432,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
static int romfs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_RDONLY;
return 0;
}
@@ -533,16 +534,14 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
root = romfs_iget(sb, pos);
if (IS_ERR(root))
- goto error;
+ return PTR_ERR(root);
sb->s_root = d_make_root(root);
if (!sb->s_root)
- goto error;
+ return -ENOMEM;
return 0;
-error:
- return -EINVAL;
error_rsb_inval:
ret = -EINVAL;
error_rsb:
diff --git a/fs/select.c b/fs/select.c
index 35d4adc749d..467bb1cb3ea 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -238,8 +238,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
set_current_state(state);
if (!pwq->triggered)
- rc = freezable_schedule_hrtimeout_range(expires, slack,
- HRTIMER_MODE_ABS);
+ rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
__set_current_state(TASK_RUNNING);
/*
@@ -455,7 +454,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
const struct file_operations *f_op;
f_op = f.file->f_op;
mask = DEFAULT_POLLMASK;
- if (f_op && f_op->poll) {
+ if (f_op->poll) {
wait_key_set(wait, in, out,
bit, busy_flag);
mask = (*f_op->poll)(f.file, wait);
@@ -762,7 +761,7 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
mask = POLLNVAL;
if (f.file) {
mask = DEFAULT_POLLMASK;
- if (f.file->f_op && f.file->f_op->poll) {
+ if (f.file->f_op->poll) {
pwait->_key = pollfd->events|POLLERR|POLLHUP;
pwait->_key |= busy_flag;
mask = f.file->f_op->poll(f.file, pwait);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 3135c2525c7..3857b720cb1 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -8,8 +8,10 @@
#include <linux/fs.h>
#include <linux/export.h>
#include <linux/seq_file.h>
+#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/cred.h>
+#include <linux/mm.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m)
m->count = m->size;
}
+static void *seq_buf_alloc(unsigned long size)
+{
+ void *buf;
+
+ buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
+ if (!buf && size > PAGE_SIZE)
+ buf = vmalloc(size);
+ return buf;
+}
+
/**
* seq_open - initialize sequential file
* @file: file we initialize
@@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset)
return 0;
}
if (!m->buf) {
- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+ m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
if (!m->buf)
return -ENOMEM;
}
@@ -135,8 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset)
Eoverflow:
m->op->stop(m, p);
- kfree(m->buf);
- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+ kvfree(m->buf);
+ m->count = 0;
+ m->buf = seq_buf_alloc(m->size <<= 1);
return !m->buf ? -ENOMEM : -EAGAIN;
}
@@ -191,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
/* grab buffer if we didn't have one */
if (!m->buf) {
- m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+ m->buf = seq_buf_alloc(m->size = PAGE_SIZE);
if (!m->buf)
goto Enomem;
}
@@ -231,11 +244,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
if (m->count < m->size)
goto Fill;
m->op->stop(m, p);
- kfree(m->buf);
- m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+ kvfree(m->buf);
+ m->count = 0;
+ m->buf = seq_buf_alloc(m->size <<= 1);
if (!m->buf)
goto Enomem;
- m->count = 0;
m->version = 0;
pos = m->index;
p = m->op->start(m, &pos);
@@ -328,6 +341,8 @@ loff_t seq_lseek(struct file *file, loff_t offset, int whence)
m->read_pos = offset;
retval = file->f_pos = offset;
}
+ } else {
+ file->f_pos = offset;
}
}
file->f_version = m->version;
@@ -347,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek);
int seq_release(struct inode *inode, struct file *file)
{
struct seq_file *m = file->private_data;
- kfree(m->buf);
+ kvfree(m->buf);
kfree(m);
return 0;
}
@@ -602,13 +617,13 @@ EXPORT_SYMBOL(single_open);
int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
void *data, size_t size)
{
- char *buf = kmalloc(size, GFP_KERNEL);
+ char *buf = seq_buf_alloc(size);
int ret;
if (!buf)
return -ENOMEM;
ret = single_open(file, show, data);
if (ret) {
- kfree(buf);
+ kvfree(buf);
return ret;
}
((struct seq_file *)file->private_data)->buf = buf;
@@ -764,6 +779,21 @@ int seq_write(struct seq_file *seq, const void *data, size_t len)
}
EXPORT_SYMBOL(seq_write);
+/**
+ * seq_pad - write padding spaces to buffer
+ * @m: seq_file identifying the buffer to which data should be written
+ * @c: the byte to append after padding if non-zero
+ */
+void seq_pad(struct seq_file *m, char c)
+{
+ int size = m->pad_until - m->count;
+ if (size > 0)
+ seq_printf(m, "%*s", size, "");
+ if (c)
+ seq_putc(m, c);
+}
+EXPORT_SYMBOL(seq_pad);
+
struct list_head *seq_list_start(struct list_head *head, loff_t pos)
{
struct list_head *lh;
diff --git a/fs/splice.c b/fs/splice.c
index 3b7ee656f3a..f5cb9ba8451 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,6 +32,7 @@
#include <linux/gfp.h>
#include <linux/socket.h>
#include <linux/compat.h>
+#include <linux/aio.h>
#include "internal.h"
/*
@@ -136,8 +137,6 @@ error:
const struct pipe_buf_operations page_cache_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
@@ -156,8 +155,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
@@ -547,14 +544,28 @@ EXPORT_SYMBOL(generic_file_splice_read);
static const struct pipe_buf_operations default_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = generic_pipe_buf_release,
.steal = generic_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
+{
+ return 1;
+}
+
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = generic_pipe_buf_release,
+ .steal = generic_pipe_buf_nosteal,
+ .get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
+
static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
unsigned long vlen, loff_t offset)
{
@@ -695,7 +706,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
loff_t pos = sd->pos;
int more;
- if (!likely(file->f_op && file->f_op->sendpage))
+ if (!likely(file->f_op->sendpage))
return -EINVAL;
more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
@@ -707,63 +718,6 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
sd->len, &pos, more);
}
-/*
- * This is a little more tricky than the file -> pipe splicing. There are
- * basically three cases:
- *
- * - Destination page already exists in the address space and there
- * are users of it. For that case we have no other option that
- * copying the data. Tough luck.
- * - Destination page already exists in the address space, but there
- * are no users of it. Make sure it's uptodate, then drop it. Fall
- * through to last case.
- * - Destination page does not exist, we can add the pipe page to
- * the page cache and avoid the copy.
- *
- * If asked to move pages to the output file (SPLICE_F_MOVE is set in
- * sd->flags), we attempt to migrate pages from the pipe to the output
- * file address space page cache. This is possible if no one else has
- * the pipe page referenced outside of the pipe and page cache. If
- * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
- * a new page in the output file page cache and fill/dirty that.
- */
-int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
- struct splice_desc *sd)
-{
- struct file *file = sd->u.file;
- struct address_space *mapping = file->f_mapping;
- unsigned int offset, this_len;
- struct page *page;
- void *fsdata;
- int ret;
-
- offset = sd->pos & ~PAGE_CACHE_MASK;
-
- this_len = sd->len;
- if (this_len + offset > PAGE_CACHE_SIZE)
- this_len = PAGE_CACHE_SIZE - offset;
-
- ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- if (unlikely(ret))
- goto out;
-
- if (buf->page != page) {
- char *src = buf->ops->map(pipe, buf, 1);
- char *dst = kmap_atomic(page);
-
- memcpy(dst + offset, src + buf->offset, this_len);
- flush_dcache_page(page);
- kunmap_atomic(dst);
- buf->ops->unmap(pipe, buf, src);
- }
- ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
- page, fsdata);
-out:
- return ret;
-}
-EXPORT_SYMBOL(pipe_to_file);
-
static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
{
smp_mb();
@@ -792,7 +746,7 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
* locking is required around copying the pipe buffers to the
* destination.
*/
-int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
splice_actor *actor)
{
int ret;
@@ -839,7 +793,6 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
return 1;
}
-EXPORT_SYMBOL(splice_from_pipe_feed);
/**
* splice_from_pipe_next - wait for some data to splice from
@@ -851,7 +804,7 @@ EXPORT_SYMBOL(splice_from_pipe_feed);
* value (one) if pipe buffers are available. It will return zero
* or -errno if no more data needs to be spliced.
*/
-int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
while (!pipe->nrbufs) {
if (!pipe->writers)
@@ -876,7 +829,6 @@ int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
return 1;
}
-EXPORT_SYMBOL(splice_from_pipe_next);
/**
* splice_from_pipe_begin - start splicing from pipe
@@ -887,12 +839,11 @@ EXPORT_SYMBOL(splice_from_pipe_next);
* splice_from_pipe_next() and splice_from_pipe_feed() to
* initialize the necessary fields of @sd.
*/
-void splice_from_pipe_begin(struct splice_desc *sd)
+static void splice_from_pipe_begin(struct splice_desc *sd)
{
sd->num_spliced = 0;
sd->need_wakeup = false;
}
-EXPORT_SYMBOL(splice_from_pipe_begin);
/**
* splice_from_pipe_end - finish splicing from pipe
@@ -904,12 +855,11 @@ EXPORT_SYMBOL(splice_from_pipe_begin);
* be called after a loop containing splice_from_pipe_next() and
* splice_from_pipe_feed().
*/
-void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
{
if (sd->need_wakeup)
wakeup_pipe_writers(pipe);
}
-EXPORT_SYMBOL(splice_from_pipe_end);
/**
* __splice_from_pipe - splice data from a pipe to given actor
@@ -975,7 +925,7 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
}
/**
- * generic_file_splice_write - splice data from a pipe to a file
+ * iter_file_splice_write - splice data from a pipe to a file
* @pipe: pipe info
* @out: file to write to
* @ppos: position in @out
@@ -985,40 +935,122 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
* Description:
* Will either move or copy pages (determined by @flags options) from
* the given pipe inode to the given file.
+ * This one is ->write_iter-based.
*
*/
ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
struct splice_desc sd = {
.total_len = len,
.flags = flags,
.pos = *ppos,
.u.file = out,
};
+ int nbufs = pipe->buffers;
+ struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
+ GFP_KERNEL);
ssize_t ret;
+ if (unlikely(!array))
+ return -ENOMEM;
+
pipe_lock(pipe);
splice_from_pipe_begin(&sd);
- do {
+ while (sd.total_len) {
+ struct iov_iter from;
+ struct kiocb kiocb;
+ size_t left;
+ int n, idx;
+
ret = splice_from_pipe_next(pipe, &sd);
if (ret <= 0)
break;
- mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
- ret = file_remove_suid(out);
- if (!ret) {
- ret = file_update_time(out);
- if (!ret)
- ret = splice_from_pipe_feed(pipe, &sd,
- pipe_to_file);
+ if (unlikely(nbufs < pipe->buffers)) {
+ kfree(array);
+ nbufs = pipe->buffers;
+ array = kcalloc(nbufs, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!array) {
+ ret = -ENOMEM;
+ break;
+ }
}
- mutex_unlock(&inode->i_mutex);
- } while (ret > 0);
+
+ /* build the vector */
+ left = sd.total_len;
+ for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
+ struct pipe_buffer *buf = pipe->bufs + idx;
+ size_t this_len = buf->len;
+
+ if (this_len > left)
+ this_len = left;
+
+ if (idx == pipe->buffers - 1)
+ idx = -1;
+
+ ret = buf->ops->confirm(pipe, buf);
+ if (unlikely(ret)) {
+ if (ret == -ENODATA)
+ ret = 0;
+ goto done;
+ }
+
+ array[n].bv_page = buf->page;
+ array[n].bv_len = this_len;
+ array[n].bv_offset = buf->offset;
+ left -= this_len;
+ }
+
+ /* ... iov_iter */
+ from.type = ITER_BVEC | WRITE;
+ from.bvec = array;
+ from.nr_segs = n;
+ from.count = sd.total_len - left;
+ from.iov_offset = 0;
+
+ /* ... and iocb */
+ init_sync_kiocb(&kiocb, out);
+ kiocb.ki_pos = sd.pos;
+ kiocb.ki_nbytes = sd.total_len - left;
+
+ /* now, send it */
+ ret = out->f_op->write_iter(&kiocb, &from);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+
+ if (ret <= 0)
+ break;
+
+ sd.num_spliced += ret;
+ sd.total_len -= ret;
+ *ppos = sd.pos = kiocb.ki_pos;
+
+ /* dismiss the fully eaten buffers, adjust the partial one */
+ while (ret) {
+ struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ if (ret >= buf->len) {
+ const struct pipe_buf_operations *ops = buf->ops;
+ ret -= buf->len;
+ buf->len = 0;
+ buf->ops = NULL;
+ ops->release(pipe, buf);
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+ pipe->nrbufs--;
+ if (pipe->files)
+ sd.need_wakeup = true;
+ } else {
+ buf->offset += ret;
+ buf->len -= ret;
+ ret = 0;
+ }
+ }
+ }
+done:
+ kfree(array);
splice_from_pipe_end(pipe, &sd);
pipe_unlock(pipe);
@@ -1026,21 +1058,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
if (sd.num_spliced)
ret = sd.num_spliced;
- if (ret > 0) {
- int err;
-
- err = generic_write_sync(out, *ppos, ret);
- if (err)
- ret = err;
- else
- *ppos += ret;
- balance_dirty_pages_ratelimited(mapping);
- }
-
return ret;
}
-EXPORT_SYMBOL(generic_file_splice_write);
+EXPORT_SYMBOL(iter_file_splice_write);
static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
@@ -1049,9 +1070,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
void *data;
loff_t tmp = sd->pos;
- data = buf->ops->map(pipe, buf, 0);
+ data = kmap(buf->page);
ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
- buf->ops->unmap(pipe, buf, data);
+ kunmap(buf->page);
return ret;
}
@@ -1099,7 +1120,7 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
loff_t *, size_t, unsigned int);
- if (out->f_op && out->f_op->splice_write)
+ if (out->f_op->splice_write)
splice_write = out->f_op->splice_write;
else
splice_write = default_file_splice_write;
@@ -1125,7 +1146,7 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
- if (in->f_op && in->f_op->splice_read)
+ if (in->f_op->splice_read)
splice_read = in->f_op->splice_read;
else
splice_read = default_file_splice_read;
@@ -1510,116 +1531,50 @@ static int get_iovec_page_array(const struct iovec __user *iov,
static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
- char *src;
- int ret;
-
- /*
- * See if we can use the atomic maps, by prefaulting in the
- * pages and doing an atomic copy
- */
- if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
- src = buf->ops->map(pipe, buf, 1);
- ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
- sd->len);
- buf->ops->unmap(pipe, buf, src);
- if (!ret) {
- ret = sd->len;
- goto out;
- }
- }
-
- /*
- * No dice, use slow non-atomic map and copy
- */
- src = buf->ops->map(pipe, buf, 0);
-
- ret = sd->len;
- if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
- ret = -EFAULT;
-
- buf->ops->unmap(pipe, buf, src);
-out:
- if (ret > 0)
- sd->u.userptr += ret;
- return ret;
+ int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
+ return n == sd->len ? n : -EFAULT;
}
/*
* For lack of a better implementation, implement vmsplice() to userspace
* as a simple copy of the pipes pages to the user iov.
*/
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
struct splice_desc sd;
- ssize_t size;
- int error;
long ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ ssize_t count;
pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- pipe_lock(pipe);
-
- error = ret = 0;
- while (nr_segs) {
- void __user *base;
- size_t len;
-
- /*
- * Get user address base and length for this iovec.
- */
- error = get_user(base, &iov->iov_base);
- if (unlikely(error))
- break;
- error = get_user(len, &iov->iov_len);
- if (unlikely(error))
- break;
-
- /*
- * Sanity check this iovec. 0 read succeeds.
- */
- if (unlikely(!len))
- break;
- if (unlikely(!base)) {
- error = -EFAULT;
- break;
- }
-
- if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
- error = -EFAULT;
- break;
- }
-
- sd.len = 0;
- sd.total_len = len;
- sd.flags = flags;
- sd.u.userptr = base;
- sd.pos = 0;
-
- size = __splice_from_pipe(pipe, &sd, pipe_to_user);
- if (size < 0) {
- if (!ret)
- ret = size;
-
- break;
- }
-
- ret += size;
+ ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+ ARRAY_SIZE(iovstack), iovstack, &iov);
+ if (ret <= 0)
+ goto out;
- if (size < len)
- break;
+ count = ret;
+ iov_iter_init(&iter, READ, iov, nr_segs, count);
- nr_segs--;
- iov++;
- }
+ sd.len = 0;
+ sd.total_len = count;
+ sd.flags = flags;
+ sd.u.data = &iter;
+ sd.pos = 0;
+ pipe_lock(pipe);
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
pipe_unlock(pipe);
- if (!ret)
- ret = error;
+out:
+ if (iov != iovstack)
+ kfree(iov);
return ret;
}
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index c70111ebefd..b6fa8657dcb 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,6 +25,78 @@ config SQUASHFS
If unsure, say N.
+choice
+ prompt "File decompression options"
+ depends on SQUASHFS
+ help
+ Squashfs now supports two options for decompressing file
+ data. Traditionally Squashfs has decompressed into an
+ intermediate buffer and then memcopied it into the page cache.
+ Squashfs now supports the ability to decompress directly into
+ the page cache.
+
+ If unsure, select "Decompress file data into an intermediate buffer"
+
+config SQUASHFS_FILE_CACHE
+ bool "Decompress file data into an intermediate buffer"
+ help
+ Decompress file data into an intermediate buffer and then
+ memcopy it into the page cache.
+
+config SQUASHFS_FILE_DIRECT
+ bool "Decompress files directly into the page cache"
+ help
+ Directly decompress file data into the page cache.
+ Doing so can significantly improve performance because
+ it eliminates a memcpy and it also removes the lock contention
+ on the single buffer.
+
+endchoice
+
+choice
+ prompt "Decompressor parallelisation options"
+ depends on SQUASHFS
+ help
+ Squashfs now supports three parallelisation options for
+ decompression. Each one exhibits various trade-offs between
+ decompression performance and CPU and memory usage.
+
+ If in doubt, select "Single threaded compression"
+
+config SQUASHFS_DECOMP_SINGLE
+ bool "Single threaded compression"
+ help
+ Traditionally Squashfs has used single-threaded decompression.
+ Only one block (data or metadata) can be decompressed at any
+ one time. This limits CPU and memory usage to a minimum.
+
+config SQUASHFS_DECOMP_MULTI
+ bool "Use multiple decompressors for parallel I/O"
+ help
+ By default Squashfs uses a single decompressor but it gives
+ poor performance on parallel I/O workloads when using multiple CPU
+ machines due to waiting on decompressor availability.
+
+ If you have a parallel I/O workload and your system has enough memory,
+ using this option may improve overall I/O performance.
+
+ This decompressor implementation uses up to two parallel
+ decompressors per core. It dynamically allocates decompressors
+ on a demand basis.
+
+config SQUASHFS_DECOMP_MULTI_PERCPU
+ bool "Use percpu multiple decompressors for parallel I/O"
+ help
+ By default Squashfs uses a single decompressor but it gives
+ poor performance on parallel I/O workloads when using multiple CPU
+ machines due to waiting on decompressor availability.
+
+ This decompressor implementation uses a maximum of one
+ decompressor per core. It uses percpu variables to ensure
+ decompression is load-balanced across the cores.
+
+endchoice
+
config SQUASHFS_XATTR
bool "Squashfs XATTR support"
depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 110b0476f3b..4132520b4ff 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,6 +5,11 @@
obj-$(CONFIG_SQUASHFS) += squashfs.o
squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
squashfs-y += namei.o super.o symlink.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
+squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
+squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 41d108ecc9b..0cea9b9236d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -36,6 +36,7 @@
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
+#include "page_actor.h"
/*
* Read the metadata block length, this is stored in the first two
@@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb,
* generated a larger block - this does occasionally happen with compression
* algorithms).
*/
-int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
- int length, u64 *next_index, int srclength, int pages)
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+ u64 *next_index, struct squashfs_page_actor *output)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
struct buffer_head **bh;
int offset = index & ((1 << msblk->devblksize_log2) - 1);
u64 cur_index = index >> msblk->devblksize_log2;
- int bytes, compressed, b = 0, k = 0, page = 0, avail;
+ int bytes, compressed, b = 0, k = 0, avail, i;
- bh = kcalloc(((srclength + msblk->devblksize - 1)
+ bh = kcalloc(((output->length + msblk->devblksize - 1)
>> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
if (bh == NULL)
return -ENOMEM;
@@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
*next_index = index + length;
TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
- index, compressed ? "" : "un", length, srclength);
+ index, compressed ? "" : "un", length, output->length);
- if (length < 0 || length > srclength ||
+ if (length < 0 || length > output->length ||
(index + length) > msblk->bytes_used)
goto read_failure;
@@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
compressed ? "" : "un", length);
- if (length < 0 || length > srclength ||
+ if (length < 0 || length > output->length ||
(index + length) > msblk->bytes_used)
goto block_release;
@@ -158,9 +159,15 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
ll_rw_block(READ, b - 1, bh + 1);
}
+ for (i = 0; i < b; i++) {
+ wait_on_buffer(bh[i]);
+ if (!buffer_uptodate(bh[i]))
+ goto block_release;
+ }
+
if (compressed) {
- length = squashfs_decompress(msblk, buffer, bh, b, offset,
- length, srclength, pages);
+ length = squashfs_decompress(msblk, bh, b, offset, length,
+ output);
if (length < 0)
goto read_failure;
} else {
@@ -168,22 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
* Block is uncompressed.
*/
int in, pg_offset = 0;
+ void *data = squashfs_first_page(output);
for (bytes = length; k < b; k++) {
in = min(bytes, msblk->devblksize - offset);
bytes -= in;
- wait_on_buffer(bh[k]);
- if (!buffer_uptodate(bh[k]))
- goto block_release;
while (in) {
if (pg_offset == PAGE_CACHE_SIZE) {
- page++;
+ data = squashfs_next_page(output);
pg_offset = 0;
}
avail = min_t(int, in, PAGE_CACHE_SIZE -
pg_offset);
- memcpy(buffer[page] + pg_offset,
- bh[k]->b_data + offset, avail);
+ memcpy(data + pg_offset, bh[k]->b_data + offset,
+ avail);
in -= avail;
pg_offset += avail;
offset += avail;
@@ -191,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
offset = 0;
put_bh(bh[k]);
}
+ squashfs_finish_page(output);
}
kfree(bh);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index af0b7380259..1cb70a0b216 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -56,6 +56,7 @@
#include "squashfs_fs.h"
#include "squashfs_fs_sb.h"
#include "squashfs.h"
+#include "page_actor.h"
/*
* Look-up block in cache, and increment usage count. If not in cache, read
@@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
entry->error = 0;
spin_unlock(&cache->lock);
- entry->length = squashfs_read_data(sb, entry->data,
- block, length, &entry->next_index,
- cache->block_size, cache->pages);
+ entry->length = squashfs_read_data(sb, block, length,
+ &entry->next_index, entry->actor);
spin_lock(&cache->lock);
@@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
kfree(cache->entry[i].data[j]);
kfree(cache->entry[i].data);
}
+ kfree(cache->entry[i].actor);
}
kfree(cache->entry);
@@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
goto cleanup;
}
}
+
+ entry->actor = squashfs_page_actor_init(entry->data,
+ cache->pages, 0);
+ if (entry->actor == NULL) {
+ ERROR("Failed to allocate %s cache entry\n", name);
+ goto cleanup;
+ }
}
return cache;
@@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
int i, res;
void *table, *buffer, **data;
+ struct squashfs_page_actor *actor;
table = buffer = kmalloc(length, GFP_KERNEL);
if (table == NULL)
@@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
goto failed;
}
+ actor = squashfs_page_actor_init(data, pages, length);
+ if (actor == NULL) {
+ res = -ENOMEM;
+ goto failed2;
+ }
+
for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
data[i] = buffer;
- res = squashfs_read_data(sb, data, block, length |
- SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
+ res = squashfs_read_data(sb, block, length |
+ SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
kfree(data);
+ kfree(actor);
if (res < 0)
goto failed;
return table;
+failed2:
+ kfree(data);
failed:
kfree(table);
return ERR_PTR(res);
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 3f6271d86ab..ac22fe73b0a 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -30,6 +30,7 @@
#include "squashfs_fs_sb.h"
#include "decompressor.h"
#include "squashfs.h"
+#include "page_actor.h"
/*
* This file (and decompressor.h) implements a decompressor framework for
@@ -37,29 +38,29 @@
*/
static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
- NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
+ NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
};
#ifndef CONFIG_SQUASHFS_LZO
static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
- NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
+ NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
};
#endif
#ifndef CONFIG_SQUASHFS_XZ
static const struct squashfs_decompressor squashfs_xz_comp_ops = {
- NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
+ NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
};
#endif
#ifndef CONFIG_SQUASHFS_ZLIB
static const struct squashfs_decompressor squashfs_zlib_comp_ops = {
- NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
+ NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
};
#endif
static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
- NULL, NULL, NULL, 0, "unknown", 0
+ NULL, NULL, NULL, NULL, 0, "unknown", 0
};
static const struct squashfs_decompressor *decompressor[] = {
@@ -83,10 +84,11 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
}
-void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
+static void *get_comp_opts(struct super_block *sb, unsigned short flags)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
- void *strm, *buffer = NULL;
+ void *buffer = NULL, *comp_opts;
+ struct squashfs_page_actor *actor = NULL;
int length = 0;
/*
@@ -94,23 +96,46 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
*/
if (SQUASHFS_COMP_OPTS(flags)) {
buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
- if (buffer == NULL)
- return ERR_PTR(-ENOMEM);
+ if (buffer == NULL) {
+ comp_opts = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ actor = squashfs_page_actor_init(&buffer, 1, 0);
+ if (actor == NULL) {
+ comp_opts = ERR_PTR(-ENOMEM);
+ goto out;
+ }
- length = squashfs_read_data(sb, &buffer,
- sizeof(struct squashfs_super_block), 0, NULL,
- PAGE_CACHE_SIZE, 1);
+ length = squashfs_read_data(sb,
+ sizeof(struct squashfs_super_block), 0, NULL, actor);
if (length < 0) {
- strm = ERR_PTR(length);
- goto finished;
+ comp_opts = ERR_PTR(length);
+ goto out;
}
}
- strm = msblk->decompressor->init(msblk, buffer, length);
+ comp_opts = squashfs_comp_opts(msblk, buffer, length);
-finished:
+out:
+ kfree(actor);
kfree(buffer);
+ return comp_opts;
+}
+
+
+void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
+{
+ struct squashfs_sb_info *msblk = sb->s_fs_info;
+ void *stream, *comp_opts = get_comp_opts(sb, flags);
+
+ if (IS_ERR(comp_opts))
+ return comp_opts;
+
+ stream = squashfs_decompressor_create(msblk, comp_opts);
+ if (IS_ERR(stream))
+ kfree(comp_opts);
- return strm;
+ return stream;
}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 330073e2902..af098532180 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,28 +24,22 @@
*/
struct squashfs_decompressor {
- void *(*init)(struct squashfs_sb_info *, void *, int);
+ void *(*init)(struct squashfs_sb_info *, void *);
+ void *(*comp_opts)(struct squashfs_sb_info *, void *, int);
void (*free)(void *);
- int (*decompress)(struct squashfs_sb_info *, void **,
- struct buffer_head **, int, int, int, int, int);
+ int (*decompress)(struct squashfs_sb_info *, void *,
+ struct buffer_head **, int, int, int,
+ struct squashfs_page_actor *);
int id;
char *name;
int supported;
};
-static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
- void *s)
+static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
+ void *buff, int length)
{
- if (msblk->decompressor)
- msblk->decompressor->free(s);
-}
-
-static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
- void **buffer, struct buffer_head **bh, int b, int offset, int length,
- int srclength, int pages)
-{
- return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
- length, srclength, pages);
+ return msblk->decompressor->comp_opts ?
+ msblk->decompressor->comp_opts(msblk, buff, length) : NULL;
}
#ifdef CONFIG_SQUASHFS_XZ
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
new file mode 100644
index 00000000000..d6008a63647
--- /dev/null
+++ b/fs/squashfs/decompressor_multi.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2013
+ * Minchan Kim <minchan@kernel.org>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/cpumask.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression in the
+ * decompressor framework
+ */
+
+
+/*
+ * The reason that multiply two is that a CPU can request new I/O
+ * while it is waiting previous request.
+ */
+#define MAX_DECOMPRESSOR (num_online_cpus() * 2)
+
+
+int squashfs_max_decompressors(void)
+{
+ return MAX_DECOMPRESSOR;
+}
+
+
+struct squashfs_stream {
+ void *comp_opts;
+ struct list_head strm_list;
+ struct mutex mutex;
+ int avail_decomp;
+ wait_queue_head_t wait;
+};
+
+
+struct decomp_stream {
+ void *stream;
+ struct list_head list;
+};
+
+
+static void put_decomp_stream(struct decomp_stream *decomp_strm,
+ struct squashfs_stream *stream)
+{
+ mutex_lock(&stream->mutex);
+ list_add(&decomp_strm->list, &stream->strm_list);
+ mutex_unlock(&stream->mutex);
+ wake_up(&stream->wait);
+}
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+ void *comp_opts)
+{
+ struct squashfs_stream *stream;
+ struct decomp_stream *decomp_strm = NULL;
+ int err = -ENOMEM;
+
+ stream = kzalloc(sizeof(*stream), GFP_KERNEL);
+ if (!stream)
+ goto out;
+
+ stream->comp_opts = comp_opts;
+ mutex_init(&stream->mutex);
+ INIT_LIST_HEAD(&stream->strm_list);
+ init_waitqueue_head(&stream->wait);
+
+ /*
+ * We should have a decompressor at least as default
+ * so if we fail to allocate new decompressor dynamically,
+ * we could always fall back to default decompressor and
+ * file system works.
+ */
+ decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+ if (!decomp_strm)
+ goto out;
+
+ decomp_strm->stream = msblk->decompressor->init(msblk,
+ stream->comp_opts);
+ if (IS_ERR(decomp_strm->stream)) {
+ err = PTR_ERR(decomp_strm->stream);
+ goto out;
+ }
+
+ list_add(&decomp_strm->list, &stream->strm_list);
+ stream->avail_decomp = 1;
+ return stream;
+
+out:
+ kfree(decomp_strm);
+ kfree(stream);
+ return ERR_PTR(err);
+}
+
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+ struct squashfs_stream *stream = msblk->stream;
+ if (stream) {
+ struct decomp_stream *decomp_strm;
+
+ while (!list_empty(&stream->strm_list)) {
+ decomp_strm = list_entry(stream->strm_list.prev,
+ struct decomp_stream, list);
+ list_del(&decomp_strm->list);
+ msblk->decompressor->free(decomp_strm->stream);
+ kfree(decomp_strm);
+ stream->avail_decomp--;
+ }
+ WARN_ON(stream->avail_decomp);
+ kfree(stream->comp_opts);
+ kfree(stream);
+ }
+}
+
+
+static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
+ struct squashfs_stream *stream)
+{
+ struct decomp_stream *decomp_strm;
+
+ while (1) {
+ mutex_lock(&stream->mutex);
+
+ /* There is available decomp_stream */
+ if (!list_empty(&stream->strm_list)) {
+ decomp_strm = list_entry(stream->strm_list.prev,
+ struct decomp_stream, list);
+ list_del(&decomp_strm->list);
+ mutex_unlock(&stream->mutex);
+ break;
+ }
+
+ /*
+ * If there is no available decomp and already full,
+ * let's wait for releasing decomp from other users.
+ */
+ if (stream->avail_decomp >= MAX_DECOMPRESSOR)
+ goto wait;
+
+ /* Let's allocate new decomp */
+ decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
+ if (!decomp_strm)
+ goto wait;
+
+ decomp_strm->stream = msblk->decompressor->init(msblk,
+ stream->comp_opts);
+ if (IS_ERR(decomp_strm->stream)) {
+ kfree(decomp_strm);
+ goto wait;
+ }
+
+ stream->avail_decomp++;
+ WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
+
+ mutex_unlock(&stream->mutex);
+ break;
+wait:
+ /*
+ * If system memory is tough, let's for other's
+ * releasing instead of hurting VM because it could
+ * make page cache thrashing.
+ */
+ mutex_unlock(&stream->mutex);
+ wait_event(stream->wait,
+ !list_empty(&stream->strm_list));
+ }
+
+ return decomp_strm;
+}
+
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+ int b, int offset, int length, struct squashfs_page_actor *output)
+{
+ int res;
+ struct squashfs_stream *stream = msblk->stream;
+ struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
+ res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
+ bh, b, offset, length, output);
+ put_decomp_stream(decomp_stream, stream);
+ if (res < 0)
+ ERROR("%s decompression failed, data probably corrupt\n",
+ msblk->decompressor->name);
+ return res;
+}
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
new file mode 100644
index 00000000000..23a9c28ad8e
--- /dev/null
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements multi-threaded decompression using percpu
+ * variables, one thread per cpu core.
+ */
+
+struct squashfs_stream {
+ void *stream;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+ void *comp_opts)
+{
+ struct squashfs_stream *stream;
+ struct squashfs_stream __percpu *percpu;
+ int err, cpu;
+
+ percpu = alloc_percpu(struct squashfs_stream);
+ if (percpu == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ for_each_possible_cpu(cpu) {
+ stream = per_cpu_ptr(percpu, cpu);
+ stream->stream = msblk->decompressor->init(msblk, comp_opts);
+ if (IS_ERR(stream->stream)) {
+ err = PTR_ERR(stream->stream);
+ goto out;
+ }
+ }
+
+ kfree(comp_opts);
+ return (__force void *) percpu;
+
+out:
+ for_each_possible_cpu(cpu) {
+ stream = per_cpu_ptr(percpu, cpu);
+ if (!IS_ERR_OR_NULL(stream->stream))
+ msblk->decompressor->free(stream->stream);
+ }
+ free_percpu(percpu);
+ return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+ struct squashfs_stream __percpu *percpu =
+ (struct squashfs_stream __percpu *) msblk->stream;
+ struct squashfs_stream *stream;
+ int cpu;
+
+ if (msblk->stream) {
+ for_each_possible_cpu(cpu) {
+ stream = per_cpu_ptr(percpu, cpu);
+ msblk->decompressor->free(stream->stream);
+ }
+ free_percpu(percpu);
+ }
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+ int b, int offset, int length, struct squashfs_page_actor *output)
+{
+ struct squashfs_stream __percpu *percpu =
+ (struct squashfs_stream __percpu *) msblk->stream;
+ struct squashfs_stream *stream = get_cpu_ptr(percpu);
+ int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+ offset, length, output);
+ put_cpu_ptr(stream);
+
+ if (res < 0)
+ ERROR("%s decompression failed, data probably corrupt\n",
+ msblk->decompressor->name);
+
+ return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+ return num_possible_cpus();
+}
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
new file mode 100644
index 00000000000..a6c75929a00
--- /dev/null
+++ b/fs/squashfs/decompressor_single.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "decompressor.h"
+#include "squashfs.h"
+
+/*
+ * This file implements single-threaded decompression in the
+ * decompressor framework
+ */
+
+struct squashfs_stream {
+ void *stream;
+ struct mutex mutex;
+};
+
+void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
+ void *comp_opts)
+{
+ struct squashfs_stream *stream;
+ int err = -ENOMEM;
+
+ stream = kmalloc(sizeof(*stream), GFP_KERNEL);
+ if (stream == NULL)
+ goto out;
+
+ stream->stream = msblk->decompressor->init(msblk, comp_opts);
+ if (IS_ERR(stream->stream)) {
+ err = PTR_ERR(stream->stream);
+ goto out;
+ }
+
+ kfree(comp_opts);
+ mutex_init(&stream->mutex);
+ return stream;
+
+out:
+ kfree(stream);
+ return ERR_PTR(err);
+}
+
+void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
+{
+ struct squashfs_stream *stream = msblk->stream;
+
+ if (stream) {
+ msblk->decompressor->free(stream->stream);
+ kfree(stream);
+ }
+}
+
+int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
+ int b, int offset, int length, struct squashfs_page_actor *output)
+{
+ int res;
+ struct squashfs_stream *stream = msblk->stream;
+
+ mutex_lock(&stream->mutex);
+ res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
+ offset, length, output);
+ mutex_unlock(&stream->mutex);
+
+ if (res < 0)
+ ERROR("%s decompression failed, data probably corrupt\n",
+ msblk->decompressor->name);
+
+ return res;
+}
+
+int squashfs_max_decompressors(void)
+{
+ return 1;
+}
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 8ca62c28fe1..e5c9689062b 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
return le32_to_cpu(size);
}
-
-static int squashfs_readpage(struct file *file, struct page *page)
+/* Copy data into page cache */
+void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
+ int bytes, int offset)
{
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int bytes, i, offset = 0, sparse = 0;
- struct squashfs_cache_entry *buffer = NULL;
void *pageaddr;
-
- int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
- int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
- int start_index = page->index & ~mask;
- int end_index = start_index | mask;
- int file_end = i_size_read(inode) >> msblk->block_log;
-
- TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
- page->index, squashfs_i(inode)->start);
-
- if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
- PAGE_CACHE_SHIFT))
- goto out;
-
- if (index < file_end || squashfs_i(inode)->fragment_block ==
- SQUASHFS_INVALID_BLK) {
- /*
- * Reading a datablock from disk. Need to read block list
- * to get location and block size.
- */
- u64 block = 0;
- int bsize = read_blocklist(inode, index, &block);
- if (bsize < 0)
- goto error_out;
-
- if (bsize == 0) { /* hole */
- bytes = index == file_end ?
- (i_size_read(inode) & (msblk->block_size - 1)) :
- msblk->block_size;
- sparse = 1;
- } else {
- /*
- * Read and decompress datablock.
- */
- buffer = squashfs_get_datablock(inode->i_sb,
- block, bsize);
- if (buffer->error) {
- ERROR("Unable to read page, block %llx, size %x"
- "\n", block, bsize);
- squashfs_cache_put(buffer);
- goto error_out;
- }
- bytes = buffer->length;
- }
- } else {
- /*
- * Datablock is stored inside a fragment (tail-end packed
- * block).
- */
- buffer = squashfs_get_fragment(inode->i_sb,
- squashfs_i(inode)->fragment_block,
- squashfs_i(inode)->fragment_size);
-
- if (buffer->error) {
- ERROR("Unable to read page, block %llx, size %x\n",
- squashfs_i(inode)->fragment_block,
- squashfs_i(inode)->fragment_size);
- squashfs_cache_put(buffer);
- goto error_out;
- }
- bytes = i_size_read(inode) & (msblk->block_size - 1);
- offset = squashfs_i(inode)->fragment_offset;
- }
+ int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int start_index = page->index & ~mask, end_index = start_index | mask;
/*
* Loop copying datablock into pages. As the datablock likely covers
@@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
for (i = start_index; i <= end_index && bytes > 0; i++,
bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
struct page *push_page;
- int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
+ int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
@@ -475,11 +413,75 @@ skip_page:
if (i != page->index)
page_cache_release(push_page);
}
+}
+
+/* Read datablock stored packed inside a fragment (tail-end packed block) */
+static int squashfs_readpage_fragment(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
+ squashfs_i(inode)->fragment_block,
+ squashfs_i(inode)->fragment_size);
+ int res = buffer->error;
+
+ if (res)
+ ERROR("Unable to read page, block %llx, size %x\n",
+ squashfs_i(inode)->fragment_block,
+ squashfs_i(inode)->fragment_size);
+ else
+ squashfs_copy_cache(page, buffer, i_size_read(inode) &
+ (msblk->block_size - 1),
+ squashfs_i(inode)->fragment_offset);
+
+ squashfs_cache_put(buffer);
+ return res;
+}
- if (!sparse)
- squashfs_cache_put(buffer);
+static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
+{
+ struct inode *inode = page->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int bytes = index == file_end ?
+ (i_size_read(inode) & (msblk->block_size - 1)) :
+ msblk->block_size;
+ squashfs_copy_cache(page, NULL, bytes, 0);
return 0;
+}
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+ int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
+ int file_end = i_size_read(inode) >> msblk->block_log;
+ int res;
+ void *pageaddr;
+
+ TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+ page->index, squashfs_i(inode)->start);
+
+ if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT))
+ goto out;
+
+ if (index < file_end || squashfs_i(inode)->fragment_block ==
+ SQUASHFS_INVALID_BLK) {
+ u64 block = 0;
+ int bsize = read_blocklist(inode, index, &block);
+ if (bsize < 0)
+ goto error_out;
+
+ if (bsize == 0)
+ res = squashfs_readpage_sparse(page, index, file_end);
+ else
+ res = squashfs_readpage_block(page, block, bsize);
+ } else
+ res = squashfs_readpage_fragment(page);
+
+ if (!res)
+ return 0;
error_out:
SetPageError(page);
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
new file mode 100644
index 00000000000..f2310d2a201
--- /dev/null
+++ b/fs/squashfs/file_cache.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+
+/* Read separately compressed datablock and memcopy into page cache */
+int squashfs_readpage_block(struct page *page, u64 block, int bsize)
+{
+ struct inode *i = page->mapping->host;
+ struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+ block, bsize);
+ int res = buffer->error;
+
+ if (res)
+ ERROR("Unable to read page, block %llx, size %x\n", block,
+ bsize);
+ else
+ squashfs_copy_cache(page, buffer, buffer->length, 0);
+
+ squashfs_cache_put(buffer);
+ return res;
+}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
new file mode 100644
index 00000000000..62a0de6632e
--- /dev/null
+++ b/fs/squashfs/file_direct.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/pagemap.h>
+#include <linux/mutex.h>
+
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+#include "page_actor.h"
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+ int pages, struct page **page);
+
+/* Read separately compressed datablock directly into page cache */
+int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+
+{
+ struct inode *inode = target_page->mapping->host;
+ struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+
+ int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+ int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
+ int start_index = target_page->index & ~mask;
+ int end_index = start_index | mask;
+ int i, n, pages, missing_pages, bytes, res = -ENOMEM;
+ struct page **page;
+ struct squashfs_page_actor *actor;
+ void *pageaddr;
+
+ if (end_index > file_end)
+ end_index = file_end;
+
+ pages = end_index - start_index + 1;
+
+ page = kmalloc(sizeof(void *) * pages, GFP_KERNEL);
+ if (page == NULL)
+ return res;
+
+ /*
+ * Create a "page actor" which will kmap and kunmap the
+ * page cache pages appropriately within the decompressor
+ */
+ actor = squashfs_page_actor_init_special(page, pages, 0);
+ if (actor == NULL)
+ goto out;
+
+ /* Try to grab all the pages covered by the Squashfs block */
+ for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
+ page[i] = (n == target_page->index) ? target_page :
+ grab_cache_page_nowait(target_page->mapping, n);
+
+ if (page[i] == NULL) {
+ missing_pages++;
+ continue;
+ }
+
+ if (PageUptodate(page[i])) {
+ unlock_page(page[i]);
+ page_cache_release(page[i]);
+ page[i] = NULL;
+ missing_pages++;
+ }
+ }
+
+ if (missing_pages) {
+ /*
+ * Couldn't get one or more pages, this page has either
+ * been VM reclaimed, but others are still in the page cache
+ * and uptodate, or we're racing with another thread in
+ * squashfs_readpage also trying to grab them. Fall back to
+ * using an intermediate buffer.
+ */
+ res = squashfs_read_cache(target_page, block, bsize, pages,
+ page);
+ if (res < 0)
+ goto mark_errored;
+
+ goto out;
+ }
+
+ /* Decompress directly into the page cache buffers */
+ res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
+ if (res < 0)
+ goto mark_errored;
+
+ /* Last page may have trailing bytes not filled */
+ bytes = res % PAGE_CACHE_SIZE;
+ if (bytes) {
+ pageaddr = kmap_atomic(page[pages - 1]);
+ memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
+ kunmap_atomic(pageaddr);
+ }
+
+ /* Mark pages as uptodate, unlock and release */
+ for (i = 0; i < pages; i++) {
+ flush_dcache_page(page[i]);
+ SetPageUptodate(page[i]);
+ unlock_page(page[i]);
+ if (page[i] != target_page)
+ page_cache_release(page[i]);
+ }
+
+ kfree(actor);
+ kfree(page);
+
+ return 0;
+
+mark_errored:
+ /* Decompression failed, mark pages as errored. Target_page is
+ * dealt with by the caller
+ */
+ for (i = 0; i < pages; i++) {
+ if (page[i] == NULL || page[i] == target_page)
+ continue;
+ flush_dcache_page(page[i]);
+ SetPageError(page[i]);
+ unlock_page(page[i]);
+ page_cache_release(page[i]);
+ }
+
+out:
+ kfree(actor);
+ kfree(page);
+ return res;
+}
+
+
+static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
+ int pages, struct page **page)
+{
+ struct inode *i = target_page->mapping->host;
+ struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
+ block, bsize);
+ int bytes = buffer->length, res = buffer->error, n, offset = 0;
+ void *pageaddr;
+
+ if (res) {
+ ERROR("Unable to read page, block %llx, size %x\n", block,
+ bsize);
+ goto out;
+ }
+
+ for (n = 0; n < pages && bytes > 0; n++,
+ bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
+ int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
+
+ if (page[n] == NULL)
+ continue;
+
+ pageaddr = kmap_atomic(page[n]);
+ squashfs_copy_data(pageaddr, buffer, offset, avail);
+ memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
+ kunmap_atomic(pageaddr);
+ flush_dcache_page(page[n]);
+ SetPageUptodate(page[n]);
+ unlock_page(page[n]);
+ if (page[n] != target_page)
+ page_cache_release(page[n]);
+ }
+
+out:
+ squashfs_cache_put(buffer);
+ return res;
+}
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 00f4dfc5f08..244b9fbfff7 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -31,13 +31,14 @@
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
+#include "page_actor.h"
struct squashfs_lzo {
void *input;
void *output;
};
-static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
+static void *lzo_init(struct squashfs_sb_info *msblk, void *buff)
{
int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
@@ -74,22 +75,16 @@ static void lzo_free(void *strm)
}
-static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
- struct buffer_head **bh, int b, int offset, int length, int srclength,
- int pages)
+static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
+ struct buffer_head **bh, int b, int offset, int length,
+ struct squashfs_page_actor *output)
{
- struct squashfs_lzo *stream = msblk->stream;
- void *buff = stream->input;
+ struct squashfs_lzo *stream = strm;
+ void *buff = stream->input, *data;
int avail, i, bytes = length, res;
- size_t out_len = srclength;
-
- mutex_lock(&msblk->read_data_mutex);
+ size_t out_len = output->length;
for (i = 0; i < b; i++) {
- wait_on_buffer(bh[i]);
- if (!buffer_uptodate(bh[i]))
- goto block_release;
-
avail = min(bytes, msblk->devblksize - offset);
memcpy(buff, bh[i]->b_data + offset, avail);
buff += avail;
@@ -104,24 +99,24 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
goto failed;
res = bytes = (int)out_len;
- for (i = 0, buff = stream->output; bytes && i < pages; i++) {
- avail = min_t(int, bytes, PAGE_CACHE_SIZE);
- memcpy(buffer[i], buff, avail);
- buff += avail;
- bytes -= avail;
+ data = squashfs_first_page(output);
+ buff = stream->output;
+ while (data) {
+ if (bytes <= PAGE_CACHE_SIZE) {
+ memcpy(data, buff, bytes);
+ break;
+ } else {
+ memcpy(data, buff, PAGE_CACHE_SIZE);
+ buff += PAGE_CACHE_SIZE;
+ bytes -= PAGE_CACHE_SIZE;
+ data = squashfs_next_page(output);
+ }
}
+ squashfs_finish_page(output);
- mutex_unlock(&msblk->read_data_mutex);
return res;
-block_release:
- for (; i < b; i++)
- put_bh(bh[i]);
-
failed:
- mutex_unlock(&msblk->read_data_mutex);
-
- ERROR("lzo decompression failed, data probably corrupt\n");
return -EIO;
}
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
new file mode 100644
index 00000000000..5a1c11f5644
--- /dev/null
+++ b/fs/squashfs/page_actor.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include "page_actor.h"
+
+/*
+ * This file contains implementations of page_actor for decompressing into
+ * an intermediate buffer, and for decompressing directly into the
+ * page cache.
+ *
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
+
+/* Implementation of page_actor for decompressing into intermediate buffer */
+static void *cache_first_page(struct squashfs_page_actor *actor)
+{
+ actor->next_page = 1;
+ return actor->buffer[0];
+}
+
+static void *cache_next_page(struct squashfs_page_actor *actor)
+{
+ if (actor->next_page == actor->pages)
+ return NULL;
+
+ return actor->buffer[actor->next_page++];
+}
+
+static void cache_finish_page(struct squashfs_page_actor *actor)
+{
+ /* empty */
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
+ int pages, int length)
+{
+ struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+ if (actor == NULL)
+ return NULL;
+
+ actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->buffer = buffer;
+ actor->pages = pages;
+ actor->next_page = 0;
+ actor->squashfs_first_page = cache_first_page;
+ actor->squashfs_next_page = cache_next_page;
+ actor->squashfs_finish_page = cache_finish_page;
+ return actor;
+}
+
+/* Implementation of page_actor for decompressing directly into page cache. */
+static void *direct_first_page(struct squashfs_page_actor *actor)
+{
+ actor->next_page = 1;
+ return actor->pageaddr = kmap_atomic(actor->page[0]);
+}
+
+static void *direct_next_page(struct squashfs_page_actor *actor)
+{
+ if (actor->pageaddr)
+ kunmap_atomic(actor->pageaddr);
+
+ return actor->pageaddr = actor->next_page == actor->pages ? NULL :
+ kmap_atomic(actor->page[actor->next_page++]);
+}
+
+static void direct_finish_page(struct squashfs_page_actor *actor)
+{
+ if (actor->pageaddr)
+ kunmap_atomic(actor->pageaddr);
+}
+
+struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
+ int pages, int length)
+{
+ struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+ if (actor == NULL)
+ return NULL;
+
+ actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->page = page;
+ actor->pages = pages;
+ actor->next_page = 0;
+ actor->pageaddr = NULL;
+ actor->squashfs_first_page = direct_first_page;
+ actor->squashfs_next_page = direct_next_page;
+ actor->squashfs_finish_page = direct_finish_page;
+ return actor;
+}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
new file mode 100644
index 00000000000..26dd82008b8
--- /dev/null
+++ b/fs/squashfs/page_actor.h
@@ -0,0 +1,81 @@
+#ifndef PAGE_ACTOR_H
+#define PAGE_ACTOR_H
+/*
+ * Copyright (c) 2013
+ * Phillip Lougher <phillip@squashfs.org.uk>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef CONFIG_SQUASHFS_FILE_DIRECT
+struct squashfs_page_actor {
+ void **page;
+ int pages;
+ int length;
+ int next_page;
+};
+
+static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
+ int pages, int length)
+{
+ struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+
+ if (actor == NULL)
+ return NULL;
+
+ actor->length = length ? : pages * PAGE_CACHE_SIZE;
+ actor->page = page;
+ actor->pages = pages;
+ actor->next_page = 0;
+ return actor;
+}
+
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+ actor->next_page = 1;
+ return actor->page[0];
+}
+
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+ return actor->next_page == actor->pages ? NULL :
+ actor->page[actor->next_page++];
+}
+
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+ /* empty */
+}
+#else
+struct squashfs_page_actor {
+ union {
+ void **buffer;
+ struct page **page;
+ };
+ void *pageaddr;
+ void *(*squashfs_first_page)(struct squashfs_page_actor *);
+ void *(*squashfs_next_page)(struct squashfs_page_actor *);
+ void (*squashfs_finish_page)(struct squashfs_page_actor *);
+ int pages;
+ int length;
+ int next_page;
+};
+
+extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
+extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
+ **, int, int);
+static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
+{
+ return actor->squashfs_first_page(actor);
+}
+static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
+{
+ return actor->squashfs_next_page(actor);
+}
+static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
+{
+ actor->squashfs_finish_page(actor);
+}
+#endif
+#endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index d1266516ed0..887d6d27008 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -25,11 +25,11 @@
#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args)
-#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
+#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args)
/* block.c */
-extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
- int, int);
+extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
+ struct squashfs_page_actor *);
/* cache.c */
extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int);
/* decompressor.c */
extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
-extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
+extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
+
+/* decompressor_xxx.c */
+extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
+extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
+extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
+ int, int, int, struct squashfs_page_actor *);
+extern int squashfs_max_decompressors(void);
/* export.c */
extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
@@ -59,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
u64, u64, unsigned int);
+/* file.c */
+void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
+ int);
+
+/* file_xxx.c */
+extern int squashfs_readpage_block(struct page *, u64, int);
+
/* id.c */
extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 52934a22f29..1da565cb50c 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -50,6 +50,7 @@ struct squashfs_cache_entry {
wait_queue_head_t wait_queue;
struct squashfs_cache *cache;
void **data;
+ struct squashfs_page_actor *actor;
};
struct squashfs_sb_info {
@@ -63,10 +64,9 @@ struct squashfs_sb_info {
__le64 *id_table;
__le64 *fragment_index;
__le64 *xattr_id_table;
- struct mutex read_data_mutex;
struct mutex meta_index_mutex;
struct meta_index *meta_index;
- void *stream;
+ struct squashfs_stream *stream;
__le64 *inode_lookup_table;
u64 inode_table;
u64 directory_table;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60553a9053c..031c8d67fd5 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
msblk->devblksize_log2 = ffz(~msblk->devblksize);
- mutex_init(&msblk->read_data_mutex);
mutex_init(&msblk->meta_index_mutex);
/*
@@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
/* Allocate read_page block */
- msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size);
+ msblk->read_page = squashfs_cache_init("data",
+ squashfs_max_decompressors(), msblk->block_size);
if (msblk->read_page == NULL) {
ERROR("Failed to allocate read_page block\n");
goto failed_mount;
}
- msblk->stream = squashfs_decompressor_init(sb, flags);
+ msblk->stream = squashfs_decompressor_setup(sb, flags);
if (IS_ERR(msblk->stream)) {
err = PTR_ERR(msblk->stream);
msblk->stream = NULL;
@@ -336,7 +336,7 @@ failed_mount:
squashfs_cache_delete(msblk->block_cache);
squashfs_cache_delete(msblk->fragment_cache);
squashfs_cache_delete(msblk->read_page);
- squashfs_decompressor_free(msblk, msblk->stream);
+ squashfs_decompressor_destroy(msblk);
kfree(msblk->inode_lookup_table);
kfree(msblk->fragment_index);
kfree(msblk->id_table);
@@ -371,6 +371,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf)
static int squashfs_remount(struct super_block *sb, int *flags, char *data)
{
+ sync_filesystem(sb);
*flags |= MS_RDONLY;
return 0;
}
@@ -383,7 +384,7 @@ static void squashfs_put_super(struct super_block *sb)
squashfs_cache_delete(sbi->block_cache);
squashfs_cache_delete(sbi->fragment_cache);
squashfs_cache_delete(sbi->read_page);
- squashfs_decompressor_free(sbi, sbi->stream);
+ squashfs_decompressor_destroy(sbi);
kfree(sbi->id_table);
kfree(sbi->fragment_index);
kfree(sbi->meta_index);
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 1760b7d108f..c609624e4b8 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -32,44 +32,70 @@
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
+#include "page_actor.h"
struct squashfs_xz {
struct xz_dec *state;
struct xz_buf buf;
};
-struct comp_opts {
+struct disk_comp_opts {
__le32 dictionary_size;
__le32 flags;
};
-static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
- int len)
+struct comp_opts {
+ int dict_size;
+};
+
+static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
+ void *buff, int len)
{
- struct comp_opts *comp_opts = buff;
- struct squashfs_xz *stream;
- int dict_size = msblk->block_size;
- int err, n;
+ struct disk_comp_opts *comp_opts = buff;
+ struct comp_opts *opts;
+ int err = 0, n;
+
+ opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+ if (opts == NULL) {
+ err = -ENOMEM;
+ goto out2;
+ }
if (comp_opts) {
/* check compressor options are the expected length */
if (len < sizeof(*comp_opts)) {
err = -EIO;
- goto failed;
+ goto out;
}
- dict_size = le32_to_cpu(comp_opts->dictionary_size);
+ opts->dict_size = le32_to_cpu(comp_opts->dictionary_size);
/* the dictionary size should be 2^n or 2^n+2^(n+1) */
- n = ffs(dict_size) - 1;
- if (dict_size != (1 << n) && dict_size != (1 << n) +
+ n = ffs(opts->dict_size) - 1;
+ if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) +
(1 << (n + 1))) {
err = -EIO;
- goto failed;
+ goto out;
}
- }
+ } else
+ /* use defaults */
+ opts->dict_size = max_t(int, msblk->block_size,
+ SQUASHFS_METADATA_SIZE);
+
+ return opts;
+
+out:
+ kfree(opts);
+out2:
+ return ERR_PTR(err);
+}
+
- dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
+static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff)
+{
+ struct comp_opts *comp_opts = buff;
+ struct squashfs_xz *stream;
+ int err;
stream = kmalloc(sizeof(*stream), GFP_KERNEL);
if (stream == NULL) {
@@ -77,7 +103,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
goto failed;
}
- stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
+ stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size);
if (stream->state == NULL) {
kfree(stream);
err = -ENOMEM;
@@ -103,42 +129,37 @@ static void squashfs_xz_free(void *strm)
}
-static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
- struct buffer_head **bh, int b, int offset, int length, int srclength,
- int pages)
+static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
+ struct buffer_head **bh, int b, int offset, int length,
+ struct squashfs_page_actor *output)
{
enum xz_ret xz_err;
- int avail, total = 0, k = 0, page = 0;
- struct squashfs_xz *stream = msblk->stream;
-
- mutex_lock(&msblk->read_data_mutex);
+ int avail, total = 0, k = 0;
+ struct squashfs_xz *stream = strm;
xz_dec_reset(stream->state);
stream->buf.in_pos = 0;
stream->buf.in_size = 0;
stream->buf.out_pos = 0;
stream->buf.out_size = PAGE_CACHE_SIZE;
- stream->buf.out = buffer[page++];
+ stream->buf.out = squashfs_first_page(output);
do {
if (stream->buf.in_pos == stream->buf.in_size && k < b) {
avail = min(length, msblk->devblksize - offset);
length -= avail;
- wait_on_buffer(bh[k]);
- if (!buffer_uptodate(bh[k]))
- goto release_mutex;
-
stream->buf.in = bh[k]->b_data + offset;
stream->buf.in_size = avail;
stream->buf.in_pos = 0;
offset = 0;
}
- if (stream->buf.out_pos == stream->buf.out_size
- && page < pages) {
- stream->buf.out = buffer[page++];
- stream->buf.out_pos = 0;
- total += PAGE_CACHE_SIZE;
+ if (stream->buf.out_pos == stream->buf.out_size) {
+ stream->buf.out = squashfs_next_page(output);
+ if (stream->buf.out != NULL) {
+ stream->buf.out_pos = 0;
+ total += PAGE_CACHE_SIZE;
+ }
}
xz_err = xz_dec_run(stream->state, &stream->buf);
@@ -147,23 +168,14 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
put_bh(bh[k++]);
} while (xz_err == XZ_OK);
- if (xz_err != XZ_STREAM_END) {
- ERROR("xz_dec_run error, data probably corrupt\n");
- goto release_mutex;
- }
-
- if (k < b) {
- ERROR("xz_uncompress error, input remaining\n");
- goto release_mutex;
- }
+ squashfs_finish_page(output);
- total += stream->buf.out_pos;
- mutex_unlock(&msblk->read_data_mutex);
- return total;
+ if (xz_err != XZ_STREAM_END || k < b)
+ goto out;
-release_mutex:
- mutex_unlock(&msblk->read_data_mutex);
+ return total + stream->buf.out_pos;
+out:
for (; k < b; k++)
put_bh(bh[k]);
@@ -172,6 +184,7 @@ release_mutex:
const struct squashfs_decompressor squashfs_xz_comp_ops = {
.init = squashfs_xz_init,
+ .comp_opts = squashfs_xz_comp_opts,
.free = squashfs_xz_free,
.decompress = squashfs_xz_uncompress,
.id = XZ_COMPRESSION,
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 55d918fd2d8..8727caba688 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -32,8 +32,9 @@
#include "squashfs_fs_sb.h"
#include "squashfs.h"
#include "decompressor.h"
+#include "page_actor.h"
-static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
+static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
{
z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
if (stream == NULL)
@@ -61,44 +62,37 @@ static void zlib_free(void *strm)
}
-static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
- struct buffer_head **bh, int b, int offset, int length, int srclength,
- int pages)
+static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
+ struct buffer_head **bh, int b, int offset, int length,
+ struct squashfs_page_actor *output)
{
- int zlib_err, zlib_init = 0;
- int k = 0, page = 0;
- z_stream *stream = msblk->stream;
-
- mutex_lock(&msblk->read_data_mutex);
+ int zlib_err, zlib_init = 0, k = 0;
+ z_stream *stream = strm;
- stream->avail_out = 0;
+ stream->avail_out = PAGE_CACHE_SIZE;
+ stream->next_out = squashfs_first_page(output);
stream->avail_in = 0;
do {
if (stream->avail_in == 0 && k < b) {
int avail = min(length, msblk->devblksize - offset);
length -= avail;
- wait_on_buffer(bh[k]);
- if (!buffer_uptodate(bh[k]))
- goto release_mutex;
-
stream->next_in = bh[k]->b_data + offset;
stream->avail_in = avail;
offset = 0;
}
- if (stream->avail_out == 0 && page < pages) {
- stream->next_out = buffer[page++];
- stream->avail_out = PAGE_CACHE_SIZE;
+ if (stream->avail_out == 0) {
+ stream->next_out = squashfs_next_page(output);
+ if (stream->next_out != NULL)
+ stream->avail_out = PAGE_CACHE_SIZE;
}
if (!zlib_init) {
zlib_err = zlib_inflateInit(stream);
if (zlib_err != Z_OK) {
- ERROR("zlib_inflateInit returned unexpected "
- "result 0x%x, srclength %d\n",
- zlib_err, srclength);
- goto release_mutex;
+ squashfs_finish_page(output);
+ goto out;
}
zlib_init = 1;
}
@@ -109,29 +103,21 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
put_bh(bh[k++]);
} while (zlib_err == Z_OK);
- if (zlib_err != Z_STREAM_END) {
- ERROR("zlib_inflate error, data probably corrupt\n");
- goto release_mutex;
- }
+ squashfs_finish_page(output);
- zlib_err = zlib_inflateEnd(stream);
- if (zlib_err != Z_OK) {
- ERROR("zlib_inflate error, data probably corrupt\n");
- goto release_mutex;
- }
+ if (zlib_err != Z_STREAM_END)
+ goto out;
- if (k < b) {
- ERROR("zlib_uncompress error, data remaining\n");
- goto release_mutex;
- }
+ zlib_err = zlib_inflateEnd(stream);
+ if (zlib_err != Z_OK)
+ goto out;
- length = stream->total_out;
- mutex_unlock(&msblk->read_data_mutex);
- return length;
+ if (k < b)
+ goto out;
-release_mutex:
- mutex_unlock(&msblk->read_data_mutex);
+ return stream->total_out;
+out:
for (; k < b; k++)
put_bh(bh[k]);
diff --git a/fs/stat.c b/fs/stat.c
index d0ea7ef75e2..ae0c3cef992 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -37,14 +37,21 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
EXPORT_SYMBOL(generic_fillattr);
-int vfs_getattr(struct path *path, struct kstat *stat)
+/**
+ * vfs_getattr_nosec - getattr without security checks
+ * @path: file to get attributes from
+ * @stat: structure to return attributes in
+ *
+ * Get attributes without calling security_inode_getattr.
+ *
+ * Currently the only caller other than vfs_getattr is internal to the
+ * filehandle lookup code, which uses only the inode number and returns
+ * no attributes to any user. Any other code probably wants
+ * vfs_getattr.
+ */
+int vfs_getattr_nosec(struct path *path, struct kstat *stat)
{
struct inode *inode = path->dentry->d_inode;
- int retval;
-
- retval = security_inode_getattr(path->mnt, path->dentry);
- if (retval)
- return retval;
if (inode->i_op->getattr)
return inode->i_op->getattr(path->mnt, path->dentry, stat);
@@ -53,6 +60,18 @@ int vfs_getattr(struct path *path, struct kstat *stat)
return 0;
}
+EXPORT_SYMBOL(vfs_getattr_nosec);
+
+int vfs_getattr(struct path *path, struct kstat *stat)
+{
+ int retval;
+
+ retval = security_inode_getattr(path->mnt, path->dentry);
+ if (retval)
+ return retval;
+ return vfs_getattr_nosec(path, stat);
+}
+
EXPORT_SYMBOL(vfs_getattr);
int vfs_fstat(unsigned int fd, struct kstat *stat)
diff --git a/fs/statfs.c b/fs/statfs.c
index c219e733f55..083dc0ac914 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -94,7 +94,7 @@ retry:
int fd_statfs(int fd, struct kstatfs *st)
{
- struct fd f = fdget(fd);
+ struct fd f = fdget_raw(fd);
int error = -EBADF;
if (f.file) {
error = vfs_statfs(&f.file->f_path, st);
diff --git a/fs/super.c b/fs/super.c
index 0225c20f877..d20d5b11ded 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sb = container_of(shrink, struct super_block, s_shrink);
- if (!grab_super_passive(sb))
- return 0;
-
+ /*
+ * Don't call grab_super_passive as it is a potential
+ * scalability bottleneck. The counts could get updated
+ * between super_cache_count and super_cache_scan anyway.
+ * Call to super_cache_count with shrinker_rwsem held
+ * ensures the safety of call to list_lru_count_node() and
+ * s_op->nr_cached_objects().
+ */
if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb,
sc->nid);
@@ -125,37 +130,27 @@ static unsigned long super_cache_count(struct shrinker *shrink,
sc->nid);
total_objects = vfs_pressure_ratio(total_objects);
- drop_super(sb);
return total_objects;
}
-static int init_sb_writers(struct super_block *s, struct file_system_type *type)
-{
- int err;
- int i;
-
- for (i = 0; i < SB_FREEZE_LEVELS; i++) {
- err = percpu_counter_init(&s->s_writers.counter[i], 0);
- if (err < 0)
- goto err_out;
- lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
- &type->s_writers_key[i], 0);
- }
- init_waitqueue_head(&s->s_writers.wait);
- init_waitqueue_head(&s->s_writers.wait_unfrozen);
- return 0;
-err_out:
- while (--i >= 0)
- percpu_counter_destroy(&s->s_writers.counter[i]);
- return err;
-}
-
-static void destroy_sb_writers(struct super_block *s)
+/**
+ * destroy_super - frees a superblock
+ * @s: superblock to free
+ *
+ * Frees a superblock.
+ */
+static void destroy_super(struct super_block *s)
{
int i;
-
+ list_lru_destroy(&s->s_dentry_lru);
+ list_lru_destroy(&s->s_inode_lru);
for (i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_counter_destroy(&s->s_writers.counter[i]);
+ security_sb_free(s);
+ WARN_ON(!list_empty(&s->s_mounts));
+ kfree(s->s_subtype);
+ kfree(s->s_options);
+ kfree_rcu(s, rcu);
}
/**
@@ -170,111 +165,75 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
+ int i;
- if (s) {
- if (security_sb_alloc(s))
- goto out_free_sb;
+ if (!s)
+ return NULL;
-#ifdef CONFIG_SMP
- s->s_files = alloc_percpu(struct list_head);
- if (!s->s_files)
- goto err_out;
- else {
- int i;
+ INIT_LIST_HEAD(&s->s_mounts);
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
- }
-#else
- INIT_LIST_HEAD(&s->s_files);
-#endif
- if (init_sb_writers(s, type))
- goto err_out;
- s->s_flags = flags;
- s->s_bdi = &default_backing_dev_info;
- INIT_HLIST_NODE(&s->s_instances);
- INIT_HLIST_BL_HEAD(&s->s_anon);
- INIT_LIST_HEAD(&s->s_inodes);
-
- if (list_lru_init(&s->s_dentry_lru))
- goto err_out;
- if (list_lru_init(&s->s_inode_lru))
- goto err_out_dentry_lru;
-
- INIT_LIST_HEAD(&s->s_mounts);
- init_rwsem(&s->s_umount);
- lockdep_set_class(&s->s_umount, &type->s_umount_key);
- /*
- * sget() can have s_umount recursion.
- *
- * When it cannot find a suitable sb, it allocates a new
- * one (this one), and tries again to find a suitable old
- * one.
- *
- * In case that succeeds, it will acquire the s_umount
- * lock of the old one. Since these are clearly distrinct
- * locks, and this object isn't exposed yet, there's no
- * risk of deadlocks.
- *
- * Annotate this by putting this lock in a different
- * subclass.
- */
- down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
- s->s_count = 1;
- atomic_set(&s->s_active, 1);
- mutex_init(&s->s_vfs_rename_mutex);
- lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
- mutex_init(&s->s_dquot.dqio_mutex);
- mutex_init(&s->s_dquot.dqonoff_mutex);
- init_rwsem(&s->s_dquot.dqptr_sem);
- s->s_maxbytes = MAX_NON_LFS;
- s->s_op = &default_op;
- s->s_time_gran = 1000000000;
- s->cleancache_poolid = -1;
-
- s->s_shrink.seeks = DEFAULT_SEEKS;
- s->s_shrink.scan_objects = super_cache_scan;
- s->s_shrink.count_objects = super_cache_count;
- s->s_shrink.batch = 1024;
- s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+ if (security_sb_alloc(s))
+ goto fail;
+
+ for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+ if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0)
+ goto fail;
+ lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+ &type->s_writers_key[i], 0);
}
-out:
+ init_waitqueue_head(&s->s_writers.wait);
+ init_waitqueue_head(&s->s_writers.wait_unfrozen);
+ s->s_flags = flags;
+ s->s_bdi = &default_backing_dev_info;
+ INIT_HLIST_NODE(&s->s_instances);
+ INIT_HLIST_BL_HEAD(&s->s_anon);
+ INIT_LIST_HEAD(&s->s_inodes);
+
+ if (list_lru_init(&s->s_dentry_lru))
+ goto fail;
+ if (list_lru_init(&s->s_inode_lru))
+ goto fail;
+
+ init_rwsem(&s->s_umount);
+ lockdep_set_class(&s->s_umount, &type->s_umount_key);
+ /*
+ * sget() can have s_umount recursion.
+ *
+ * When it cannot find a suitable sb, it allocates a new
+ * one (this one), and tries again to find a suitable old
+ * one.
+ *
+ * In case that succeeds, it will acquire the s_umount
+ * lock of the old one. Since these are clearly distrinct
+ * locks, and this object isn't exposed yet, there's no
+ * risk of deadlocks.
+ *
+ * Annotate this by putting this lock in a different
+ * subclass.
+ */
+ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
+ s->s_count = 1;
+ atomic_set(&s->s_active, 1);
+ mutex_init(&s->s_vfs_rename_mutex);
+ lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
+ mutex_init(&s->s_dquot.dqio_mutex);
+ mutex_init(&s->s_dquot.dqonoff_mutex);
+ init_rwsem(&s->s_dquot.dqptr_sem);
+ s->s_maxbytes = MAX_NON_LFS;
+ s->s_op = &default_op;
+ s->s_time_gran = 1000000000;
+ s->cleancache_poolid = -1;
+
+ s->s_shrink.seeks = DEFAULT_SEEKS;
+ s->s_shrink.scan_objects = super_cache_scan;
+ s->s_shrink.count_objects = super_cache_count;
+ s->s_shrink.batch = 1024;
+ s->s_shrink.flags = SHRINKER_NUMA_AWARE;
return s;
-err_out_dentry_lru:
- list_lru_destroy(&s->s_dentry_lru);
-err_out:
- security_sb_free(s);
-#ifdef CONFIG_SMP
- if (s->s_files)
- free_percpu(s->s_files);
-#endif
- destroy_sb_writers(s);
-out_free_sb:
- kfree(s);
- s = NULL;
- goto out;
-}
-
-/**
- * destroy_super - frees a superblock
- * @s: superblock to free
- *
- * Frees a superblock.
- */
-static inline void destroy_super(struct super_block *s)
-{
- list_lru_destroy(&s->s_dentry_lru);
- list_lru_destroy(&s->s_inode_lru);
-#ifdef CONFIG_SMP
- free_percpu(s->s_files);
-#endif
- destroy_sb_writers(s);
- security_sb_free(s);
- WARN_ON(!list_empty(&s->s_mounts));
- kfree(s->s_subtype);
- kfree(s->s_options);
- kfree(s);
+fail:
+ destroy_super(s);
+ return NULL;
}
/* Superblock refcounting */
@@ -321,10 +280,8 @@ void deactivate_locked_super(struct super_block *s)
struct file_system_type *fs = s->s_type;
if (atomic_dec_and_test(&s->s_active)) {
cleancache_invalidate_fs(s);
- fs->kill_sb(s);
-
- /* caches are now gone, we can safely kill the shrinker now */
unregister_shrinker(&s->s_shrink);
+ fs->kill_sb(s);
put_filesystem(fs);
put_super(s);
@@ -748,7 +705,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
if (flags & MS_RDONLY)
acct_auto_close(sb);
shrink_dcache_sb(sb);
- sync_filesystem(sb);
remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
@@ -756,7 +712,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
make sure there are no rw files opened */
if (remount_ro) {
if (force) {
- mark_files_ro(sb);
+ sb->s_readonly_remount = 1;
+ smp_wmb();
} else {
retval = sb_prepare_remount_readonly(sb);
if (retval)
@@ -845,7 +802,10 @@ void emergency_remount(void)
static DEFINE_IDA(unnamed_dev_ida);
static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */
-static int unnamed_dev_start = 0; /* don't bother trying below it */
+/* Many userspace utilities consider an FSID of 0 invalid.
+ * Always return at least 1 from get_anon_bdev.
+ */
+static int unnamed_dev_start = 1;
int get_anon_bdev(dev_t *p)
{
diff --git a/fs/sync.c b/fs/sync.c
index 905f3f6b3d8..b28d1dd10e8 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,7 +177,7 @@ SYSCALL_DEFINE1(syncfs, int, fd)
*/
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
- if (!file->f_op || !file->f_op->fsync)
+ if (!file->f_op->fsync)
return -EINVAL;
return file->f_op->fsync(file, start, end, datasync);
}
@@ -219,23 +219,6 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
return do_fsync(fd, 1);
}
-/**
- * generic_write_sync - perform syncing after a write if file / inode is sync
- * @file: file to which the write happened
- * @pos: offset where the write started
- * @count: length of the write
- *
- * This is just a simple wrapper about our general syncing function.
- */
-int generic_write_sync(struct file *file, loff_t pos, loff_t count)
-{
- if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
- return 0;
- return vfs_fsync_range(file, pos, pos + count - 1,
- (file->f_flags & __O_SYNC) ? 0 : 1);
-}
-EXPORT_SYMBOL(generic_write_sync);
-
/*
* sys_sync_file_range() permits finely controlled syncing over a segment of
* a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is
diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig
index 8c41feacbac..b2756014508 100644
--- a/fs/sysfs/Kconfig
+++ b/fs/sysfs/Kconfig
@@ -1,6 +1,7 @@
config SYSFS
bool "sysfs file system support" if EXPERT
default y
+ select KERNFS
help
The sysfs filesystem is a virtual filesystem that the kernel uses to
export internal kernel objects, their attributes, and their
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 7a1ceb946b8..6eff6e1205a 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,5 +2,4 @@
# Makefile for the sysfs virtual filesystem
#
-obj-y := inode.o file.o dir.o symlink.o mount.o bin.o \
- group.o
+obj-y := file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
deleted file mode 100644
index c590cabd57b..00000000000
--- a/fs/sysfs/bin.c
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * fs/sysfs/bin.c - sysfs binary file implementation
- *
- * Copyright (c) 2003 Patrick Mochel
- * Copyright (c) 2003 Matthew Wilcox
- * Copyright (c) 2004 Silicon Graphics, Inc.
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/kobject.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/mutex.h>
-#include <linux/mm.h>
-#include <linux/uaccess.h>
-
-#include "sysfs.h"
-
-/*
- * There's one bin_buffer for each open file.
- *
- * filp->private_data points to bin_buffer and
- * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
- * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
- */
-static DEFINE_MUTEX(sysfs_bin_lock);
-
-struct bin_buffer {
- struct mutex mutex;
- void *buffer;
- int mmapped;
- const struct vm_operations_struct *vm_ops;
- struct file *file;
- struct hlist_node list;
-};
-
-static int
-fill_read(struct file *file, char *buffer, loff_t off, size_t count)
-{
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
- struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- int rc;
-
- /* need attr_sd for attr, its parent for kobj */
- if (!sysfs_get_active(attr_sd))
- return -ENODEV;
-
- rc = -EIO;
- if (attr->read)
- rc = attr->read(file, kobj, attr, buffer, off, count);
-
- sysfs_put_active(attr_sd);
-
- return rc;
-}
-
-static ssize_t
-read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off)
-{
- struct bin_buffer *bb = file->private_data;
- int size = file_inode(file)->i_size;
- loff_t offs = *off;
- int count = min_t(size_t, bytes, PAGE_SIZE);
- char *temp;
-
- if (!bytes)
- return 0;
-
- if (size) {
- if (offs > size)
- return 0;
- if (offs + count > size)
- count = size - offs;
- }
-
- temp = kmalloc(count, GFP_KERNEL);
- if (!temp)
- return -ENOMEM;
-
- mutex_lock(&bb->mutex);
-
- count = fill_read(file, bb->buffer, offs, count);
- if (count < 0) {
- mutex_unlock(&bb->mutex);
- goto out_free;
- }
-
- memcpy(temp, bb->buffer, count);
-
- mutex_unlock(&bb->mutex);
-
- if (copy_to_user(userbuf, temp, count)) {
- count = -EFAULT;
- goto out_free;
- }
-
- pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
-
- *off = offs + count;
-
- out_free:
- kfree(temp);
- return count;
-}
-
-static int
-flush_write(struct file *file, char *buffer, loff_t offset, size_t count)
-{
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
- struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- int rc;
-
- /* need attr_sd for attr, its parent for kobj */
- if (!sysfs_get_active(attr_sd))
- return -ENODEV;
-
- rc = -EIO;
- if (attr->write)
- rc = attr->write(file, kobj, attr, buffer, offset, count);
-
- sysfs_put_active(attr_sd);
-
- return rc;
-}
-
-static ssize_t write(struct file *file, const char __user *userbuf,
- size_t bytes, loff_t *off)
-{
- struct bin_buffer *bb = file->private_data;
- int size = file_inode(file)->i_size;
- loff_t offs = *off;
- int count = min_t(size_t, bytes, PAGE_SIZE);
- char *temp;
-
- if (!bytes)
- return 0;
-
- if (size) {
- if (offs > size)
- return 0;
- if (offs + count > size)
- count = size - offs;
- }
-
- temp = memdup_user(userbuf, count);
- if (IS_ERR(temp))
- return PTR_ERR(temp);
-
- mutex_lock(&bb->mutex);
-
- memcpy(bb->buffer, temp, count);
-
- count = flush_write(file, bb->buffer, offs, count);
- mutex_unlock(&bb->mutex);
-
- if (count > 0)
- *off = offs + count;
-
- kfree(temp);
- return count;
-}
-
-static void bin_vma_open(struct vm_area_struct *vma)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-
- if (!bb->vm_ops)
- return;
-
- if (!sysfs_get_active(attr_sd))
- return;
-
- if (bb->vm_ops->open)
- bb->vm_ops->open(vma);
-
- sysfs_put_active(attr_sd);
-}
-
-static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- int ret;
-
- if (!bb->vm_ops)
- return VM_FAULT_SIGBUS;
-
- if (!sysfs_get_active(attr_sd))
- return VM_FAULT_SIGBUS;
-
- ret = VM_FAULT_SIGBUS;
- if (bb->vm_ops->fault)
- ret = bb->vm_ops->fault(vma, vmf);
-
- sysfs_put_active(attr_sd);
- return ret;
-}
-
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- int ret;
-
- if (!bb->vm_ops)
- return VM_FAULT_SIGBUS;
-
- if (!sysfs_get_active(attr_sd))
- return VM_FAULT_SIGBUS;
-
- ret = 0;
- if (bb->vm_ops->page_mkwrite)
- ret = bb->vm_ops->page_mkwrite(vma, vmf);
- else
- file_update_time(file);
-
- sysfs_put_active(attr_sd);
- return ret;
-}
-
-static int bin_access(struct vm_area_struct *vma, unsigned long addr,
- void *buf, int len, int write)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- int ret;
-
- if (!bb->vm_ops)
- return -EINVAL;
-
- if (!sysfs_get_active(attr_sd))
- return -EINVAL;
-
- ret = -EINVAL;
- if (bb->vm_ops->access)
- ret = bb->vm_ops->access(vma, addr, buf, len, write);
-
- sysfs_put_active(attr_sd);
- return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- int ret;
-
- if (!bb->vm_ops)
- return 0;
-
- if (!sysfs_get_active(attr_sd))
- return -EINVAL;
-
- ret = 0;
- if (bb->vm_ops->set_policy)
- ret = bb->vm_ops->set_policy(vma, new);
-
- sysfs_put_active(attr_sd);
- return ret;
-}
-
-static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
- unsigned long addr)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- struct mempolicy *pol;
-
- if (!bb->vm_ops)
- return vma->vm_policy;
-
- if (!sysfs_get_active(attr_sd))
- return vma->vm_policy;
-
- pol = vma->vm_policy;
- if (bb->vm_ops->get_policy)
- pol = bb->vm_ops->get_policy(vma, addr);
-
- sysfs_put_active(attr_sd);
- return pol;
-}
-
-static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
- const nodemask_t *to, unsigned long flags)
-{
- struct file *file = vma->vm_file;
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- int ret;
-
- if (!bb->vm_ops)
- return 0;
-
- if (!sysfs_get_active(attr_sd))
- return 0;
-
- ret = 0;
- if (bb->vm_ops->migrate)
- ret = bb->vm_ops->migrate(vma, from, to, flags);
-
- sysfs_put_active(attr_sd);
- return ret;
-}
-#endif
-
-static const struct vm_operations_struct bin_vm_ops = {
- .open = bin_vma_open,
- .fault = bin_fault,
- .page_mkwrite = bin_page_mkwrite,
- .access = bin_access,
-#ifdef CONFIG_NUMA
- .set_policy = bin_set_policy,
- .get_policy = bin_get_policy,
- .migrate = bin_migrate,
-#endif
-};
-
-static int mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct bin_buffer *bb = file->private_data;
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
- struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- int rc;
-
- mutex_lock(&bb->mutex);
-
- /* need attr_sd for attr, its parent for kobj */
- rc = -ENODEV;
- if (!sysfs_get_active(attr_sd))
- goto out_unlock;
-
- rc = -EINVAL;
- if (!attr->mmap)
- goto out_put;
-
- rc = attr->mmap(file, kobj, attr, vma);
- if (rc)
- goto out_put;
-
- /*
- * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
- * to satisfy versions of X which crash if the mmap fails: that
- * substitutes a new vm_file, and we don't then want bin_vm_ops.
- */
- if (vma->vm_file != file)
- goto out_put;
-
- rc = -EINVAL;
- if (bb->mmapped && bb->vm_ops != vma->vm_ops)
- goto out_put;
-
- /*
- * It is not possible to successfully wrap close.
- * So error if someone is trying to use close.
- */
- rc = -EINVAL;
- if (vma->vm_ops && vma->vm_ops->close)
- goto out_put;
-
- rc = 0;
- bb->mmapped = 1;
- bb->vm_ops = vma->vm_ops;
- vma->vm_ops = &bin_vm_ops;
-out_put:
- sysfs_put_active(attr_sd);
-out_unlock:
- mutex_unlock(&bb->mutex);
-
- return rc;
-}
-
-static int open(struct inode *inode, struct file *file)
-{
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- struct bin_attribute *attr = attr_sd->s_bin_attr.bin_attr;
- struct bin_buffer *bb = NULL;
- int error;
-
- /* binary file operations requires both @sd and its parent */
- if (!sysfs_get_active(attr_sd))
- return -ENODEV;
-
- error = -EACCES;
- if ((file->f_mode & FMODE_WRITE) && !(attr->write || attr->mmap))
- goto err_out;
- if ((file->f_mode & FMODE_READ) && !(attr->read || attr->mmap))
- goto err_out;
-
- error = -ENOMEM;
- bb = kzalloc(sizeof(*bb), GFP_KERNEL);
- if (!bb)
- goto err_out;
-
- bb->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!bb->buffer)
- goto err_out;
-
- mutex_init(&bb->mutex);
- bb->file = file;
- file->private_data = bb;
-
- mutex_lock(&sysfs_bin_lock);
- hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
- mutex_unlock(&sysfs_bin_lock);
-
- /* open succeeded, put active references */
- sysfs_put_active(attr_sd);
- return 0;
-
- err_out:
- sysfs_put_active(attr_sd);
- kfree(bb);
- return error;
-}
-
-static int release(struct inode *inode, struct file *file)
-{
- struct bin_buffer *bb = file->private_data;
-
- mutex_lock(&sysfs_bin_lock);
- hlist_del(&bb->list);
- mutex_unlock(&sysfs_bin_lock);
-
- kfree(bb->buffer);
- kfree(bb);
- return 0;
-}
-
-const struct file_operations bin_fops = {
- .read = read,
- .write = write,
- .mmap = mmap,
- .llseek = generic_file_llseek,
- .open = open,
- .release = release,
-};
-
-
-void unmap_bin_file(struct sysfs_dirent *attr_sd)
-{
- struct bin_buffer *bb;
-
- if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
- return;
-
- mutex_lock(&sysfs_bin_lock);
-
- hlist_for_each_entry(bb, &attr_sd->s_bin_attr.buffers, list) {
- struct inode *inode = file_inode(bb->file);
-
- unmap_mapping_range(inode->i_mapping, 0, 0, 1);
- }
-
- mutex_unlock(&sysfs_bin_lock);
-}
-
-/**
- * sysfs_create_bin_file - create binary file for object.
- * @kobj: object.
- * @attr: attribute descriptor.
- */
-int sysfs_create_bin_file(struct kobject *kobj,
- const struct bin_attribute *attr)
-{
- BUG_ON(!kobj || !kobj->sd || !attr);
-
- return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
-}
-EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
-
-/**
- * sysfs_remove_bin_file - remove binary file for object.
- * @kobj: object.
- * @attr: attribute descriptor.
- */
-void sysfs_remove_bin_file(struct kobject *kobj,
- const struct bin_attribute *attr)
-{
- sysfs_hash_and_remove(kobj->sd, NULL, attr->attr.name);
-}
-EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4d83cedb9fc..0b45ff42f37 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,835 +13,55 @@
#undef DEBUG
#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/module.h>
#include <linux/kobject.h>
-#include <linux/namei.h>
-#include <linux/idr.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
#include <linux/slab.h>
-#include <linux/security.h>
-#include <linux/hash.h>
#include "sysfs.h"
-DEFINE_MUTEX(sysfs_mutex);
-DEFINE_SPINLOCK(sysfs_assoc_lock);
+DEFINE_SPINLOCK(sysfs_symlink_target_lock);
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb);
-
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-
-/**
- * sysfs_name_hash
- * @ns: Namespace tag to hash
- * @name: Null terminated string to hash
- *
- * Returns 31 bit hash of ns + name (so it fits in an off_t )
- */
-static unsigned int sysfs_name_hash(const void *ns, const char *name)
-{
- unsigned long hash = init_name_hash();
- unsigned int len = strlen(name);
- while (len--)
- hash = partial_name_hash(*name++, hash);
- hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
- hash &= 0x7fffffffU;
- /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
- if (hash < 1)
- hash += 2;
- if (hash >= INT_MAX)
- hash = INT_MAX - 1;
- return hash;
-}
-
-static int sysfs_name_compare(unsigned int hash, const void *ns,
- const char *name, const struct sysfs_dirent *sd)
-{
- if (hash != sd->s_hash)
- return hash - sd->s_hash;
- if (ns != sd->s_ns)
- return ns - sd->s_ns;
- return strcmp(name, sd->s_name);
-}
-
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
- const struct sysfs_dirent *right)
-{
- return sysfs_name_compare(left->s_hash, left->s_ns, left->s_name,
- right);
-}
-
-/**
- * sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- * @sd: sysfs_dirent of interest
- *
- * Link @sd into its sibling rbtree which starts from
- * sd->s_parent->s_dir.children.
- *
- * Locking:
- * mutex_lock(sysfs_mutex)
- *
- * RETURNS:
- * 0 on susccess -EEXIST on failure.
- */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
-{
- struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
- struct rb_node *parent = NULL;
-
- if (sysfs_type(sd) == SYSFS_DIR)
- sd->s_parent->s_dir.subdirs++;
-
- while (*node) {
- struct sysfs_dirent *pos;
- int result;
-
- pos = to_sysfs_dirent(*node);
- parent = *node;
- result = sysfs_sd_compare(sd, pos);
- if (result < 0)
- node = &pos->s_rb.rb_left;
- else if (result > 0)
- node = &pos->s_rb.rb_right;
- else
- return -EEXIST;
- }
- /* add new node and rebalance the tree */
- rb_link_node(&sd->s_rb, parent, node);
- rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
- return 0;
-}
-
-/**
- * sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- * @sd: sysfs_dirent of interest
- *
- * Unlink @sd from its sibling rbtree which starts from
- * sd->s_parent->s_dir.children.
- *
- * Locking:
- * mutex_lock(sysfs_mutex)
- */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
-{
- if (sysfs_type(sd) == SYSFS_DIR)
- sd->s_parent->s_dir.subdirs--;
-
- rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-/* Test for attributes that want to ignore lockdep for read-locking */
-static bool ignore_lockdep(struct sysfs_dirent *sd)
-{
- return sysfs_type(sd) == SYSFS_KOBJ_ATTR &&
- sd->s_attr.attr->ignore_lockdep;
-}
-
-#else
-
-static inline bool ignore_lockdep(struct sysfs_dirent *sd)
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
{
- return true;
-}
-
-#endif
-
-/**
- * sysfs_get_active - get an active reference to sysfs_dirent
- * @sd: sysfs_dirent to get an active reference to
- *
- * Get an active reference of @sd. This function is noop if @sd
- * is NULL.
- *
- * RETURNS:
- * Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
-{
- if (unlikely(!sd))
- return NULL;
-
- if (!atomic_inc_unless_negative(&sd->s_active))
- return NULL;
-
- if (likely(!ignore_lockdep(sd)))
- rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
- return sd;
-}
-
-/**
- * sysfs_put_active - put an active reference to sysfs_dirent
- * @sd: sysfs_dirent to put an active reference to
- *
- * Put an active reference to @sd. This function is noop if @sd
- * is NULL.
- */
-void sysfs_put_active(struct sysfs_dirent *sd)
-{
- int v;
+ char *buf, *path = NULL;
- if (unlikely(!sd))
- return;
+ buf = kzalloc(PATH_MAX, GFP_KERNEL);
+ if (buf)
+ path = kernfs_path(parent, buf, PATH_MAX);
- if (likely(!ignore_lockdep(sd)))
- rwsem_release(&sd->dep_map, 1, _RET_IP_);
- v = atomic_dec_return(&sd->s_active);
- if (likely(v != SD_DEACTIVATED_BIAS))
- return;
+ WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n",
+ path, name);
- /* atomic_dec_return() is a mb(), we'll always see the updated
- * sd->u.completion.
- */
- complete(sd->u.completion);
+ kfree(buf);
}
/**
- * sysfs_deactivate - deactivate sysfs_dirent
- * @sd: sysfs_dirent to deactivate
- *
- * Deny new active references and drain existing ones.
+ * sysfs_create_dir_ns - create a directory for an object with a namespace tag
+ * @kobj: object we're creating directory for
+ * @ns: the namespace tag to use
*/
-static void sysfs_deactivate(struct sysfs_dirent *sd)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- int v;
-
- BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
-
- if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
- return;
-
- sd->u.completion = (void *)&wait;
-
- rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
- /* atomic_add_return() is a mb(), put_active() will always see
- * the updated sd->u.completion.
- */
- v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-
- if (v != SD_DEACTIVATED_BIAS) {
- lock_contended(&sd->dep_map, _RET_IP_);
- wait_for_completion(&wait);
- }
-
- lock_acquired(&sd->dep_map, _RET_IP_);
- rwsem_release(&sd->dep_map, 1, _RET_IP_);
-}
-
-static int sysfs_alloc_ino(unsigned int *pino)
-{
- int ino, rc;
-
- retry:
- spin_lock(&sysfs_ino_lock);
- rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
- spin_unlock(&sysfs_ino_lock);
-
- if (rc == -EAGAIN) {
- if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
- goto retry;
- rc = -ENOMEM;
- }
-
- *pino = ino;
- return rc;
-}
-
-static void sysfs_free_ino(unsigned int ino)
+int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
{
- spin_lock(&sysfs_ino_lock);
- ida_remove(&sysfs_ino_ida, ino);
- spin_unlock(&sysfs_ino_lock);
-}
-
-void release_sysfs_dirent(struct sysfs_dirent *sd)
-{
- struct sysfs_dirent *parent_sd;
-
- repeat:
- /* Moving/renaming is always done while holding reference.
- * sd->s_parent won't change beneath us.
- */
- parent_sd = sd->s_parent;
-
- WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
- "sysfs: free using entry: %s/%s\n",
- parent_sd ? parent_sd->s_name : "", sd->s_name);
-
- if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
- sysfs_put(sd->s_symlink.target_sd);
- if (sysfs_type(sd) & SYSFS_COPY_NAME)
- kfree(sd->s_name);
- if (sd->s_iattr && sd->s_iattr->ia_secdata)
- security_release_secctx(sd->s_iattr->ia_secdata,
- sd->s_iattr->ia_secdata_len);
- kfree(sd->s_iattr);
- sysfs_free_ino(sd->s_ino);
- kmem_cache_free(sysfs_dir_cachep, sd);
-
- sd = parent_sd;
- if (sd && atomic_dec_and_test(&sd->s_count))
- goto repeat;
-}
-
-static int sysfs_dentry_delete(const struct dentry *dentry)
-{
- struct sysfs_dirent *sd = dentry->d_fsdata;
- return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
-}
-
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
- struct sysfs_dirent *sd;
- int type;
-
- if (flags & LOOKUP_RCU)
- return -ECHILD;
-
- sd = dentry->d_fsdata;
- mutex_lock(&sysfs_mutex);
-
- /* The sysfs dirent has been deleted */
- if (sd->s_flags & SYSFS_FLAG_REMOVED)
- goto out_bad;
-
- /* The sysfs dirent has been moved? */
- if (dentry->d_parent->d_fsdata != sd->s_parent)
- goto out_bad;
-
- /* The sysfs dirent has been renamed */
- if (strcmp(dentry->d_name.name, sd->s_name) != 0)
- goto out_bad;
-
- /* The sysfs dirent has been moved to a different namespace */
- type = KOBJ_NS_TYPE_NONE;
- if (sd->s_parent) {
- type = sysfs_ns_type(sd->s_parent);
- if (type != KOBJ_NS_TYPE_NONE &&
- sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
- goto out_bad;
- }
-
- mutex_unlock(&sysfs_mutex);
-out_valid:
- return 1;
-out_bad:
- /* Remove the dentry from the dcache hashes.
- * If this is a deleted dentry we use d_drop instead of d_delete
- * so sysfs doesn't need to cope with negative dentries.
- *
- * If this is a dentry that has simply been renamed we
- * use d_drop to remove it from the dcache lookup on its
- * old parent. If this dentry persists later when a lookup
- * is performed at its new name the dentry will be readded
- * to the dcache hashes.
- */
- mutex_unlock(&sysfs_mutex);
-
- /* If we have submounts we must allow the vfs caches
- * to lie about the state of the filesystem to prevent
- * leaks and other nasty things.
- */
- if (check_submounts_and_drop(dentry) != 0)
- goto out_valid;
-
- return 0;
-}
-
-static void sysfs_dentry_release(struct dentry *dentry)
-{
- sysfs_put(dentry->d_fsdata);
-}
-
-const struct dentry_operations sysfs_dentry_ops = {
- .d_revalidate = sysfs_dentry_revalidate,
- .d_delete = sysfs_dentry_delete,
- .d_release = sysfs_dentry_release,
-};
-
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
-{
- char *dup_name = NULL;
- struct sysfs_dirent *sd;
-
- if (type & SYSFS_COPY_NAME) {
- name = dup_name = kstrdup(name, GFP_KERNEL);
- if (!name)
- return NULL;
- }
-
- sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
- if (!sd)
- goto err_out1;
-
- if (sysfs_alloc_ino(&sd->s_ino))
- goto err_out2;
-
- atomic_set(&sd->s_count, 1);
- atomic_set(&sd->s_active, 0);
-
- sd->s_name = name;
- sd->s_mode = mode;
- sd->s_flags = type | SYSFS_FLAG_REMOVED;
-
- return sd;
-
- err_out2:
- kmem_cache_free(sysfs_dir_cachep, sd);
- err_out1:
- kfree(dup_name);
- return NULL;
-}
-
-/**
- * sysfs_addrm_start - prepare for sysfs_dirent add/remove
- * @acxt: pointer to sysfs_addrm_cxt to be used
- * @parent_sd: parent sysfs_dirent
- *
- * This function is called when the caller is about to add or
- * remove sysfs_dirent under @parent_sd. This function acquires
- * sysfs_mutex. @acxt is used to keep and pass context to
- * other addrm functions.
- *
- * LOCKING:
- * Kernel thread context (may sleep). sysfs_mutex is locked on
- * return.
- */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
- struct sysfs_dirent *parent_sd)
-{
- memset(acxt, 0, sizeof(*acxt));
- acxt->parent_sd = parent_sd;
-
- mutex_lock(&sysfs_mutex);
-}
-
-/**
- * __sysfs_add_one - add sysfs_dirent to parent without warning
- * @acxt: addrm context to use
- * @sd: sysfs_dirent to be added
- *
- * Get @acxt->parent_sd and set sd->s_parent to it and increment
- * nlink of parent inode if @sd is a directory and link into the
- * children list of the parent.
- *
- * This function should be called between calls to
- * sysfs_addrm_start() and sysfs_addrm_finish() and should be
- * passed the same @acxt as passed to sysfs_addrm_start().
- *
- * LOCKING:
- * Determined by sysfs_addrm_start().
- *
- * RETURNS:
- * 0 on success, -EEXIST if entry with the given name already
- * exists.
- */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
-{
- struct sysfs_inode_attrs *ps_iattr;
- int ret;
-
- if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
- WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
- sysfs_ns_type(acxt->parent_sd) ? "required" : "invalid",
- acxt->parent_sd->s_name, sd->s_name);
- return -EINVAL;
- }
-
- sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
- sd->s_parent = sysfs_get(acxt->parent_sd);
-
- ret = sysfs_link_sibling(sd);
- if (ret)
- return ret;
-
- /* Update timestamps on the parent */
- ps_iattr = acxt->parent_sd->s_iattr;
- if (ps_iattr) {
- struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
- ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
- }
-
- /* Mark the entry added into directory tree */
- sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-
- return 0;
-}
-
-/**
- * sysfs_pathname - return full path to sysfs dirent
- * @sd: sysfs_dirent whose path we want
- * @path: caller allocated buffer of size PATH_MAX
- *
- * Gives the name "/" to the sysfs_root entry; any path returned
- * is relative to wherever sysfs is mounted.
- */
-static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
-{
- if (sd->s_parent) {
- sysfs_pathname(sd->s_parent, path);
- strlcat(path, "/", PATH_MAX);
- }
- strlcat(path, sd->s_name, PATH_MAX);
- return path;
-}
-
-/**
- * sysfs_add_one - add sysfs_dirent to parent
- * @acxt: addrm context to use
- * @sd: sysfs_dirent to be added
- *
- * Get @acxt->parent_sd and set sd->s_parent to it and increment
- * nlink of parent inode if @sd is a directory and link into the
- * children list of the parent.
- *
- * This function should be called between calls to
- * sysfs_addrm_start() and sysfs_addrm_finish() and should be
- * passed the same @acxt as passed to sysfs_addrm_start().
- *
- * LOCKING:
- * Determined by sysfs_addrm_start().
- *
- * RETURNS:
- * 0 on success, -EEXIST if entry with the given name already
- * exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
-{
- int ret;
-
- ret = __sysfs_add_one(acxt, sd);
- if (ret == -EEXIST) {
- char *path = kzalloc(PATH_MAX, GFP_KERNEL);
- WARN(1, KERN_WARNING
- "sysfs: cannot create duplicate filename '%s'\n",
- (path == NULL) ? sd->s_name
- : (sysfs_pathname(acxt->parent_sd, path),
- strlcat(path, "/", PATH_MAX),
- strlcat(path, sd->s_name, PATH_MAX),
- path));
- kfree(path);
- }
-
- return ret;
-}
-
-/**
- * sysfs_remove_one - remove sysfs_dirent from parent
- * @acxt: addrm context to use
- * @sd: sysfs_dirent to be removed
- *
- * Mark @sd removed and drop nlink of parent inode if @sd is a
- * directory. @sd is unlinked from the children list.
- *
- * This function should be called between calls to
- * sysfs_addrm_start() and sysfs_addrm_finish() and should be
- * passed the same @acxt as passed to sysfs_addrm_start().
- *
- * LOCKING:
- * Determined by sysfs_addrm_start().
- */
-void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
-{
- struct sysfs_inode_attrs *ps_iattr;
-
- BUG_ON(sd->s_flags & SYSFS_FLAG_REMOVED);
-
- sysfs_unlink_sibling(sd);
-
- /* Update timestamps on the parent */
- ps_iattr = acxt->parent_sd->s_iattr;
- if (ps_iattr) {
- struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
- ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
- }
-
- sd->s_flags |= SYSFS_FLAG_REMOVED;
- sd->u.removed_list = acxt->removed;
- acxt->removed = sd;
-}
-
-/**
- * sysfs_addrm_finish - finish up sysfs_dirent add/remove
- * @acxt: addrm context to finish up
- *
- * Finish up sysfs_dirent add/remove. Resources acquired by
- * sysfs_addrm_start() are released and removed sysfs_dirents are
- * cleaned up.
- *
- * LOCKING:
- * sysfs_mutex is released.
- */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
-{
- /* release resources acquired by sysfs_addrm_start() */
- mutex_unlock(&sysfs_mutex);
-
- /* kill removed sysfs_dirents */
- while (acxt->removed) {
- struct sysfs_dirent *sd = acxt->removed;
-
- acxt->removed = sd->u.removed_list;
-
- sysfs_deactivate(sd);
- unmap_bin_file(sd);
- sysfs_put(sd);
- }
-}
-
-/**
- * sysfs_find_dirent - find sysfs_dirent with the given name
- * @parent_sd: sysfs_dirent to search under
- * @name: name to look for
- *
- * Look for sysfs_dirent with name @name under @parent_sd.
- *
- * LOCKING:
- * mutex_lock(sysfs_mutex)
- *
- * RETURNS:
- * Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
- const void *ns,
- const unsigned char *name)
-{
- struct rb_node *node = parent_sd->s_dir.children.rb_node;
- unsigned int hash;
-
- if (!!sysfs_ns_type(parent_sd) != !!ns) {
- WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
- sysfs_ns_type(parent_sd) ? "required" : "invalid",
- parent_sd->s_name, name);
- return NULL;
- }
-
- hash = sysfs_name_hash(ns, name);
- while (node) {
- struct sysfs_dirent *sd;
- int result;
-
- sd = to_sysfs_dirent(node);
- result = sysfs_name_compare(hash, ns, name, sd);
- if (result < 0)
- node = node->rb_left;
- else if (result > 0)
- node = node->rb_right;
- else
- return sd;
- }
- return NULL;
-}
-
-/**
- * sysfs_get_dirent - find and get sysfs_dirent with the given name
- * @parent_sd: sysfs_dirent to search under
- * @name: name to look for
- *
- * Look for sysfs_dirent with name @name under @parent_sd and get
- * it if found.
- *
- * LOCKING:
- * Kernel thread context (may sleep). Grabs sysfs_mutex.
- *
- * RETURNS:
- * Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
- const void *ns,
- const unsigned char *name)
-{
- struct sysfs_dirent *sd;
-
- mutex_lock(&sysfs_mutex);
- sd = sysfs_find_dirent(parent_sd, ns, name);
- sysfs_get(sd);
- mutex_unlock(&sysfs_mutex);
-
- return sd;
-}
-EXPORT_SYMBOL_GPL(sysfs_get_dirent);
-
-static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
- enum kobj_ns_type type, const void *ns, const char *name,
- struct sysfs_dirent **p_sd)
-{
- umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
- struct sysfs_addrm_cxt acxt;
- struct sysfs_dirent *sd;
- int rc;
-
- /* allocate */
- sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
- if (!sd)
- return -ENOMEM;
-
- sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
- sd->s_ns = ns;
- sd->s_dir.kobj = kobj;
-
- /* link in */
- sysfs_addrm_start(&acxt, parent_sd);
- rc = sysfs_add_one(&acxt, sd);
- sysfs_addrm_finish(&acxt);
-
- if (rc == 0)
- *p_sd = sd;
- else
- sysfs_put(sd);
-
- return rc;
-}
-
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
- struct sysfs_dirent **p_sd)
-{
- return create_dir(kobj, kobj->sd,
- KOBJ_NS_TYPE_NONE, NULL, name, p_sd);
-}
-
-/**
- * sysfs_read_ns_type: return associated ns_type
- * @kobj: the kobject being queried
- *
- * Each kobject can be tagged with exactly one namespace type
- * (i.e. network or user). Return the ns_type associated with
- * this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
- const struct kobj_ns_type_operations *ops;
- enum kobj_ns_type type;
-
- ops = kobj_child_ns_ops(kobj);
- if (!ops)
- return KOBJ_NS_TYPE_NONE;
-
- type = ops->type;
- BUG_ON(type <= KOBJ_NS_TYPE_NONE);
- BUG_ON(type >= KOBJ_NS_TYPES);
- BUG_ON(!kobj_ns_type_registered(type));
-
- return type;
-}
-
-/**
- * sysfs_create_dir - create a directory for an object.
- * @kobj: object we're creating directory for.
- */
-int sysfs_create_dir(struct kobject *kobj)
-{
- enum kobj_ns_type type;
- struct sysfs_dirent *parent_sd, *sd;
- const void *ns = NULL;
- int error = 0;
+ struct kernfs_node *parent, *kn;
BUG_ON(!kobj);
if (kobj->parent)
- parent_sd = kobj->parent->sd;
+ parent = kobj->parent->sd;
else
- parent_sd = &sysfs_root;
+ parent = sysfs_root_kn;
- if (!parent_sd)
+ if (!parent)
return -ENOENT;
- if (sysfs_ns_type(parent_sd))
- ns = kobj->ktype->namespace(kobj);
- type = sysfs_read_ns_type(kobj);
-
- error = create_dir(kobj, parent_sd, type, ns, kobject_name(kobj), &sd);
- if (!error)
- kobj->sd = sd;
- return error;
-}
-
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
- unsigned int flags)
-{
- struct dentry *ret = NULL;
- struct dentry *parent = dentry->d_parent;
- struct sysfs_dirent *parent_sd = parent->d_fsdata;
- struct sysfs_dirent *sd;
- struct inode *inode;
- enum kobj_ns_type type;
- const void *ns;
-
- mutex_lock(&sysfs_mutex);
-
- type = sysfs_ns_type(parent_sd);
- ns = sysfs_info(dir->i_sb)->ns[type];
-
- sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
-
- /* no such entry */
- if (!sd) {
- ret = ERR_PTR(-ENOENT);
- goto out_unlock;
- }
- dentry->d_fsdata = sysfs_get(sd);
-
- /* attach dentry and inode */
- inode = sysfs_get_inode(dir->i_sb, sd);
- if (!inode) {
- ret = ERR_PTR(-ENOMEM);
- goto out_unlock;
- }
-
- /* instantiate and hash dentry */
- ret = d_materialise_unique(dentry, inode);
- out_unlock:
- mutex_unlock(&sysfs_mutex);
- return ret;
-}
-
-const struct inode_operations sysfs_dir_inode_operations = {
- .lookup = sysfs_lookup,
- .permission = sysfs_permission,
- .setattr = sysfs_setattr,
- .getattr = sysfs_getattr,
- .setxattr = sysfs_setxattr,
-};
-
-static void remove_dir(struct sysfs_dirent *sd)
-{
- struct sysfs_addrm_cxt acxt;
-
- sysfs_addrm_start(&acxt, sd->s_parent);
- sysfs_remove_one(&acxt, sd);
- sysfs_addrm_finish(&acxt);
-}
-
-void sysfs_remove_subdir(struct sysfs_dirent *sd)
-{
- remove_dir(sd);
-}
-
-
-static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
-{
- struct sysfs_addrm_cxt acxt;
- struct rb_node *pos;
-
- if (!dir_sd)
- return;
-
- pr_debug("sysfs %s: removing dir\n", dir_sd->s_name);
- sysfs_addrm_start(&acxt, dir_sd);
- pos = rb_first(&dir_sd->s_dir.children);
- while (pos) {
- struct sysfs_dirent *sd = to_sysfs_dirent(pos);
- pos = rb_next(pos);
- if (sysfs_type(sd) != SYSFS_DIR)
- sysfs_remove_one(&acxt, sd);
+ kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
+ S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, kobject_name(kobj));
+ return PTR_ERR(kn);
}
- sysfs_addrm_finish(&acxt);
- remove_dir(dir_sd);
+ kobj->sd = kn;
+ return 0;
}
/**
@@ -852,201 +72,52 @@ static void __sysfs_remove_dir(struct sysfs_dirent *dir_sd)
* the directory before we remove the directory, and we've inlined
* what used to be sysfs_rmdir() below, instead of calling separately.
*/
-
void sysfs_remove_dir(struct kobject *kobj)
{
- struct sysfs_dirent *sd = kobj->sd;
-
- spin_lock(&sysfs_assoc_lock);
- kobj->sd = NULL;
- spin_unlock(&sysfs_assoc_lock);
-
- __sysfs_remove_dir(sd);
-}
-
-int sysfs_rename(struct sysfs_dirent *sd,
- struct sysfs_dirent *new_parent_sd, const void *new_ns,
- const char *new_name)
-{
- int error;
-
- mutex_lock(&sysfs_mutex);
-
- error = 0;
- if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
- (strcmp(sd->s_name, new_name) == 0))
- goto out; /* nothing to rename */
-
- error = -EEXIST;
- if (sysfs_find_dirent(new_parent_sd, new_ns, new_name))
- goto out;
-
- /* rename sysfs_dirent */
- if (strcmp(sd->s_name, new_name) != 0) {
- error = -ENOMEM;
- new_name = kstrdup(new_name, GFP_KERNEL);
- if (!new_name)
- goto out;
-
- kfree(sd->s_name);
- sd->s_name = new_name;
- }
+ struct kernfs_node *kn = kobj->sd;
/*
- * Move to the appropriate place in the appropriate directories rbtree.
+ * In general, kboject owner is responsible for ensuring removal
+ * doesn't race with other operations and sysfs doesn't provide any
+ * protection; however, when @kobj is used as a symlink target, the
+ * symlinking entity usually doesn't own @kobj and thus has no
+ * control over removal. @kobj->sd may be removed anytime
+ * and symlink code may end up dereferencing an already freed node.
+ *
+ * sysfs_symlink_target_lock synchronizes @kobj->sd
+ * disassociation against symlink operations so that symlink code
+ * can safely dereference @kobj->sd.
*/
- sysfs_unlink_sibling(sd);
- sysfs_get(new_parent_sd);
- sysfs_put(sd->s_parent);
- sd->s_ns = new_ns;
- sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
- sd->s_parent = new_parent_sd;
- sysfs_link_sibling(sd);
-
- error = 0;
- out:
- mutex_unlock(&sysfs_mutex);
- return error;
-}
-
-int sysfs_rename_dir(struct kobject *kobj, const char *new_name)
-{
- struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
- const void *new_ns = NULL;
-
- if (sysfs_ns_type(parent_sd))
- new_ns = kobj->ktype->namespace(kobj);
-
- return sysfs_rename(kobj->sd, parent_sd, new_ns, new_name);
-}
-
-int sysfs_move_dir(struct kobject *kobj, struct kobject *new_parent_kobj)
-{
- struct sysfs_dirent *sd = kobj->sd;
- struct sysfs_dirent *new_parent_sd;
- const void *new_ns = NULL;
-
- BUG_ON(!sd->s_parent);
- if (sysfs_ns_type(sd->s_parent))
- new_ns = kobj->ktype->namespace(kobj);
- new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
- new_parent_kobj->sd : &sysfs_root;
-
- return sysfs_rename(sd, new_parent_sd, new_ns, sd->s_name);
-}
-
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
-{
- return (sd->s_mode >> 12) & 15;
-}
-
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
-{
- sysfs_put(filp->private_data);
- return 0;
-}
-
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
- struct sysfs_dirent *parent_sd, loff_t hash, struct sysfs_dirent *pos)
-{
- if (pos) {
- int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
- pos->s_parent == parent_sd &&
- hash == pos->s_hash;
- sysfs_put(pos);
- if (!valid)
- pos = NULL;
- }
- if (!pos && (hash > 1) && (hash < INT_MAX)) {
- struct rb_node *node = parent_sd->s_dir.children.rb_node;
- while (node) {
- pos = to_sysfs_dirent(node);
+ spin_lock(&sysfs_symlink_target_lock);
+ kobj->sd = NULL;
+ spin_unlock(&sysfs_symlink_target_lock);
- if (hash < pos->s_hash)
- node = node->rb_left;
- else if (hash > pos->s_hash)
- node = node->rb_right;
- else
- break;
- }
- }
- /* Skip over entries in the wrong namespace */
- while (pos && pos->s_ns != ns) {
- struct rb_node *node = rb_next(&pos->s_rb);
- if (!node)
- pos = NULL;
- else
- pos = to_sysfs_dirent(node);
+ if (kn) {
+ WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
+ kernfs_remove(kn);
}
- return pos;
}
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
- struct sysfs_dirent *parent_sd, ino_t ino, struct sysfs_dirent *pos)
+int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
+ const void *new_ns)
{
- pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
- if (pos)
- do {
- struct rb_node *node = rb_next(&pos->s_rb);
- if (!node)
- pos = NULL;
- else
- pos = to_sysfs_dirent(node);
- } while (pos && pos->s_ns != ns);
- return pos;
-}
-
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
-{
- struct dentry *dentry = file->f_path.dentry;
- struct sysfs_dirent *parent_sd = dentry->d_fsdata;
- struct sysfs_dirent *pos = file->private_data;
- enum kobj_ns_type type;
- const void *ns;
-
- type = sysfs_ns_type(parent_sd);
- ns = sysfs_info(dentry->d_sb)->ns[type];
-
- if (!dir_emit_dots(file, ctx))
- return 0;
- mutex_lock(&sysfs_mutex);
- for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
- pos;
- pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
- const char *name = pos->s_name;
- unsigned int type = dt_type(pos);
- int len = strlen(name);
- ino_t ino = pos->s_ino;
- ctx->pos = pos->s_hash;
- file->private_data = sysfs_get(pos);
+ struct kernfs_node *parent;
+ int ret;
- mutex_unlock(&sysfs_mutex);
- if (!dir_emit(ctx, name, len, ino, type))
- return 0;
- mutex_lock(&sysfs_mutex);
- }
- mutex_unlock(&sysfs_mutex);
- file->private_data = NULL;
- ctx->pos = INT_MAX;
- return 0;
+ parent = kernfs_get_parent(kobj->sd);
+ ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
+ kernfs_put(parent);
+ return ret;
}
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
+int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
+ const void *new_ns)
{
- struct inode *inode = file_inode(file);
- loff_t ret;
+ struct kernfs_node *kn = kobj->sd;
+ struct kernfs_node *new_parent;
- mutex_lock(&inode->i_mutex);
- ret = generic_file_llseek(file, offset, whence);
- mutex_unlock(&inode->i_mutex);
+ new_parent = new_parent_kobj && new_parent_kobj->sd ?
+ new_parent_kobj->sd : sysfs_root_kn;
- return ret;
+ return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
}
-
-const struct file_operations sysfs_dir_operations = {
- .read = generic_read_dir,
- .iterate = sysfs_readdir,
- .release = sysfs_dir_release,
- .llseek = sysfs_dir_llseek,
-};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 15ef5eb1366..e9ef59b3abb 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,77 +14,56 @@
#include <linux/kobject.h>
#include <linux/kallsyms.h>
#include <linux/slab.h>
-#include <linux/fsnotify.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
#include <linux/list.h>
#include <linux/mutex.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
+#include <linux/seq_file.h>
#include "sysfs.h"
+#include "../kernfs/kernfs-internal.h"
/*
- * There's one sysfs_buffer for each open file and one
- * sysfs_open_dirent for each sysfs_dirent with one or more open
- * files.
- *
- * filp->private_data points to sysfs_buffer and
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent. s_attr.open
- * is protected by sysfs_open_dirent_lock.
+ * Determine ktype->sysfs_ops for the given kernfs_node. This function
+ * must be called while holding an active reference.
*/
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-
-struct sysfs_open_dirent {
- atomic_t refcnt;
- atomic_t event;
- wait_queue_head_t poll;
- struct list_head buffers; /* goes through sysfs_buffer.list */
-};
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
+{
+ struct kobject *kobj = kn->parent->priv;
-struct sysfs_buffer {
- size_t count;
- loff_t pos;
- char *page;
- const struct sysfs_ops *ops;
- struct mutex mutex;
- int needs_read_fill;
- int event;
- struct list_head list;
-};
+ if (kn->flags & KERNFS_LOCKDEP)
+ lockdep_assert_held(kn);
+ return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
+}
-/**
- * fill_read_buffer - allocate and fill buffer from object.
- * @dentry: dentry pointer.
- * @buffer: data buffer for file.
- *
- * Allocate @buffer->page, if it hasn't been already, then call the
- * kobject's show() method to fill the buffer with this attribute's
- * data.
- * This is called only once, on the file's first read unless an error
- * is returned.
+/*
+ * Reads on sysfs are handled through seq_file, which takes care of hairy
+ * details like buffering and seeking. The following function pipes
+ * sysfs_ops->show() result through seq_file.
*/
-static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
{
- struct sysfs_dirent *attr_sd = dentry->d_fsdata;
- struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- const struct sysfs_ops *ops = buffer->ops;
- int ret = 0;
+ struct kernfs_open_file *of = sf->private;
+ struct kobject *kobj = of->kn->parent->priv;
+ const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
ssize_t count;
+ char *buf;
- if (!buffer->page)
- buffer->page = (char *) get_zeroed_page(GFP_KERNEL);
- if (!buffer->page)
- return -ENOMEM;
-
- /* need attr_sd for attr and ops, its parent for kobj */
- if (!sysfs_get_active(attr_sd))
- return -ENODEV;
-
- buffer->event = atomic_read(&attr_sd->s_attr.open->event);
- count = ops->show(kobj, attr_sd->s_attr.attr, buffer->page);
+ /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
+ count = seq_get_buf(sf, &buf);
+ if (count < PAGE_SIZE) {
+ seq_commit(sf, -1);
+ return 0;
+ }
+ memset(buf, 0, PAGE_SIZE);
- sysfs_put_active(attr_sd);
+ /*
+ * Invoke show(). Control may reach here via seq file lseek even
+ * if @ops->show() isn't implemented.
+ */
+ if (ops->show) {
+ count = ops->show(kobj, of->kn->priv, buf);
+ if (count < 0)
+ return count;
+ }
/*
* The code works fine with PAGE_SIZE return but it's likely to
@@ -96,486 +75,215 @@ static int fill_read_buffer(struct dentry *dentry, struct sysfs_buffer *buffer)
/* Try to struggle along */
count = PAGE_SIZE - 1;
}
- if (count >= 0) {
- buffer->needs_read_fill = 0;
- buffer->count = count;
- } else {
- ret = count;
- }
- return ret;
-}
-
-/**
- * sysfs_read_file - read an attribute.
- * @file: file pointer.
- * @buf: buffer to fill.
- * @count: number of bytes to read.
- * @ppos: starting offset in file.
- *
- * Userspace wants to read an attribute file. The attribute descriptor
- * is in the file's ->d_fsdata. The target object is in the directory's
- * ->d_fsdata.
- *
- * We call fill_read_buffer() to allocate and fill the buffer from the
- * object's show() method exactly once (if the read is happening from
- * the beginning of the file). That should fill the entire buffer with
- * all the data the object has to offer for that attribute.
- * We then call flush_read_buffer() to copy the buffer to userspace
- * in the increments specified.
- */
-
-static ssize_t
-sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
-{
- struct sysfs_buffer *buffer = file->private_data;
- ssize_t retval = 0;
-
- mutex_lock(&buffer->mutex);
- if (buffer->needs_read_fill || *ppos == 0) {
- retval = fill_read_buffer(file->f_path.dentry, buffer);
- if (retval)
- goto out;
- }
- pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n",
- __func__, count, *ppos, buffer->page);
- retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
- buffer->count);
-out:
- mutex_unlock(&buffer->mutex);
- return retval;
-}
-
-/**
- * fill_write_buffer - copy buffer from userspace.
- * @buffer: data buffer for file.
- * @buf: data from user.
- * @count: number of bytes in @userbuf.
- *
- * Allocate @buffer->page if it hasn't been already, then
- * copy the user-supplied buffer into it.
- */
-static int fill_write_buffer(struct sysfs_buffer *buffer,
- const char __user *buf, size_t count)
-{
- int error;
-
- if (!buffer->page)
- buffer->page = (char *)get_zeroed_page(GFP_KERNEL);
- if (!buffer->page)
- return -ENOMEM;
-
- if (count >= PAGE_SIZE)
- count = PAGE_SIZE - 1;
- error = copy_from_user(buffer->page, buf, count);
- buffer->needs_read_fill = 1;
- /* if buf is assumed to contain a string, terminate it by \0,
- so e.g. sscanf() can scan the string easily */
- buffer->page[count] = 0;
- return error ? -EFAULT : count;
-}
-
-
-/**
- * flush_write_buffer - push buffer to kobject.
- * @dentry: dentry to the attribute
- * @buffer: data buffer for file.
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're
- * dealing with, then call the store() method for the attribute,
- * passing the buffer that we acquired in fill_write_buffer().
- */
-static int flush_write_buffer(struct dentry *dentry,
- struct sysfs_buffer *buffer, size_t count)
-{
- struct sysfs_dirent *attr_sd = dentry->d_fsdata;
- struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- const struct sysfs_ops *ops = buffer->ops;
- int rc;
-
- /* need attr_sd for attr and ops, its parent for kobj */
- if (!sysfs_get_active(attr_sd))
- return -ENODEV;
-
- rc = ops->store(kobj, attr_sd->s_attr.attr, buffer->page, count);
-
- sysfs_put_active(attr_sd);
-
- return rc;
-}
-
-
-/**
- * sysfs_write_file - write an attribute.
- * @file: file pointer
- * @buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Similar to sysfs_read_file(), though working in the opposite direction.
- * We allocate and fill the data from the user in fill_write_buffer(),
- * then push it to the kobject in flush_write_buffer().
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come
- * on the first write.
- * Hint: if you're writing a value, first read the file, modify only the
- * the value you're changing, then write entire buffer back.
- */
-static ssize_t sysfs_write_file(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct sysfs_buffer *buffer = file->private_data;
- ssize_t len;
-
- mutex_lock(&buffer->mutex);
- len = fill_write_buffer(buffer, buf, count);
- if (len > 0)
- len = flush_write_buffer(file->f_path.dentry, buffer, len);
- if (len > 0)
- *ppos += len;
- mutex_unlock(&buffer->mutex);
- return len;
+ seq_commit(sf, count);
+ return 0;
}
-/**
- * sysfs_get_open_dirent - get or create sysfs_open_dirent
- * @sd: target sysfs_dirent
- * @buffer: sysfs_buffer for this instance of open
- *
- * If @sd->s_attr.open exists, increment its reference count;
- * otherwise, create one. @buffer is chained to the buffers
- * list.
- *
- * LOCKING:
- * Kernel thread context (may sleep).
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
- struct sysfs_buffer *buffer)
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
+ size_t count, loff_t pos)
{
- struct sysfs_open_dirent *od, *new_od = NULL;
+ struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = of->kn->parent->priv;
+ loff_t size = file_inode(of->file)->i_size;
- retry:
- spin_lock_irq(&sysfs_open_dirent_lock);
-
- if (!sd->s_attr.open && new_od) {
- sd->s_attr.open = new_od;
- new_od = NULL;
- }
-
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->refcnt);
- list_add_tail(&buffer->list, &od->buffers);
- }
-
- spin_unlock_irq(&sysfs_open_dirent_lock);
-
- if (od) {
- kfree(new_od);
+ if (!count)
return 0;
+
+ if (size) {
+ if (pos > size)
+ return 0;
+ if (pos + count > size)
+ count = size - pos;
}
- /* not there, initialize a new one and retry */
- new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
- if (!new_od)
- return -ENOMEM;
+ if (!battr->read)
+ return -EIO;
- atomic_set(&new_od->refcnt, 0);
- atomic_set(&new_od->event, 1);
- init_waitqueue_head(&new_od->poll);
- INIT_LIST_HEAD(&new_od->buffers);
- goto retry;
+ return battr->read(of->file, kobj, battr, buf, pos, count);
}
-/**
- * sysfs_put_open_dirent - put sysfs_open_dirent
- * @sd: target sysfs_dirent
- * @buffer: associated sysfs_buffer
- *
- * Put @sd->s_attr.open and unlink @buffer from the buffers list.
- * If reference count reaches zero, disassociate and free it.
- *
- * LOCKING:
- * None.
- */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
- struct sysfs_buffer *buffer)
+/* kernfs write callback for regular sysfs files */
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
+ size_t count, loff_t pos)
{
- struct sysfs_open_dirent *od = sd->s_attr.open;
- unsigned long flags;
+ const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+ struct kobject *kobj = of->kn->parent->priv;
- spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-
- list_del(&buffer->list);
- if (atomic_dec_and_test(&od->refcnt))
- sd->s_attr.open = NULL;
- else
- od = NULL;
-
- spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
+ if (!count)
+ return 0;
- kfree(od);
+ return ops->store(kobj, of->kn->priv, buf, count);
}
-static int sysfs_open_file(struct inode *inode, struct file *file)
+/* kernfs write callback for bin sysfs files */
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
+ size_t count, loff_t pos)
{
- struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
- struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
- struct sysfs_buffer *buffer;
- const struct sysfs_ops *ops;
- int error = -EACCES;
-
- /* need attr_sd for attr and ops, its parent for kobj */
- if (!sysfs_get_active(attr_sd))
- return -ENODEV;
-
- /* every kobject with an attribute needs a ktype assigned */
- if (kobj->ktype && kobj->ktype->sysfs_ops)
- ops = kobj->ktype->sysfs_ops;
- else {
- WARN(1, KERN_ERR
- "missing sysfs attribute operations for kobject: %s\n",
- kobject_name(kobj));
- goto err_out;
- }
-
- /* File needs write support.
- * The inode's perms must say it's ok,
- * and we must have a store method.
- */
- if (file->f_mode & FMODE_WRITE) {
- if (!(inode->i_mode & S_IWUGO) || !ops->store)
- goto err_out;
- }
-
- /* File needs read support.
- * The inode's perms must say it's ok, and we there
- * must be a show method for it.
- */
- if (file->f_mode & FMODE_READ) {
- if (!(inode->i_mode & S_IRUGO) || !ops->show)
- goto err_out;
+ struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = of->kn->parent->priv;
+ loff_t size = file_inode(of->file)->i_size;
+
+ if (size) {
+ if (size <= pos)
+ return 0;
+ count = min_t(ssize_t, count, size - pos);
}
+ if (!count)
+ return 0;
- /* No error? Great, allocate a buffer for the file, and store it
- * it in file->private_data for easy access.
- */
- error = -ENOMEM;
- buffer = kzalloc(sizeof(struct sysfs_buffer), GFP_KERNEL);
- if (!buffer)
- goto err_out;
-
- mutex_init(&buffer->mutex);
- buffer->needs_read_fill = 1;
- buffer->ops = ops;
- file->private_data = buffer;
-
- /* make sure we have open dirent struct */
- error = sysfs_get_open_dirent(attr_sd, buffer);
- if (error)
- goto err_free;
-
- /* open succeeded, put active references */
- sysfs_put_active(attr_sd);
- return 0;
+ if (!battr->write)
+ return -EIO;
- err_free:
- kfree(buffer);
- err_out:
- sysfs_put_active(attr_sd);
- return error;
+ return battr->write(of->file, kobj, battr, buf, pos, count);
}
-static int sysfs_release(struct inode *inode, struct file *filp)
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
+ struct vm_area_struct *vma)
{
- struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
- struct sysfs_buffer *buffer = filp->private_data;
-
- sysfs_put_open_dirent(sd, buffer);
-
- if (buffer->page)
- free_page((unsigned long)buffer->page);
- kfree(buffer);
+ struct bin_attribute *battr = of->kn->priv;
+ struct kobject *kobj = of->kn->parent->priv;
- return 0;
+ return battr->mmap(of->file, kobj, battr, vma);
}
-/* Sysfs attribute files are pollable. The idea is that you read
- * the content and then you use 'poll' or 'select' to wait for
- * the content to change. When the content changes (assuming the
- * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
- * it is waiting for read, write, or exceptions.
- * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, or seek to 0 and read again.
- * Reminder: this only works for attributes which actively support
- * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Neither 'poll' nor 'select' return
- * an appropriate error code). When in doubt, set a suitable timeout value.
- */
-static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
{
- struct sysfs_buffer *buffer = filp->private_data;
- struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
- struct sysfs_open_dirent *od = attr_sd->s_attr.open;
-
- /* need parent for the kobj, grab both */
- if (!sysfs_get_active(attr_sd))
- goto trigger;
-
- poll_wait(filp, &od->poll, wait);
-
- sysfs_put_active(attr_sd);
+ struct kernfs_node *kn = kobj->sd, *tmp;
- if (buffer->event != atomic_read(&od->event))
- goto trigger;
+ if (kn && dir)
+ kn = kernfs_find_and_get(kn, dir);
+ else
+ kernfs_get(kn);
- return DEFAULT_POLLMASK;
+ if (kn && attr) {
+ tmp = kernfs_find_and_get(kn, attr);
+ kernfs_put(kn);
+ kn = tmp;
+ }
- trigger:
- buffer->needs_read_fill = 1;
- return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+ if (kn) {
+ kernfs_notify(kn);
+ kernfs_put(kn);
+ }
}
+EXPORT_SYMBOL_GPL(sysfs_notify);
-void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
- struct sysfs_open_dirent *od;
- unsigned long flags;
-
- spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+static const struct kernfs_ops sysfs_file_kfops_empty = {
+};
- if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
- od = sd->s_attr.open;
- if (od) {
- atomic_inc(&od->event);
- wake_up_interruptible(&od->poll);
- }
- }
+static const struct kernfs_ops sysfs_file_kfops_ro = {
+ .seq_show = sysfs_kf_seq_show,
+};
- spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
+static const struct kernfs_ops sysfs_file_kfops_wo = {
+ .write = sysfs_kf_write,
+};
-void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
-{
- struct sysfs_dirent *sd = k->sd;
+static const struct kernfs_ops sysfs_file_kfops_rw = {
+ .seq_show = sysfs_kf_seq_show,
+ .write = sysfs_kf_write,
+};
- mutex_lock(&sysfs_mutex);
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
+ .read = sysfs_kf_bin_read,
+};
- if (sd && dir)
- sd = sysfs_find_dirent(sd, NULL, dir);
- if (sd && attr)
- sd = sysfs_find_dirent(sd, NULL, attr);
- if (sd)
- sysfs_notify_dirent(sd);
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
+ .write = sysfs_kf_bin_write,
+};
- mutex_unlock(&sysfs_mutex);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify);
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
+ .read = sysfs_kf_bin_read,
+ .write = sysfs_kf_bin_write,
+};
-const struct file_operations sysfs_file_operations = {
- .read = sysfs_read_file,
- .write = sysfs_write_file,
- .llseek = generic_file_llseek,
- .open = sysfs_open_file,
- .release = sysfs_release,
- .poll = sysfs_poll,
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
+ .read = sysfs_kf_bin_read,
+ .write = sysfs_kf_bin_write,
+ .mmap = sysfs_kf_bin_mmap,
};
-static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
- const void **pns)
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+ const struct attribute *attr, bool is_bin,
+ umode_t mode, const void *ns)
{
- struct sysfs_dirent *dir_sd = kobj->sd;
- const struct sysfs_ops *ops;
- const void *ns = NULL;
- int err;
-
- if (!dir_sd) {
- WARN(1, KERN_ERR "sysfs: kobject %s without dirent\n",
- kobject_name(kobj));
- return -ENOENT;
+ struct lock_class_key *key = NULL;
+ const struct kernfs_ops *ops;
+ struct kernfs_node *kn;
+ loff_t size;
+
+ if (!is_bin) {
+ struct kobject *kobj = parent->priv;
+ const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
+
+ /* every kobject with an attribute needs a ktype assigned */
+ if (WARN(!sysfs_ops, KERN_ERR
+ "missing sysfs attribute operations for kobject: %s\n",
+ kobject_name(kobj)))
+ return -EINVAL;
+
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_file_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_file_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_file_kfops_wo;
+ else
+ ops = &sysfs_file_kfops_empty;
+
+ size = PAGE_SIZE;
+ } else {
+ struct bin_attribute *battr = (void *)attr;
+
+ if (battr->mmap)
+ ops = &sysfs_bin_kfops_mmap;
+ else if (battr->read && battr->write)
+ ops = &sysfs_bin_kfops_rw;
+ else if (battr->read)
+ ops = &sysfs_bin_kfops_ro;
+ else if (battr->write)
+ ops = &sysfs_bin_kfops_wo;
+ else
+ ops = &sysfs_file_kfops_empty;
+
+ size = battr->size;
}
- err = 0;
- if (!sysfs_ns_type(dir_sd))
- goto out;
-
- err = -EINVAL;
- if (!kobj->ktype)
- goto out;
- ops = kobj->ktype->sysfs_ops;
- if (!ops)
- goto out;
- if (!ops->namespace)
- goto out;
-
- err = 0;
- ns = ops->namespace(kobj, attr);
-out:
- if (err) {
- WARN(1, KERN_ERR
- "missing sysfs namespace attribute operation for kobject: %s\n",
- kobject_name(kobj));
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ if (!attr->ignore_lockdep)
+ key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+ kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
+ (void *)attr, ns, true, key);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, attr->name);
+ return PTR_ERR(kn);
}
- *pns = ns;
- return err;
-}
-
-int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
- const struct attribute *attr, int type, umode_t amode)
-{
- umode_t mode = (amode & S_IALLUGO) | S_IFREG;
- struct sysfs_addrm_cxt acxt;
- struct sysfs_dirent *sd;
- const void *ns;
- int rc;
-
- rc = sysfs_attr_ns(dir_sd->s_dir.kobj, attr, &ns);
- if (rc)
- return rc;
-
- sd = sysfs_new_dirent(attr->name, mode, type);
- if (!sd)
- return -ENOMEM;
-
- sd->s_ns = ns;
- sd->s_attr.attr = (void *)attr;
- sysfs_dirent_init_lockdep(sd);
-
- sysfs_addrm_start(&acxt, dir_sd);
- rc = sysfs_add_one(&acxt, sd);
- sysfs_addrm_finish(&acxt);
-
- if (rc)
- sysfs_put(sd);
-
- return rc;
+ return 0;
}
-
-int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
- int type)
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
+ bool is_bin)
{
- return sysfs_add_file_mode(dir_sd, attr, type, attr->mode);
+ return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
}
-
/**
- * sysfs_create_file - create an attribute file for an object.
- * @kobj: object we're creating for.
- * @attr: attribute descriptor.
+ * sysfs_create_file_ns - create an attribute file for an object with custom ns
+ * @kobj: object we're creating for
+ * @attr: attribute descriptor
+ * @ns: namespace the new file should belong to
*/
-int sysfs_create_file(struct kobject *kobj, const struct attribute *attr)
+int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
+ const void *ns)
{
BUG_ON(!kobj || !kobj->sd || !attr);
- return sysfs_add_file(kobj->sd, attr, SYSFS_KOBJ_ATTR);
+ return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
}
-EXPORT_SYMBOL_GPL(sysfs_create_file);
+EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
int sysfs_create_files(struct kobject *kobj, const struct attribute **ptr)
{
@@ -600,19 +308,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
int sysfs_add_file_to_group(struct kobject *kobj,
const struct attribute *attr, const char *group)
{
- struct sysfs_dirent *dir_sd;
+ struct kernfs_node *parent;
int error;
- if (group)
- dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
- else
- dir_sd = sysfs_get(kobj->sd);
+ if (group) {
+ parent = kernfs_find_and_get(kobj->sd, group);
+ } else {
+ parent = kobj->sd;
+ kernfs_get(parent);
+ }
- if (!dir_sd)
+ if (!parent)
return -ENOENT;
- error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
- sysfs_put(dir_sd);
+ error = sysfs_add_file(parent, attr, false);
+ kernfs_put(parent);
return error;
}
@@ -628,49 +338,63 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
umode_t mode)
{
- struct sysfs_dirent *sd;
+ struct kernfs_node *kn;
struct iattr newattrs;
- const void *ns;
int rc;
- rc = sysfs_attr_ns(kobj, attr, &ns);
- if (rc)
- return rc;
-
- mutex_lock(&sysfs_mutex);
-
- rc = -ENOENT;
- sd = sysfs_find_dirent(kobj->sd, ns, attr->name);
- if (!sd)
- goto out;
+ kn = kernfs_find_and_get(kobj->sd, attr->name);
+ if (!kn)
+ return -ENOENT;
- newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+ newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
newattrs.ia_valid = ATTR_MODE;
- rc = sysfs_sd_setattr(sd, &newattrs);
- out:
- mutex_unlock(&sysfs_mutex);
+ rc = kernfs_setattr(kn, &newattrs);
+
+ kernfs_put(kn);
return rc;
}
EXPORT_SYMBOL_GPL(sysfs_chmod_file);
/**
- * sysfs_remove_file - remove an object attribute.
- * @kobj: object we're acting for.
- * @attr: attribute descriptor.
+ * sysfs_remove_file_ns - remove an object attribute with a custom ns tag
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
+ * @ns: namespace tag of the file to remove
+ *
+ * Hash the attribute name and namespace tag and kill the victim.
+ */
+void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
+ const void *ns)
+{
+ struct kernfs_node *parent = kobj->sd;
+
+ kernfs_remove_by_name_ns(parent, attr->name, ns);
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
+
+/**
+ * sysfs_remove_file_self - remove an object attribute from its own method
+ * @kobj: object we're acting for
+ * @attr: attribute descriptor
*
- * Hash the attribute name and kill the victim.
+ * See kernfs_remove_self() for details.
*/
-void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr)
+bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr)
{
- const void *ns;
+ struct kernfs_node *parent = kobj->sd;
+ struct kernfs_node *kn;
+ bool ret;
+
+ kn = kernfs_find_and_get(parent, attr->name);
+ if (WARN_ON_ONCE(!kn))
+ return false;
- if (sysfs_attr_ns(kobj, attr, &ns))
- return;
+ ret = kernfs_remove_self(kn);
- sysfs_hash_and_remove(kobj->sd, ns, attr->name);
+ kernfs_put(kn);
+ return ret;
}
-EXPORT_SYMBOL_GPL(sysfs_remove_file);
void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr)
{
@@ -689,107 +413,44 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
void sysfs_remove_file_from_group(struct kobject *kobj,
const struct attribute *attr, const char *group)
{
- struct sysfs_dirent *dir_sd;
+ struct kernfs_node *parent;
- if (group)
- dir_sd = sysfs_get_dirent(kobj->sd, NULL, group);
- else
- dir_sd = sysfs_get(kobj->sd);
- if (dir_sd) {
- sysfs_hash_and_remove(dir_sd, NULL, attr->name);
- sysfs_put(dir_sd);
+ if (group) {
+ parent = kernfs_find_and_get(kobj->sd, group);
+ } else {
+ parent = kobj->sd;
+ kernfs_get(parent);
+ }
+
+ if (parent) {
+ kernfs_remove_by_name(parent, attr->name);
+ kernfs_put(parent);
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
-struct sysfs_schedule_callback_struct {
- struct list_head workq_list;
- struct kobject *kobj;
- void (*func)(void *);
- void *data;
- struct module *owner;
- struct work_struct work;
-};
-
-static struct workqueue_struct *sysfs_workqueue;
-static DEFINE_MUTEX(sysfs_workq_mutex);
-static LIST_HEAD(sysfs_workq);
-static void sysfs_schedule_callback_work(struct work_struct *work)
+/**
+ * sysfs_create_bin_file - create binary file for object.
+ * @kobj: object.
+ * @attr: attribute descriptor.
+ */
+int sysfs_create_bin_file(struct kobject *kobj,
+ const struct bin_attribute *attr)
{
- struct sysfs_schedule_callback_struct *ss = container_of(work,
- struct sysfs_schedule_callback_struct, work);
-
- (ss->func)(ss->data);
- kobject_put(ss->kobj);
- module_put(ss->owner);
- mutex_lock(&sysfs_workq_mutex);
- list_del(&ss->workq_list);
- mutex_unlock(&sysfs_workq_mutex);
- kfree(ss);
+ BUG_ON(!kobj || !kobj->sd || !attr);
+
+ return sysfs_add_file(kobj->sd, &attr->attr, true);
}
+EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
/**
- * sysfs_schedule_callback - helper to schedule a callback for a kobject
- * @kobj: object we're acting for.
- * @func: callback function to invoke later.
- * @data: argument to pass to @func.
- * @owner: module owning the callback code
- *
- * sysfs attribute methods must not unregister themselves or their parent
- * kobject (which would amount to the same thing). Attempts to do so will
- * deadlock, since unregistration is mutually exclusive with driver
- * callbacks.
- *
- * Instead methods can call this routine, which will attempt to allocate
- * and schedule a workqueue request to call back @func with @data as its
- * argument in the workqueue's process context. @kobj will be pinned
- * until @func returns.
- *
- * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available,
- * -EAGAIN if a callback has already been scheduled for @kobj.
+ * sysfs_remove_bin_file - remove binary file for object.
+ * @kobj: object.
+ * @attr: attribute descriptor.
*/
-int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
- void *data, struct module *owner)
+void sysfs_remove_bin_file(struct kobject *kobj,
+ const struct bin_attribute *attr)
{
- struct sysfs_schedule_callback_struct *ss, *tmp;
-
- if (!try_module_get(owner))
- return -ENODEV;
-
- mutex_lock(&sysfs_workq_mutex);
- list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
- if (ss->kobj == kobj) {
- module_put(owner);
- mutex_unlock(&sysfs_workq_mutex);
- return -EAGAIN;
- }
- mutex_unlock(&sysfs_workq_mutex);
-
- if (sysfs_workqueue == NULL) {
- sysfs_workqueue = create_singlethread_workqueue("sysfsd");
- if (sysfs_workqueue == NULL) {
- module_put(owner);
- return -ENOMEM;
- }
- }
-
- ss = kmalloc(sizeof(*ss), GFP_KERNEL);
- if (!ss) {
- module_put(owner);
- return -ENOMEM;
- }
- kobject_get(kobj);
- ss->kobj = kobj;
- ss->func = func;
- ss->data = data;
- ss->owner = owner;
- INIT_WORK(&ss->work, sysfs_schedule_callback_work);
- INIT_LIST_HEAD(&ss->workq_list);
- mutex_lock(&sysfs_workq_mutex);
- list_add_tail(&ss->workq_list, &sysfs_workq);
- mutex_unlock(&sysfs_workq_mutex);
- queue_work(sysfs_workqueue, &ss->work);
- return 0;
+ kernfs_remove_by_name(kobj->sd, attr->attr.name);
}
-EXPORT_SYMBOL_GPL(sysfs_schedule_callback);
+EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 5f92cd2f61c..7d2a860ba78 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
#include "sysfs.h"
-static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent,
const struct attribute_group *grp)
{
struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
if (grp->attrs)
for (attr = grp->attrs; *attr; attr++)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+ kernfs_remove_by_name(parent, (*attr)->name);
if (grp->bin_attrs)
for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
- sysfs_remove_bin_file(kobj, *bin_attr);
+ kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
}
-static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
const struct attribute_group *grp, int update)
{
struct attribute *const *attr;
@@ -49,21 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
* re-adding (if required) the file.
*/
if (update)
- sysfs_hash_and_remove(dir_sd, NULL,
- (*attr)->name);
+ kernfs_remove_by_name(parent, (*attr)->name);
if (grp->is_visible) {
mode = grp->is_visible(kobj, *attr, i);
if (!mode)
continue;
}
- error = sysfs_add_file_mode(dir_sd, *attr,
- SYSFS_KOBJ_ATTR,
- (*attr)->mode | mode);
+ error = sysfs_add_file_mode_ns(parent, *attr, false,
+ (*attr)->mode | mode,
+ NULL);
if (unlikely(error))
break;
}
if (error) {
- remove_files(dir_sd, kobj, grp);
+ remove_files(parent, grp);
goto exit;
}
}
@@ -71,13 +70,16 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
if (grp->bin_attrs) {
for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
if (update)
- sysfs_remove_bin_file(kobj, *bin_attr);
- error = sysfs_create_bin_file(kobj, *bin_attr);
+ kernfs_remove_by_name(parent,
+ (*bin_attr)->attr.name);
+ error = sysfs_add_file_mode_ns(parent,
+ &(*bin_attr)->attr, true,
+ (*bin_attr)->attr.mode, NULL);
if (error)
break;
}
if (error)
- remove_files(dir_sd, kobj, grp);
+ remove_files(parent, grp);
}
exit:
return error;
@@ -87,7 +89,7 @@ exit:
static int internal_create_group(struct kobject *kobj, int update,
const struct attribute_group *grp)
{
- struct sysfs_dirent *sd;
+ struct kernfs_node *kn;
int error;
BUG_ON(!kobj || (!update && !kobj->sd));
@@ -101,18 +103,22 @@ static int internal_create_group(struct kobject *kobj, int update,
return -EINVAL;
}
if (grp->name) {
- error = sysfs_create_subdir(kobj, grp->name, &sd);
- if (error)
- return error;
+ kn = kernfs_create_dir(kobj->sd, grp->name,
+ S_IRWXU | S_IRUGO | S_IXUGO, kobj);
+ if (IS_ERR(kn)) {
+ if (PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(kobj->sd, grp->name);
+ return PTR_ERR(kn);
+ }
} else
- sd = kobj->sd;
- sysfs_get(sd);
- error = create_files(sd, kobj, grp, update);
+ kn = kobj->sd;
+ kernfs_get(kn);
+ error = create_files(kn, kobj, grp, update);
if (error) {
if (grp->name)
- sysfs_remove_subdir(sd);
+ kernfs_remove(kn);
}
- sysfs_put(sd);
+ kernfs_put(kn);
return error;
}
@@ -202,25 +208,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
void sysfs_remove_group(struct kobject *kobj,
const struct attribute_group *grp)
{
- struct sysfs_dirent *dir_sd = kobj->sd;
- struct sysfs_dirent *sd;
+ struct kernfs_node *parent = kobj->sd;
+ struct kernfs_node *kn;
if (grp->name) {
- sd = sysfs_get_dirent(dir_sd, NULL, grp->name);
- if (!sd) {
- WARN(!sd, KERN_WARNING
+ kn = kernfs_find_and_get(parent, grp->name);
+ if (!kn) {
+ WARN(!kn, KERN_WARNING
"sysfs group %p not found for kobject '%s'\n",
grp, kobject_name(kobj));
return;
}
- } else
- sd = sysfs_get(dir_sd);
+ } else {
+ kn = parent;
+ kernfs_get(kn);
+ }
- remove_files(sd, kobj, grp);
+ remove_files(kn, grp);
if (grp->name)
- sysfs_remove_subdir(sd);
+ kernfs_remove(kn);
- sysfs_put(sd);
+ kernfs_put(kn);
}
EXPORT_SYMBOL_GPL(sysfs_remove_group);
@@ -256,22 +264,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
int sysfs_merge_group(struct kobject *kobj,
const struct attribute_group *grp)
{
- struct sysfs_dirent *dir_sd;
+ struct kernfs_node *parent;
int error = 0;
struct attribute *const *attr;
int i;
- dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
- if (!dir_sd)
+ parent = kernfs_find_and_get(kobj->sd, grp->name);
+ if (!parent)
return -ENOENT;
for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
- error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+ error = sysfs_add_file(parent, *attr, false);
if (error) {
while (--i >= 0)
- sysfs_hash_and_remove(dir_sd, NULL, (*--attr)->name);
+ kernfs_remove_by_name(parent, (*--attr)->name);
}
- sysfs_put(dir_sd);
+ kernfs_put(parent);
return error;
}
@@ -285,14 +293,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
void sysfs_unmerge_group(struct kobject *kobj,
const struct attribute_group *grp)
{
- struct sysfs_dirent *dir_sd;
+ struct kernfs_node *parent;
struct attribute *const *attr;
- dir_sd = sysfs_get_dirent(kobj->sd, NULL, grp->name);
- if (dir_sd) {
+ parent = kernfs_find_and_get(kobj->sd, grp->name);
+ if (parent) {
for (attr = grp->attrs; *attr; ++attr)
- sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
- sysfs_put(dir_sd);
+ kernfs_remove_by_name(parent, (*attr)->name);
+ kernfs_put(parent);
}
}
EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -307,15 +315,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
struct kobject *target, const char *link_name)
{
- struct sysfs_dirent *dir_sd;
+ struct kernfs_node *parent;
int error = 0;
- dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
- if (!dir_sd)
+ parent = kernfs_find_and_get(kobj->sd, group_name);
+ if (!parent)
return -ENOENT;
- error = sysfs_create_link_sd(dir_sd, target, link_name);
- sysfs_put(dir_sd);
+ error = sysfs_create_link_sd(parent, target, link_name);
+ kernfs_put(parent);
return error;
}
@@ -330,12 +338,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
const char *link_name)
{
- struct sysfs_dirent *dir_sd;
+ struct kernfs_node *parent;
- dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
- if (dir_sd) {
- sysfs_hash_and_remove(dir_sd, NULL, link_name);
- sysfs_put(dir_sd);
+ parent = kernfs_find_and_get(kobj->sd, group_name);
+ if (parent) {
+ kernfs_remove_by_name(parent, link_name);
+ kernfs_put(parent);
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 963f910c803..00000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * fs/sysfs/inode.c - basic sysfs inode and dentry operations
- *
- * Copyright (c) 2001-3 Patrick Mochel
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/pagemap.h>
-#include <linux/namei.h>
-#include <linux/backing-dev.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/xattr.h>
-#include <linux/security.h>
-#include "sysfs.h"
-
-static const struct address_space_operations sysfs_aops = {
- .readpage = simple_readpage,
- .write_begin = simple_write_begin,
- .write_end = simple_write_end,
-};
-
-static struct backing_dev_info sysfs_backing_dev_info = {
- .name = "sysfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static const struct inode_operations sysfs_inode_operations = {
- .permission = sysfs_permission,
- .setattr = sysfs_setattr,
- .getattr = sysfs_getattr,
- .setxattr = sysfs_setxattr,
-};
-
-int __init sysfs_inode_init(void)
-{
- return bdi_init(&sysfs_backing_dev_info);
-}
-
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
-{
- struct sysfs_inode_attrs *attrs;
- struct iattr *iattrs;
-
- attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
- if (!attrs)
- return NULL;
- iattrs = &attrs->ia_iattr;
-
- /* assign default attributes */
- iattrs->ia_mode = sd->s_mode;
- iattrs->ia_uid = GLOBAL_ROOT_UID;
- iattrs->ia_gid = GLOBAL_ROOT_GID;
- iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
-
- return attrs;
-}
-
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
-{
- struct sysfs_inode_attrs *sd_attrs;
- struct iattr *iattrs;
- unsigned int ia_valid = iattr->ia_valid;
-
- sd_attrs = sd->s_iattr;
-
- if (!sd_attrs) {
- /* setting attributes for the first time, allocate now */
- sd_attrs = sysfs_init_inode_attrs(sd);
- if (!sd_attrs)
- return -ENOMEM;
- sd->s_iattr = sd_attrs;
- }
- /* attributes were changed at least once in past */
- iattrs = &sd_attrs->ia_iattr;
-
- if (ia_valid & ATTR_UID)
- iattrs->ia_uid = iattr->ia_uid;
- if (ia_valid & ATTR_GID)
- iattrs->ia_gid = iattr->ia_gid;
- if (ia_valid & ATTR_ATIME)
- iattrs->ia_atime = iattr->ia_atime;
- if (ia_valid & ATTR_MTIME)
- iattrs->ia_mtime = iattr->ia_mtime;
- if (ia_valid & ATTR_CTIME)
- iattrs->ia_ctime = iattr->ia_ctime;
- if (ia_valid & ATTR_MODE) {
- umode_t mode = iattr->ia_mode;
- iattrs->ia_mode = sd->s_mode = mode;
- }
- return 0;
-}
-
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
- struct inode *inode = dentry->d_inode;
- struct sysfs_dirent *sd = dentry->d_fsdata;
- int error;
-
- if (!sd)
- return -EINVAL;
-
- mutex_lock(&sysfs_mutex);
- error = inode_change_ok(inode, iattr);
- if (error)
- goto out;
-
- error = sysfs_sd_setattr(sd, iattr);
- if (error)
- goto out;
-
- /* this ignores size changes */
- setattr_copy(inode, iattr);
-
-out:
- mutex_unlock(&sysfs_mutex);
- return error;
-}
-
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
- u32 *secdata_len)
-{
- struct sysfs_inode_attrs *iattrs;
- void *old_secdata;
- size_t old_secdata_len;
-
- if (!sd->s_iattr) {
- sd->s_iattr = sysfs_init_inode_attrs(sd);
- if (!sd->s_iattr)
- return -ENOMEM;
- }
-
- iattrs = sd->s_iattr;
- old_secdata = iattrs->ia_secdata;
- old_secdata_len = iattrs->ia_secdata_len;
-
- iattrs->ia_secdata = *secdata;
- iattrs->ia_secdata_len = *secdata_len;
-
- *secdata = old_secdata;
- *secdata_len = old_secdata_len;
- return 0;
-}
-
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags)
-{
- struct sysfs_dirent *sd = dentry->d_fsdata;
- void *secdata;
- int error;
- u32 secdata_len = 0;
-
- if (!sd)
- return -EINVAL;
-
- if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
- const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
- error = security_inode_setsecurity(dentry->d_inode, suffix,
- value, size, flags);
- if (error)
- goto out;
- error = security_inode_getsecctx(dentry->d_inode,
- &secdata, &secdata_len);
- if (error)
- goto out;
-
- mutex_lock(&sysfs_mutex);
- error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
- mutex_unlock(&sysfs_mutex);
-
- if (secdata)
- security_release_secctx(secdata, secdata_len);
- } else
- return -EINVAL;
-out:
- return error;
-}
-
-static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
-{
- inode->i_mode = mode;
- inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-}
-
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
-{
- inode->i_uid = iattr->ia_uid;
- inode->i_gid = iattr->ia_gid;
- inode->i_atime = iattr->ia_atime;
- inode->i_mtime = iattr->ia_mtime;
- inode->i_ctime = iattr->ia_ctime;
-}
-
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
- struct sysfs_inode_attrs *iattrs = sd->s_iattr;
-
- inode->i_mode = sd->s_mode;
- if (iattrs) {
- /* sysfs_dirent has non-default attributes
- * get them from persistent copy in sysfs_dirent
- */
- set_inode_attr(inode, &iattrs->ia_iattr);
- security_inode_notifysecctx(inode,
- iattrs->ia_secdata,
- iattrs->ia_secdata_len);
- }
-
- if (sysfs_type(sd) == SYSFS_DIR)
- set_nlink(inode, sd->s_dir.subdirs + 2);
-}
-
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct sysfs_dirent *sd = dentry->d_fsdata;
- struct inode *inode = dentry->d_inode;
-
- mutex_lock(&sysfs_mutex);
- sysfs_refresh_inode(sd, inode);
- mutex_unlock(&sysfs_mutex);
-
- generic_fillattr(inode, stat);
- return 0;
-}
-
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
- struct bin_attribute *bin_attr;
-
- inode->i_private = sysfs_get(sd);
- inode->i_mapping->a_ops = &sysfs_aops;
- inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
- inode->i_op = &sysfs_inode_operations;
-
- set_default_inode_attr(inode, sd->s_mode);
- sysfs_refresh_inode(sd, inode);
-
- /* initialize inode according to type */
- switch (sysfs_type(sd)) {
- case SYSFS_DIR:
- inode->i_op = &sysfs_dir_inode_operations;
- inode->i_fop = &sysfs_dir_operations;
- break;
- case SYSFS_KOBJ_ATTR:
- inode->i_size = PAGE_SIZE;
- inode->i_fop = &sysfs_file_operations;
- break;
- case SYSFS_KOBJ_BIN_ATTR:
- bin_attr = sd->s_bin_attr.bin_attr;
- inode->i_size = bin_attr->size;
- inode->i_fop = &bin_fops;
- break;
- case SYSFS_KOBJ_LINK:
- inode->i_op = &sysfs_symlink_inode_operations;
- break;
- default:
- BUG();
- }
-
- unlock_new_inode(inode);
-}
-
-/**
- * sysfs_get_inode - get inode for sysfs_dirent
- * @sb: super block
- * @sd: sysfs_dirent to allocate inode for
- *
- * Get inode for @sd. If such inode doesn't exist, a new inode
- * is allocated and basics are initialized. New inode is
- * returned locked.
- *
- * LOCKING:
- * Kernel thread context (may sleep).
- *
- * RETURNS:
- * Pointer to allocated inode on success, NULL on failure.
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
-{
- struct inode *inode;
-
- inode = iget_locked(sb, sd->s_ino);
- if (inode && (inode->i_state & I_NEW))
- sysfs_init_inode(sd, inode);
-
- return inode;
-}
-
-/*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode. A
- * super_operations.evict_inode() implementation is needed to drop that
- * reference upon inode destruction.
- */
-void sysfs_evict_inode(struct inode *inode)
-{
- struct sysfs_dirent *sd = inode->i_private;
-
- truncate_inode_pages(&inode->i_data, 0);
- clear_inode(inode);
- sysfs_put(sd);
-}
-
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
- const char *name)
-{
- struct sysfs_addrm_cxt acxt;
- struct sysfs_dirent *sd;
-
- if (!dir_sd) {
- WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
- name);
- return -ENOENT;
- }
-
- sysfs_addrm_start(&acxt, dir_sd);
-
- sd = sysfs_find_dirent(dir_sd, ns, name);
- if (sd)
- sysfs_remove_one(&acxt, sd);
-
- sysfs_addrm_finish(&acxt);
-
- if (sd)
- return 0;
- else
- return -ENOENT;
-}
-
-int sysfs_permission(struct inode *inode, int mask)
-{
- struct sysfs_dirent *sd;
-
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
- sd = inode->i_private;
-
- mutex_lock(&sysfs_mutex);
- sysfs_refresh_inode(sd, inode);
- mutex_unlock(&sysfs_mutex);
-
- return generic_permission(inode, mask);
-}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a..8a49486bf30 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,147 +13,45 @@
#define DEBUG
#include <linux/fs.h>
+#include <linux/magic.h>
#include <linux/mount.h>
-#include <linux/pagemap.h>
#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/magic.h>
-#include <linux/slab.h>
#include <linux/user_namespace.h>
#include "sysfs.h"
-
-static struct vfsmount *sysfs_mnt;
-struct kmem_cache *sysfs_dir_cachep;
-
-static const struct super_operations sysfs_ops = {
- .statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
- .evict_inode = sysfs_evict_inode,
-};
-
-struct sysfs_dirent sysfs_root = {
- .s_name = "",
- .s_count = ATOMIC_INIT(1),
- .s_flags = SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
- .s_mode = S_IFDIR | S_IRUGO | S_IXUGO,
- .s_ino = 1,
-};
-
-static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
-{
- struct inode *inode;
- struct dentry *root;
-
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
- sb->s_magic = SYSFS_MAGIC;
- sb->s_op = &sysfs_ops;
- sb->s_time_gran = 1;
-
- /* get root inode, initialize and unlock it */
- mutex_lock(&sysfs_mutex);
- inode = sysfs_get_inode(sb, &sysfs_root);
- mutex_unlock(&sysfs_mutex);
- if (!inode) {
- pr_debug("sysfs: could not get root inode\n");
- return -ENOMEM;
- }
-
- /* instantiate and link root dentry */
- root = d_make_root(inode);
- if (!root) {
- pr_debug("%s: could not get root dentry!\n", __func__);
- return -ENOMEM;
- }
- root->d_fsdata = &sysfs_root;
- sb->s_root = root;
- sb->s_d_op = &sysfs_dentry_ops;
- return 0;
-}
-
-static int sysfs_test_super(struct super_block *sb, void *data)
-{
- struct sysfs_super_info *sb_info = sysfs_info(sb);
- struct sysfs_super_info *info = data;
- enum kobj_ns_type type;
- int found = 1;
-
- for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
- if (sb_info->ns[type] != info->ns[type])
- found = 0;
- }
- return found;
-}
-
-static int sysfs_set_super(struct super_block *sb, void *data)
-{
- int error;
- error = set_anon_super(sb, data);
- if (!error)
- sb->s_fs_info = data;
- return error;
-}
-
-static void free_sysfs_super_info(struct sysfs_super_info *info)
-{
- int type;
- for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
- kobj_ns_drop(type, info->ns[type]);
- kfree(info);
-}
+static struct kernfs_root *sysfs_root;
+struct kernfs_node *sysfs_root_kn;
static struct dentry *sysfs_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- struct sysfs_super_info *info;
- enum kobj_ns_type type;
- struct super_block *sb;
- int error;
+ struct dentry *root;
+ void *ns;
+ bool new_sb;
if (!(flags & MS_KERNMOUNT)) {
if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
return ERR_PTR(-EPERM);
- for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
- if (!kobj_ns_current_may_mount(type))
- return ERR_PTR(-EPERM);
- }
- }
-
- info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
- return ERR_PTR(-ENOMEM);
-
- for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
- info->ns[type] = kobj_ns_grab_current(type);
-
- sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
- if (IS_ERR(sb) || sb->s_fs_info != info)
- free_sysfs_super_info(info);
- if (IS_ERR(sb))
- return ERR_CAST(sb);
- if (!sb->s_root) {
- error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
- if (error) {
- deactivate_locked_super(sb);
- return ERR_PTR(error);
- }
- sb->s_flags |= MS_ACTIVE;
+ if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
+ return ERR_PTR(-EPERM);
}
- return dget(sb->s_root);
+ ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+ root = kernfs_mount_ns(fs_type, flags, sysfs_root,
+ SYSFS_MAGIC, &new_sb, ns);
+ if (IS_ERR(root) || !new_sb)
+ kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+ return root;
}
static void sysfs_kill_sb(struct super_block *sb)
{
- struct sysfs_super_info *info = sysfs_info(sb);
- /* Remove the superblock from fs_supers/s_instances
- * so we can't find it, before freeing sysfs_super_info.
- */
- kill_anon_super(sb);
- free_sysfs_super_info(info);
+ void *ns = (void *)kernfs_super_ns(sb);
+
+ kernfs_kill_sb(sb);
+ kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
}
static struct file_system_type sysfs_fs_type = {
@@ -165,48 +63,20 @@ static struct file_system_type sysfs_fs_type = {
int __init sysfs_init(void)
{
- int err = -ENOMEM;
+ int err;
- sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
- sizeof(struct sysfs_dirent),
- 0, 0, NULL);
- if (!sysfs_dir_cachep)
- goto out;
+ sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+ NULL);
+ if (IS_ERR(sysfs_root))
+ return PTR_ERR(sysfs_root);
- err = sysfs_inode_init();
- if (err)
- goto out_err;
+ sysfs_root_kn = sysfs_root->kn;
err = register_filesystem(&sysfs_fs_type);
- if (!err) {
- sysfs_mnt = kern_mount(&sysfs_fs_type);
- if (IS_ERR(sysfs_mnt)) {
- printk(KERN_ERR "sysfs: could not mount!\n");
- err = PTR_ERR(sysfs_mnt);
- sysfs_mnt = NULL;
- unregister_filesystem(&sysfs_fs_type);
- goto out_err;
- }
- } else
- goto out_err;
-out:
- return err;
-out_err:
- kmem_cache_destroy(sysfs_dir_cachep);
- sysfs_dir_cachep = NULL;
- goto out;
-}
-
-#undef sysfs_get
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
- return __sysfs_get(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_get);
+ if (err) {
+ kernfs_destroy_root(sysfs_root);
+ return err;
+ }
-#undef sysfs_put
-void sysfs_put(struct sysfs_dirent *sd)
-{
- __sysfs_put(sd);
+ return 0;
}
-EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 2dd4507d9ed..aecb15f8455 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,107 +11,73 @@
*/
#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kobject.h>
-#include <linux/namei.h>
#include <linux/mutex.h>
#include <linux/security.h>
#include "sysfs.h"
-static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
- struct kobject *target,
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
+ struct kobject *target_kobj,
const char *name, int warn)
{
- struct sysfs_dirent *target_sd = NULL;
- struct sysfs_dirent *sd = NULL;
- struct sysfs_addrm_cxt acxt;
- enum kobj_ns_type ns_type;
- int error;
+ struct kernfs_node *kn, *target = NULL;
- BUG_ON(!name || !parent_sd);
+ BUG_ON(!name || !parent);
- /* target->sd can go away beneath us but is protected with
- * sysfs_assoc_lock. Fetch target_sd from it.
+ /*
+ * We don't own @target_kobj and it may be removed at any time.
+ * Synchronize using sysfs_symlink_target_lock. See
+ * sysfs_remove_dir() for details.
*/
- spin_lock(&sysfs_assoc_lock);
- if (target->sd)
- target_sd = sysfs_get(target->sd);
- spin_unlock(&sysfs_assoc_lock);
-
- error = -ENOENT;
- if (!target_sd)
- goto out_put;
-
- error = -ENOMEM;
- sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
- if (!sd)
- goto out_put;
-
- ns_type = sysfs_ns_type(parent_sd);
- if (ns_type)
- sd->s_ns = target->ktype->namespace(target);
- sd->s_symlink.target_sd = target_sd;
- target_sd = NULL; /* reference is now owned by the symlink */
-
- sysfs_addrm_start(&acxt, parent_sd);
- /* Symlinks must be between directories with the same ns_type */
- if (!ns_type ||
- (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
- if (warn)
- error = sysfs_add_one(&acxt, sd);
- else
- error = __sysfs_add_one(&acxt, sd);
- } else {
- error = -EINVAL;
- WARN(1, KERN_WARNING
- "sysfs: symlink across ns_types %s/%s -> %s/%s\n",
- parent_sd->s_name,
- sd->s_name,
- sd->s_symlink.target_sd->s_parent->s_name,
- sd->s_symlink.target_sd->s_name);
+ spin_lock(&sysfs_symlink_target_lock);
+ if (target_kobj->sd) {
+ target = target_kobj->sd;
+ kernfs_get(target);
}
- sysfs_addrm_finish(&acxt);
+ spin_unlock(&sysfs_symlink_target_lock);
+
+ if (!target)
+ return -ENOENT;
- if (error)
- goto out_put;
+ kn = kernfs_create_link(parent, name, target);
+ kernfs_put(target);
- return 0;
+ if (!IS_ERR(kn))
+ return 0;
- out_put:
- sysfs_put(target_sd);
- sysfs_put(sd);
- return error;
+ if (warn && PTR_ERR(kn) == -EEXIST)
+ sysfs_warn_dup(parent, name);
+ return PTR_ERR(kn);
}
/**
* sysfs_create_link_sd - create symlink to a given object.
- * @sd: directory we're creating the link in.
+ * @kn: directory we're creating the link in.
* @target: object we're pointing to.
* @name: name of the symlink.
*/
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
const char *name)
{
- return sysfs_do_create_link_sd(sd, target, name, 1);
+ return sysfs_do_create_link_sd(kn, target, name, 1);
}
static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
const char *name, int warn)
{
- struct sysfs_dirent *parent_sd = NULL;
+ struct kernfs_node *parent = NULL;
if (!kobj)
- parent_sd = &sysfs_root;
+ parent = sysfs_root_kn;
else
- parent_sd = kobj->sd;
+ parent = kobj->sd;
- if (!parent_sd)
+ if (!parent)
return -EFAULT;
- return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+ return sysfs_do_create_link_sd(parent, target, name, warn);
}
/**
@@ -155,11 +121,17 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
const char *name)
{
const void *ns = NULL;
- spin_lock(&sysfs_assoc_lock);
- if (targ->sd && sysfs_ns_type(kobj->sd))
- ns = targ->sd->s_ns;
- spin_unlock(&sysfs_assoc_lock);
- sysfs_hash_and_remove(kobj->sd, ns, name);
+
+ /*
+ * We don't own @target and it may be removed at any time.
+ * Synchronize using sysfs_symlink_target_lock. See
+ * sysfs_remove_dir() for details.
+ */
+ spin_lock(&sysfs_symlink_target_lock);
+ if (targ->sd && kernfs_ns_enabled(kobj->sd))
+ ns = targ->sd->ns;
+ spin_unlock(&sysfs_symlink_target_lock);
+ kernfs_remove_by_name_ns(kobj->sd, name, ns);
}
/**
@@ -169,156 +141,57 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
*/
void sysfs_remove_link(struct kobject *kobj, const char *name)
{
- struct sysfs_dirent *parent_sd = NULL;
+ struct kernfs_node *parent = NULL;
if (!kobj)
- parent_sd = &sysfs_root;
+ parent = sysfs_root_kn;
else
- parent_sd = kobj->sd;
+ parent = kobj->sd;
- sysfs_hash_and_remove(parent_sd, NULL, name);
+ kernfs_remove_by_name(parent, name);
}
EXPORT_SYMBOL_GPL(sysfs_remove_link);
/**
- * sysfs_rename_link - rename symlink in object's directory.
+ * sysfs_rename_link_ns - rename symlink in object's directory.
* @kobj: object we're acting for.
* @targ: object we're pointing to.
* @old: previous name of the symlink.
* @new: new name of the symlink.
+ * @new_ns: new namespace of the symlink.
*
* A helper function for the common rename symlink idiom.
*/
-int sysfs_rename_link(struct kobject *kobj, struct kobject *targ,
- const char *old, const char *new)
+int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
+ const char *old, const char *new, const void *new_ns)
{
- struct sysfs_dirent *parent_sd, *sd = NULL;
- const void *old_ns = NULL, *new_ns = NULL;
+ struct kernfs_node *parent, *kn = NULL;
+ const void *old_ns = NULL;
int result;
if (!kobj)
- parent_sd = &sysfs_root;
+ parent = sysfs_root_kn;
else
- parent_sd = kobj->sd;
+ parent = kobj->sd;
if (targ->sd)
- old_ns = targ->sd->s_ns;
+ old_ns = targ->sd->ns;
result = -ENOENT;
- sd = sysfs_get_dirent(parent_sd, old_ns, old);
- if (!sd)
+ kn = kernfs_find_and_get_ns(parent, old, old_ns);
+ if (!kn)
goto out;
result = -EINVAL;
- if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+ if (kernfs_type(kn) != KERNFS_LINK)
goto out;
- if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+ if (kn->symlink.target_kn->priv != targ)
goto out;
- if (sysfs_ns_type(parent_sd))
- new_ns = targ->ktype->namespace(targ);
-
- result = sysfs_rename(sd, parent_sd, new_ns, new);
+ result = kernfs_rename_ns(kn, parent, new, new_ns);
out:
- sysfs_put(sd);
+ kernfs_put(kn);
return result;
}
-EXPORT_SYMBOL_GPL(sysfs_rename_link);
-
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
- struct sysfs_dirent *target_sd, char *path)
-{
- struct sysfs_dirent *base, *sd;
- char *s = path;
- int len = 0;
-
- /* go up to the root, stop at the base */
- base = parent_sd;
- while (base->s_parent) {
- sd = target_sd->s_parent;
- while (sd->s_parent && base != sd)
- sd = sd->s_parent;
-
- if (base == sd)
- break;
-
- strcpy(s, "../");
- s += 3;
- base = base->s_parent;
- }
-
- /* determine end of target string for reverse fillup */
- sd = target_sd;
- while (sd->s_parent && sd != base) {
- len += strlen(sd->s_name) + 1;
- sd = sd->s_parent;
- }
-
- /* check limits */
- if (len < 2)
- return -EINVAL;
- len--;
- if ((s - path) + len > PATH_MAX)
- return -ENAMETOOLONG;
-
- /* reverse fillup of target string from target to base */
- sd = target_sd;
- while (sd->s_parent && sd != base) {
- int slen = strlen(sd->s_name);
-
- len -= slen;
- strncpy(s + len, sd->s_name, slen);
- if (len)
- s[--len] = '/';
-
- sd = sd->s_parent;
- }
-
- return 0;
-}
-
-static int sysfs_getlink(struct dentry *dentry, char *path)
-{
- struct sysfs_dirent *sd = dentry->d_fsdata;
- struct sysfs_dirent *parent_sd = sd->s_parent;
- struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
- int error;
-
- mutex_lock(&sysfs_mutex);
- error = sysfs_get_target_path(parent_sd, target_sd, path);
- mutex_unlock(&sysfs_mutex);
-
- return error;
-}
-
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
- int error = -ENOMEM;
- unsigned long page = get_zeroed_page(GFP_KERNEL);
- if (page) {
- error = sysfs_getlink(dentry, (char *) page);
- if (error < 0)
- free_page((unsigned long)page);
- }
- nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
- return NULL;
-}
-
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
- void *cookie)
-{
- char *page = nd_get_link(nd);
- if (!IS_ERR(page))
- free_page((unsigned long)page);
-}
-
-const struct inode_operations sysfs_symlink_inode_operations = {
- .setxattr = sysfs_setxattr,
- .readlink = generic_readlink,
- .follow_link = sysfs_follow_link,
- .put_link = sysfs_put_link,
- .setattr = sysfs_setattr,
- .getattr = sysfs_getattr,
- .permission = sysfs_permission,
-};
+EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index b6deca3e301..0e2f1cccb81 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,239 +8,36 @@
* This file is released under the GPLv2.
*/
-#include <linux/lockdep.h>
-#include <linux/kobject_ns.h>
-#include <linux/fs.h>
-#include <linux/rbtree.h>
+#ifndef __SYSFS_INTERNAL_H
+#define __SYSFS_INTERNAL_H
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
- struct kobject *kobj;
-
- unsigned long subdirs;
- /* children rbtree starts here and goes through sd->s_rb */
- struct rb_root children;
-};
-
-struct sysfs_elem_symlink {
- struct sysfs_dirent *target_sd;
-};
-
-struct sysfs_elem_attr {
- struct attribute *attr;
- struct sysfs_open_dirent *open;
-};
-
-struct sysfs_elem_bin_attr {
- struct bin_attribute *bin_attr;
- struct hlist_head buffers;
-};
-
-struct sysfs_inode_attrs {
- struct iattr ia_iattr;
- void *ia_secdata;
- u32 ia_secdata_len;
-};
-
-/*
- * sysfs_dirent - the building block of sysfs hierarchy. Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible. Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
- atomic_t s_count;
- atomic_t s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- struct lockdep_map dep_map;
-#endif
- struct sysfs_dirent *s_parent;
- const char *s_name;
-
- struct rb_node s_rb;
-
- union {
- struct completion *completion;
- struct sysfs_dirent *removed_list;
- } u;
-
- const void *s_ns; /* namespace tag */
- unsigned int s_hash; /* ns + name hash */
- union {
- struct sysfs_elem_dir s_dir;
- struct sysfs_elem_symlink s_symlink;
- struct sysfs_elem_attr s_attr;
- struct sysfs_elem_bin_attr s_bin_attr;
- };
-
- unsigned short s_flags;
- umode_t s_mode;
- unsigned int s_ino;
- struct sysfs_inode_attrs *s_iattr;
-};
-
-#define SD_DEACTIVATED_BIAS INT_MIN
-
-#define SYSFS_TYPE_MASK 0x00ff
-#define SYSFS_DIR 0x0001
-#define SYSFS_KOBJ_ATTR 0x0002
-#define SYSFS_KOBJ_BIN_ATTR 0x0004
-#define SYSFS_KOBJ_LINK 0x0008
-#define SYSFS_COPY_NAME (SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF (SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK 0xf00
-#define SYSFS_NS_TYPE_SHIFT 8
-
-#define SYSFS_FLAG_MASK ~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
-#define SYSFS_FLAG_REMOVED 0x02000
-
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
- return sd->s_flags & SYSFS_TYPE_MASK;
-}
-
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
- return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-#define sysfs_dirent_init_lockdep(sd) \
-do { \
- struct attribute *attr = sd->s_attr.attr; \
- struct lock_class_key *key = attr->key; \
- if (!key) \
- key = &attr->skey; \
- \
- lockdep_init_map(&sd->dep_map, "s_active", key, 0); \
-} while (0)
-#else
-#define sysfs_dirent_init_lockdep(sd) do {} while (0)
-#endif
-
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct sysfs_addrm_cxt {
- struct sysfs_dirent *parent_sd;
- struct sysfs_dirent *removed;
-};
+#include <linux/sysfs.h>
/*
* mount.c
*/
-
-/*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
- */
-struct sysfs_super_info {
- void *ns[KOBJ_NS_TYPES];
-};
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
-extern struct sysfs_dirent sysfs_root;
-extern struct kmem_cache *sysfs_dir_cachep;
+extern struct kernfs_node *sysfs_root_kn;
/*
* dir.c
*/
-extern struct mutex sysfs_mutex;
-extern spinlock_t sysfs_assoc_lock;
-extern const struct dentry_operations sysfs_dentry_ops;
-
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
-
-struct dentry *sysfs_get_dentry(struct sysfs_dirent *sd);
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
- struct sysfs_dirent *parent_sd);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
-void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
- const void *ns,
- const unsigned char *name);
-struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd,
- const void *ns,
- const unsigned char *name);
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
-
-void release_sysfs_dirent(struct sysfs_dirent *sd);
+extern spinlock_t sysfs_symlink_target_lock;
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
- struct sysfs_dirent **p_sd);
-void sysfs_remove_subdir(struct sysfs_dirent *sd);
-
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
- const void *ns, const char *new_name);
-
-static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
-{
- if (sd) {
- WARN_ON(!atomic_read(&sd->s_count));
- atomic_inc(&sd->s_count);
- }
- return sd;
-}
-#define sysfs_get(sd) __sysfs_get(sd)
-
-static inline void __sysfs_put(struct sysfs_dirent *sd)
-{
- if (sd && atomic_dec_and_test(&sd->s_count))
- release_sysfs_dirent(sd);
-}
-#define sysfs_put(sd) __sysfs_put(sd)
-
-/*
- * inode.c
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns,
- const char *name);
-int sysfs_inode_init(void);
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
/*
* file.c
*/
-extern const struct file_operations sysfs_file_operations;
-
-int sysfs_add_file(struct sysfs_dirent *dir_sd,
- const struct attribute *attr, int type);
-
-int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
- const struct attribute *attr, int type, umode_t amode);
-/*
- * bin.c
- */
-extern const struct file_operations bin_fops;
-void unmap_bin_file(struct sysfs_dirent *attr_sd);
+int sysfs_add_file(struct kernfs_node *parent,
+ const struct attribute *attr, bool is_bin);
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+ const struct attribute *attr, bool is_bin,
+ umode_t amode, const void *ns);
/*
* symlink.c
*/
-extern const struct inode_operations sysfs_symlink_inode_operations;
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
const char *name);
+
+#endif /* __SYSFS_INTERNAL_H */
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 9d4dc683179..b00811c75b2 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -21,10 +21,10 @@
*/
const struct file_operations sysv_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4ee123..88956309cc8 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -60,6 +60,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data)
{
struct sysv_sb_info *sbi = SYSV_SB(sb);
+ sync_filesystem(sb);
if (sbi->s_forced_ro)
*flags |= MS_RDONLY;
return 0;
@@ -295,7 +296,7 @@ int sysv_sync_inode(struct inode *inode)
static void sysv_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
sysv_truncate(inode);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 929312180dd..0013142c047 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -317,6 +317,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
(clockid != CLOCK_MONOTONIC &&
clockid != CLOCK_REALTIME &&
clockid != CLOCK_REALTIME_ALARM &&
+ clockid != CLOCK_BOOTTIME &&
clockid != CLOCK_BOOTTIME_ALARM))
return -EINVAL;
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index e8e01d74dc0..eb997e9c4ab 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -437,7 +437,6 @@ static int calc_dd_growth(const struct ubifs_info *c,
*/
int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
{
- int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
int err, idx_growth, data_growth, dd_growth, retried = 0;
ubifs_assert(req->new_page <= 1);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 6e025e02ffd..177b0152fef 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -745,8 +745,10 @@ void ubifs_dump_lprops(struct ubifs_info *c)
for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
err = ubifs_read_one_lp(c, lnum, &lp);
- if (err)
+ if (err) {
ubifs_err("cannot read lprops for LEB %d", lnum);
+ continue;
+ }
ubifs_dump_lprop(c, &lp);
}
@@ -2118,26 +2120,10 @@ out_free:
*/
static void free_inodes(struct fsck_data *fsckd)
{
- struct rb_node *this = fsckd->inodes.rb_node;
- struct fsck_inode *fscki;
+ struct fsck_inode *fscki, *n;
- while (this) {
- if (this->rb_left)
- this = this->rb_left;
- else if (this->rb_right)
- this = this->rb_right;
- else {
- fscki = rb_entry(this, struct fsck_inode, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &fscki->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
- kfree(fscki);
- }
- }
+ rbtree_postorder_for_each_entry_safe(fscki, n, &fsckd->inodes, rb)
+ kfree(fscki);
}
/**
@@ -2563,9 +2549,9 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf,
unsigned int from, to, ffs = chance(1, 2);
unsigned char *p = (void *)buf;
- from = prandom_u32() % (len + 1);
- /* Corruption may only span one max. write unit */
- to = min(len, ALIGN(from, c->max_write_size));
+ from = prandom_u32() % len;
+ /* Corruption span max to end of write unit */
+ to = min(len, ALIGN(from + 1, c->max_write_size));
ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
ffs ? "0xFFs" : "random data");
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 123c79b7261..b5b593c4527 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -903,8 +903,9 @@ static int do_writepage(struct page *page, int len)
struct ubifs_info *c = inode->i_sb->s_fs_info;
#ifdef UBIFS_DEBUG
+ struct ubifs_inode *ui = ubifs_inode(inode);
spin_lock(&ui->ui_lock);
- ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE);
+ ubifs_assert(page->index <= ui->synced_i_size >> PAGE_CACHE_SHIFT);
spin_unlock(&ui->ui_lock);
#endif
@@ -1363,17 +1364,17 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* update_ctime - update mtime and ctime of an inode.
- * @c: UBIFS file-system description object
* @inode: inode to update
*
* This function updates mtime and ctime of the inode if it is not equivalent to
* current time. Returns zero in case of success and a negative error code in
* case of failure.
*/
-static int update_mctime(struct ubifs_info *c, struct inode *inode)
+static int update_mctime(struct inode *inode)
{
struct timespec now = ubifs_current_time(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
if (mctime_update_needed(inode, &now)) {
int err, release;
@@ -1396,18 +1397,13 @@ static int update_mctime(struct ubifs_info *c, struct inode *inode)
return 0;
}
-static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+static ssize_t ubifs_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
- int err;
- struct inode *inode = iocb->ki_filp->f_mapping->host;
- struct ubifs_info *c = inode->i_sb->s_fs_info;
-
- err = update_mctime(c, inode);
+ int err = update_mctime(file_inode(iocb->ki_filp));
if (err)
return err;
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ return generic_file_write_iter(iocb, from);
}
static int ubifs_set_page_dirty(struct page *page)
@@ -1525,8 +1521,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
}
wait_for_stable_page(page);
- unlock_page(page);
- return 0;
+ return VM_FAULT_LOCKED;
out_unlock:
unlock_page(page);
@@ -1538,6 +1533,7 @@ out_unlock:
static const struct vm_operations_struct ubifs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = ubifs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
@@ -1581,15 +1577,15 @@ const struct inode_operations ubifs_symlink_inode_operations = {
const struct file_operations ubifs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = generic_file_aio_read,
- .aio_write = ubifs_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ubifs_write_iter,
.mmap = ubifs_file_mmap,
.fsync = ubifs_fsync,
.unlocked_ioctl = ubifs_ioctl,
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = iter_file_splice_write,
#ifdef CONFIG_COMPAT
.compat_ioctl = ubifs_compat_ioctl,
#endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 76ca53cd3ee..9718da86ad0 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -668,8 +668,7 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway)
ubifs_assert(!wbuf->used);
for (i = 0; ; i++) {
- int space_before = c->leb_size - wbuf->offs - wbuf->used;
- int space_after;
+ int space_before, space_after;
cond_resched();
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index e18b9889a51..2290d586672 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -988,30 +988,32 @@ int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
return err;
if (type != ch->node_type) {
- ubifs_err("bad node type (%d but expected %d)",
- ch->node_type, type);
+ ubifs_errc(c, "bad node type (%d but expected %d)",
+ ch->node_type, type);
goto out;
}
err = ubifs_check_node(c, buf, lnum, offs, 0, 0);
if (err) {
- ubifs_err("expected node type %d", type);
+ ubifs_errc(c, "expected node type %d", type);
return err;
}
l = le32_to_cpu(ch->len);
if (l != len) {
- ubifs_err("bad node length %d, expected %d", l, len);
+ ubifs_errc(c, "bad node length %d, expected %d", l, len);
goto out;
}
return 0;
out:
- ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
- ubi_is_mapped(c->ubi, lnum));
- ubifs_dump_node(c, buf);
- dump_stack();
+ ubifs_errc(c, "bad node at LEB %d:%d, LEB mapping status %d", lnum,
+ offs, ubi_is_mapped(c->ubi, lnum));
+ if (!c->probing) {
+ ubifs_dump_node(c, buf);
+ dump_stack();
+ }
return -EINVAL;
}
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 36bd4efd081..a902c5919e4 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -574,27 +574,10 @@ static int done_already(struct rb_root *done_tree, int lnum)
*/
static void destroy_done_tree(struct rb_root *done_tree)
{
- struct rb_node *this = done_tree->rb_node;
- struct done_ref *dr;
+ struct done_ref *dr, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- dr = rb_entry(this, struct done_ref, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &dr->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
+ rbtree_postorder_for_each_entry_safe(dr, n, done_tree, rb)
kfree(dr);
- }
}
/**
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 4b826abb152..45d4e96a6ba 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -460,9 +460,9 @@ static int write_cnodes(struct ubifs_info *c)
* important.
*/
clear_bit(DIRTY_CNODE, &cnode->flags);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(COW_CNODE, &cnode->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
offs += len;
dbg_chk_lpt_sz(c, 1, len);
cnode = cnode->cnext;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index ba32da3fe08..f1c3e5a1b31 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -815,27 +815,10 @@ static int dbg_find_check_orphan(struct rb_root *root, ino_t inum)
static void dbg_free_check_tree(struct rb_root *root)
{
- struct rb_node *this = root->rb_node;
- struct check_orphan *o;
+ struct check_orphan *o, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- o = rb_entry(this, struct check_orphan, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &o->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
+ rbtree_postorder_for_each_entry_safe(o, n, root, rb)
kfree(o);
- }
}
static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr,
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 065096e36ed..c14adb2f420 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -1335,29 +1335,14 @@ static void remove_ino(struct ubifs_info *c, ino_t inum)
*/
void ubifs_destroy_size_tree(struct ubifs_info *c)
{
- struct rb_node *this = c->size_tree.rb_node;
- struct size_entry *e;
+ struct size_entry *e, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- e = rb_entry(this, struct size_entry, rb);
+ rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
if (e->inode)
iput(e->inode);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &e->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
kfree(e);
}
+
c->size_tree = RB_ROOT;
}
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index f35135e28e9..9a9fb94a41c 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -128,7 +128,6 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention)
freed = ubifs_destroy_tnc_subtree(znode);
atomic_long_sub(freed, &ubifs_clean_zn_cnt);
atomic_long_sub(freed, &c->clean_zn_cnt);
- ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0);
total_freed += freed;
znode = zprev;
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3e4aa7281e0..3904c8574ef 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode)
dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
ubifs_assert(!atomic_read(&inode->i_count));
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (inode->i_nlink)
goto done;
@@ -873,26 +873,10 @@ static void free_orphans(struct ubifs_info *c)
*/
static void free_buds(struct ubifs_info *c)
{
- struct rb_node *this = c->buds.rb_node;
- struct ubifs_bud *bud;
-
- while (this) {
- if (this->rb_left)
- this = this->rb_left;
- else if (this->rb_right)
- this = this->rb_right;
- else {
- bud = rb_entry(this, struct ubifs_bud, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &bud->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
- kfree(bud);
- }
- }
+ struct ubifs_bud *bud, *n;
+
+ rbtree_postorder_for_each_entry_safe(bud, n, &c->buds, rb)
+ kfree(bud);
}
/**
@@ -1165,6 +1149,9 @@ static int mount_ubifs(struct ubifs_info *c)
size_t sz;
c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
+ /* Suppress error messages while probing if MS_SILENT is set */
+ c->probing = !!(c->vfs_sb->s_flags & MS_SILENT);
+
err = init_constants_early(c);
if (err)
return err;
@@ -1230,6 +1217,8 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_free;
+ c->probing = 0;
+
/*
* Make sure the compressor which is set as default in the superblock
* or overridden by mount options is actually compiled in.
@@ -1572,7 +1561,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
if (c->space_fixup) {
err = ubifs_fixup_free_space(c);
if (err)
- return err;
+ goto out;
}
err = check_free_space(c);
@@ -1630,8 +1619,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
}
c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
- if (!c->write_reserve_buf)
+ if (!c->write_reserve_buf) {
+ err = -ENOMEM;
goto out;
+ }
err = ubifs_lpt_init(c, 0, 1);
if (err)
@@ -1841,6 +1832,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
int err;
struct ubifs_info *c = sb->s_fs_info;
+ sync_filesystem(sb);
dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
err = ubifs_parse_options(c, data, 1);
@@ -2064,8 +2056,10 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_root = d_make_root(root);
- if (!sb->s_root)
+ if (!sb->s_root) {
+ err = -ENOMEM;
goto out_umount;
+ }
mutex_unlock(&c->umount_mutex);
return 0;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 349f31a30f4..8a40cf9c02d 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -178,27 +178,11 @@ static int ins_clr_old_idx_znode(struct ubifs_info *c,
*/
void destroy_old_idx(struct ubifs_info *c)
{
- struct rb_node *this = c->old_idx.rb_node;
- struct ubifs_old_idx *old_idx;
+ struct ubifs_old_idx *old_idx, *n;
- while (this) {
- if (this->rb_left) {
- this = this->rb_left;
- continue;
- } else if (this->rb_right) {
- this = this->rb_right;
- continue;
- }
- old_idx = rb_entry(this, struct ubifs_old_idx, rb);
- this = rb_parent(this);
- if (this) {
- if (this->rb_left == &old_idx->rb)
- this->rb_left = NULL;
- else
- this->rb_right = NULL;
- }
+ rbtree_postorder_for_each_entry_safe(old_idx, n, &c->old_idx, rb)
kfree(old_idx);
- }
+
c->old_idx = RB_ROOT;
}
@@ -2875,10 +2859,11 @@ void ubifs_tnc_close(struct ubifs_info *c)
{
tnc_destroy_cnext(c);
if (c->zroot.znode) {
- long n;
+ long n, freed;
- ubifs_destroy_tnc_subtree(c->zroot.znode);
n = atomic_long_read(&c->clean_zn_cnt);
+ freed = ubifs_destroy_tnc_subtree(c->zroot.znode);
+ ubifs_assert(freed == n);
atomic_long_sub(n, &ubifs_clean_zn_cnt);
}
kfree(c->gap_lebs);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 52a6559275c..3600994f841 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -895,9 +895,9 @@ static int write_index(struct ubifs_info *c)
* the reason for the second barrier.
*/
clear_bit(DIRTY_ZNODE, &znode->flags);
- smp_mb__before_clear_bit();
+ smp_mb__before_atomic();
clear_bit(COW_ZNODE, &znode->flags);
- smp_mb__after_clear_bit();
+ smp_mb__after_atomic();
/*
* We have marked the znode as clean but have not updated the
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index e8c8cfe1435..c1f71fe17cc 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -51,6 +51,15 @@
#define ubifs_warn(fmt, ...) \
pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \
current->pid, __func__, ##__VA_ARGS__)
+/*
+ * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
+ * object as an argument.
+ */
+#define ubifs_errc(c, fmt, ...) \
+ do { \
+ if (!(c)->probing) \
+ ubifs_err(fmt, ##__VA_ARGS__); \
+ } while (0)
/* UBIFS file system VFS magic number */
#define UBIFS_SUPER_MAGIC 0x24051905
@@ -1209,6 +1218,7 @@ struct ubifs_debug_info;
* @need_recovery: %1 if the file-system needs recovery
* @replaying: %1 during journal replay
* @mounting: %1 while mounting
+ * @probing: %1 while attempting to mount if MS_SILENT mount flag is set
* @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
* @replay_list: temporary list used during journal replay
* @replay_buds: list of buds to replay
@@ -1441,6 +1451,7 @@ struct ubifs_info {
unsigned int replaying:1;
unsigned int mounting:1;
unsigned int remounting_rw:1;
+ unsigned int probing:1;
struct list_head replay_list;
struct list_head replay_buds;
unsigned long long cs_sqnum;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index c02a27a19c6..d80738fdf42 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -119,8 +119,8 @@ static int udf_adinicb_write_end(struct file *file,
}
static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
/* Fallback to buffered I/O. */
return 0;
@@ -134,8 +134,7 @@ const struct address_space_operations udf_adinicb_aops = {
.direct_IO = udf_adinicb_direct_IO,
};
-static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t ppos)
+static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
ssize_t retval;
struct file *file = iocb->ki_filp;
@@ -144,18 +143,20 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
size_t count = iocb->ki_nbytes;
struct udf_inode_info *iinfo = UDF_I(inode);
+ mutex_lock(&inode->i_mutex);
down_write(&iinfo->i_data_sem);
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
if (file->f_flags & O_APPEND)
pos = inode->i_size;
else
- pos = ppos;
+ pos = iocb->ki_pos;
if (inode->i_sb->s_blocksize <
(udf_file_entry_alloc_offset(inode) +
pos + count)) {
err = udf_expand_file_adinicb(inode);
if (err) {
+ mutex_unlock(&inode->i_mutex);
udf_debug("udf_expand_adinicb: err=%d\n", err);
return err;
}
@@ -169,9 +170,17 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
} else
up_write(&iinfo->i_data_sem);
- retval = generic_file_aio_write(iocb, iov, nr_segs, ppos);
- if (retval > 0)
+ retval = __generic_file_write_iter(iocb, from);
+ mutex_unlock(&inode->i_mutex);
+
+ if (retval > 0) {
+ ssize_t err;
+
mark_inode_dirty(inode);
+ err = generic_write_sync(file, iocb->ki_pos - retval, retval);
+ if (err < 0)
+ retval = err;
+ }
return retval;
}
@@ -242,13 +251,13 @@ static int udf_release_file(struct inode *inode, struct file *filp)
}
const struct file_operations udf_file_operations = {
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
.unlocked_ioctl = udf_ioctl,
.open = generic_file_open,
.mmap = generic_file_mmap,
- .write = do_sync_write,
- .aio_write = udf_file_aio_write,
+ .write = new_sync_write,
+ .write_iter = udf_file_write_iter,
.release = udf_release_file,
.fsync = generic_file_fsync,
.splice_read = generic_file_splice_read,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 062b7925bca..236cd48184c 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode)
want_delete = 1;
udf_setsize(inode, 0);
udf_update_inode(inode, IS_SYNC(inode));
- } else
- truncate_inode_pages(&inode->i_data, 0);
+ }
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode);
clear_inode(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -217,18 +217,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
}
static ssize_t udf_direct_IO(int rw, struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset, unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
+ size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
- udf_get_block);
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block);
if (unlikely(ret < 0 && (rw & WRITE)))
- udf_write_failed(mapping, offset + iov_length(iov, nr_segs));
+ udf_write_failed(mapping, offset + count);
return ret;
}
@@ -265,6 +265,7 @@ int udf_expand_file_adinicb(struct inode *inode)
.nr_to_write = 1,
};
+ WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
if (!iinfo->i_lenAlloc) {
if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 5f6fc17d6bc..9737cba1357 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1010,6 +1010,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
else
udf_truncate_tail_extent(inode);
mark_inode_dirty(inode);
+ up_write(&iinfo->i_data_sem);
fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
if (!fi)
@@ -1023,7 +1024,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
mark_inode_dirty(dir);
- up_write(&iinfo->i_data_sem);
if (fibh.sbh != fibh.ebh)
brelse(fibh.ebh);
brelse(fibh.sbh);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 91219385691..3286db047a4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -76,6 +76,9 @@
#define UDF_DEFAULT_BLOCKSIZE 2048
+#define VSD_FIRST_SECTOR_OFFSET 32768
+#define VSD_MAX_SECTOR_OFFSET 0x800000
+
enum { UDF_MAX_LINKS = 0xffff };
/* These are the "meat" - everything else is stuffing */
@@ -172,7 +175,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
udf_inode_cachep = kmem_cache_create("udf_inode_cache",
sizeof(struct udf_inode_info),
@@ -502,6 +505,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
while ((p = strsep(&options, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
int token;
+ unsigned n;
if (!*p)
continue;
@@ -513,7 +517,10 @@ static int udf_parse_options(char *options, struct udf_options *uopt,
case Opt_bs:
if (match_int(&args[0], &option))
return 0;
- uopt->blocksize = option;
+ n = option;
+ if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+ return 0;
+ uopt->blocksize = n;
uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
break;
case Opt_unhide:
@@ -643,6 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
int error = 0;
struct logicalVolIntegrityDescImpUse *lvidiu = udf_sb_lvidiu(sb);
+ sync_filesystem(sb);
if (lvidiu) {
int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev);
if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY))
@@ -685,7 +693,7 @@ out_unlock:
static loff_t udf_check_vsd(struct super_block *sb)
{
struct volStructDesc *vsd = NULL;
- loff_t sector = 32768;
+ loff_t sector = VSD_FIRST_SECTOR_OFFSET;
int sectorsize;
struct buffer_head *bh = NULL;
int nsr02 = 0;
@@ -703,8 +711,18 @@ static loff_t udf_check_vsd(struct super_block *sb)
udf_debug("Starting at sector %u (%ld byte sectors)\n",
(unsigned int)(sector >> sb->s_blocksize_bits),
sb->s_blocksize);
- /* Process the sequence (if applicable) */
- for (; !nsr02 && !nsr03; sector += sectorsize) {
+ /* Process the sequence (if applicable). The hard limit on the sector
+ * offset is arbitrary, hopefully large enough so that all valid UDF
+ * filesystems will be recognised. There is no mention of an upper
+ * bound to the size of the volume recognition area in the standard.
+ * The limit will prevent the code to read all the sectors of a
+ * specially crafted image (like a bluray disc full of CD001 sectors),
+ * potentially causing minutes or even hours of uninterruptible I/O
+ * activity. This actually happened with uninitialised SSD partitions
+ * (all 0xFF) before the check for the limit and all valid IDs were
+ * added */
+ for (; !nsr02 && !nsr03 && sector < VSD_MAX_SECTOR_OFFSET;
+ sector += sectorsize) {
/* Read a block */
bh = udf_tread(sb, sector >> sb->s_blocksize_bits);
if (!bh)
@@ -714,10 +732,7 @@ static loff_t udf_check_vsd(struct super_block *sb)
vsd = (struct volStructDesc *)(bh->b_data +
(sector & (sb->s_blocksize - 1)));
- if (vsd->stdIdent[0] == 0) {
- brelse(bh);
- break;
- } else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
+ if (!strncmp(vsd->stdIdent, VSD_STD_ID_CD001,
VSD_STD_ID_LEN)) {
switch (vsd->structType) {
case 0:
@@ -753,6 +768,17 @@ static loff_t udf_check_vsd(struct super_block *sb)
else if (!strncmp(vsd->stdIdent, VSD_STD_ID_NSR03,
VSD_STD_ID_LEN))
nsr03 = sector;
+ else if (!strncmp(vsd->stdIdent, VSD_STD_ID_BOOT2,
+ VSD_STD_ID_LEN))
+ ; /* nothing */
+ else if (!strncmp(vsd->stdIdent, VSD_STD_ID_CDW02,
+ VSD_STD_ID_LEN))
+ ; /* nothing */
+ else {
+ /* invalid id : end of volume recognition area */
+ brelse(bh);
+ break;
+ }
brelse(bh);
}
@@ -760,7 +786,8 @@ static loff_t udf_check_vsd(struct super_block *sb)
return nsr03;
else if (nsr02)
return nsr02;
- else if (sector - (sbi->s_session << sb->s_blocksize_bits) == 32768)
+ else if (!bh && sector - (sbi->s_session << sb->s_blocksize_bits) ==
+ VSD_FIRST_SECTOR_OFFSET)
return -1;
else
return 0;
@@ -1270,6 +1297,9 @@ static int udf_load_partdesc(struct super_block *sb, sector_t block)
* PHYSICAL partitions are already set up
*/
type1_idx = i;
+#ifdef UDFFS_DEBUG
+ map = NULL; /* supress 'maybe used uninitialized' warning */
+#endif
for (i = 0; i < sbi->s_partitions; i++) {
map = &sbi->s_partmaps[i];
@@ -1891,7 +1921,9 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
return 0;
}
if (nsr_off == -1)
- udf_debug("Failed to read byte 32768. Assuming open disc. Skipping validity check\n");
+ udf_debug("Failed to read sector at offset %d. "
+ "Assuming open disc. Skipping validity "
+ "check\n", VSD_FIRST_SECTOR_OFFSET);
if (!sbi->s_last_block)
sbi->s_last_block = udf_get_last_block(sb);
} else {
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index a7ea492ae66..7bc20809c99 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -24,7 +24,7 @@
#define INVBLOCK ((u64)-1L)
-static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned, int *);
+static u64 ufs_add_fragments(struct inode *, u64, unsigned, unsigned);
static u64 ufs_alloc_fragments(struct inode *, unsigned, u64, unsigned, int *);
static u64 ufs_alloccg_block(struct inode *, struct ufs_cg_private_info *, u64, int *);
static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *, u64, unsigned);
@@ -38,7 +38,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, bit, end_bit, bbase, blkmap, i;
@@ -46,7 +45,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
@@ -54,7 +52,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
if (ufs_fragnum(fragment) + count > uspi->s_fpg)
ufs_error (sb, "ufs_free_fragments", "internal error");
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
cgno = ufs_dtog(uspi, fragment);
bit = ufs_dtogd(uspi, fragment);
@@ -118,12 +116,12 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT\n");
return;
failed:
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT (FAILED)\n");
return;
}
@@ -135,7 +133,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned overflow, cgno, bit, end_bit, i;
@@ -143,7 +140,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
@@ -155,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
goto failed;
}
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
do_more:
overflow = 0;
@@ -215,12 +211,12 @@ do_more:
}
ufs_mark_sb_dirty(sb);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT\n");
return;
failed_unlock:
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
failed:
UFSD("EXIT (FAILED)\n");
return;
@@ -361,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
usb1 = ubh_get_usb_first(uspi);
*err = -ENOSPC;
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (count + ufs_fragnum(fragment) > uspi->s_fpb) {
@@ -382,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
"fragment %llu, tmp %llu\n",
(unsigned long long)fragment,
(unsigned long long)tmp);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return INVBLOCK;
}
if (fragment < UFS_I(inode)->i_lastfrag) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return 0;
}
}
else {
if (tmp) {
UFSD("EXIT (ALREADY ALLOCATED)\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return 0;
}
}
@@ -403,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
* There is not enough space for user on the device
*/
if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT (FAILED)\n");
return 0;
}
@@ -428,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
ufs_clear_frags(inode, result + oldcount,
newcount - oldcount, locked_page != NULL);
}
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -436,14 +432,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
/*
* resize block
*/
- result = ufs_add_fragments (inode, tmp, oldcount, newcount, err);
+ result = ufs_add_fragments(inode, tmp, oldcount, newcount);
if (result) {
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
locked_page != NULL);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
}
@@ -481,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
*err = 0;
UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag,
fragment + count);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
if (newcount < request)
ufs_free_fragments (inode, result + newcount, request - newcount);
ufs_free_fragments (inode, tmp, oldcount);
@@ -489,17 +485,16 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
return result;
}
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT (FAILED)\n");
return 0;
}
static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
- unsigned oldcount, unsigned newcount, int *err)
+ unsigned oldcount, unsigned newcount)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, fragno, fragoff, count, fragsize, i;
@@ -509,7 +504,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first (uspi);
count = newcount - oldcount;
cgno = ufs_dtog(uspi, fragment);
@@ -577,7 +571,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned oldcg, i, j, k, allocsize;
@@ -588,7 +581,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
oldcg = cgno;
/*
@@ -690,7 +682,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cylinder_group * ucg;
u64 result, blkno;
@@ -698,7 +689,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
if (goal == 0) {
@@ -794,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
};
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
- struct ufs_super_block_first *usb1;
struct ufs_cylinder_group *ucg;
unsigned start, length, loc;
unsigned pos, want, blockmap, mask, end;
@@ -803,7 +792,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
(unsigned long long)goal, count);
- usb1 = ubh_get_usb_first (uspi);
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
if (goal)
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 33afa20d450..c84ec010a67 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -35,10 +35,10 @@
const struct file_operations ufs_file_operations = {
.llseek = generic_file_llseek,
- .read = do_sync_read,
- .aio_read = generic_file_aio_read,
- .write = do_sync_write,
- .aio_write = generic_file_aio_write,
+ .read = new_sync_read,
+ .read_iter = generic_file_read_iter,
+ .write = new_sync_write,
+ .write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = generic_file_open,
.fsync = generic_file_fsync,
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index d0426d74817..a9cc75ffa92 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -57,7 +57,6 @@ void ufs_free_inode (struct inode * inode)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
int is_directory;
@@ -67,15 +66,14 @@ void ufs_free_inode (struct inode * inode)
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
ino = inode->i_ino;
- mutex_lock(&UFS_SB(sb)->s_lock);
+ lock_ufs(sb);
if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) {
ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return;
}
@@ -83,7 +81,7 @@ void ufs_free_inode (struct inode * inode)
bit = ufs_inotocgoff (ino);
ucpi = ufs_load_cylinder (sb, cg);
if (!ucpi) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
return;
}
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
@@ -117,7 +115,7 @@ void ufs_free_inode (struct inode * inode)
ubh_sync_block(UCPI_UBH(ucpi));
ufs_mark_sb_dirty(sb);
- mutex_unlock(&UFS_SB(sb)->s_lock);
+ unlock_ufs(sb);
UFSD("EXIT\n");
}
@@ -175,7 +173,6 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
struct super_block * sb;
struct ufs_sb_info * sbi;
struct ufs_sb_private_info * uspi;
- struct ufs_super_block_first * usb1;
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
struct inode * inode;
@@ -195,9 +192,8 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode)
ufsi = UFS_I(inode);
sbi = UFS_SB(sb);
uspi = sbi->s_uspi;
- usb1 = ubh_get_usb_first(uspi);
- mutex_lock(&sbi->s_lock);
+ lock_ufs(sb);
/*
* Try to place the inode in its parent directory
@@ -332,21 +328,20 @@ cg_found:
sync_dirty_buffer(bh);
brelse(bh);
}
-
- mutex_unlock(&sbi->s_lock);
+ unlock_ufs(sb);
UFSD("allocating inode %lu\n", inode->i_ino);
UFSD("EXIT\n");
return inode;
fail_remove_inode:
- mutex_unlock(&sbi->s_lock);
+ unlock_ufs(sb);
clear_nlink(inode);
iput(inode);
UFSD("EXIT (FAILED): err %d\n", err);
return ERR_PTR(err);
failed:
- mutex_unlock(&sbi->s_lock);
+ unlock_ufs(sb);
make_bad_inode(inode);
iput (inode);
UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c8ca9608678..61e8a9b021d 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode)
if (!inode->i_nlink && !is_bad_inode(inode))
want_delete = 1;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (want_delete) {
loff_t old_i_size;
/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 329f2f53b7e..b879f1ba343 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -524,11 +524,9 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned size, blks, i;
- struct ufs_super_block_third *usb3;
UFSD("ENTER\n");
- usb3 = ubh_get_usb_third(uspi);
/*
* Read cs structures from (usually) first data block
* on the device.
@@ -699,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
unsigned flags;
lock_ufs(sb);
- mutex_lock(&UFS_SB(sb)->s_lock);
UFSD("ENTER\n");
@@ -717,7 +714,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
ufs_put_cstotal(sb);
UFSD("EXIT\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
@@ -762,6 +758,7 @@ static void ufs_put_super(struct super_block *sb)
ubh_brelse_uspi (sbi->s_uspi);
kfree (sbi->s_uspi);
+ mutex_destroy(&sbi->mutex);
kfree (sbi);
sb->s_fs_info = NULL;
UFSD("EXIT\n");
@@ -788,6 +785,14 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
flags = 0;
UFSD("ENTER\n");
+
+#ifndef CONFIG_UFS_FS_WRITE
+ if (!(sb->s_flags & MS_RDONLY)) {
+ printk("ufs was compiled with read-only support, "
+ "can't be mounted as read-write\n");
+ return -EROFS;
+ }
+#endif
sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL);
if (!sbi)
@@ -797,15 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY));
-#ifndef CONFIG_UFS_FS_WRITE
- if (!(sb->s_flags & MS_RDONLY)) {
- printk("ufs was compiled with read-only support, "
- "can't be mounted as read-write\n");
- goto failed;
- }
-#endif
mutex_init(&sbi->mutex);
- mutex_init(&sbi->s_lock);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
/*
@@ -1259,6 +1256,7 @@ magic_found:
return 0;
failed:
+ mutex_destroy(&sbi->mutex);
if (ubh)
ubh_brelse_uspi (uspi);
kfree (uspi);
@@ -1280,8 +1278,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
unsigned new_mount_opt, ufstype;
unsigned flags;
+ sync_filesystem(sb);
lock_ufs(sb);
- mutex_lock(&UFS_SB(sb)->s_lock);
uspi = UFS_SB(sb)->s_uspi;
flags = UFS_SB(sb)->s_flags;
usb1 = ubh_get_usb_first(uspi);
@@ -1295,7 +1293,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt = 0;
ufs_set_opt (new_mount_opt, ONERROR_LOCK);
if (!ufs_parse_options (data, &new_mount_opt)) {
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
@@ -1303,14 +1300,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
new_mount_opt |= ufstype;
} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
printk("ufstype can't be changed during remount\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
UFS_SB(sb)->s_mount_opt = new_mount_opt;
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
@@ -1335,7 +1330,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#ifndef CONFIG_UFS_FS_WRITE
printk("ufs was compiled with read-only support, "
"can't be mounted as read-write\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
#else
@@ -1345,13 +1339,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
printk("this ufstype is read-only supported\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EINVAL;
}
if (!ufs_read_cylinder_structures(sb)) {
printk("failed during remounting\n");
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return -EPERM;
}
@@ -1359,7 +1351,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
#endif
}
UFS_SB(sb)->s_mount_opt = new_mount_opt;
- mutex_unlock(&UFS_SB(sb)->s_lock);
unlock_ufs(sb);
return 0;
}
@@ -1389,15 +1380,11 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi;
unsigned flags = UFS_SB(sb)->s_flags;
- struct ufs_super_block_first *usb1;
- struct ufs_super_block_second *usb2;
struct ufs_super_block_third *usb3;
u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
lock_ufs(sb);
- usb1 = ubh_get_usb_first(uspi);
- usb2 = ubh_get_usb_second(uspi);
usb3 = ubh_get_usb_third(uspi);
if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) {
@@ -1453,7 +1440,7 @@ static void init_once(void *foo)
inode_init_once(&ei->vfs_inode);
}
-static int init_inodecache(void)
+static int __init init_inodecache(void)
{
ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
sizeof(struct ufs_inode_info),
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index ff2c15ab81a..343e6fc571e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -24,7 +24,6 @@ struct ufs_sb_info {
int work_queued; /* non-zero if the delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
- struct mutex s_lock;
};
struct ufs_inode_info {
diff --git a/fs/utimes.c b/fs/utimes.c
index f4fb7eca10e..aa138d64560 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -53,6 +53,7 @@ static int utimes_common(struct path *path, struct timespec *times)
int error;
struct iattr newattrs;
struct inode *inode = path->dentry->d_inode;
+ struct inode *delegated_inode = NULL;
error = mnt_want_write(path->mnt);
if (error)
@@ -101,9 +102,15 @@ static int utimes_common(struct path *path, struct timespec *times)
goto mnt_drop_write_and_out;
}
}
+retry_deleg:
mutex_lock(&inode->i_mutex);
- error = notify_change(path->dentry, &newattrs);
+ error = notify_change(path->dentry, &newattrs, &delegated_inode);
mutex_unlock(&inode->i_mutex);
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
mnt_drop_write_and_out:
mnt_drop_write(path->mnt);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3377dff1840..c69e6d43a0d 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -843,7 +843,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
/* wrap around? */
len = sizeof(*new_xattr) + size;
- if (len <= sizeof(*new_xattr))
+ if (len < sizeof(*new_xattr))
return NULL;
new_xattr = kmalloc(len, GFP_KERNEL);
diff --git a/fs/xattr_acl.c b/fs/xattr_acl.c
deleted file mode 100644
index 9fbea87fdb6..00000000000
--- a/fs/xattr_acl.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * linux/fs/xattr_acl.c
- *
- * Almost all from linux/fs/ext2/acl.c:
- * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
- */
-
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/posix_acl_xattr.h>
-#include <linux/gfp.h>
-#include <linux/user_namespace.h>
-
-/*
- * Fix up the uids and gids in posix acl extended attributes in place.
- */
-static void posix_acl_fix_xattr_userns(
- struct user_namespace *to, struct user_namespace *from,
- void *value, size_t size)
-{
- posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
- posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
- int count;
- kuid_t uid;
- kgid_t gid;
-
- if (!value)
- return;
- if (size < sizeof(posix_acl_xattr_header))
- return;
- if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
- return;
-
- count = posix_acl_xattr_count(size);
- if (count < 0)
- return;
- if (count == 0)
- return;
-
- for (end = entry + count; entry != end; entry++) {
- switch(le16_to_cpu(entry->e_tag)) {
- case ACL_USER:
- uid = make_kuid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kuid(to, uid));
- break;
- case ACL_GROUP:
- gid = make_kgid(from, le32_to_cpu(entry->e_id));
- entry->e_id = cpu_to_le32(from_kgid(to, gid));
- break;
- default:
- break;
- }
- }
-}
-
-void posix_acl_fix_xattr_from_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size);
-}
-
-void posix_acl_fix_xattr_to_user(void *value, size_t size)
-{
- struct user_namespace *user_ns = current_user_ns();
- if (user_ns == &init_user_ns)
- return;
- posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size);
-}
-
-/*
- * Convert from extended attribute to in-memory representation.
- */
-struct posix_acl *
-posix_acl_from_xattr(struct user_namespace *user_ns,
- const void *value, size_t size)
-{
- posix_acl_xattr_header *header = (posix_acl_xattr_header *)value;
- posix_acl_xattr_entry *entry = (posix_acl_xattr_entry *)(header+1), *end;
- int count;
- struct posix_acl *acl;
- struct posix_acl_entry *acl_e;
-
- if (!value)
- return NULL;
- if (size < sizeof(posix_acl_xattr_header))
- return ERR_PTR(-EINVAL);
- if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
- return ERR_PTR(-EOPNOTSUPP);
-
- count = posix_acl_xattr_count(size);
- if (count < 0)
- return ERR_PTR(-EINVAL);
- if (count == 0)
- return NULL;
-
- acl = posix_acl_alloc(count, GFP_NOFS);
- if (!acl)
- return ERR_PTR(-ENOMEM);
- acl_e = acl->a_entries;
-
- for (end = entry + count; entry != end; acl_e++, entry++) {
- acl_e->e_tag = le16_to_cpu(entry->e_tag);
- acl_e->e_perm = le16_to_cpu(entry->e_perm);
-
- switch(acl_e->e_tag) {
- case ACL_USER_OBJ:
- case ACL_GROUP_OBJ:
- case ACL_MASK:
- case ACL_OTHER:
- break;
-
- case ACL_USER:
- acl_e->e_uid =
- make_kuid(user_ns,
- le32_to_cpu(entry->e_id));
- if (!uid_valid(acl_e->e_uid))
- goto fail;
- break;
- case ACL_GROUP:
- acl_e->e_gid =
- make_kgid(user_ns,
- le32_to_cpu(entry->e_id));
- if (!gid_valid(acl_e->e_gid))
- goto fail;
- break;
-
- default:
- goto fail;
- }
- }
- return acl;
-
-fail:
- posix_acl_release(acl);
- return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL (posix_acl_from_xattr);
-
-/*
- * Convert from in-memory to extended attribute representation.
- */
-int
-posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
- void *buffer, size_t size)
-{
- posix_acl_xattr_header *ext_acl = (posix_acl_xattr_header *)buffer;
- posix_acl_xattr_entry *ext_entry = ext_acl->a_entries;
- int real_size, n;
-
- real_size = posix_acl_xattr_size(acl->a_count);
- if (!buffer)
- return real_size;
- if (real_size > size)
- return -ERANGE;
-
- ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);
-
- for (n=0; n < acl->a_count; n++, ext_entry++) {
- const struct posix_acl_entry *acl_e = &acl->a_entries[n];
- ext_entry->e_tag = cpu_to_le16(acl_e->e_tag);
- ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
- switch(acl_e->e_tag) {
- case ACL_USER:
- ext_entry->e_id =
- cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
- break;
- case ACL_GROUP:
- ext_entry->e_id =
- cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
- break;
- default:
- ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
- break;
- }
- }
- return real_size;
-}
-EXPORT_SYMBOL (posix_acl_to_xattr);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 0719e4db93f..c21f4350666 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -66,12 +66,14 @@ xfs-y += xfs_alloc.o \
xfs_bmap_btree.o \
xfs_btree.o \
xfs_da_btree.o \
+ xfs_da_format.o \
xfs_dir2.o \
xfs_dir2_block.o \
xfs_dir2_data.o \
xfs_dir2_leaf.o \
xfs_dir2_node.o \
xfs_dir2_sf.o \
+ xfs_dquot_buf.o \
xfs_ialloc.o \
xfs_ialloc_btree.o \
xfs_icreate_item.o \
@@ -103,7 +105,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs_qm_bhv.o \
xfs_qm.o \
xfs_quotaops.o
-xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
+
+# xfs_rtbitmap is shared with libxfs
+xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \
+ xfs_rtbitmap.o
+
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a02cfb9e3bc..844e288b957 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -63,25 +63,33 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
}
void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
-{
- void *ptr;
-
- ptr = kmem_alloc(size, flags);
- if (ptr)
- memset((char *)ptr, 0, (int)size);
- return ptr;
-}
-
-void *
kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
{
+ unsigned noio_flag = 0;
void *ptr;
+ gfp_t lflags;
ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
if (ptr)
return ptr;
- return vzalloc(size);
+
+ /*
+ * __vmalloc() will allocate data pages and auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
+ * here. Hence we need to tell memory reclaim that we are in such a
+ * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+ * the filesystem here and potentially deadlocking.
+ */
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ noio_flag = memalloc_noio_save();
+
+ lflags = kmem_flags_convert(flags);
+ ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+
+ if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+ memalloc_noio_restore(noio_flag);
+
+ return ptr;
}
void
@@ -128,14 +136,3 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
-
-void *
-kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
-{
- void *ptr;
-
- ptr = kmem_zone_alloc(zone, flags);
- if (ptr)
- memset((char *)ptr, 0, kmem_cache_size(zone));
- return ptr;
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 3a7371cab50..64db0e53ede 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -32,6 +32,7 @@ typedef unsigned __bitwise xfs_km_flags_t;
#define KM_NOSLEEP ((__force xfs_km_flags_t)0x0002u)
#define KM_NOFS ((__force xfs_km_flags_t)0x0004u)
#define KM_MAYFAIL ((__force xfs_km_flags_t)0x0008u)
+#define KM_ZERO ((__force xfs_km_flags_t)0x0010u)
/*
* We use a special process flag to avoid recursive callbacks into
@@ -43,7 +44,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
{
gfp_t lflags;
- BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+ BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL|KM_ZERO));
if (flags & KM_NOSLEEP) {
lflags = GFP_ATOMIC | __GFP_NOWARN;
@@ -52,11 +53,14 @@ kmem_flags_convert(xfs_km_flags_t flags)
if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
lflags &= ~__GFP_FS;
}
+
+ if (flags & KM_ZERO)
+ lflags |= __GFP_ZERO;
+
return lflags;
}
extern void *kmem_alloc(size_t, xfs_km_flags_t);
-extern void *kmem_zalloc(size_t, xfs_km_flags_t);
extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
extern void kmem_free(const void *);
@@ -64,6 +68,12 @@ extern void kmem_free(const void *);
extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+static inline void *
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
+{
+ return kmem_alloc(size, flags | KM_ZERO);
+}
+
/*
* Zone interfaces
*/
@@ -102,6 +112,11 @@ kmem_zone_destroy(kmem_zone_t *zone)
}
extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
-extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
+
+static inline void *
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
+{
+ return kmem_zone_alloc(zone, flags | KM_ZERO);
+}
#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 0e2f37efedd..6888ad886ff 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -16,15 +16,15 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
#include "xfs_ag.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
#include "xfs_trace.h"
#include <linux/slab.h>
#include <linux/xattr.h>
@@ -124,16 +124,12 @@ struct posix_acl *
xfs_get_acl(struct inode *inode, int type)
{
struct xfs_inode *ip = XFS_I(inode);
- struct posix_acl *acl;
+ struct posix_acl *acl = NULL;
struct xfs_acl *xfs_acl;
unsigned char *ea_name;
int error;
int len;
- acl = get_cached_acl(inode, type);
- if (acl != ACL_NOT_CACHED)
- return acl;
-
trace_xfs_get_acl(ip);
switch (type) {
@@ -164,10 +160,8 @@ xfs_get_acl(struct inode *inode, int type)
* cache entry, for any other error assume it is transient and
* leave the cache entry as ACL_NOT_CACHED.
*/
- if (error == -ENOATTR) {
- acl = NULL;
+ if (error == -ENOATTR)
goto out_update_cache;
- }
goto out;
}
@@ -183,15 +177,12 @@ out:
}
STATIC int
-xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+__xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
{
struct xfs_inode *ip = XFS_I(inode);
unsigned char *ea_name;
int error;
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
switch (type) {
case ACL_TYPE_ACCESS:
ea_name = SGI_ACL_FILE;
@@ -282,131 +273,23 @@ posix_acl_default_exists(struct inode *inode)
return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
}
-/*
- * No need for i_mutex because the inode is not yet exposed to the VFS.
- */
-int
-xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
-{
- umode_t mode = inode->i_mode;
- int error = 0, inherit = 0;
-
- if (S_ISDIR(inode->i_mode)) {
- error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
- if (error)
- goto out;
- }
-
- error = posix_acl_create(&acl, GFP_KERNEL, &mode);
- if (error < 0)
- return error;
-
- /*
- * If posix_acl_create returns a positive value we need to
- * inherit a permission that can't be represented using the Unix
- * mode bits and we actually need to set an ACL.
- */
- if (error > 0)
- inherit = 1;
-
- error = xfs_set_mode(inode, mode);
- if (error)
- goto out;
-
- if (inherit)
- error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-
-out:
- posix_acl_release(acl);
- return error;
-}
-
int
-xfs_acl_chmod(struct inode *inode)
+xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *acl;
- int error;
-
- if (S_ISLNK(inode->i_mode))
- return -EOPNOTSUPP;
-
- acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
-
- error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
- if (error)
- return error;
-
- error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
- posix_acl_release(acl);
- return error;
-}
-
-static int
-xfs_xattr_acl_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
-{
- struct posix_acl *acl;
- int error;
-
- acl = xfs_get_acl(dentry->d_inode, type);
- if (IS_ERR(acl))
- return PTR_ERR(acl);
- if (acl == NULL)
- return -ENODATA;
-
- error = posix_acl_to_xattr(&init_user_ns, acl, value, size);
- posix_acl_release(acl);
-
- return error;
-}
-
-static int
-xfs_xattr_acl_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- struct inode *inode = dentry->d_inode;
- struct posix_acl *acl = NULL;
int error = 0;
- if (flags & XATTR_CREATE)
- return -EINVAL;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
- return value ? -EACCES : 0;
- if (!inode_owner_or_capable(inode))
- return -EPERM;
-
- if (!value)
+ if (!acl)
goto set_acl;
- acl = posix_acl_from_xattr(&init_user_ns, value, size);
- if (!acl) {
- /*
- * acl_set_file(3) may request that we set default ACLs with
- * zero length -- defend (gracefully) against that here.
- */
- goto out;
- }
- if (IS_ERR(acl)) {
- error = PTR_ERR(acl);
- goto out;
- }
-
- error = posix_acl_valid(acl);
- if (error)
- goto out_release;
-
- error = -EINVAL;
+ error = -E2BIG;
if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
- goto out_release;
+ return error;
if (type == ACL_TYPE_ACCESS) {
umode_t mode = inode->i_mode;
error = posix_acl_equiv_mode(acl, &mode);
if (error <= 0) {
- posix_acl_release(acl);
acl = NULL;
if (error < 0)
@@ -415,27 +298,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
error = xfs_set_mode(inode, mode);
if (error)
- goto out_release;
+ return error;
}
set_acl:
- error = xfs_set_acl(inode, type, acl);
- out_release:
- posix_acl_release(acl);
- out:
- return error;
+ return __xfs_set_acl(inode, type, acl);
}
-
-const struct xattr_handler xfs_xattr_acl_access_handler = {
- .prefix = POSIX_ACL_XATTR_ACCESS,
- .flags = ACL_TYPE_ACCESS,
- .get = xfs_xattr_acl_get,
- .set = xfs_xattr_acl_set,
-};
-
-const struct xattr_handler xfs_xattr_acl_default_handler = {
- .prefix = POSIX_ACL_XATTR_DEFAULT,
- .flags = ACL_TYPE_DEFAULT,
- .get = xfs_xattr_acl_get,
- .set = xfs_xattr_acl_set,
-};
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 4016a567b83..5dc16374451 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -60,20 +60,15 @@ struct xfs_acl {
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
-extern int xfs_inherit_acl(struct inode *inode, struct posix_acl *default_acl);
-extern int xfs_acl_chmod(struct inode *inode);
+extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
extern int posix_acl_access_exists(struct inode *inode);
extern int posix_acl_default_exists(struct inode *inode);
-
-extern const struct xattr_handler xfs_xattr_acl_access_handler;
-extern const struct xattr_handler xfs_xattr_acl_default_handler;
#else
static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
{
return NULL;
}
-# define xfs_inherit_acl(inode, default_acl) 0
-# define xfs_acl_chmod(inode) 0
+# define xfs_set_acl NULL
# define posix_acl_access_exists(inode) 0
# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 1cb740afd67..6e247a99f5d 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -89,6 +89,8 @@ typedef struct xfs_agf {
/* structure must be padded to 64 bit alignment */
} xfs_agf_t;
+#define XFS_AGF_CRC_OFF offsetof(struct xfs_agf, agf_crc)
+
#define XFS_AGF_MAGICNUM 0x00000001
#define XFS_AGF_VERSIONNUM 0x00000002
#define XFS_AGF_SEQNO 0x00000004
@@ -128,8 +130,6 @@ typedef struct xfs_agf {
extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-
/*
* Size of the unlinked inode hash table in the agi.
*/
@@ -160,28 +160,38 @@ typedef struct xfs_agi {
* still being referenced.
*/
__be32 agi_unlinked[XFS_AGI_UNLINKED_BUCKETS];
-
+ /*
+ * This marks the end of logging region 1 and start of logging region 2.
+ */
uuid_t agi_uuid; /* uuid of filesystem */
__be32 agi_crc; /* crc of agi sector */
__be32 agi_pad32;
__be64 agi_lsn; /* last write sequence */
+ __be32 agi_free_root; /* root of the free inode btree */
+ __be32 agi_free_level;/* levels in free inode btree */
+
/* structure must be padded to 64 bit alignment */
} xfs_agi_t;
-#define XFS_AGI_MAGICNUM 0x00000001
-#define XFS_AGI_VERSIONNUM 0x00000002
-#define XFS_AGI_SEQNO 0x00000004
-#define XFS_AGI_LENGTH 0x00000008
-#define XFS_AGI_COUNT 0x00000010
-#define XFS_AGI_ROOT 0x00000020
-#define XFS_AGI_LEVEL 0x00000040
-#define XFS_AGI_FREECOUNT 0x00000080
-#define XFS_AGI_NEWINO 0x00000100
-#define XFS_AGI_DIRINO 0x00000200
-#define XFS_AGI_UNLINKED 0x00000400
-#define XFS_AGI_NUM_BITS 11
-#define XFS_AGI_ALL_BITS ((1 << XFS_AGI_NUM_BITS) - 1)
+#define XFS_AGI_CRC_OFF offsetof(struct xfs_agi, agi_crc)
+
+#define XFS_AGI_MAGICNUM (1 << 0)
+#define XFS_AGI_VERSIONNUM (1 << 1)
+#define XFS_AGI_SEQNO (1 << 2)
+#define XFS_AGI_LENGTH (1 << 3)
+#define XFS_AGI_COUNT (1 << 4)
+#define XFS_AGI_ROOT (1 << 5)
+#define XFS_AGI_LEVEL (1 << 6)
+#define XFS_AGI_FREECOUNT (1 << 7)
+#define XFS_AGI_NEWINO (1 << 8)
+#define XFS_AGI_DIRINO (1 << 9)
+#define XFS_AGI_UNLINKED (1 << 10)
+#define XFS_AGI_NUM_BITS_R1 11 /* end of the 1st agi logging region */
+#define XFS_AGI_ALL_BITS_R1 ((1 << XFS_AGI_NUM_BITS_R1) - 1)
+#define XFS_AGI_FREE_ROOT (1 << 11)
+#define XFS_AGI_FREE_LEVEL (1 << 12)
+#define XFS_AGI_NUM_BITS_R2 13
/* disk block (xfs_daddr_t) in the AG */
#define XFS_AGI_DADDR(mp) ((xfs_daddr_t)(2 << (mp)->m_sectbb_log))
@@ -191,8 +201,6 @@ typedef struct xfs_agi {
extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-
/*
* The third a.g. block contains the a.g. freelist, an array
* of block pointers to blocks owned by the allocation btree code.
@@ -226,6 +234,8 @@ typedef struct xfs_agfl {
__be32 agfl_bno[]; /* actually XFS_AGFL_SIZE(mp) */
} xfs_agfl_t;
+#define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc)
+
/*
* tags for inode radix tree
*/
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 5a1393f5e02..d43813267a8 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -17,25 +17,25 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
struct workqueue_struct *xfs_alloc_wq;
@@ -257,16 +257,14 @@ xfs_alloc_fix_len(
k = rlen % args->prod;
if (k == args->mod)
return;
- if (k > args->mod) {
- if ((int)(rlen = rlen - k - args->mod) < (int)args->minlen)
- return;
- } else {
- if ((int)(rlen = rlen - args->prod - (args->mod - k)) <
- (int)args->minlen)
- return;
- }
- ASSERT(rlen >= args->minlen);
- ASSERT(rlen <= args->maxlen);
+ if (k > args->mod)
+ rlen = rlen - (k - args->mod);
+ else
+ rlen = rlen - args->prod + (args->mod - k);
+ if ((int)rlen < (int)args->minlen)
+ return;
+ ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
+ ASSERT(rlen % args->prod == args->mod);
args->len = rlen;
}
@@ -474,7 +472,6 @@ xfs_agfl_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- int agfl_ok = 1;
/*
* There is no verification of non-crc AGFLs because mkfs does not
@@ -485,15 +482,13 @@ xfs_agfl_read_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return;
- agfl_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_agfl, agfl_crc));
-
- agfl_ok = agfl_ok && xfs_agfl_verify(bp);
-
- if (!agfl_ok) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_agfl_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -508,16 +503,15 @@ xfs_agfl_write_verify(
return;
if (!xfs_agfl_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
if (bip)
XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_agfl, agfl_crc));
+ xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
}
const struct xfs_buf_ops xfs_agfl_buf_ops = {
@@ -545,7 +539,6 @@ xfs_alloc_read_agfl(
XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
xfs_buf_set_ref(bp, XFS_AGFL_REF);
*bpp = bp;
return 0;
@@ -2238,19 +2231,17 @@ xfs_agf_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- int agf_ok = 1;
-
- if (xfs_sb_version_hascrc(&mp->m_sb))
- agf_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_agf, agf_crc));
-
- agf_ok = agf_ok && xfs_agf_verify(mp, bp);
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
+ XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -2261,8 +2252,8 @@ xfs_agf_write_verify(
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_agf_verify(mp, bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -2272,8 +2263,7 @@ xfs_agf_write_verify(
if (bip)
XFS_BUF_TO_AGF(bp)->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_agf, agf_crc));
+ xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
}
const struct xfs_buf_ops xfs_agf_buf_ops = {
@@ -2294,6 +2284,8 @@ xfs_read_agf(
{
int error;
+ trace_xfs_read_agf(mp, agno);
+
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
@@ -2324,8 +2316,9 @@ xfs_alloc_read_agf(
struct xfs_perag *pag; /* per allocation group data */
int error;
- ASSERT(agno != NULLAGNUMBER);
+ trace_xfs_alloc_read_agf(mp, agno);
+ ASSERT(agno != NULLAGNUMBER);
error = xfs_read_agf(mp, tp, agno,
(flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
bpp);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 99d0a610155..feacb061bab 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,7 +231,4 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
-extern const struct xfs_buf_ops xfs_agf_buf_ops;
-extern const struct xfs_buf_ops xfs_agfl_buf_ops;
-
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index cafc90251d1..8358f1ded94 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -17,23 +17,21 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_trans.h"
STATIC struct xfs_btree_cur *
@@ -72,7 +70,6 @@ xfs_allocbt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
int error;
@@ -357,12 +354,14 @@ static void
xfs_allocbt_read_verify(
struct xfs_buf *bp)
{
- if (!(xfs_btree_sblock_verify_crc(bp) &&
- xfs_allocbt_verify(bp))) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- bp->b_target->bt_mount, bp->b_addr);
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_allocbt_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
}
}
@@ -372,9 +371,9 @@ xfs_allocbt_write_verify(
{
if (!xfs_allocbt_verify(bp)) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- bp->b_target->bt_mount, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
}
xfs_btree_sblock_calc_crc(bp);
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index e3a3f742419..45e189e7e81 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -27,39 +27,6 @@ struct xfs_btree_cur;
struct xfs_mount;
/*
- * There are two on-disk btrees, one sorted by blockno and one sorted
- * by blockcount and blockno. All blocks look the same to make the code
- * simpler; if we have time later, we'll make the optimizations.
- */
-#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
-#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
-#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
-#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
-
-/*
- * Data record/key structure
- */
-typedef struct xfs_alloc_rec {
- __be32 ar_startblock; /* starting block number */
- __be32 ar_blockcount; /* count of free blocks */
-} xfs_alloc_rec_t, xfs_alloc_key_t;
-
-typedef struct xfs_alloc_rec_incore {
- xfs_agblock_t ar_startblock; /* starting block number */
- xfs_extlen_t ar_blockcount; /* count of free blocks */
-} xfs_alloc_rec_incore_t;
-
-/* btree pointer type */
-typedef __be32 xfs_alloc_ptr_t;
-
-/*
- * Block numbers in the AG:
- * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
- */
-#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
-#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
-
-/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_ALLOC_BLOCK_LEN(mp) \
@@ -95,6 +62,4 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
-extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
-
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e51e581454e..faaf716e208 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -16,14 +16,15 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_trans.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
@@ -31,6 +32,8 @@
#include "xfs_trace.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
#include <linux/aio.h>
#include <linux/gfp.h>
#include <linux/mpage.h>
@@ -333,7 +336,7 @@ xfs_map_blocks(
if (type == XFS_IO_DELALLOC &&
(!nimaps || isnullstartblock(imap->br_startblock))) {
- error = xfs_iomap_write_allocate(ip, offset, count, imap);
+ error = xfs_iomap_write_allocate(ip, offset, imap);
if (!error)
trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
return -XFS_ERROR(error);
@@ -404,7 +407,7 @@ xfs_alloc_ioend_bio(
struct bio *bio = bio_alloc(GFP_NOIO, nvecs);
ASSERT(bio->bi_private == NULL);
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
return bio;
}
@@ -629,38 +632,46 @@ xfs_map_at_offset(
}
/*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
+ * Test if a given page contains at least one buffer of a given @type.
+ * If @check_all_buffers is true, then we walk all the buffers in the page to
+ * try to find one of the type passed in. If it is not set, then the caller only
+ * needs to check the first buffer on the page for a match.
*/
-STATIC int
+STATIC bool
xfs_check_page_type(
struct page *page,
- unsigned int type)
+ unsigned int type,
+ bool check_all_buffers)
{
- if (PageWriteback(page))
- return 0;
+ struct buffer_head *bh;
+ struct buffer_head *head;
- if (page->mapping && page_has_buffers(page)) {
- struct buffer_head *bh, *head;
- int acceptable = 0;
+ if (PageWriteback(page))
+ return false;
+ if (!page->mapping)
+ return false;
+ if (!page_has_buffers(page))
+ return false;
- bh = head = page_buffers(page);
- do {
- if (buffer_unwritten(bh))
- acceptable += (type == XFS_IO_UNWRITTEN);
- else if (buffer_delay(bh))
- acceptable += (type == XFS_IO_DELALLOC);
- else if (buffer_dirty(bh) && buffer_mapped(bh))
- acceptable += (type == XFS_IO_OVERWRITE);
- else
- break;
- } while ((bh = bh->b_this_page) != head);
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_unwritten(bh)) {
+ if (type == XFS_IO_UNWRITTEN)
+ return true;
+ } else if (buffer_delay(bh)) {
+ if (type == XFS_IO_DELALLOC)
+ return true;
+ } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
+ if (type == XFS_IO_OVERWRITE)
+ return true;
+ }
- if (acceptable)
- return 1;
- }
+ /* If we are only checking the first buffer, we are done now. */
+ if (!check_all_buffers)
+ break;
+ } while ((bh = bh->b_this_page) != head);
- return 0;
+ return false;
}
/*
@@ -694,7 +705,7 @@ xfs_convert_page(
goto fail_unlock_page;
if (page->mapping != inode->i_mapping)
goto fail_unlock_page;
- if (!xfs_check_page_type(page, (*ioendp)->io_type))
+ if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
goto fail_unlock_page;
/*
@@ -739,6 +750,15 @@ xfs_convert_page(
p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
page_dirty = p_offset / len;
+ /*
+ * The moment we find a buffer that doesn't match our current type
+ * specification or can't be written, abort the loop and start
+ * writeback. As per the above xfs_imap_valid() check, only
+ * xfs_vm_writepage() can handle partial page writeback fully - we are
+ * limited here to the buffers that are contiguous with the current
+ * ioend, and hence a buffer we can't write breaks that contiguity and
+ * we have to defer the rest of the IO to xfs_vm_writepage().
+ */
bh = head = page_buffers(page);
do {
if (offset >= end_offset)
@@ -747,7 +767,7 @@ xfs_convert_page(
uptodate = 0;
if (!(PageUptodate(page) || buffer_uptodate(bh))) {
done = 1;
- continue;
+ break;
}
if (buffer_unwritten(bh) || buffer_delay(bh) ||
@@ -759,10 +779,11 @@ xfs_convert_page(
else
type = XFS_IO_OVERWRITE;
- if (!xfs_imap_valid(inode, imap, offset)) {
- done = 1;
- continue;
- }
+ /*
+ * imap should always be valid because of the above
+ * partial page end_offset check on the imap.
+ */
+ ASSERT(xfs_imap_valid(inode, imap, offset));
lock_buffer(bh);
if (type != XFS_IO_OVERWRITE)
@@ -774,6 +795,7 @@ xfs_convert_page(
count++;
} else {
done = 1;
+ break;
}
} while (offset += len, (bh = bh->b_this_page) != head);
@@ -865,7 +887,7 @@ xfs_aops_discard_page(
struct buffer_head *bh, *head;
loff_t offset = page_offset(page);
- if (!xfs_check_page_type(page, XFS_IO_DELALLOC))
+ if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
goto out_invalidate;
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
@@ -953,14 +975,39 @@ xfs_vm_writepage(
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
- if (WARN_ON(current->flags & PF_FSTRANS))
+ if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
goto redirty;
/* Is this page beyond the end of the file? */
offset = i_size_read(inode);
end_index = offset >> PAGE_CACHE_SHIFT;
last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
- if (page->index >= end_index) {
+
+ /*
+ * The page index is less than the end_index, adjust the end_offset
+ * to the highest offset that this page should represent.
+ * -----------------------------------------------------
+ * | file mapping | <EOF> |
+ * -----------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | |
+ * ^--------------------------------^----------|--------
+ * | desired writeback range | see else |
+ * ---------------------------------^------------------|
+ */
+ if (page->index < end_index)
+ end_offset = (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT;
+ else {
+ /*
+ * Check whether the page to write out is beyond or straddles
+ * i_size or not.
+ * -------------------------------------------------------
+ * | file mapping | <EOF> |
+ * -------------------------------------------------------
+ * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
+ * ^--------------------------------^-----------|---------
+ * | | Straddles |
+ * ---------------------------------^-----------|--------|
+ */
unsigned offset_into_page = offset & (PAGE_CACHE_SIZE - 1);
/*
@@ -968,24 +1015,36 @@ xfs_vm_writepage(
* truncate operation that is in progress. We must redirty the
* page so that reclaim stops reclaiming it. Otherwise
* xfs_vm_releasepage() is called on it and gets confused.
+ *
+ * Note that the end_index is unsigned long, it would overflow
+ * if the given offset is greater than 16TB on 32-bit system
+ * and if we do check the page is fully outside i_size or not
+ * via "if (page->index >= end_index + 1)" as "end_index + 1"
+ * will be evaluated to 0. Hence this page will be redirtied
+ * and be written out repeatedly which would result in an
+ * infinite loop, the user program that perform this operation
+ * will hang. Instead, we can verify this situation by checking
+ * if the page to write is totally beyond the i_size or if it's
+ * offset is just equal to the EOF.
*/
- if (page->index >= end_index + 1 || offset_into_page == 0)
+ if (page->index > end_index ||
+ (page->index == end_index && offset_into_page == 0))
goto redirty;
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
+ * that is not a multiple of the page size, the remaining
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
zero_user_segment(page, offset_into_page, PAGE_CACHE_SIZE);
+
+ /* Adjust the end_offset to the end of file */
+ end_offset = offset;
}
- end_offset = min_t(unsigned long long,
- (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
- offset);
len = 1 << inode->i_blkbits;
bh = head = page_buffers(page);
@@ -1166,9 +1225,9 @@ xfs_vm_releasepage(
xfs_count_page_state(page, &delalloc, &unwritten);
- if (WARN_ON(delalloc))
+ if (WARN_ON_ONCE(delalloc))
return 0;
- if (WARN_ON(unwritten))
+ if (WARN_ON_ONCE(unwritten))
return 0;
return try_to_free_buffers(page);
@@ -1214,7 +1273,7 @@ __xfs_get_blocks(
lockmode = XFS_ILOCK_EXCL;
xfs_ilock(ip, lockmode);
} else {
- lockmode = xfs_ilock_map_shared(ip);
+ lockmode = xfs_ilock_data_map_shared(ip);
}
ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -1322,6 +1381,14 @@ __xfs_get_blocks(
/*
* If this is O_DIRECT or the mpage code calling tell them how large
* the mapping is, so that we can avoid repeated get_blocks calls.
+ *
+ * If the mapping spans EOF, then we have to break the mapping up as the
+ * mapping for blocks beyond EOF must be marked new so that sub block
+ * regions can be correctly zeroed. We can't do this for mappings within
+ * EOF unless the mapping was just allocated or is unwritten, otherwise
+ * the callers would overwrite existing data with zeros. Hence we have
+ * to split the mapping into a range up to and including EOF, and a
+ * second mapping for beyond EOF.
*/
if (direct || size > (1 << inode->i_blkbits)) {
xfs_off_t mapping_size;
@@ -1332,6 +1399,12 @@ __xfs_get_blocks(
ASSERT(mapping_size > 0);
if (mapping_size > size)
mapping_size = size;
+ if (offset < i_size_read(inode) &&
+ offset + mapping_size >= i_size_read(inode)) {
+ /* limit mapping to block that spans EOF */
+ mapping_size = roundup_64(i_size_read(inode) - offset,
+ 1 << inode->i_blkbits);
+ }
if (mapping_size > LONG_MAX)
mapping_size = LONG_MAX;
@@ -1413,9 +1486,8 @@ STATIC ssize_t
xfs_vm_direct_IO(
int rw,
struct kiocb *iocb,
- const struct iovec *iov,
- loff_t offset,
- unsigned long nr_segs)
+ struct iov_iter *iter,
+ loff_t offset)
{
struct inode *inode = iocb->ki_filp->f_mapping->host;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
@@ -1423,7 +1495,7 @@ xfs_vm_direct_IO(
ssize_t ret;
if (rw & WRITE) {
- size_t size = iov_length(iov, nr_segs);
+ size_t size = iov_iter_count(iter);
/*
* We cannot preallocate a size update transaction here as we
@@ -1435,16 +1507,15 @@ xfs_vm_direct_IO(
if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
- xfs_end_io_direct_write, NULL, 0);
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
+ xfs_end_io_direct_write, NULL,
+ DIO_ASYNC_EXTEND);
if (ret != -EIOCBQUEUED && iocb->private)
goto out_destroy_ioend;
} else {
- ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
- offset, nr_segs,
- xfs_get_blocks_direct,
+ ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
+ offset, xfs_get_blocks_direct,
NULL, NULL, 0);
}
@@ -1543,6 +1614,16 @@ xfs_vm_write_failed(
xfs_vm_kill_delalloc_range(inode, block_offset,
block_offset + bh->b_size);
+
+ /*
+ * This buffer does not contain data anymore. make sure anyone
+ * who finds it knows that for certain.
+ */
+ clear_buffer_delay(bh);
+ clear_buffer_uptodate(bh);
+ clear_buffer_mapped(bh);
+ clear_buffer_new(bh);
+ clear_buffer_dirty(bh);
}
}
@@ -1569,20 +1650,28 @@ xfs_vm_write_begin(
ASSERT(len <= PAGE_CACHE_SIZE);
- page = grab_cache_page_write_begin(mapping, index,
- flags | AOP_FLAG_NOFS);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
status = __block_write_begin(page, pos, len, xfs_get_blocks);
if (unlikely(status)) {
struct inode *inode = mapping->host;
+ size_t isize = i_size_read(inode);
xfs_vm_write_failed(inode, page, pos, len);
unlock_page(page);
- if (pos + len > i_size_read(inode))
- truncate_pagecache(inode, i_size_read(inode));
+ /*
+ * If the write is beyond EOF, we only want to kill blocks
+ * allocated in this write, not blocks that were previously
+ * written successfully.
+ */
+ if (pos + len > isize) {
+ ssize_t start = max_t(ssize_t, pos, isize);
+
+ truncate_pagecache_range(inode, start, pos + len);
+ }
page_cache_release(page);
page = NULL;
@@ -1593,9 +1682,12 @@ xfs_vm_write_begin(
}
/*
- * On failure, we only need to kill delalloc blocks beyond EOF because they
- * will never be written. For blocks within EOF, generic_write_end() zeros them
- * so they are safe to leave alone and be written with all the other valid data.
+ * On failure, we only need to kill delalloc blocks beyond EOF in the range of
+ * this specific write because they will never be written. Previous writes
+ * beyond EOF where block allocation succeeded do not need to be trashed, so
+ * only new blocks from this write should be trashed. For blocks within
+ * EOF, generic_write_end() zeros them so they are safe to leave alone and be
+ * written with all the other valid data.
*/
STATIC int
xfs_vm_write_end(
@@ -1618,8 +1710,11 @@ xfs_vm_write_end(
loff_t to = pos + len;
if (to > isize) {
- truncate_pagecache(inode, isize);
+ /* only kill blocks in this write beyond EOF */
+ if (pos > isize)
+ isize = pos;
xfs_vm_kill_delalloc_range(inode, isize, to);
+ truncate_pagecache_range(inode, isize, to);
}
}
return ret;
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index ddcf2267ffa..bfe36fc2cdc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -17,23 +17,24 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_attr_remote.h"
@@ -41,6 +42,7 @@
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
+#include "xfs_dinode.h"
/*
* xfs_attr.c
@@ -75,17 +77,27 @@ STATIC int xfs_attr_refillstate(xfs_da_state_t *state);
STATIC int
-xfs_attr_name_to_xname(
- struct xfs_name *xname,
- const unsigned char *aname)
+xfs_attr_args_init(
+ struct xfs_da_args *args,
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- if (!aname)
+
+ if (!name)
return EINVAL;
- xname->name = aname;
- xname->len = strlen((char *)aname);
- if (xname->len >= MAXNAMELEN)
+
+ memset(args, 0, sizeof(*args));
+ args->geo = dp->i_mount->m_attr_geo;
+ args->whichfork = XFS_ATTR_FORK;
+ args->dp = dp;
+ args->flags = flags;
+ args->name = name;
+ args->namelen = strlen((const char *)name);
+ if (args->namelen >= MAXNAMELEN)
return EFAULT; /* match IRIX behaviour */
+ args->hashval = xfs_da_hashname(args->name, args->namelen);
return 0;
}
@@ -104,78 +116,46 @@ xfs_inode_hasattr(
* Overall external interface routines.
*========================================================================*/
-STATIC int
-xfs_attr_get_int(
+int
+xfs_attr_get(
struct xfs_inode *ip,
- struct xfs_name *name,
+ const unsigned char *name,
unsigned char *value,
int *valuelenp,
int flags)
{
- xfs_da_args_t args;
- int error;
+ struct xfs_da_args args;
+ uint lock_mode;
+ int error;
+
+ XFS_STATS_INC(xs_attr_get);
+
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return EIO;
if (!xfs_inode_hasattr(ip))
return ENOATTR;
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
+ error = xfs_attr_args_init(&args, ip, name, flags);
+ if (error)
+ return error;
+
args.value = value;
args.valuelen = *valuelenp;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = ip;
- args.whichfork = XFS_ATTR_FORK;
- /*
- * Decide on what work routines to call based on the inode size.
- */
- if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ lock_mode = xfs_ilock_attr_map_shared(ip);
+ if (!xfs_inode_hasattr(ip))
+ error = ENOATTR;
+ else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
error = xfs_attr_shortform_getvalue(&args);
- } else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) {
+ else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
error = xfs_attr_leaf_get(&args);
- } else {
+ else
error = xfs_attr_node_get(&args);
- }
+ xfs_iunlock(ip, lock_mode);
- /*
- * Return the number of bytes in the value to the caller.
- */
*valuelenp = args.valuelen;
-
- if (error == EEXIST)
- error = 0;
- return(error);
-}
-
-int
-xfs_attr_get(
- xfs_inode_t *ip,
- const unsigned char *name,
- unsigned char *value,
- int *valuelenp,
- int flags)
-{
- int error;
- struct xfs_name xname;
-
- XFS_STATS_INC(xs_attr_get);
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return(EIO);
-
- error = xfs_attr_name_to_xname(&xname, name);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- error = xfs_attr_get_int(ip, &xname, value, valuelenp, flags);
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- return(error);
+ return error == EEXIST ? 0 : error;
}
/*
@@ -183,12 +163,10 @@ xfs_attr_get(
*/
STATIC int
xfs_attr_calc_size(
- struct xfs_inode *ip,
- int namelen,
- int valuelen,
+ struct xfs_da_args *args,
int *local)
{
- struct xfs_mount *mp = ip->i_mount;
+ struct xfs_mount *mp = args->dp->i_mount;
int size;
int nblks;
@@ -196,12 +174,10 @@ xfs_attr_calc_size(
* Determine space new attribute will use, and if it would be
* "local" or "remote" (note: local != inline).
*/
- size = xfs_attr_leaf_newentsize(namelen, valuelen,
- mp->m_sb.sb_blocksize, local);
-
+ size = xfs_attr_leaf_newentsize(args, local);
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
if (*local) {
- if (size > (mp->m_sb.sb_blocksize >> 1)) {
+ if (size > (args->geo->blksize / 2)) {
/* Double split possible */
nblks *= 2;
}
@@ -210,7 +186,7 @@ xfs_attr_calc_size(
* Out of line attribute, cannot double split, but
* make room for the attribute value itself.
*/
- uint dblocks = XFS_B_TO_FSB(mp, valuelen);
+ uint dblocks = xfs_attr3_rmt_blocks(mp, args->valuelen);
nblks += dblocks;
nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
}
@@ -218,26 +194,38 @@ xfs_attr_calc_size(
return nblks;
}
-STATIC int
-xfs_attr_set_int(
- struct xfs_inode *dp,
- struct xfs_name *name,
- unsigned char *value,
- int valuelen,
- int flags)
+int
+xfs_attr_set(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ unsigned char *value,
+ int valuelen,
+ int flags)
{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error, err2, committed;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
struct xfs_trans_res tres;
+ xfs_fsblock_t firstblock;
int rsvd = (flags & ATTR_ROOT) != 0;
- int local;
+ int error, err2, committed, local;
+
+ XFS_STATS_INC(xs_attr_set);
+
+ if (XFS_FORCED_SHUTDOWN(dp->i_mount))
+ return EIO;
+
+ error = xfs_attr_args_init(&args, dp, name, flags);
+ if (error)
+ return error;
+
+ args.value = value;
+ args.valuelen = valuelen;
+ args.firstblock = &firstblock;
+ args.flist = &flist;
+ args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+ args.total = xfs_attr_calc_size(&args, &local);
- /*
- * Attach the dquots to the inode.
- */
error = xfs_qm_dqattach(dp, 0);
if (error)
return error;
@@ -248,32 +236,14 @@ xfs_attr_set_int(
*/
if (XFS_IFORK_Q(dp) == 0) {
int sf_size = sizeof(xfs_attr_sf_hdr_t) +
- XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen);
+ XFS_ATTR_SF_ENTSIZE_BYNAME(args.namelen, valuelen);
- if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd)))
- return(error);
+ error = xfs_bmap_add_attrfork(dp, sf_size, rsvd);
+ if (error)
+ return error;
}
/*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
- args.value = value;
- args.valuelen = valuelen;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
- args.firstblock = &firstblock;
- args.flist = &flist;
- args.whichfork = XFS_ATTR_FORK;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
- /* Size is now blocks for attribute data */
- args.total = xfs_attr_calc_size(dp, name->len, valuelen, &local);
-
- /*
* Start our first transaction of the day.
*
* All future transactions during this code must be "chained" off
@@ -300,7 +270,7 @@ xfs_attr_set_int(
error = xfs_trans_reserve(args.trans, &tres, args.total, 0);
if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -310,7 +280,7 @@ xfs_attr_set_int(
if (error) {
xfs_iunlock(dp, XFS_ILOCK_EXCL);
xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES);
- return (error);
+ return error;
}
xfs_trans_ijoin(args.trans, dp, 0);
@@ -319,9 +289,9 @@ xfs_attr_set_int(
* If the attribute list is non-existent or a shortform list,
* upgrade it to a single-leaf-block attribute list.
*/
- if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) ||
- ((dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS) &&
- (dp->i_d.di_anextents == 0))) {
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
/*
* Build initial attribute list (if required).
@@ -346,9 +316,8 @@ xfs_attr_set_int(
* the transaction goes to disk before returning
* to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if (!error && (flags & ATTR_KERNOTIME) == 0) {
xfs_trans_ichgtime(args.trans, dp,
@@ -358,7 +327,7 @@ xfs_attr_set_int(
XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error == 0 ? err2 : error);
+ return error ? error : err2;
}
/*
@@ -396,22 +365,19 @@ xfs_attr_set_int(
}
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
error = xfs_attr_leaf_addname(&args);
- } else {
+ else
error = xfs_attr_node_addname(&args);
- }
- if (error) {
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if ((flags & ATTR_KERNOTIME) == 0)
xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
@@ -423,65 +389,47 @@ xfs_attr_set_int(
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ }
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
}
+/*
+ * Generic handler routine to remove a name from an attribute list.
+ * Transitions attribute list from Btree to shortform as necessary.
+ */
int
-xfs_attr_set(
- xfs_inode_t *dp,
- const unsigned char *name,
- unsigned char *value,
- int valuelen,
- int flags)
+xfs_attr_remove(
+ struct xfs_inode *dp,
+ const unsigned char *name,
+ int flags)
{
- int error;
- struct xfs_name xname;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_da_args args;
+ struct xfs_bmap_free flist;
+ xfs_fsblock_t firstblock;
+ int error;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
+ return EIO;
+
+ if (!xfs_inode_hasattr(dp))
+ return ENOATTR;
- error = xfs_attr_name_to_xname(&xname, name);
+ error = xfs_attr_args_init(&args, dp, name, flags);
if (error)
return error;
- return xfs_attr_set_int(dp, &xname, value, valuelen, flags);
-}
-
-/*
- * Generic handler routine to remove a name from an attribute list.
- * Transitions attribute list from Btree to shortform as necessary.
- */
-STATIC int
-xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
-{
- xfs_da_args_t args;
- xfs_fsblock_t firstblock;
- xfs_bmap_free_t flist;
- int error;
- xfs_mount_t *mp = dp->i_mount;
-
- /*
- * Fill in the arg structure for this request.
- */
- memset((char *)&args, 0, sizeof(args));
- args.name = name->name;
- args.namelen = name->len;
- args.flags = flags;
- args.hashval = xfs_da_hashname(args.name, args.namelen);
- args.dp = dp;
args.firstblock = &firstblock;
args.flist = &flist;
- args.total = 0;
- args.whichfork = XFS_ATTR_FORK;
/*
* we have no control over the attribute names that userspace passes us
@@ -490,9 +438,6 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
*/
args.op_flags = XFS_DA_OP_OKNOENT;
- /*
- * Attach the dquots to the inode.
- */
error = xfs_qm_dqattach(dp, 0);
if (error)
return error;
@@ -521,7 +466,7 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
XFS_ATTRRM_SPACE_RES(mp), 0);
if (error) {
xfs_trans_cancel(args.trans, 0);
- return(error);
+ return error;
}
xfs_ilock(dp, XFS_ILOCK_EXCL);
@@ -531,35 +476,26 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
*/
xfs_trans_ijoin(args.trans, dp, 0);
- /*
- * Decide on what work routines to call based on the inode size.
- */
if (!xfs_inode_hasattr(dp)) {
error = XFS_ERROR(ENOATTR);
- goto out;
- }
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
error = xfs_attr_shortform_remove(&args);
- if (error) {
- goto out;
- }
} else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
error = xfs_attr_leaf_removename(&args);
} else {
error = xfs_attr_node_removename(&args);
}
- if (error) {
+
+ if (error)
goto out;
- }
/*
* If this is a synchronous mount, make sure that the
* transaction goes to disk before returning to the user.
*/
- if (mp->m_flags & XFS_MOUNT_WSYNC) {
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(args.trans);
- }
if ((flags & ATTR_KERNOTIME) == 0)
xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG);
@@ -571,45 +507,17 @@ xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags)
error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES);
xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
+ return error;
out:
- if (args.trans)
+ if (args.trans) {
xfs_trans_cancel(args.trans,
XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return(error);
-}
-
-int
-xfs_attr_remove(
- xfs_inode_t *dp,
- const unsigned char *name,
- int flags)
-{
- int error;
- struct xfs_name xname;
-
- XFS_STATS_INC(xs_attr_remove);
-
- if (XFS_FORCED_SHUTDOWN(dp->i_mount))
- return (EIO);
-
- error = xfs_attr_name_to_xname(&xname, name);
- if (error)
- return error;
-
- xfs_ilock(dp, XFS_ILOCK_SHARED);
- if (!xfs_inode_hasattr(dp)) {
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
- return XFS_ERROR(ENOATTR);
}
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
- return xfs_attr_remove_int(dp, &xname, flags);
+ xfs_iunlock(dp, XFS_ILOCK_EXCL);
+ return error;
}
-
/*========================================================================
* External routines when attribute list is inside the inode
*========================================================================*/
@@ -695,11 +603,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
trace_xfs_attr_leaf_replace(args);
+ /* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
+ args->rmtblkno = 0;
+ args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
/*
@@ -791,6 +710,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -943,7 +863,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
}
/*========================================================================
- * External routines when attribute list size > XFS_LBSIZE(mp).
+ * External routines when attribute list size > geo->blksize
*========================================================================*/
/*
@@ -976,8 +896,6 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name already exists, and get back a pointer
@@ -996,13 +914,22 @@ restart:
trace_xfs_attr_node_replace(args);
+ /* save the attribute state for later removal*/
args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */
args->blkno2 = args->blkno; /* set 2nd entry info*/
args->index2 = args->index;
args->rmtblkno2 = args->rmtblkno;
args->rmtblkcnt2 = args->rmtblkcnt;
+ args->rmtvaluelen2 = args->rmtvaluelen;
+
+ /*
+ * clear the remote attr state now that it is saved so that the
+ * values reflect the state of the attribute we are about to
+ * add, not the attribute we just found and will remove later.
+ */
args->rmtblkno = 0;
args->rmtblkcnt = 0;
+ args->rmtvaluelen = 0;
}
retval = xfs_attr3_leaf_add(blk->bp, state->args);
@@ -1130,6 +1057,7 @@ restart:
args->blkno = args->blkno2;
args->rmtblkno = args->rmtblkno2;
args->rmtblkcnt = args->rmtblkcnt2;
+ args->rmtvaluelen = args->rmtvaluelen2;
if (args->rmtblkno) {
error = xfs_attr_rmtval_remove(args);
if (error)
@@ -1145,8 +1073,6 @@ restart:
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
state->inleaf = 0;
error = xfs_da3_node_lookup_int(state, &retval);
if (error)
@@ -1237,8 +1163,6 @@ xfs_attr_node_removename(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
@@ -1500,8 +1424,6 @@ xfs_attr_node_get(xfs_da_args_t *args)
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_sb.sb_blocksize;
- state->node_ents = state->mp->m_attr_node_ents;
/*
* Search to see if name exists, and get back a pointer to it.
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index bb24b07cbed..09480c57f06 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -18,22 +18,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
#include "xfs_alloc.h"
-#include "xfs_btree.h"
#include "xfs_attr_remote.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
@@ -41,7 +39,8 @@
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
-#include "xfs_trans_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
/*
* Look at all the extents for this logical region,
@@ -232,13 +231,13 @@ xfs_attr3_node_inactive(
}
node = bp->b_addr;
- xfs_da3_node_hdr_from_disk(&ichdr, node);
+ dp->d_ops->node_hdr_from_disk(&ichdr, node);
parent_blkno = bp->b_bn;
if (!ichdr.count) {
xfs_trans_brelse(*trans, bp);
return 0;
}
- btree = xfs_da3_node_tree_p(node);
+ btree = dp->d_ops->node_tree_p(node);
child_fsb = be32_to_cpu(btree[0].before);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 86db20a9cc0..28712d29e43 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -18,32 +18,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
/*
@@ -81,11 +80,12 @@ STATIC int xfs_attr3_leaf_figure_balance(xfs_da_state_t *state,
/*
* Utility routines.
*/
-STATIC void xfs_attr3_leaf_moveents(struct xfs_attr_leafblock *src_leaf,
+STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args,
+ struct xfs_attr_leafblock *src_leaf,
struct xfs_attr3_icleaf_hdr *src_ichdr, int src_start,
struct xfs_attr_leafblock *dst_leaf,
struct xfs_attr3_icleaf_hdr *dst_ichdr, int dst_start,
- int move_count, struct xfs_mount *mp);
+ int move_count);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
void
@@ -214,8 +214,8 @@ xfs_attr3_leaf_write_verify(
struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
if (!xfs_attr3_leaf_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -225,7 +225,7 @@ xfs_attr3_leaf_write_verify(
if (bip)
hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_ATTR3_LEAF_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF);
}
/*
@@ -240,13 +240,14 @@ xfs_attr3_leaf_read_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if ((xfs_sb_version_hascrc(&mp->m_sb) &&
- !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_ATTR3_LEAF_CRC_OFF)) ||
- !xfs_attr3_leaf_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_attr3_leaf_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
@@ -711,6 +712,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
memset((char *)&nargs, 0, sizeof(nargs));
nargs.dp = dp;
+ nargs.geo = args->geo;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
nargs.total = args->total;
@@ -805,18 +807,18 @@ xfs_attr3_leaf_to_shortform(
trace_xfs_attr_leaf_to_sf(args);
- tmpbuffer = kmem_alloc(XFS_LBSIZE(dp->i_mount), KM_SLEEP);
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
if (!tmpbuffer)
return ENOMEM;
- memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(dp->i_mount));
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
leaf = (xfs_attr_leafblock_t *)tmpbuffer;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
entry = xfs_attr3_leaf_entryp(leaf);
/* XXX (dgc): buffer is about to be marked stale - why zero it? */
- memset(bp->b_addr, 0, XFS_LBSIZE(dp->i_mount));
+ memset(bp->b_addr, 0, args->geo->blksize);
/*
* Clean out the prior contents of the attribute list.
@@ -838,6 +840,7 @@ xfs_attr3_leaf_to_shortform(
* Copy the attributes
*/
memset((char *)&nargs, 0, sizeof(nargs));
+ nargs.geo = args->geo;
nargs.dp = dp;
nargs.firstblock = args->firstblock;
nargs.flist = args->flist;
@@ -904,12 +907,12 @@ xfs_attr3_leaf_to_node(
/* copy leaf to new buffer, update identifiers */
xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(mp));
+ memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
hdr3->blkno = cpu_to_be64(bp2->b_bn);
}
- xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
* Set up the new root node.
@@ -918,8 +921,8 @@ xfs_attr3_leaf_to_node(
if (error)
goto out;
node = bp1->b_addr;
- xfs_da3_node_hdr_from_disk(&icnodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&icnodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
leaf = bp2->b_addr;
xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf);
@@ -929,8 +932,8 @@ xfs_attr3_leaf_to_node(
btree[0].hashval = entries[icleafhdr.count - 1].hashval;
btree[0].before = cpu_to_be32(blkno);
icnodehdr.count = 1;
- xfs_da3_node_hdr_to_disk(node, &icnodehdr);
- xfs_trans_log_buf(args->trans, bp1, 0, XFS_LBSIZE(mp) - 1);
+ dp->d_ops->node_hdr_to_disk(node, &icnodehdr);
+ xfs_trans_log_buf(args->trans, bp1, 0, args->geo->blksize - 1);
error = 0;
out:
return error;
@@ -966,10 +969,10 @@ xfs_attr3_leaf_create(
bp->b_ops = &xfs_attr3_leaf_buf_ops;
xfs_trans_buf_set_type(args->trans, bp, XFS_BLFT_ATTR_LEAF_BUF);
leaf = bp->b_addr;
- memset(leaf, 0, XFS_LBSIZE(mp));
+ memset(leaf, 0, args->geo->blksize);
memset(&ichdr, 0, sizeof(ichdr));
- ichdr.firstused = XFS_LBSIZE(mp);
+ ichdr.firstused = args->geo->blksize;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_blkinfo *hdr3 = bp->b_addr;
@@ -988,7 +991,7 @@ xfs_attr3_leaf_create(
ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base;
xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr);
- xfs_trans_log_buf(args->trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1);
*bpp = bp;
return 0;
@@ -1074,8 +1077,7 @@ xfs_attr3_leaf_add(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
ASSERT(args->index >= 0 && args->index <= ichdr.count);
- entsize = xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- args->trans->t_mountp->m_sb.sb_blocksize, NULL);
+ entsize = xfs_attr_leaf_newentsize(args, NULL);
/*
* Search through freemap for first-fit on new name length.
@@ -1174,17 +1176,14 @@ xfs_attr3_leaf_add_work(
* Allocate space for the new string (at the end of the run).
*/
mp = args->trans->t_mountp;
- ASSERT(ichdr->freemap[mapindex].base < XFS_LBSIZE(mp));
+ ASSERT(ichdr->freemap[mapindex].base < args->geo->blksize);
ASSERT((ichdr->freemap[mapindex].base & 0x3) == 0);
ASSERT(ichdr->freemap[mapindex].size >=
- xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, NULL));
- ASSERT(ichdr->freemap[mapindex].size < XFS_LBSIZE(mp));
+ xfs_attr_leaf_newentsize(args, NULL));
+ ASSERT(ichdr->freemap[mapindex].size < args->geo->blksize);
ASSERT((ichdr->freemap[mapindex].size & 0x3) == 0);
- ichdr->freemap[mapindex].size -=
- xfs_attr_leaf_newentsize(args->namelen, args->valuelen,
- mp->m_sb.sb_blocksize, &tmp);
+ ichdr->freemap[mapindex].size -= xfs_attr_leaf_newentsize(args, &tmp);
entry->nameidx = cpu_to_be16(ichdr->freemap[mapindex].base +
ichdr->freemap[mapindex].size);
@@ -1229,6 +1228,7 @@ xfs_attr3_leaf_add_work(
name_rmt->valueblk = 0;
args->rmtblkno = 1;
args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ args->rmtvaluelen = args->valuelen;
}
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -1268,14 +1268,13 @@ xfs_attr3_leaf_compact(
struct xfs_attr_leafblock *leaf_dst;
struct xfs_attr3_icleaf_hdr ichdr_src;
struct xfs_trans *trans = args->trans;
- struct xfs_mount *mp = trans->t_mountp;
char *tmpbuffer;
trace_xfs_attr_leaf_compact(args);
- tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
- memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
- memset(bp->b_addr, 0, XFS_LBSIZE(mp));
+ tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
+ memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
+ memset(bp->b_addr, 0, args->geo->blksize);
leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
leaf_dst = bp->b_addr;
@@ -1288,7 +1287,7 @@ xfs_attr3_leaf_compact(
/* Initialise the incore headers */
ichdr_src = *ichdr_dst; /* struct copy */
- ichdr_dst->firstused = XFS_LBSIZE(mp);
+ ichdr_dst->firstused = args->geo->blksize;
ichdr_dst->usedbytes = 0;
ichdr_dst->count = 0;
ichdr_dst->holes = 0;
@@ -1303,13 +1302,13 @@ xfs_attr3_leaf_compact(
* Copy all entry's in the same (sorted) order,
* but allocate name/value pairs packed and in sequence.
*/
- xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0,
- ichdr_src.count, mp);
+ xfs_attr3_leaf_moveents(args, leaf_src, &ichdr_src, 0,
+ leaf_dst, ichdr_dst, 0, ichdr_src.count);
/*
* this logs the entire buffer, but the caller must write the header
* back to the buffer when it is finished modifying it.
*/
- xfs_trans_log_buf(trans, bp, 0, XFS_LBSIZE(mp) - 1);
+ xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
kmem_free(tmpbuffer);
}
@@ -1460,8 +1459,8 @@ xfs_attr3_leaf_rebalance(
/*
* Move high entries from leaf1 to low end of leaf2.
*/
- xfs_attr3_leaf_moveents(leaf1, &ichdr1, ichdr1.count - count,
- leaf2, &ichdr2, 0, count, state->mp);
+ xfs_attr3_leaf_moveents(args, leaf1, &ichdr1,
+ ichdr1.count - count, leaf2, &ichdr2, 0, count);
} else if (count > ichdr1.count) {
/*
@@ -1489,14 +1488,14 @@ xfs_attr3_leaf_rebalance(
/*
* Move low entries from leaf2 to high end of leaf1.
*/
- xfs_attr3_leaf_moveents(leaf2, &ichdr2, 0, leaf1, &ichdr1,
- ichdr1.count, count, state->mp);
+ xfs_attr3_leaf_moveents(args, leaf2, &ichdr2, 0, leaf1, &ichdr1,
+ ichdr1.count, count);
}
xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1);
xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2);
- xfs_trans_log_buf(args->trans, blk1->bp, 0, state->blocksize-1);
- xfs_trans_log_buf(args->trans, blk2->bp, 0, state->blocksize-1);
+ xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1);
+ xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
@@ -1591,11 +1590,9 @@ xfs_attr3_leaf_figure_balance(
max = ichdr1->count + ichdr2->count;
half = (max + 1) * sizeof(*entry);
half += ichdr1->usedbytes + ichdr2->usedbytes +
- xfs_attr_leaf_newentsize(state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
half /= 2;
- lastdelta = state->blocksize;
+ lastdelta = state->args->geo->blksize;
entry = xfs_attr3_leaf_entryp(leaf1);
for (count = index = 0; count < max; entry++, index++, count++) {
@@ -1605,10 +1602,7 @@ xfs_attr3_leaf_figure_balance(
*/
if (count == blk1->index) {
tmp = totallen + sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
if (XFS_ATTR_ABS(half - tmp) > lastdelta)
break;
lastdelta = XFS_ATTR_ABS(half - tmp);
@@ -1644,10 +1638,7 @@ xfs_attr3_leaf_figure_balance(
totallen -= count * sizeof(*entry);
if (foundit) {
totallen -= sizeof(*entry) +
- xfs_attr_leaf_newentsize(
- state->args->namelen,
- state->args->valuelen,
- state->blocksize, NULL);
+ xfs_attr_leaf_newentsize(state->args, NULL);
}
*countarg = count;
@@ -1699,7 +1690,7 @@ xfs_attr3_leaf_toosmall(
bytes = xfs_attr3_leaf_hdr_size(leaf) +
ichdr.count * sizeof(xfs_attr_leaf_entry_t) +
ichdr.usedbytes;
- if (bytes > (state->blocksize >> 1)) {
+ if (bytes > (state->args->geo->blksize >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0);
}
@@ -1753,7 +1744,8 @@ xfs_attr3_leaf_toosmall(
xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
- bytes = state->blocksize - (state->blocksize >> 2) -
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2) -
ichdr.usedbytes - ichdr2.usedbytes -
((ichdr.count + ichdr2.count) *
sizeof(xfs_attr_leaf_entry_t)) -
@@ -1804,7 +1796,6 @@ xfs_attr3_leaf_remove(
struct xfs_attr_leafblock *leaf;
struct xfs_attr3_icleaf_hdr ichdr;
struct xfs_attr_leaf_entry *entry;
- struct xfs_mount *mp = args->trans->t_mountp;
int before;
int after;
int smallest;
@@ -1818,7 +1809,7 @@ xfs_attr3_leaf_remove(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
- ASSERT(ichdr.count > 0 && ichdr.count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8);
ASSERT(args->index >= 0 && args->index < ichdr.count);
ASSERT(ichdr.firstused >= ichdr.count * sizeof(*entry) +
xfs_attr3_leaf_hdr_size(leaf));
@@ -1826,7 +1817,7 @@ xfs_attr3_leaf_remove(
entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
- ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
/*
* Scan through free region table:
@@ -1841,8 +1832,8 @@ xfs_attr3_leaf_remove(
smallest = XFS_ATTR_LEAF_MAPSIZE - 1;
entsize = xfs_attr_leaf_entsize(leaf, args->index);
for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) {
- ASSERT(ichdr.freemap[i].base < XFS_LBSIZE(mp));
- ASSERT(ichdr.freemap[i].size < XFS_LBSIZE(mp));
+ ASSERT(ichdr.freemap[i].base < args->geo->blksize);
+ ASSERT(ichdr.freemap[i].size < args->geo->blksize);
if (ichdr.freemap[i].base == tablesize) {
ichdr.freemap[i].base -= sizeof(xfs_attr_leaf_entry_t);
ichdr.freemap[i].size += sizeof(xfs_attr_leaf_entry_t);
@@ -1919,11 +1910,11 @@ xfs_attr3_leaf_remove(
* removing the name.
*/
if (smallest) {
- tmp = XFS_LBSIZE(mp);
+ tmp = args->geo->blksize;
entry = xfs_attr3_leaf_entryp(leaf);
for (i = ichdr.count - 1; i >= 0; entry++, i--) {
ASSERT(be16_to_cpu(entry->nameidx) >= ichdr.firstused);
- ASSERT(be16_to_cpu(entry->nameidx) < XFS_LBSIZE(mp));
+ ASSERT(be16_to_cpu(entry->nameidx) < args->geo->blksize);
if (be16_to_cpu(entry->nameidx) < tmp)
tmp = be16_to_cpu(entry->nameidx);
@@ -1946,7 +1937,7 @@ xfs_attr3_leaf_remove(
tmp = ichdr.usedbytes + xfs_attr3_leaf_hdr_size(leaf) +
ichdr.count * sizeof(xfs_attr_leaf_entry_t);
- return tmp < mp->m_attr_magicpct; /* leaf is < 37% full */
+ return tmp < args->geo->magicpct; /* leaf is < 37% full */
}
/*
@@ -1963,7 +1954,6 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr3_icleaf_hdr drophdr;
struct xfs_attr3_icleaf_hdr savehdr;
struct xfs_attr_leaf_entry *entry;
- struct xfs_mount *mp = state->mp;
trace_xfs_attr_leaf_unbalance(state->args);
@@ -1990,13 +1980,15 @@ xfs_attr3_leaf_unbalance(
*/
if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
drop_blk->bp, &drophdr)) {
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
save_leaf, &savehdr, 0,
- drophdr.count, mp);
+ drophdr.count);
} else {
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
save_leaf, &savehdr,
- savehdr.count, drophdr.count, mp);
+ savehdr.count, drophdr.count);
}
} else {
/*
@@ -2006,7 +1998,7 @@ xfs_attr3_leaf_unbalance(
struct xfs_attr_leafblock *tmp_leaf;
struct xfs_attr3_icleaf_hdr tmphdr;
- tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP);
+ tmp_leaf = kmem_zalloc(state->args->geo->blksize, KM_SLEEP);
/*
* Copy the header into the temp leaf so that all the stuff
@@ -2019,35 +2011,39 @@ xfs_attr3_leaf_unbalance(
tmphdr.magic = savehdr.magic;
tmphdr.forw = savehdr.forw;
tmphdr.back = savehdr.back;
- tmphdr.firstused = state->blocksize;
+ tmphdr.firstused = state->args->geo->blksize;
/* write the header to the temp buffer to initialise it */
xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr);
if (xfs_attr3_leaf_order(save_blk->bp, &savehdr,
drop_blk->bp, &drophdr)) {
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
tmp_leaf, &tmphdr, 0,
- drophdr.count, mp);
- xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+ drophdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
tmp_leaf, &tmphdr, tmphdr.count,
- savehdr.count, mp);
+ savehdr.count);
} else {
- xfs_attr3_leaf_moveents(save_leaf, &savehdr, 0,
+ xfs_attr3_leaf_moveents(state->args,
+ save_leaf, &savehdr, 0,
tmp_leaf, &tmphdr, 0,
- savehdr.count, mp);
- xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0,
+ savehdr.count);
+ xfs_attr3_leaf_moveents(state->args,
+ drop_leaf, &drophdr, 0,
tmp_leaf, &tmphdr, tmphdr.count,
- drophdr.count, mp);
+ drophdr.count);
}
- memcpy(save_leaf, tmp_leaf, state->blocksize);
+ memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
savehdr = tmphdr; /* struct copy */
kmem_free(tmp_leaf);
}
xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr);
xfs_trans_log_buf(state->args->trans, save_blk->bp, 0,
- state->blocksize - 1);
+ state->args->geo->blksize - 1);
/*
* Copy out last hashval in each block for B-tree code.
@@ -2093,7 +2089,7 @@ xfs_attr3_leaf_lookup_int(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
/*
* Binary search. (note: small blocks will skip this loop)
@@ -2167,11 +2163,11 @@ xfs_attr3_leaf_lookup_int(
if (!xfs_attr_namesp_match(args->flags, entry->flags))
continue;
args->index = probe;
- args->valuelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
args->rmtblkcnt = xfs_attr3_rmt_blocks(
args->dp->i_mount,
- args->valuelen);
+ args->rmtvaluelen);
return XFS_ERROR(EEXIST);
}
}
@@ -2197,7 +2193,7 @@ xfs_attr3_leaf_getvalue(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf);
- ASSERT(ichdr.count < XFS_LBSIZE(args->dp->i_mount) / 8);
+ ASSERT(ichdr.count < args->geo->blksize / 8);
ASSERT(args->index < ichdr.count);
entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
@@ -2220,19 +2216,19 @@ xfs_attr3_leaf_getvalue(
name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
ASSERT(name_rmt->namelen == args->namelen);
ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
- valuelen = be32_to_cpu(name_rmt->valuelen);
+ args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
- valuelen);
+ args->rmtvaluelen);
if (args->flags & ATTR_KERNOVAL) {
- args->valuelen = valuelen;
+ args->valuelen = args->rmtvaluelen;
return 0;
}
- if (args->valuelen < valuelen) {
- args->valuelen = valuelen;
+ if (args->valuelen < args->rmtvaluelen) {
+ args->valuelen = args->rmtvaluelen;
return XFS_ERROR(ERANGE);
}
- args->valuelen = valuelen;
+ args->valuelen = args->rmtvaluelen;
}
return 0;
}
@@ -2248,14 +2244,14 @@ xfs_attr3_leaf_getvalue(
/*ARGSUSED*/
STATIC void
xfs_attr3_leaf_moveents(
+ struct xfs_da_args *args,
struct xfs_attr_leafblock *leaf_s,
struct xfs_attr3_icleaf_hdr *ichdr_s,
int start_s,
struct xfs_attr_leafblock *leaf_d,
struct xfs_attr3_icleaf_hdr *ichdr_d,
int start_d,
- int count,
- struct xfs_mount *mp)
+ int count)
{
struct xfs_attr_leaf_entry *entry_s;
struct xfs_attr_leaf_entry *entry_d;
@@ -2275,10 +2271,10 @@ xfs_attr3_leaf_moveents(
ASSERT(ichdr_s->magic == XFS_ATTR_LEAF_MAGIC ||
ichdr_s->magic == XFS_ATTR3_LEAF_MAGIC);
ASSERT(ichdr_s->magic == ichdr_d->magic);
- ASSERT(ichdr_s->count > 0 && ichdr_s->count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr_s->count > 0 && ichdr_s->count < args->geo->blksize / 8);
ASSERT(ichdr_s->firstused >= (ichdr_s->count * sizeof(*entry_s))
+ xfs_attr3_leaf_hdr_size(leaf_s));
- ASSERT(ichdr_d->count < XFS_LBSIZE(mp) / 8);
+ ASSERT(ichdr_d->count < args->geo->blksize / 8);
ASSERT(ichdr_d->firstused >= (ichdr_d->count * sizeof(*entry_d))
+ xfs_attr3_leaf_hdr_size(leaf_d));
@@ -2330,11 +2326,11 @@ xfs_attr3_leaf_moveents(
entry_d->nameidx = cpu_to_be16(ichdr_d->firstused);
entry_d->flags = entry_s->flags;
ASSERT(be16_to_cpu(entry_d->nameidx) + tmp
- <= XFS_LBSIZE(mp));
+ <= args->geo->blksize);
memmove(xfs_attr3_leaf_name(leaf_d, desti),
xfs_attr3_leaf_name(leaf_s, start_s + i), tmp);
ASSERT(be16_to_cpu(entry_s->nameidx) + tmp
- <= XFS_LBSIZE(mp));
+ <= args->geo->blksize);
memset(xfs_attr3_leaf_name(leaf_s, start_s + i), 0, tmp);
ichdr_s->usedbytes -= tmp;
ichdr_d->usedbytes += tmp;
@@ -2355,7 +2351,7 @@ xfs_attr3_leaf_moveents(
tmp = count * sizeof(xfs_attr_leaf_entry_t);
entry_s = &xfs_attr3_leaf_entryp(leaf_s)[start_s];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
+ ((char *)leaf_s + args->geo->blksize));
memset(entry_s, 0, tmp);
} else {
/*
@@ -2370,7 +2366,7 @@ xfs_attr3_leaf_moveents(
tmp = count * sizeof(xfs_attr_leaf_entry_t);
entry_s = &xfs_attr3_leaf_entryp(leaf_s)[ichdr_s->count];
ASSERT(((char *)entry_s + tmp) <=
- ((char *)leaf_s + XFS_LBSIZE(mp)));
+ ((char *)leaf_s + args->geo->blksize));
memset(entry_s, 0, tmp);
}
@@ -2438,22 +2434,21 @@ xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index)
* a "local" or a "remote" attribute.
*/
int
-xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize, int *local)
+xfs_attr_leaf_newentsize(
+ struct xfs_da_args *args,
+ int *local)
{
- int size;
+ int size;
- size = xfs_attr_leaf_entsize_local(namelen, valuelen);
- if (size < xfs_attr_leaf_entsize_local_max(blocksize)) {
- if (local) {
+ size = xfs_attr_leaf_entsize_local(args->namelen, args->valuelen);
+ if (size < xfs_attr_leaf_entsize_local_max(args->geo->blksize)) {
+ if (local)
*local = 1;
- }
- } else {
- size = xfs_attr_leaf_entsize_remote(namelen);
- if (local) {
- *local = 0;
- }
+ return size;
}
- return size;
+ if (local)
+ *local = 0;
+ return xfs_attr_leaf_entsize_remote(args->namelen);
}
@@ -2519,7 +2514,7 @@ xfs_attr3_leaf_clearflag(
ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
- name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
xfs_trans_log_buf(args->trans, bp,
XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
}
@@ -2677,7 +2672,7 @@ xfs_attr3_leaf_flipflags(
ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
- name_rmt->valuelen = cpu_to_be32(args->valuelen);
+ name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
xfs_trans_log_buf(args->trans, bp1,
XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
}
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index c1022138c7e..e2929da7c3b 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -19,16 +19,6 @@
#ifndef __XFS_ATTR_LEAF_H__
#define __XFS_ATTR_LEAF_H__
-/*
- * Attribute storage layout, internal structure, access macros, etc.
- *
- * Attribute lists are structured around Btrees where all the data
- * elements are in the leaf nodes. Attribute names are hashed into an int,
- * then that int is used as the index into the Btree. Since the hashval
- * of an attribute name may not be unique, we may have duplicate keys. The
- * internal links in the Btree are logical block offsets into the file.
- */
-
struct attrlist;
struct attrlist_cursor_kern;
struct xfs_attr_list_context;
@@ -38,226 +28,6 @@ struct xfs_da_state_blk;
struct xfs_inode;
struct xfs_trans;
-/*========================================================================
- * Attribute structure when equal to XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This is the structure of the leaf nodes in the Btree.
- *
- * Struct leaf_entry's are packed from the top. Name/values grow from the
- * bottom but are not packed. The freemap contains run-length-encoded entries
- * for the free bytes after the leaf_entry's, but only the N largest such,
- * smaller runs are dropped. When the freemap doesn't show enough space
- * for an allocation, we compact the name/value area and try again. If we
- * still don't have enough space, then we have to split the block. The
- * name/value structs (both local and remote versions) must be 32bit aligned.
- *
- * Since we have duplicate hash keys, for each key that matches, compare
- * the actual name string. The root and intermediate node search always
- * takes the first-in-the-block key match found, so we should only have
- * to work "forw"ard. If none matches, continue with the "forw"ard leaf
- * nodes until the hash key changes or the attribute name is found.
- *
- * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
- * the leaf_entry. The namespaces are independent only because we also look
- * at the namespace bit when we are looking for a matching attribute name.
- *
- * We also store an "incomplete" bit in the leaf_entry. It shows that an
- * attribute is in the middle of being created and should not be shown to
- * the user if we crash during the time that the bit is set. We clear the
- * bit when we have finished setting up the attribute. We do this because
- * we cannot create some large attributes inside a single transaction, and we
- * need some indication that we weren't finished if we crash in the middle.
- */
-#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
-
-typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
- __be16 base; /* base of free region */
- __be16 size; /* length of free region */
-} xfs_attr_leaf_map_t;
-
-typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
- xfs_da_blkinfo_t info; /* block type, links, etc. */
- __be16 count; /* count of active leaf_entry's */
- __be16 usedbytes; /* num bytes of names/values stored */
- __be16 firstused; /* first used byte in name area */
- __u8 holes; /* != 0 if blk needs compaction */
- __u8 pad1;
- xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
- /* N largest free regions */
-} xfs_attr_leaf_hdr_t;
-
-typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
- __be32 hashval; /* hash value of name */
- __be16 nameidx; /* index into buffer of name/value */
- __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
- __u8 pad2; /* unused pad byte */
-} xfs_attr_leaf_entry_t;
-
-typedef struct xfs_attr_leaf_name_local {
- __be16 valuelen; /* number of bytes in value */
- __u8 namelen; /* length of name bytes */
- __u8 nameval[1]; /* name/value bytes */
-} xfs_attr_leaf_name_local_t;
-
-typedef struct xfs_attr_leaf_name_remote {
- __be32 valueblk; /* block number of value bytes */
- __be32 valuelen; /* number of bytes in value */
- __u8 namelen; /* length of name bytes */
- __u8 name[1]; /* name bytes */
-} xfs_attr_leaf_name_remote_t;
-
-typedef struct xfs_attr_leafblock {
- xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
- xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
- xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
- xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
-} xfs_attr_leafblock_t;
-
-/*
- * CRC enabled leaf structures. Called "version 3" structures to match the
- * version number of the directory and dablk structures for this feature, and
- * attr2 is already taken by the variable inode attribute fork size feature.
- */
-struct xfs_attr3_leaf_hdr {
- struct xfs_da3_blkinfo info;
- __be16 count;
- __be16 usedbytes;
- __be16 firstused;
- __u8 holes;
- __u8 pad1;
- struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
- __be32 pad2; /* 64 bit alignment */
-};
-
-#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
-
-struct xfs_attr3_leafblock {
- struct xfs_attr3_leaf_hdr hdr;
- struct xfs_attr_leaf_entry entries[1];
-
- /*
- * The rest of the block contains the following structures after the
- * leaf entries, growing from the bottom up. The variables are never
- * referenced, the locations accessed purely from helper functions.
- *
- * struct xfs_attr_leaf_name_local
- * struct xfs_attr_leaf_name_remote
- */
-};
-
-/*
- * incore, neutral version of the attribute leaf header
- */
-struct xfs_attr3_icleaf_hdr {
- __uint32_t forw;
- __uint32_t back;
- __uint16_t magic;
- __uint16_t count;
- __uint16_t usedbytes;
- __uint16_t firstused;
- __u8 holes;
- struct {
- __uint16_t base;
- __uint16_t size;
- } freemap[XFS_ATTR_LEAF_MAPSIZE];
-};
-
-/*
- * Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
- */
-#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
-#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
-#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
-#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
-#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
-#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
-#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
-#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
-
-/*
- * Conversion macros for converting namespace bits from argument flags
- * to ondisk flags.
- */
-#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
-#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
-#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
-#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
- ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
-#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
- ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
-
-/*
- * Alignment for namelist and valuelist entries (since they are mixed
- * there can be only one alignment value)
- */
-#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
-
-static inline int
-xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
-{
- if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
- return sizeof(struct xfs_attr3_leaf_hdr);
- return sizeof(struct xfs_attr_leaf_hdr);
-}
-
-static inline struct xfs_attr_leaf_entry *
-xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
-{
- if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
- return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
- return &leafp->entries[0];
-}
-
-/*
- * Cast typed pointers for "local" and "remote" name/value structs.
- */
-static inline char *
-xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
-{
- struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
-
- return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
-}
-
-static inline xfs_attr_leaf_name_remote_t *
-xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
-}
-
-static inline xfs_attr_leaf_name_local_t *
-xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
-{
- return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
-}
-
-/*
- * Calculate total bytes used (including trailing pad for alignment) for
- * a "local" name/value structure, a "remote" name/value structure, and
- * a pointer which might be either.
- */
-static inline int xfs_attr_leaf_entsize_remote(int nlen)
-{
- return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
-{
- return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
- XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
-}
-
-static inline int xfs_attr_leaf_entsize_local_max(int bsize)
-{
- return (((bsize) >> 1) + ((bsize) >> 2));
-}
-
/*
* Used to keep a list of "remote value" extents when unlinking an inode.
*/
@@ -326,8 +96,7 @@ int xfs_attr3_root_inactive(struct xfs_trans **trans, struct xfs_inode *dp);
xfs_dahash_t xfs_attr_leaf_lasthash(struct xfs_buf *bp, int *count);
int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
-int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
- int *local);
+int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local);
int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp);
@@ -336,6 +105,4 @@ void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to,
void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to,
struct xfs_attr3_icleaf_hdr *from);
-extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
-
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index cbc80d48517..90e2eeb2120 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -18,31 +18,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_attr_remote.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_attr.h"
+#include "xfs_attr_sf.h"
+#include "xfs_attr_remote.h"
#include "xfs_attr_leaf.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_dinode.h"
+#include "xfs_dir2.h"
STATIC int
xfs_attr_shortform_compare(const void *a, const void *b)
@@ -229,6 +227,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
struct xfs_da_node_entry *btree;
int error, i;
struct xfs_buf *bp;
+ struct xfs_inode *dp = context->dp;
trace_xfs_attr_node_list(context);
@@ -242,7 +241,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da3_node_read(NULL, context->dp, cursor->blkno, -1,
+ error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
&bp, XFS_ATTR_FORK);
if ((error != 0) && (error != EFSCORRUPTED))
return(error);
@@ -292,7 +291,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
for (;;) {
__uint16_t magic;
- error = xfs_da3_node_read(NULL, context->dp,
+ error = xfs_da3_node_read(NULL, dp,
cursor->blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
@@ -312,8 +311,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
return XFS_ERROR(EFSCORRUPTED);
}
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
for (i = 0; i < nodehdr.count; btree++, i++) {
if (cursor->hashval
<= be32_to_cpu(btree->hashval)) {
@@ -349,8 +348,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
break;
cursor->blkno = leafhdr.forw;
xfs_trans_brelse(NULL, bp);
- error = xfs_attr3_leaf_read(NULL, context->dp, cursor->blkno, -1,
- &bp);
+ error = xfs_attr3_leaf_read(NULL, dp, cursor->blkno, -1, &bp);
if (error)
return error;
}
@@ -446,9 +444,11 @@ xfs_attr3_leaf_list_int(
xfs_da_args_t args;
memset((char *)&args, 0, sizeof(args));
+ args.geo = context->dp->i_mount->m_attr_geo;
args.dp = context->dp;
args.whichfork = XFS_ATTR_FORK;
args.valuelen = valuelen;
+ args.rmtvaluelen = valuelen;
args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
args.rmtblkcnt = xfs_attr3_rmt_blocks(
@@ -509,17 +509,17 @@ xfs_attr_list_int(
{
int error;
xfs_inode_t *dp = context->dp;
+ uint lock_mode;
XFS_STATS_INC(xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return EIO;
- xfs_ilock(dp, XFS_ILOCK_SHARED);
-
/*
* Decide on what work routines to call based on the inode size.
*/
+ lock_mode = xfs_ilock_attr_map_shared(dp);
if (!xfs_inode_hasattr(dp)) {
error = 0;
} else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
@@ -529,9 +529,7 @@ xfs_attr_list_int(
} else {
error = xfs_attr_node_list(context);
}
-
- xfs_iunlock(dp, XFS_ILOCK_SHARED);
-
+ xfs_iunlock(dp, lock_mode);
return error;
}
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 712a502de61..b5adfecbb8e 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -18,20 +18,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_alloc.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -42,6 +41,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_error.h"
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
@@ -68,7 +68,6 @@ xfs_attr3_rmt_blocks(
*/
static bool
xfs_attr3_rmt_hdr_ok(
- struct xfs_mount *mp,
void *ptr,
xfs_ino_t ino,
uint32_t offset,
@@ -110,7 +109,7 @@ xfs_attr3_rmt_verify(
if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
- be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+ be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
return false;
if (rmt->rm_owner == 0)
return false;
@@ -125,8 +124,8 @@ xfs_attr3_rmt_read_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
char *ptr;
int len;
- bool corrupt = false;
xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -135,27 +134,25 @@ xfs_attr3_rmt_read_verify(
ptr = bp->b_addr;
bno = bp->b_bn;
len = BBTOB(bp->b_length);
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0) {
- if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp),
- XFS_ATTR3_RMT_CRC_OFF)) {
- corrupt = true;
+ if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
+ xfs_buf_ioerror(bp, EFSBADCRC);
break;
}
- if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
- corrupt = true;
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
break;
}
- len -= XFS_LBSIZE(mp);
- ptr += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
}
- if (corrupt) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- } else
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+ else
ASSERT(len == 0);
}
@@ -168,6 +165,7 @@ xfs_attr3_rmt_write_verify(
char *ptr;
int len;
xfs_daddr_t bno;
+ int blksize = mp->m_attr_geo->blksize;
/* no verification of non-crc buffers */
if (!xfs_sb_version_hascrc(&mp->m_sb))
@@ -176,13 +174,12 @@ xfs_attr3_rmt_write_verify(
ptr = bp->b_addr;
bno = bp->b_bn;
len = BBTOB(bp->b_length);
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0) {
- if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) {
- XFS_CORRUPTION_ERROR(__func__,
- XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
if (bip) {
@@ -191,11 +188,11 @@ xfs_attr3_rmt_write_verify(
rmt = (struct xfs_attr3_rmt_hdr *)ptr;
rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn);
}
- xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF);
+ xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF);
- len -= XFS_LBSIZE(mp);
- ptr += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ ptr += blksize;
+ bno += BTOBB(blksize);
}
ASSERT(len == 0);
}
@@ -244,17 +241,18 @@ xfs_attr_rmtval_copyout(
char *src = bp->b_addr;
xfs_daddr_t bno = bp->b_bn;
int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
int hdr_size = 0;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
byte_cnt = min(*valuelen, byte_cnt);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset,
+ if (!xfs_attr3_rmt_hdr_ok(src, ino, *offset,
byte_cnt, bno)) {
xfs_alert(mp,
"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
@@ -267,9 +265,9 @@ xfs_attr_rmtval_copyout(
memcpy(*dst, src + hdr_size, byte_cnt);
/* roll buffer forwards */
- len -= XFS_LBSIZE(mp);
- src += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ src += blksize;
+ bno += BTOBB(blksize);
/* roll attribute data forwards */
*valuelen -= byte_cnt;
@@ -291,12 +289,13 @@ xfs_attr_rmtval_copyin(
char *dst = bp->b_addr;
xfs_daddr_t bno = bp->b_bn;
int len = BBTOB(bp->b_length);
+ int blksize = mp->m_attr_geo->blksize;
- ASSERT(len >= XFS_LBSIZE(mp));
+ ASSERT(len >= blksize);
while (len > 0 && *valuelen > 0) {
int hdr_size;
- int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp));
+ int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, blksize);
byte_cnt = min(*valuelen, byte_cnt);
hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset,
@@ -308,17 +307,17 @@ xfs_attr_rmtval_copyin(
* If this is the last block, zero the remainder of it.
* Check that we are actually the last block, too.
*/
- if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) {
+ if (byte_cnt + hdr_size < blksize) {
ASSERT(*valuelen - byte_cnt == 0);
- ASSERT(len == XFS_LBSIZE(mp));
+ ASSERT(len == blksize);
memset(dst + hdr_size + byte_cnt, 0,
- XFS_LBSIZE(mp) - hdr_size - byte_cnt);
+ blksize - hdr_size - byte_cnt);
}
/* roll buffer forwards */
- len -= XFS_LBSIZE(mp);
- dst += XFS_LBSIZE(mp);
- bno += mp->m_bsize;
+ len -= blksize;
+ dst += blksize;
+ bno += BTOBB(blksize);
/* roll attribute data forwards */
*valuelen -= byte_cnt;
@@ -340,7 +339,7 @@ xfs_attr_rmtval_get(
struct xfs_buf *bp;
xfs_dablk_t lblkno = args->rmtblkno;
__uint8_t *dst = args->value;
- int valuelen = args->valuelen;
+ int valuelen;
int nmap;
int error;
int blkcnt = args->rmtblkcnt;
@@ -350,7 +349,9 @@ xfs_attr_rmtval_get(
trace_xfs_attr_rmtval_get(args);
ASSERT(!(args->flags & ATTR_KERNOVAL));
+ ASSERT(args->rmtvaluelen == args->valuelen);
+ valuelen = args->rmtvaluelen;
while (valuelen > 0) {
nmap = ATTR_RMTVALUE_MAPSIZE;
error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
@@ -418,7 +419,7 @@ xfs_attr_rmtval_set(
* attributes have headers, we can't just do a straight byte to FSB
* conversion and have to take the header space into account.
*/
- blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+ blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
XFS_ATTR_FORK);
if (error)
@@ -483,7 +484,7 @@ xfs_attr_rmtval_set(
*/
lblkno = args->rmtblkno;
blkcnt = args->rmtblkcnt;
- valuelen = args->valuelen;
+ valuelen = args->rmtvaluelen;
while (valuelen > 0) {
struct xfs_buf *bp;
xfs_daddr_t dblkno;
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h
index 92a8fd7977c..5a9acfa156d 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/xfs_attr_remote.h
@@ -18,35 +18,6 @@
#ifndef __XFS_ATTR_REMOTE_H__
#define __XFS_ATTR_REMOTE_H__
-#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
-
-/*
- * There is one of these headers per filesystem block in a remote attribute.
- * This is done to ensure there is a 1:1 mapping between the attribute value
- * length and the number of blocks needed to store the attribute. This makes the
- * verification of a buffer a little more complex, but greatly simplifies the
- * allocation, reading and writing of these attributes as we don't have to guess
- * the number of blocks needed to store the attribute data.
- */
-struct xfs_attr3_rmt_hdr {
- __be32 rm_magic;
- __be32 rm_offset;
- __be32 rm_bytes;
- __be32 rm_crc;
- uuid_t rm_uuid;
- __be64 rm_owner;
- __be64 rm_blkno;
- __be64 rm_lsn;
-};
-
-#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
-
-#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
- ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
- sizeof(struct xfs_attr3_rmt_hdr) : 0))
-
-extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
-
int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
diff --git a/fs/xfs/xfs_bit.c b/fs/xfs/xfs_bit.c
index 48228848f5a..0e8885a5964 100644
--- a/fs/xfs/xfs_bit.c
+++ b/fs/xfs/xfs_bit.c
@@ -16,10 +16,8 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_log_format.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
/*
* XFS bit manipulation routines, used in non-realtime code.
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index f1e3c907044..e1649c0d3e0 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -66,8 +66,11 @@ static inline int xfs_lowbit64(__uint64_t v)
n = ffs(w);
} else { /* upper bits */
w = (__uint32_t)(v >> 32);
- if (w && (n = ffs(w)))
- n += 32;
+ if (w) {
+ n = ffs(w);
+ if (n)
+ n += 32;
+ }
}
return n - 1;
}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index f47e65c30be..75c3fe5f3d9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -17,39 +17,37 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_mount.h"
-#include "xfs_itable.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_buf_item.h"
-#include "xfs_filestream.h"
#include "xfs_trace.h"
#include "xfs_symlink.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
kmem_zone_t *xfs_bmap_free_item_zone;
@@ -96,7 +94,7 @@ xfs_bmap_compute_maxlevels(
maxleafents = MAXAEXTNUM;
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
}
- maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
+ maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -235,7 +233,6 @@ xfs_default_attroffset(
*/
STATIC void
xfs_bmap_forkoff_reset(
- xfs_mount_t *mp,
xfs_inode_t *ip,
int whichfork)
{
@@ -907,7 +904,7 @@ xfs_bmap_local_to_extents_empty(
ASSERT(ifp->if_bytes == 0);
ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
- xfs_bmap_forkoff_reset(ip->i_mount, ip, whichfork);
+ xfs_bmap_forkoff_reset(ip, whichfork);
ifp->if_flags &= ~XFS_IFINLINE;
ifp->if_flags |= XFS_IFEXTENTS;
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
@@ -1101,10 +1098,11 @@ xfs_bmap_add_attrfork_local(
if (S_ISDIR(ip->i_d.di_mode)) {
memset(&dargs, 0, sizeof(dargs));
+ dargs.geo = ip->i_mount->m_dir_geo;
dargs.dp = ip;
dargs.firstblock = firstblock;
dargs.flist = flist;
- dargs.total = ip->i_mount->m_dirblkfsbs;
+ dargs.total = dargs.geo->fsbcount;
dargs.whichfork = XFS_DATA_FORK;
dargs.trans = tp;
return xfs_dir2_sf_to_block(&dargs);
@@ -1139,6 +1137,7 @@ xfs_bmap_add_attrfork(
int committed; /* xaction was committed */
int logflags; /* logging flags */
int error; /* error return value */
+ int cancel_flags = 0;
ASSERT(XFS_IFORK_Q(ip) == 0);
@@ -1149,19 +1148,20 @@ xfs_bmap_add_attrfork(
if (rsvd)
tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
- if (error)
- goto error0;
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+ cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
- return error;
- }
+ if (error)
+ goto trans_cancel;
+ cancel_flags |= XFS_TRANS_ABORT;
if (XFS_IFORK_Q(ip))
- goto error1;
+ goto trans_cancel;
if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
/*
* For inodes coming from pre-6.2 filesystems.
@@ -1171,7 +1171,7 @@ xfs_bmap_add_attrfork(
}
ASSERT(ip->i_d.di_anextents == 0);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
switch (ip->i_d.di_format) {
@@ -1193,7 +1193,7 @@ xfs_bmap_add_attrfork(
default:
ASSERT(0);
error = XFS_ERROR(EINVAL);
- goto error1;
+ goto trans_cancel;
}
ASSERT(ip->i_afp == NULL);
@@ -1221,7 +1221,7 @@ xfs_bmap_add_attrfork(
if (logflags)
xfs_trans_log_inode(tp, ip, logflags);
if (error)
- goto error2;
+ goto bmap_cancel;
if (!xfs_sb_version_hasattr(&mp->m_sb) ||
(!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
__int64_t sbfields = 0;
@@ -1244,14 +1244,16 @@ xfs_bmap_add_attrfork(
error = xfs_bmap_finish(&tp, &flist, &committed);
if (error)
- goto error2;
- return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-error2:
+ goto bmap_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+bmap_cancel:
xfs_bmap_cancel(&flist);
-error1:
+trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
-error0:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
return error;
}
@@ -1482,7 +1484,7 @@ xfs_bmap_search_extents(
xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
"Access to block zero in inode %llu "
"start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x lastx: %x\n",
+ "blkcnt: %llx extent-state: %x lastx: %x",
(unsigned long long)ip->i_ino,
(unsigned long long)gotp->br_startblock,
(unsigned long long)gotp->br_startoff,
@@ -1633,7 +1635,7 @@ xfs_bmap_last_extent(
* blocks at the end of the file which do not start at the previous data block,
* we will try to align the new blocks at stripe unit boundaries.
*
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
* at, or past the EOF.
*/
STATIC int
@@ -1648,9 +1650,14 @@ xfs_bmap_isaeof(
bma->aeof = 0;
error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
&is_empty);
- if (error || is_empty)
+ if (error)
return error;
+ if (is_empty) {
+ bma->aeof = 1;
+ return 0;
+ }
+
/*
* Check if we are allocation or past the last extent, or at least into
* the last delayed allocated extent.
@@ -1668,7 +1675,6 @@ xfs_bmap_isaeof(
*/
int
xfs_bmap_last_offset(
- struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *last_block,
int whichfork)
@@ -3510,6 +3516,67 @@ xfs_bmap_adjacent(
#undef ISVALID
}
+static int
+xfs_bmap_longest_free_extent(
+ struct xfs_trans *tp,
+ xfs_agnumber_t ag,
+ xfs_extlen_t *blen,
+ int *notinit)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest;
+ int error = 0;
+
+ pag = xfs_perag_get(mp, ag);
+ if (!pag->pagf_init) {
+ error = xfs_alloc_pagf_init(mp, tp, ag, XFS_ALLOC_FLAG_TRYLOCK);
+ if (error)
+ goto out;
+
+ if (!pag->pagf_init) {
+ *notinit = 1;
+ goto out;
+ }
+ }
+
+ longest = xfs_alloc_longest_free_extent(mp, pag);
+ if (*blen < longest)
+ *blen = longest;
+
+out:
+ xfs_perag_put(pag);
+ return error;
+}
+
+static void
+xfs_bmap_select_minlen(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen,
+ int notinit)
+{
+ if (notinit || *blen < ap->minlen) {
+ /*
+ * Since we did a BUF_TRYLOCK above, it is possible that
+ * there is space for this request.
+ */
+ args->minlen = ap->minlen;
+ } else if (*blen < args->maxlen) {
+ /*
+ * If the best seen length is less than the request length,
+ * use the best as the minimum.
+ */
+ args->minlen = *blen;
+ } else {
+ /*
+ * Otherwise we've seen an extent as big as maxlen, use that
+ * as the minimum.
+ */
+ args->minlen = args->maxlen;
+ }
+}
+
STATIC int
xfs_bmap_btalloc_nullfb(
struct xfs_bmalloca *ap,
@@ -3517,111 +3584,74 @@ xfs_bmap_btalloc_nullfb(
xfs_extlen_t *blen)
{
struct xfs_mount *mp = ap->ip->i_mount;
- struct xfs_perag *pag;
xfs_agnumber_t ag, startag;
int notinit = 0;
int error;
- if (ap->userdata && xfs_inode_is_filestream(ap->ip))
- args->type = XFS_ALLOCTYPE_NEAR_BNO;
- else
- args->type = XFS_ALLOCTYPE_START_BNO;
+ args->type = XFS_ALLOCTYPE_START_BNO;
args->total = ap->total;
- /*
- * Search for an allocation group with a single extent large enough
- * for the request. If one isn't found, then adjust the minimum
- * allocation size to the largest space found.
- */
startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
if (startag == NULLAGNUMBER)
startag = ag = 0;
- pag = xfs_perag_get(mp, ag);
while (*blen < args->maxlen) {
- if (!pag->pagf_init) {
- error = xfs_alloc_pagf_init(mp, args->tp, ag,
- XFS_ALLOC_FLAG_TRYLOCK);
- if (error) {
- xfs_perag_put(pag);
- return error;
- }
- }
-
- /*
- * See xfs_alloc_fix_freelist...
- */
- if (pag->pagf_init) {
- xfs_extlen_t longest;
- longest = xfs_alloc_longest_free_extent(mp, pag);
- if (*blen < longest)
- *blen = longest;
- } else
- notinit = 1;
-
- if (xfs_inode_is_filestream(ap->ip)) {
- if (*blen >= args->maxlen)
- break;
-
- if (ap->userdata) {
- /*
- * If startag is an invalid AG, we've
- * come here once before and
- * xfs_filestream_new_ag picked the
- * best currently available.
- *
- * Don't continue looping, since we
- * could loop forever.
- */
- if (startag == NULLAGNUMBER)
- break;
-
- error = xfs_filestream_new_ag(ap, &ag);
- xfs_perag_put(pag);
- if (error)
- return error;
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
- /* loop again to set 'blen'*/
- startag = NULLAGNUMBER;
- pag = xfs_perag_get(mp, ag);
- continue;
- }
- }
if (++ag == mp->m_sb.sb_agcount)
ag = 0;
if (ag == startag)
break;
- xfs_perag_put(pag);
- pag = xfs_perag_get(mp, ag);
}
- xfs_perag_put(pag);
- /*
- * Since the above loop did a BUF_TRYLOCK, it is
- * possible that there is space for this request.
- */
- if (notinit || *blen < ap->minlen)
- args->minlen = ap->minlen;
- /*
- * If the best seen length is less than the request
- * length, use the best as the minimum.
- */
- else if (*blen < args->maxlen)
- args->minlen = *blen;
- /*
- * Otherwise we've seen an extent as big as maxlen,
- * use that as the minimum.
- */
- else
- args->minlen = args->maxlen;
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
+ return 0;
+}
+
+STATIC int
+xfs_bmap_btalloc_filestreams(
+ struct xfs_bmalloca *ap,
+ struct xfs_alloc_arg *args,
+ xfs_extlen_t *blen)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_agnumber_t ag;
+ int notinit = 0;
+ int error;
+
+ args->type = XFS_ALLOCTYPE_NEAR_BNO;
+ args->total = ap->total;
+
+ ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
+ if (ag == NULLAGNUMBER)
+ ag = 0;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen, &notinit);
+ if (error)
+ return error;
+
+ if (*blen < args->maxlen) {
+ error = xfs_filestream_new_ag(ap, &ag);
+ if (error)
+ return error;
+
+ error = xfs_bmap_longest_free_extent(args->tp, ag, blen,
+ &notinit);
+ if (error)
+ return error;
+
+ }
+
+ xfs_bmap_select_minlen(ap, args, blen, notinit);
/*
- * set the failure fallback case to look in the selected
- * AG as the stream may have moved.
+ * Set the failure fallback case to look in the selected AG as stream
+ * may have moved.
*/
- if (xfs_inode_is_filestream(ap->ip))
- ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
-
+ ap->blkno = args->fsbno = XFS_AGB_TO_FSB(mp, ag, 0);
return 0;
}
@@ -3641,10 +3671,19 @@ xfs_bmap_btalloc(
int isaligned;
int tryagain;
int error;
+ int stripe_align;
ASSERT(ap->length);
mp = ap->ip->i_mount;
+
+ /* stripe alignment for allocation is determined by mount parameters */
+ stripe_align = 0;
+ if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+ stripe_align = mp->m_swidth;
+ else if (mp->m_dalign)
+ stripe_align = mp->m_dalign;
+
align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
if (unlikely(align)) {
error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3653,6 +3692,8 @@ xfs_bmap_btalloc(
ASSERT(!error);
ASSERT(ap->length);
}
+
+
nullfb = *ap->firstblock == NULLFSBLOCK;
fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
if (nullfb) {
@@ -3690,7 +3731,15 @@ xfs_bmap_btalloc(
args.firstblock = *ap->firstblock;
blen = 0;
if (nullfb) {
- error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
+ /*
+ * Search for an allocation group with a single extent large
+ * enough for the request. If one isn't found, then adjust
+ * the minimum allocation size to the largest space found.
+ */
+ if (ap->userdata && xfs_inode_is_filestream(ap->ip))
+ error = xfs_bmap_btalloc_filestreams(ap, &args, &blen);
+ else
+ error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
if (error)
return error;
} else if (ap->flist->xbf_low) {
@@ -3728,7 +3777,7 @@ xfs_bmap_btalloc(
*/
if (!ap->flist->xbf_low && ap->aeof) {
if (!ap->offset) {
- args.alignment = mp->m_dalign;
+ args.alignment = stripe_align;
atype = args.type;
isaligned = 1;
/*
@@ -3753,13 +3802,13 @@ xfs_bmap_btalloc(
* of minlen+alignment+slop doesn't go up
* between the calls.
*/
- if (blen > mp->m_dalign && blen <= args.maxlen)
- nextminlen = blen - mp->m_dalign;
+ if (blen > stripe_align && blen <= args.maxlen)
+ nextminlen = blen - stripe_align;
else
nextminlen = args.minlen;
- if (nextminlen + mp->m_dalign > args.minlen + 1)
+ if (nextminlen + stripe_align > args.minlen + 1)
args.minalignslop =
- nextminlen + mp->m_dalign -
+ nextminlen + stripe_align -
args.minlen - 1;
else
args.minalignslop = 0;
@@ -3781,7 +3830,7 @@ xfs_bmap_btalloc(
*/
args.type = atype;
args.fsbno = ap->blkno;
- args.alignment = mp->m_dalign;
+ args.alignment = stripe_align;
args.minlen = nextminlen;
args.minalignslop = 0;
isaligned = 1;
@@ -3995,6 +4044,7 @@ xfs_bmapi_read(
ASSERT(*nmap >= 1);
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
XFS_BMAPI_IGSTATE)));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4189,6 +4239,7 @@ xfs_bmapi_delay(
ASSERT(*nmap >= 1);
ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
ASSERT(!(flags & ~XFS_BMAPI_ENTIRE));
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
@@ -4247,8 +4298,8 @@ xfs_bmapi_delay(
}
-int
-__xfs_bmapi_allocate(
+static int
+xfs_bmapi_allocate(
struct xfs_bmalloca *bma)
{
struct xfs_mount *mp = bma->ip->i_mount;
@@ -4482,6 +4533,7 @@ xfs_bmapi_write(
ASSERT(tp != NULL);
ASSERT(len > 0);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -4526,9 +4578,6 @@ xfs_bmapi_write(
bma.flist = flist;
bma.firstblock = firstblock;
- if (flags & XFS_BMAPI_STACK_SWITCH)
- bma.stack_switch = 1;
-
while (bno < end && n < *nmap) {
inhole = eof || bma.got.br_startoff > bno;
wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
@@ -5033,6 +5082,7 @@ xfs_bunmapi(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(len > 0);
ASSERT(nexts >= 0);
@@ -5356,3 +5406,201 @@ error0:
}
return error;
}
+
+/*
+ * Shift extent records to the left to cover a hole.
+ *
+ * The maximum number of extents to be shifted in a single operation
+ * is @num_exts, and @current_ext keeps track of the current extent
+ * index we have shifted. @offset_shift_fsb is the length by which each
+ * extent is shifted. If there is no hole to shift the extents
+ * into, this will be considered invalid operation and we abort immediately.
+ */
+int
+xfs_bmap_shift_extents(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int *done,
+ xfs_fileoff_t start_fsb,
+ xfs_fileoff_t offset_shift_fsb,
+ xfs_extnum_t *current_ext,
+ xfs_fsblock_t *firstblock,
+ struct xfs_bmap_free *flist,
+ int num_exts)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_bmbt_rec_host *gotp;
+ struct xfs_bmbt_irec got;
+ struct xfs_bmbt_irec left;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp;
+ xfs_extnum_t nexts = 0;
+ xfs_fileoff_t startoff;
+ int error = 0;
+ int i;
+ int whichfork = XFS_DATA_FORK;
+ int logflags;
+ xfs_filblks_t blockcount = 0;
+ int total_extents;
+
+ if (unlikely(XFS_TEST_ERROR(
+ (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
+ mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+ XFS_ERROR_REPORT("xfs_bmap_shift_extents",
+ XFS_ERRLEVEL_LOW, mp);
+ return XFS_ERROR(EFSCORRUPTED);
+ }
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ ASSERT(current_ext != NULL);
+
+ ifp = XFS_IFORK_PTR(ip, whichfork);
+ if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+ /* Read in all the extents */
+ error = xfs_iread_extents(tp, ip, whichfork);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If *current_ext is 0, we would need to lookup the extent
+ * from where we would start shifting and store it in gotp.
+ */
+ if (!*current_ext) {
+ gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
+ /*
+ * gotp can be null in 2 cases: 1) if there are no extents
+ * or 2) start_fsb lies in a hole beyond which there are
+ * no extents. Either way, we are done.
+ */
+ if (!gotp) {
+ *done = 1;
+ return 0;
+ }
+ }
+
+ /* We are going to change core inode */
+ logflags = XFS_ILOG_CORE;
+ if (ifp->if_flags & XFS_IFBROOT) {
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
+ cur->bc_private.b.firstblock = *firstblock;
+ cur->bc_private.b.flist = flist;
+ cur->bc_private.b.flags = 0;
+ } else {
+ cur = NULL;
+ logflags |= XFS_ILOG_DEXT;
+ }
+
+ /*
+ * There may be delalloc extents in the data fork before the range we
+ * are collapsing out, so we cannot
+ * use the count of real extents here. Instead we have to calculate it
+ * from the incore fork.
+ */
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ while (nexts++ < num_exts && *current_ext < total_extents) {
+
+ gotp = xfs_iext_get_ext(ifp, *current_ext);
+ xfs_bmbt_get_all(gotp, &got);
+ startoff = got.br_startoff - offset_shift_fsb;
+
+ /*
+ * Before shifting extent into hole, make sure that the hole
+ * is large enough to accomodate the shift.
+ */
+ if (*current_ext) {
+ xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
+ *current_ext - 1), &left);
+
+ if (startoff < left.br_startoff + left.br_blockcount)
+ error = XFS_ERROR(EINVAL);
+ } else if (offset_shift_fsb > got.br_startoff) {
+ /*
+ * When first extent is shifted, offset_shift_fsb
+ * should be less than the stating offset of
+ * the first extent.
+ */
+ error = XFS_ERROR(EINVAL);
+ }
+
+ if (error)
+ goto del_cursor;
+
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
+
+ /* Check if we can merge 2 adjacent extents */
+ if (*current_ext &&
+ left.br_startoff + left.br_blockcount == startoff &&
+ left.br_startblock + left.br_blockcount ==
+ got.br_startblock &&
+ left.br_state == got.br_state &&
+ left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
+ blockcount = left.br_blockcount +
+ got.br_blockcount;
+ xfs_iext_remove(ip, *current_ext, 1, 0);
+ if (cur) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
+ XFS_IFORK_NEXT_SET(ip, whichfork,
+ XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+ gotp = xfs_iext_get_ext(ifp, --*current_ext);
+ xfs_bmbt_get_all(gotp, &got);
+
+ /* Make cursor point to the extent we will update */
+ if (cur) {
+ error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ &i);
+ if (error)
+ goto del_cursor;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
+ }
+
+ xfs_bmbt_set_blockcount(gotp, blockcount);
+ got.br_blockcount = blockcount;
+ } else {
+ /* We have to update the startoff */
+ xfs_bmbt_set_startoff(gotp, startoff);
+ got.br_startoff = startoff;
+ }
+
+ if (cur) {
+ error = xfs_bmbt_update(cur, got.br_startoff,
+ got.br_startblock,
+ got.br_blockcount,
+ got.br_state);
+ if (error)
+ goto del_cursor;
+ }
+
+ (*current_ext)++;
+ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+ }
+
+ /* Check if we are done */
+ if (*current_ext == total_extents)
+ *done = 1;
+
+del_cursor:
+ if (cur)
+ xfs_btree_del_cursor(cur,
+ error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+ xfs_trans_log_inode(tp, ip, logflags);
+ return error;
+}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 33b41f35122..b879ca56a64 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,7 +77,6 @@ typedef struct xfs_bmap_free
* from written to unwritten, otherwise convert from unwritten to written.
*/
#define XFS_BMAPI_CONVERT 0x040
-#define XFS_BMAPI_STACK_SWITCH 0x080
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -86,8 +85,7 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }, \
- { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }
static inline int xfs_bmapi_aflag(int w)
@@ -127,6 +125,16 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp)
{ BMAP_RIGHT_FILLING, "RF" }, \
{ BMAP_ATTRFORK, "ATTR" }
+
+/*
+ * This macro is used to determine how many extents will be shifted
+ * in one write transaction. We could require two splits,
+ * an extent move on the first and an extent merge on the second,
+ * So it is proper that one extent is shifted inside write transaction
+ * at a time.
+ */
+#define XFS_BMAP_MAX_SHIFT_EXTENTS 1
+
#ifdef DEBUG
void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt,
int whichfork, unsigned long caller_ip);
@@ -146,8 +154,8 @@ int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *last_block, int whichfork);
-int xfs_bmap_last_offset(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t *unused, int whichfork);
+int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused,
+ int whichfork);
int xfs_bmap_one_block(struct xfs_inode *ip, int whichfork);
int xfs_bmap_read_extents(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork);
@@ -169,5 +177,10 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
xfs_extnum_t num);
uint xfs_default_attroffset(struct xfs_inode *ip);
+int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
+ int *done, xfs_fileoff_t start_fsb,
+ xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext,
+ xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist,
+ int num_exts);
#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bb8de8e399c..948836c4fd9 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -17,27 +17,26 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
-#include "xfs_itable.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* Determine the extent state.
@@ -85,7 +84,7 @@ xfs_bmdr_to_bmbt(
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
- dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
@@ -444,7 +443,7 @@ xfs_bmbt_to_bmdr(
ASSERT(rblock->bb_level != 0);
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
- dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ dmxr = xfs_bmdr_maxrecs(dblocklen, 0);
fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
@@ -520,7 +519,6 @@ xfs_bmbt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
@@ -673,8 +671,7 @@ xfs_bmbt_get_dmaxrecs(
{
if (level != cur->bc_nlevels - 1)
return cur->bc_mp->m_bmap_dmxr[level != 0];
- return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
- level == 0);
+ return xfs_bmdr_maxrecs(cur->bc_private.b.forksize, level == 0);
}
STATIC void
@@ -781,12 +778,14 @@ static void
xfs_bmbt_read_verify(
struct xfs_buf *bp)
{
- if (!(xfs_btree_lblock_verify_crc(bp) &&
- xfs_bmbt_verify(bp))) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- bp->b_target->bt_mount, bp->b_addr);
+ if (!xfs_btree_lblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_bmbt_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
}
}
@@ -795,11 +794,9 @@ xfs_bmbt_write_verify(
struct xfs_buf *bp)
{
if (!xfs_bmbt_verify(bp)) {
- xfs_warn(bp->b_target->bt_mount, "bmbt daddr 0x%llx failed", bp->b_bn);
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- bp->b_target->bt_mount, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
xfs_btree_lblock_calc_crc(bp);
@@ -915,7 +912,6 @@ xfs_bmbt_maxrecs(
*/
int
xfs_bmdr_maxrecs(
- struct xfs_mount *mp,
int blocklen,
int leaf)
{
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index e367461a638..819a8a4dee9 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -18,9 +18,6 @@
#ifndef __XFS_BMAP_BTREE_H__
#define __XFS_BMAP_BTREE_H__
-#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
-#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
-
struct xfs_btree_cur;
struct xfs_btree_block;
struct xfs_mount;
@@ -28,85 +25,6 @@ struct xfs_inode;
struct xfs_trans;
/*
- * Bmap root header, on-disk form only.
- */
-typedef struct xfs_bmdr_block {
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
-} xfs_bmdr_block_t;
-
-/*
- * Bmap btree record and extent descriptor.
- * l0:63 is an extent flag (value 1 indicates non-normal).
- * l0:9-62 are startoff.
- * l0:0-8 and l1:21-63 are startblock.
- * l1:0-20 are blockcount.
- */
-#define BMBT_EXNTFLAG_BITLEN 1
-#define BMBT_STARTOFF_BITLEN 54
-#define BMBT_STARTBLOCK_BITLEN 52
-#define BMBT_BLOCKCOUNT_BITLEN 21
-
-typedef struct xfs_bmbt_rec {
- __be64 l0, l1;
-} xfs_bmbt_rec_t;
-
-typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
-typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
-
-typedef struct xfs_bmbt_rec_host {
- __uint64_t l0, l1;
-} xfs_bmbt_rec_host_t;
-
-/*
- * Values and macros for delayed-allocation startblock fields.
- */
-#define STARTBLOCKVALBITS 17
-#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
-#define DSTARTBLOCKMASKBITS (15 + 20)
-#define STARTBLOCKMASK \
- (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-#define DSTARTBLOCKMASK \
- (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
-
-static inline int isnullstartblock(xfs_fsblock_t x)
-{
- return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
-}
-
-static inline int isnulldstartblock(xfs_dfsbno_t x)
-{
- return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
-}
-
-static inline xfs_fsblock_t nullstartblock(int k)
-{
- ASSERT(k < (1 << STARTBLOCKVALBITS));
- return STARTBLOCKMASK | (k);
-}
-
-static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
-{
- return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
-}
-
-/*
- * Possible extent formats.
- */
-typedef enum {
- XFS_EXTFMT_NOSTATE = 0,
- XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
-/*
- * Possible extent states.
- */
-typedef enum {
- XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
- XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
-} xfs_exntst_t;
-
-/*
* Extent state and extent format macros.
*/
#define XFS_EXTFMT_INODE(x) \
@@ -115,27 +33,6 @@ typedef enum {
#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN)
/*
- * Incore version of above.
- */
-typedef struct xfs_bmbt_irec
-{
- xfs_fileoff_t br_startoff; /* starting file offset */
- xfs_fsblock_t br_startblock; /* starting block number */
- xfs_filblks_t br_blockcount; /* number of blocks */
- xfs_exntst_t br_state; /* extent state */
-} xfs_bmbt_irec_t;
-
-/*
- * Key structure for non-leaf levels of the tree.
- */
-typedef struct xfs_bmbt_key {
- __be64 br_startoff; /* starting file offset */
-} xfs_bmbt_key_t, xfs_bmdr_key_t;
-
-/* btree pointer type */
-typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-
-/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_BMBT_BLOCK_LEN(mp) \
@@ -233,7 +130,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
xfs_bmdr_block_t *, int);
extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
-extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmdr_maxrecs(int blocklen, int leaf);
extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
@@ -243,6 +140,4 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
-extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
-
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 97f952caea7..64731ef3324 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -18,31 +18,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_trans.h"
#include "xfs_extfree_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
/* Kernel only BMAP related definitions and functions */
@@ -249,48 +249,6 @@ xfs_bmap_rtalloc(
}
/*
- * Stack switching interfaces for allocation
- */
-static void
-xfs_bmapi_allocate_worker(
- struct work_struct *work)
-{
- struct xfs_bmalloca *args = container_of(work,
- struct xfs_bmalloca, work);
- unsigned long pflags;
-
- /* we are in a transaction context here */
- current_set_flags_nested(&pflags, PF_FSTRANS);
-
- args->result = __xfs_bmapi_allocate(args);
- complete(args->done);
-
- current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Some allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Otherwise just
- * call directly to avoid the context switch overhead here.
- */
-int
-xfs_bmapi_allocate(
- struct xfs_bmalloca *args)
-{
- DECLARE_COMPLETION_ONSTACK(done);
-
- if (!args->stack_switch)
- return __xfs_bmapi_allocate(args);
-
-
- args->done = &done;
- INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
- queue_work(xfs_alloc_wq, &args->work);
- wait_for_completion(&done);
- return args->result;
-}
-
-/*
* Check if the endoff is outside the last extent. If so the caller will grow
* the allocation to a stripe unit boundary. All offsets are considered outside
* the end of file for an empty fork, so 1 is returned in *eof in that case.
@@ -617,22 +575,27 @@ xfs_getbmap(
return XFS_ERROR(ENOMEM);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
- if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
+ if (whichfork == XFS_DATA_FORK) {
+ if (!(iflags & BMV_IF_DELALLOC) &&
+ (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (error)
goto out_unlock_iolock;
+
+ /*
+ * Even after flushing the inode, there can still be
+ * delalloc blocks on the inode beyond EOF due to
+ * speculative preallocation. These are not removed
+ * until the release function is called or the inode
+ * is inactivated. Hence we cannot assert here that
+ * ip->i_delayed_blks == 0.
+ */
}
- /*
- * even after flushing the inode, there can still be delalloc
- * blocks on the inode beyond EOF due to speculative
- * preallocation. These are not removed until the release
- * function is called or the inode is inactivated. Hence we
- * cannot assert here that ip->i_delayed_blks == 0.
- */
- }
- lock = xfs_ilock_map_shared(ip);
+ lock = xfs_ilock_data_map_shared(ip);
+ } else {
+ lock = xfs_ilock_attr_map_shared(ip);
+ }
/*
* Don't let nex be bigger than the number of extents
@@ -737,7 +700,7 @@ xfs_getbmap(
out_free_map:
kmem_free(map);
out_unlock_ilock:
- xfs_iunlock_map_shared(ip, lock);
+ xfs_iunlock(ip, lock);
out_unlock_iolock:
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -965,32 +928,12 @@ xfs_free_eofblocks(
return error;
}
-/*
- * xfs_alloc_file_space()
- * This routine allocates disk space for the given file.
- *
- * If alloc_type == 0, this request is for an ALLOCSP type
- * request which will change the file size. In this case, no
- * DMAPI event will be generated by the call. A TRUNCATE event
- * will be generated later by xfs_setattr.
- *
- * If alloc_type != 0, this request is for a RESVSP type
- * request, and a DMAPI DM_EVENT_WRITE will be generated if the
- * lower block boundary byte address is less than the file's
- * length.
- *
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-STATIC int
+int
xfs_alloc_file_space(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len,
- int alloc_type,
- int attr_flags)
+ int alloc_type)
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
@@ -1188,9 +1131,15 @@ xfs_zero_remaining_bytes(
xfs_buf_unlock(bp);
for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
+ uint lock_mode;
+
offset_fsb = XFS_B_TO_FSBT(mp, offset);
nimap = 1;
+
+ lock_mode = xfs_ilock_data_map_shared(ip);
error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
+ xfs_iunlock(ip, lock_mode);
+
if (error || nimap < 1)
break;
ASSERT(imap.br_blockcount >= 1);
@@ -1207,7 +1156,12 @@ xfs_zero_remaining_bytes(
XFS_BUF_UNWRITE(bp);
XFS_BUF_READ(bp);
XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
- xfsbdstrat(mp, bp);
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp,
@@ -1220,7 +1174,12 @@ xfs_zero_remaining_bytes(
XFS_BUF_UNDONE(bp);
XFS_BUF_UNREAD(bp);
XFS_BUF_WRITE(bp);
- xfsbdstrat(mp, bp);
+
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ error = XFS_ERROR(EIO);
+ break;
+ }
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp,
@@ -1232,24 +1191,11 @@ xfs_zero_remaining_bytes(
return error;
}
-/*
- * xfs_free_file_space()
- * This routine frees disk space for the given file.
- *
- * This routine is only called by xfs_change_file_space
- * for an UNRESVSP type call.
- *
- * RETURNS:
- * 0 on success
- * errno on error
- *
- */
-STATIC int
+int
xfs_free_file_space(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len,
- int attr_flags)
+ xfs_off_t len)
{
int committed;
int done;
@@ -1267,7 +1213,6 @@ xfs_free_file_space(
int rt;
xfs_fileoff_t startoffset_fsb;
xfs_trans_t *tp;
- int need_iolock = 1;
mp = ip->i_mount;
@@ -1284,20 +1229,15 @@ xfs_free_file_space(
startoffset_fsb = XFS_B_TO_FSB(mp, offset);
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
- if (attr_flags & XFS_ATTR_NOLOCK)
- need_iolock = 0;
- if (need_iolock) {
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- /* wait for the completion of any pending DIOs */
- inode_dio_wait(VFS_I(ip));
- }
+ /* wait for the completion of any pending DIOs */
+ inode_dio_wait(VFS_I(ip));
rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
ioffset = offset & ~(rounding - 1);
error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ioffset, -1);
if (error)
- goto out_unlock_iolock;
+ goto out;
truncate_pagecache_range(VFS_I(ip), ioffset, -1);
/*
@@ -1311,7 +1251,7 @@ xfs_free_file_space(
error = xfs_bmapi_read(ip, startoffset_fsb, 1,
&imap, &nimap, 0);
if (error)
- goto out_unlock_iolock;
+ goto out;
ASSERT(nimap == 0 || nimap == 1);
if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
xfs_daddr_t block;
@@ -1326,7 +1266,7 @@ xfs_free_file_space(
error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
&imap, &nimap, 0);
if (error)
- goto out_unlock_iolock;
+ goto out;
ASSERT(nimap == 0 || nimap == 1);
if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
@@ -1366,7 +1306,6 @@ xfs_free_file_space(
* the freeing of the space succeeds at ENOSPC.
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
- tp->t_flags |= XFS_TRANS_RESERVE;
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
/*
@@ -1412,27 +1351,23 @@ xfs_free_file_space(
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- out_unlock_iolock:
- if (need_iolock)
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ out:
return error;
error0:
xfs_bmap_cancel(&free_list);
error1:
xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- xfs_iunlock(ip, need_iolock ? (XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL) :
- XFS_ILOCK_EXCL);
- return error;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ goto out;
}
-STATIC int
+int
xfs_zero_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
- xfs_off_t len,
- int attr_flags)
+ xfs_off_t len)
{
struct xfs_mount *mp = ip->i_mount;
uint granularity;
@@ -1440,6 +1375,8 @@ xfs_zero_file_space(
xfs_off_t end_boundary;
int error;
+ trace_xfs_zero_file_space(ip);
+
granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
/*
@@ -1453,26 +1390,32 @@ xfs_zero_file_space(
ASSERT(start_boundary >= offset);
ASSERT(end_boundary <= offset + len);
- if (!(attr_flags & XFS_ATTR_NOLOCK))
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
if (start_boundary < end_boundary - 1) {
- /* punch out the page cache over the conversion range */
+ /*
+ * punch out delayed allocation blocks and the page cache over
+ * the conversion range
+ */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, start_boundary),
+ XFS_B_TO_FSB(mp, end_boundary - start_boundary));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
truncate_pagecache_range(VFS_I(ip), start_boundary,
end_boundary - 1);
+
/* convert the blocks */
error = xfs_alloc_file_space(ip, start_boundary,
end_boundary - start_boundary - 1,
- XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
- attr_flags);
+ XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
if (error)
- goto out_unlock;
+ goto out;
/* We've handled the interior of the range, now for the edges */
- if (start_boundary != offset)
+ if (start_boundary != offset) {
error = xfs_iozero(ip, offset, start_boundary - offset);
- if (error)
- goto out_unlock;
+ if (error)
+ goto out;
+ }
if (end_boundary != offset + len)
error = xfs_iozero(ip, end_boundary,
@@ -1486,194 +1429,103 @@ xfs_zero_file_space(
error = xfs_iozero(ip, offset, len);
}
-out_unlock:
- if (!(attr_flags & XFS_ATTR_NOLOCK))
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+out:
return error;
}
/*
- * xfs_change_file_space()
- * This routine allocates or frees disk space for the given file.
- * The user specified parameters are checked for alignment and size
- * limitations.
- *
+ * xfs_collapse_file_space()
+ * This routine frees disk space and shift extent for the given file.
+ * The first thing we do is to free data blocks in the specified range
+ * by calling xfs_free_file_space(). It would also sync dirty data
+ * and invalidate page cache over the region on which collapse range
+ * is working. And Shift extent records to the left to cover a hole.
* RETURNS:
- * 0 on success
- * errno on error
+ * 0 on success
+ * errno on error
*
*/
int
-xfs_change_file_space(
- xfs_inode_t *ip,
- int cmd,
- xfs_flock64_t *bf,
- xfs_off_t offset,
- int attr_flags)
+xfs_collapse_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len)
{
- xfs_mount_t *mp = ip->i_mount;
- int clrprealloc;
- int error;
- xfs_fsize_t fsize;
- int setprealloc;
- xfs_off_t startoffset;
- xfs_trans_t *tp;
- struct iattr iattr;
-
- if (!S_ISREG(ip->i_d.di_mode))
- return XFS_ERROR(EINVAL);
-
- switch (bf->l_whence) {
- case 0: /*SEEK_SET*/
- break;
- case 1: /*SEEK_CUR*/
- bf->l_start += offset;
- break;
- case 2: /*SEEK_END*/
- bf->l_start += XFS_ISIZE(ip);
- break;
- default:
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * length of <= 0 for resv/unresv/zero is invalid. length for
- * alloc/free is ignored completely and we have no idea what userspace
- * might have set it to, so set it to zero to allow range
- * checks to pass.
- */
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- if (bf->l_len <= 0)
- return XFS_ERROR(EINVAL);
- break;
- default:
- bf->l_len = 0;
- break;
- }
-
- if (bf->l_start < 0 ||
- bf->l_start > mp->m_super->s_maxbytes ||
- bf->l_start + bf->l_len < 0 ||
- bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
- return XFS_ERROR(EINVAL);
-
- bf->l_whence = 0;
+ int done = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+ xfs_extnum_t current_ext = 0;
+ struct xfs_bmap_free free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ xfs_fileoff_t start_fsb;
+ xfs_fileoff_t shift_fsb;
- startoffset = bf->l_start;
- fsize = XFS_ISIZE(ip);
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
- setprealloc = clrprealloc = 0;
- switch (cmd) {
- case XFS_IOC_ZERO_RANGE:
- error = xfs_zero_file_space(ip, startoffset, bf->l_len,
- attr_flags);
- if (error)
- return error;
- setprealloc = 1;
- break;
+ trace_xfs_collapse_file_space(ip);
- case XFS_IOC_RESVSP:
- case XFS_IOC_RESVSP64:
- error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
- XFS_BMAPI_PREALLOC, attr_flags);
- if (error)
- return error;
- setprealloc = 1;
- break;
+ start_fsb = XFS_B_TO_FSB(mp, offset + len);
+ shift_fsb = XFS_B_TO_FSB(mp, len);
- case XFS_IOC_UNRESVSP:
- case XFS_IOC_UNRESVSP64:
- if ((error = xfs_free_file_space(ip, startoffset, bf->l_len,
- attr_flags)))
- return error;
- break;
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ return error;
- case XFS_IOC_ALLOCSP:
- case XFS_IOC_ALLOCSP64:
- case XFS_IOC_FREESP:
- case XFS_IOC_FREESP64:
+ while (!error && !done) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
/*
- * These operations actually do IO when extending the file, but
- * the allocation is done seperately to the zeroing that is
- * done. This set of operations need to be serialised against
- * other IO operations, such as truncate and buffered IO. We
- * need to take the IOLOCK here to serialise the allocation and
- * zeroing IO to prevent other IOLOCK holders (e.g. getbmap,
- * truncate, direct IO) from racing against the transient
- * allocated but not written state we can have here.
+ * We would need to reserve permanent block for transaction.
+ * This will come into picture when after shifting extent into
+ * hole we found that adjacent extents can be merged which
+ * may lead to freeing of a block during record update.
*/
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
- if (startoffset > fsize) {
- error = xfs_alloc_file_space(ip, fsize,
- startoffset - fsize, 0,
- attr_flags | XFS_ATTR_NOLOCK);
- if (error) {
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- break;
- }
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ break;
}
- iattr.ia_valid = ATTR_SIZE;
- iattr.ia_size = startoffset;
-
- error = xfs_setattr_size(ip, &iattr,
- attr_flags | XFS_ATTR_NOLOCK);
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
+ ip->i_gdquot, ip->i_pdquot,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+ XFS_QMOPT_RES_REGBLKS);
if (error)
- return error;
-
- clrprealloc = 1;
- break;
-
- default:
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
-
- /*
- * update the inode timestamp, mode, and prealloc flag bits
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
+ goto out;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
- if ((attr_flags & XFS_ATTR_DMI) == 0) {
- ip->i_d.di_mode &= ~S_ISUID;
+ xfs_bmap_init(&free_list, &first_block);
/*
- * Note that we don't have to worry about mandatory
- * file locking being disabled here because we only
- * clear the S_ISGID bit if the Group execute bit is
- * on, but if it was on then mandatory locking wouldn't
- * have been enabled.
+ * We are using the write transaction in which max 2 bmbt
+ * updates are allowed
*/
- if (ip->i_d.di_mode & S_IXGRP)
- ip->i_d.di_mode &= ~S_ISGID;
+ error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb,
+ shift_fsb, &current_ext,
+ &first_block, &free_list,
+ XFS_BMAP_MAX_SHIFT_EXTENTS);
+ if (error)
+ goto out;
+
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto out;
- xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
- if (setprealloc)
- ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
- else if (clrprealloc)
- ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- if (attr_flags & XFS_ATTR_SYNC)
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp, 0);
+ return error;
+
+out:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
}
/*
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 061260946f7..2fdb72d2c90 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -50,12 +50,11 @@ struct xfs_bmalloca {
xfs_extlen_t total; /* total blocks needed for xaction */
xfs_extlen_t minlen; /* minimum allocation size (blocks) */
xfs_extlen_t minleft; /* amount must be left after alloc */
- char eof; /* set if allocating past last extent */
- char wasdel; /* replacing a delayed allocation */
- char userdata;/* set if is user data */
- char aeof; /* allocated space at eof */
- char conv; /* overwriting unwritten extents */
- char stack_switch;
+ bool eof; /* set if allocating past last extent */
+ bool wasdel; /* replacing a delayed allocation */
+ bool userdata;/* set if is user data */
+ bool aeof; /* allocated space at eof */
+ bool conv; /* overwriting unwritten extents */
int flags;
struct completion *done;
struct work_struct work;
@@ -65,8 +64,6 @@ struct xfs_bmalloca {
int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
int *committed);
int xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
-int xfs_bmapi_allocate(struct xfs_bmalloca *args);
-int __xfs_bmapi_allocate(struct xfs_bmalloca *args);
int xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
int whichfork, int *eof);
int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
@@ -93,9 +90,14 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int *is_empty);
/* preallocation and hole punch interface */
-int xfs_change_file_space(struct xfs_inode *ip, int cmd,
- xfs_flock64_t *bf, xfs_off_t offset,
- int attr_flags);
+int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len, int alloc_type);
+int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t len);
+int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
+ xfs_off_t len);
/* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 5690e102243..cf893bc1e37 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -17,24 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_alloc.h"
/*
* Cursor allocation zone.
@@ -45,9 +44,10 @@ kmem_zone_t *xfs_btree_cur_zone;
* Btree magic numbers.
*/
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
- { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
+ { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
+ XFS_FIBT_MAGIC },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
- XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
+ XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -236,8 +236,7 @@ xfs_btree_lblock_calc_crc(
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_LBLOCK_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
}
bool
@@ -245,8 +244,8 @@ xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
- return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_LBLOCK_CRC_OFF);
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+
return true;
}
@@ -269,8 +268,7 @@ xfs_btree_sblock_calc_crc(
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_SBLOCK_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
}
bool
@@ -278,8 +276,8 @@ xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
- return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_BTREE_SBLOCK_CRC_OFF);
+ return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+
return true;
}
@@ -556,14 +554,11 @@ xfs_btree_get_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -578,15 +573,12 @@ xfs_btree_get_bufs(
xfs_agblock_t agbno, /* allocation group block number */
uint lock) /* lock flags for get_buf */
{
- xfs_buf_t *bp; /* buffer pointer (return value) */
xfs_daddr_t d; /* real disk block address */
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
- ASSERT(!xfs_buf_geterror(bp));
- return bp;
+ return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
}
/*
@@ -726,7 +718,6 @@ xfs_btree_read_bufl(
mp->m_bsize, lock, &bp, ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(bp));
if (bp)
xfs_buf_set_ref(bp, refval);
*bpp = bp;
@@ -1119,6 +1110,7 @@ xfs_btree_set_refs(
xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
break;
case XFS_BTNUM_INO:
+ case XFS_BTNUM_FINO:
xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
break;
case XFS_BTNUM_BMAP:
@@ -1163,7 +1155,6 @@ STATIC int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr,
- int level,
int flags,
struct xfs_btree_block **block,
struct xfs_buf **bpp)
@@ -1182,7 +1173,6 @@ xfs_btree_read_buf_block(
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_btree_set_refs(cur, *bpp);
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
@@ -1521,8 +1511,8 @@ xfs_btree_increment(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
@@ -1620,8 +1610,8 @@ xfs_btree_decrement(
union xfs_btree_ptr *ptrp;
ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
- error = xfs_btree_read_buf_block(cur, ptrp, --lev,
- 0, &block, &bp);
+ --lev;
+ error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
if (error)
goto error0;
xfs_btree_setbuf(cur, lev, bp);
@@ -1671,7 +1661,7 @@ xfs_btree_lookup_get_block(
return 0;
}
- error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+ error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
if (error)
return error;
@@ -2022,7 +2012,7 @@ xfs_btree_lshift(
goto out0;
/* Set up the left neighbor as "left". */
- error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -2206,7 +2196,7 @@ xfs_btree_rshift(
goto out0;
/* Set up the right neighbor as "right". */
- error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -2334,7 +2324,7 @@ error1:
* record (to be inserted into parent).
*/
STATIC int /* error */
-xfs_btree_split(
+__xfs_btree_split(
struct xfs_btree_cur *cur,
int level,
union xfs_btree_ptr *ptrp,
@@ -2376,7 +2366,7 @@ xfs_btree_split(
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2474,7 +2464,7 @@ xfs_btree_split(
* point back to right instead of to left.
*/
if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
- error = xfs_btree_read_buf_block(cur, &rrptr, level,
+ error = xfs_btree_read_buf_block(cur, &rrptr,
0, &rrblock, &rrbp);
if (error)
goto error0;
@@ -2514,6 +2504,85 @@ error0:
return error;
}
+struct xfs_btree_split_args {
+ struct xfs_btree_cur *cur;
+ int level;
+ union xfs_btree_ptr *ptrp;
+ union xfs_btree_key *key;
+ struct xfs_btree_cur **curp;
+ int *stat; /* success/failure */
+ int result;
+ bool kswapd; /* allocation in kswapd context */
+ struct completion *done;
+ struct work_struct work;
+};
+
+/*
+ * Stack switching interfaces for allocation
+ */
+static void
+xfs_btree_split_worker(
+ struct work_struct *work)
+{
+ struct xfs_btree_split_args *args = container_of(work,
+ struct xfs_btree_split_args, work);
+ unsigned long pflags;
+ unsigned long new_pflags = PF_FSTRANS;
+
+ /*
+ * we are in a transaction context here, but may also be doing work
+ * in kswapd context, and hence we may need to inherit that state
+ * temporarily to ensure that we don't block waiting for memory reclaim
+ * in any way.
+ */
+ if (args->kswapd)
+ new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+
+ current_set_flags_nested(&pflags, new_pflags);
+
+ args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
+ args->key, args->curp, args->stat);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, new_pflags);
+}
+
+/*
+ * BMBT split requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. For the other
+ * btree types, just call directly to avoid the context switch overhead here.
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_split_args args;
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (cur->bc_btnum != XFS_BTNUM_BMAP)
+ return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+
+ args.cur = cur;
+ args.level = level;
+ args.ptrp = ptrp;
+ args.key = key;
+ args.curp = curp;
+ args.stat = stat;
+ args.done = &done;
+ args.kswapd = current_is_kswapd();
+ INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
+ queue_work(xfs_alloc_wq, &args.work);
+ wait_for_completion(&done);
+ destroy_work_on_stack(&args.work);
+ return args.result;
+}
+
+
/*
* Copy the old inode root contents into a real block and make the
* broot point to it.
@@ -2549,7 +2618,7 @@ xfs_btree_new_iroot(
pp = xfs_btree_ptr_addr(cur, 1, block);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
if (error)
goto error0;
if (*stat == 0) {
@@ -2653,7 +2722,7 @@ xfs_btree_new_root(
cur->bc_ops->init_ptr_from_cur(cur, &rptr);
/* Allocate the new block. If we can't do it, we're toast. Give up. */
- error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
if (error)
goto error0;
if (*stat == 0)
@@ -2688,8 +2757,7 @@ xfs_btree_new_root(
lbp = bp;
xfs_btree_buf_to_ptr(cur, lbp, &lptr);
left = block;
- error = xfs_btree_read_buf_block(cur, &rptr,
- cur->bc_nlevels - 1, 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
bp = rbp;
@@ -2700,8 +2768,7 @@ xfs_btree_new_root(
xfs_btree_buf_to_ptr(cur, rbp, &rptr);
right = block;
xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
- error = xfs_btree_read_buf_block(cur, &lptr,
- cur->bc_nlevels - 1, 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
bp = lbp;
@@ -3653,8 +3720,7 @@ xfs_btree_delrec(
rptr = cptr;
right = block;
rbp = bp;
- error = xfs_btree_read_buf_block(cur, &lptr, level,
- 0, &left, &lbp);
+ error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
if (error)
goto error0;
@@ -3671,8 +3737,7 @@ xfs_btree_delrec(
lptr = cptr;
left = block;
lbp = bp;
- error = xfs_btree_read_buf_block(cur, &rptr, level,
- 0, &right, &rbp);
+ error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
if (error)
goto error0;
@@ -3744,8 +3809,7 @@ xfs_btree_delrec(
/* If there is a right sibling, point it to the remaining block. */
xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
if (!xfs_btree_ptr_is_null(cur, &cptr)) {
- error = xfs_btree_read_buf_block(cur, &cptr, level,
- 0, &rrblock, &rrbp);
+ error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
if (error)
goto error0;
xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 06729b67ad5..a04b69422f6 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -27,73 +27,6 @@ struct xfs_trans;
extern kmem_zone_t *xfs_btree_cur_zone;
/*
- * This nonsense is to make -wlint happy.
- */
-#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
-#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
-#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
-
-#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
-
-/*
- * Generic btree header.
- *
- * This is a combination of the actual format used on disk for short and long
- * format btrees. The first three fields are shared by both format, but the
- * pointers are different and should be used with care.
- *
- * To get the size of the actual short or long form headers please use the size
- * macros below. Never use sizeof(xfs_btree_block).
- *
- * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
- * with the crc feature bit, and all accesses to them must be conditional on
- * that flag.
- */
-struct xfs_btree_block {
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
- union {
- struct {
- __be32 bb_leftsib;
- __be32 bb_rightsib;
-
- __be64 bb_blkno;
- __be64 bb_lsn;
- uuid_t bb_uuid;
- __be32 bb_owner;
- __le32 bb_crc;
- } s; /* short form pointers */
- struct {
- __be64 bb_leftsib;
- __be64 bb_rightsib;
-
- __be64 bb_blkno;
- __be64 bb_lsn;
- uuid_t bb_uuid;
- __be64 bb_owner;
- __le32 bb_crc;
- __be32 bb_pad; /* padding for alignment */
- } l; /* long form pointers */
- } bb_u; /* rest */
-};
-
-#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
-#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
-
-/* sizes of CRC enabled btree blocks */
-#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
-#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
-
-#define XFS_BTREE_SBLOCK_CRC_OFF \
- offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
-#define XFS_BTREE_LBLOCK_CRC_OFF \
- offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
-
-/*
* Generic key, ptr and record wrapper structures.
*
* These are disk format structures, and are converted where necessary
@@ -119,6 +52,19 @@ union xfs_btree_rec {
};
/*
+ * This nonsense is to make -wlint happy.
+ */
+#define XFS_LOOKUP_EQ ((xfs_lookup_t)XFS_LOOKUP_EQi)
+#define XFS_LOOKUP_LE ((xfs_lookup_t)XFS_LOOKUP_LEi)
+#define XFS_LOOKUP_GE ((xfs_lookup_t)XFS_LOOKUP_GEi)
+
+#define XFS_BTNUM_BNO ((xfs_btnum_t)XFS_BTNUM_BNOi)
+#define XFS_BTNUM_CNT ((xfs_btnum_t)XFS_BTNUM_CNTi)
+#define XFS_BTNUM_BMAP ((xfs_btnum_t)XFS_BTNUM_BMAPi)
+#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
+#define XFS_BTNUM_FINO ((xfs_btnum_t)XFS_BTNUM_FINOi)
+
+/*
* For logging record fields.
*/
#define XFS_BB_MAGIC (1 << 0)
@@ -147,6 +93,7 @@ do { \
case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -160,6 +107,7 @@ do { \
case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
@@ -184,7 +132,7 @@ struct xfs_btree_ops {
int (*alloc_block)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *start_bno,
union xfs_btree_ptr *new_bno,
- int length, int *stat);
+ int *stat);
int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
/* update last record information */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 263470075ea..7a34a1ae655 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -34,12 +34,13 @@
#include <linux/backing-dev.h>
#include <linux/freezer.h>
-#include "xfs_sb.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_log.h"
+#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
static kmem_zone_t *xfs_buf_zone;
@@ -215,8 +216,7 @@ _xfs_buf_alloc(
STATIC int
_xfs_buf_get_pages(
xfs_buf_t *bp,
- int page_count,
- xfs_buf_flags_t flags)
+ int page_count)
{
/* Make sure that we have a page list */
if (bp->b_pages == NULL) {
@@ -329,7 +329,7 @@ use_alloc_page:
end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
page_count = end - start;
- error = _xfs_buf_get_pages(bp, page_count, flags);
+ error = _xfs_buf_get_pages(bp, page_count);
if (unlikely(error))
return error;
@@ -395,7 +395,17 @@ _xfs_buf_map_pages(
bp->b_addr = NULL;
} else {
int retried = 0;
+ unsigned noio_flag;
+ /*
+ * vm_map_ram() will allocate auxillary structures (e.g.
+ * pagetables) with GFP_KERNEL, yet we are likely to be under
+ * GFP_NOFS context here. Hence we need to tell memory reclaim
+ * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+ * memory reclaim re-entering the filesystem here and
+ * potentially deadlocking.
+ */
+ noio_flag = memalloc_noio_save();
do {
bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-1, PAGE_KERNEL);
@@ -403,6 +413,7 @@ _xfs_buf_map_pages(
break;
vm_unmap_aliases();
} while (retried++ <= 1);
+ memalloc_noio_restore(noio_flag);
if (!bp->b_addr)
return -ENOMEM;
@@ -444,8 +455,8 @@ _xfs_buf_find(
numbytes = BBTOB(numblks);
/* Check for IOs smaller than the sector size / not sector aligned */
- ASSERT(!(numbytes < (1 << btp->bt_sshift)));
- ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+ ASSERT(!(numbytes < btp->bt_meta_sectorsize));
+ ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
/*
* Corrupted block numbers can get through to here, unfortunately, so we
@@ -590,7 +601,7 @@ found:
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
- "%s: failed to map pages\n", __func__);
+ "%s: failed to map pagesn", __func__);
xfs_buf_relse(bp);
return NULL;
}
@@ -697,7 +708,11 @@ xfs_buf_read_uncached(
bp->b_flags |= XBF_READ;
bp->b_ops = ops;
- xfsbdstrat(target->bt_mount, bp);
+ if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+ xfs_buf_iorequest(bp);
xfs_buf_iowait(bp);
return bp;
}
@@ -762,7 +777,7 @@ xfs_buf_associate_memory(
bp->b_pages = NULL;
bp->b_addr = mem;
- rval = _xfs_buf_get_pages(bp, page_count, 0);
+ rval = _xfs_buf_get_pages(bp, page_count);
if (rval)
return rval;
@@ -795,7 +810,7 @@ xfs_buf_get_uncached(
goto fail;
page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
- error = _xfs_buf_get_pages(bp, page_count, 0);
+ error = _xfs_buf_get_pages(bp, page_count);
if (error)
goto fail_free_buf;
@@ -809,7 +824,7 @@ xfs_buf_get_uncached(
error = _xfs_buf_map_pages(bp, 0);
if (unlikely(error)) {
xfs_warn(target->bt_mount,
- "%s: failed to map pages\n", __func__);
+ "%s: failed to map pages", __func__);
goto fail_free_mem;
}
@@ -1088,7 +1103,7 @@ xfs_bioerror(
* This is meant for userdata errors; metadata bufs come with
* iodone functions attached, so that we can track down errors.
*/
-STATIC int
+int
xfs_bioerror_relse(
struct xfs_buf *bp)
{
@@ -1151,7 +1166,7 @@ xfs_bwrite(
ASSERT(xfs_buf_islocked(bp));
bp->b_flags |= XBF_WRITE;
- bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
+ bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
xfs_bdstrat_cb(bp);
@@ -1163,25 +1178,6 @@ xfs_bwrite(
return error;
}
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem. Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- if (XFS_FORCED_SHUTDOWN(mp)) {
- trace_xfs_bdstrat_shut(bp, _RET_IP_);
- xfs_bioerror_relse(bp);
- return;
- }
-
- xfs_buf_iorequest(bp);
-}
-
STATIC void
_xfs_buf_ioend(
xfs_buf_t *bp,
@@ -1254,7 +1250,7 @@ next_chunk:
bio = bio_alloc(GFP_NOIO, nr_pages);
bio->bi_bdev = bp->b_target->bt_bdev;
- bio->bi_sector = sector;
+ bio->bi_iter.bi_sector = sector;
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
@@ -1276,7 +1272,7 @@ next_chunk:
total_nr_pages--;
}
- if (likely(bio->bi_size)) {
+ if (likely(bio->bi_iter.bi_size)) {
if (xfs_buf_is_vmapped(bp)) {
flush_kernel_vmap_range(bp->b_addr,
xfs_buf_vmap_len(bp));
@@ -1375,21 +1371,29 @@ xfs_buf_iorequest(
xfs_buf_wait_unpin(bp);
xfs_buf_hold(bp);
- /* Set the count to 1 initially, this will stop an I/O
+ /*
+ * Set the count to 1 initially, this will stop an I/O
* completion callout which happens before we have started
* all the I/O from calling xfs_buf_ioend too early.
*/
atomic_set(&bp->b_io_remaining, 1);
_xfs_buf_ioapply(bp);
- _xfs_buf_ioend(bp, 1);
+ /*
+ * If _xfs_buf_ioapply failed, we'll get back here with
+ * only the reference we took above. _xfs_buf_ioend will
+ * drop it to zero, so we'd better not queue it for later,
+ * or we'll free it before it's done.
+ */
+ _xfs_buf_ioend(bp, bp->b_error ? 0 : 1);
xfs_buf_rele(bp);
}
/*
* Waits for I/O to complete on the buffer supplied. It returns immediately if
- * no I/O is pending or there is already a pending error on the buffer. It
- * returns the I/O error code, if any, or 0 if there was no error.
+ * no I/O is pending or there is already a pending error on the buffer, in which
+ * case nothing will ever complete. It returns the I/O error code, if any, or
+ * 0 if there was no error.
*/
int
xfs_buf_iowait(
@@ -1515,6 +1519,12 @@ xfs_wait_buftarg(
struct xfs_buf *bp;
bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
list_del_init(&bp->b_lru);
+ if (bp->b_flags & XBF_WRITE_FAIL) {
+ xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+ (long long)bp->b_bn);
+ }
xfs_buf_rele(bp);
}
if (loop++ != 0)
@@ -1601,16 +1611,14 @@ xfs_free_buftarg(
kmem_free(btp);
}
-STATIC int
-xfs_setsize_buftarg_flags(
+int
+xfs_setsize_buftarg(
xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize,
- int verbose)
+ unsigned int sectorsize)
{
- btp->bt_bsize = blocksize;
- btp->bt_sshift = ffs(sectorsize) - 1;
- btp->bt_smask = sectorsize - 1;
+ /* Set up metadata sector size info */
+ btp->bt_meta_sectorsize = sectorsize;
+ btp->bt_meta_sectormask = sectorsize - 1;
if (set_blocksize(btp->bt_bdev, sectorsize)) {
char name[BDEVNAME_SIZE];
@@ -1618,43 +1626,35 @@ xfs_setsize_buftarg_flags(
bdevname(btp->bt_bdev, name);
xfs_warn(btp->bt_mount,
- "Cannot set_blocksize to %u on device %s\n",
+ "Cannot set_blocksize to %u on device %s",
sectorsize, name);
return EINVAL;
}
+ /* Set up device logical sector size mask */
+ btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
+ btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
+
return 0;
}
/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used at this early stage. Play safe.
+ * When allocating the initial buffer target we have not yet
+ * read in the superblock, so don't know what sized sectors
+ * are being used at this early stage. Play safe.
*/
STATIC int
xfs_setsize_buftarg_early(
xfs_buftarg_t *btp,
struct block_device *bdev)
{
- return xfs_setsize_buftarg_flags(btp,
- PAGE_SIZE, bdev_logical_block_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
- xfs_buftarg_t *btp,
- unsigned int blocksize,
- unsigned int sectorsize)
-{
- return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+ return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
}
xfs_buftarg_t *
xfs_alloc_buftarg(
struct xfs_mount *mp,
- struct block_device *bdev,
- int external,
- const char *fsname)
+ struct block_device *bdev)
{
xfs_buftarg_t *btp;
@@ -1798,7 +1798,7 @@ __xfs_buf_delwri_submit(
blk_start_plug(&plug);
list_for_each_entry_safe(bp, n, io_list, b_list) {
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+ bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
bp->b_flags |= XBF_WRITE;
if (!wait) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e6568336101..3a7a5523d3d 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -45,6 +45,7 @@ typedef enum {
#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL (1 << 24)/* async writes have failed on this buffer */
/* I/O hints for the BIO layer */
#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_ASYNC, "ASYNC" }, \
{ XBF_DONE, "DONE" }, \
{ XBF_STALE, "STALE" }, \
+ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \
{ XBF_SYNCIO, "SYNCIO" }, \
{ XBF_FUA, "FUA" }, \
{ XBF_FLUSH, "FLUSH" }, \
@@ -80,19 +82,34 @@ typedef unsigned int xfs_buf_flags_t;
{ _XBF_DELWRI_Q, "DELWRI_Q" }, \
{ _XBF_COMPOUND, "COMPOUND" }
+
/*
* Internal state flags.
*/
#define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */
+/*
+ * The xfs_buftarg contains 2 notions of "sector size" -
+ *
+ * 1) The metadata sector size, which is the minimum unit and
+ * alignment of IO which will be performed by metadata operations.
+ * 2) The device logical sector size
+ *
+ * The first is specified at mkfs time, and is stored on-disk in the
+ * superblock's sb_sectsize.
+ *
+ * The latter is derived from the underlying device, and controls direct IO
+ * alignment constraints.
+ */
typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
struct backing_dev_info *bt_bdi;
struct xfs_mount *bt_mount;
- unsigned int bt_bsize;
- unsigned int bt_sshift;
- size_t bt_smask;
+ unsigned int bt_meta_sectorsize;
+ size_t bt_meta_sectormask;
+ size_t bt_logical_sectorsize;
+ size_t bt_logical_sectormask;
/* LRU control structures */
struct shrinker bt_shrinker;
@@ -269,9 +286,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
/* Buffer Read and Write Routines */
extern int xfs_bwrite(struct xfs_buf *bp);
-
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-
extern void xfs_buf_ioend(xfs_buf_t *, int);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
@@ -282,10 +296,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
#define xfs_buf_zero(bp, off, len) \
xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-static inline int xfs_buf_geterror(xfs_buf_t *bp)
-{
- return bp ? bp->b_error : ENOMEM;
-}
+extern int xfs_bioerror_relse(struct xfs_buf *);
/* Buffer Utility Routines */
extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
@@ -301,7 +312,8 @@ extern void xfs_buf_terminate(void);
#define XFS_BUF_ZEROFLAGS(bp) \
((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
- XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
+ XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
+ XBF_WRITE_FAIL))
void xfs_buf_stale(struct xfs_buf *bp);
#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
@@ -352,14 +364,28 @@ static inline void xfs_buf_relse(xfs_buf_t *bp)
xfs_buf_rele(bp);
}
+static inline int
+xfs_buf_verify_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+ return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
+ cksum_offset);
+}
+
+static inline void
+xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
+{
+ xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
+ cksum_offset);
+}
+
/*
* Handling of buftargs.
*/
extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
- struct block_device *, int, const char *);
+ struct block_device *);
extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index f1d85cfc0a5..4654338b03f 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -17,17 +17,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_buf_item_zone;
@@ -181,21 +182,47 @@ xfs_buf_item_size(
trace_xfs_buf_item_size(bip);
}
-static struct xfs_log_iovec *
+static inline void
+xfs_buf_item_copy_iovec(
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
+ struct xfs_buf *bp,
+ uint offset,
+ int first_bit,
+ uint nbits)
+{
+ offset += first_bit * XFS_BLF_CHUNK;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BCHUNK,
+ xfs_buf_offset(bp, offset),
+ nbits * XFS_BLF_CHUNK);
+}
+
+static inline bool
+xfs_buf_item_straddle(
+ struct xfs_buf *bp,
+ uint offset,
+ int next_bit,
+ int last_bit)
+{
+ return xfs_buf_offset(bp, offset + (next_bit << XFS_BLF_SHIFT)) !=
+ (xfs_buf_offset(bp, offset + (last_bit << XFS_BLF_SHIFT)) +
+ XFS_BLF_CHUNK);
+}
+
+static void
xfs_buf_item_format_segment(
struct xfs_buf_log_item *bip,
- struct xfs_log_iovec *vecp,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp,
uint offset,
struct xfs_buf_log_format *blfp)
{
struct xfs_buf *bp = bip->bli_buf;
uint base_size;
- uint nvecs;
int first_bit;
int last_bit;
int next_bit;
uint nbits;
- uint buffer_offset;
/* copy the flags across from the base format item */
blfp->blf_flags = bip->__bli_format.blf_flags;
@@ -207,21 +234,17 @@ xfs_buf_item_format_segment(
*/
base_size = xfs_buf_log_format_size(blfp);
- nvecs = 0;
first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
/*
* If the map is not be dirty in the transaction, mark
* the size as zero and do not advance the vector pointer.
*/
- goto out;
+ return;
}
- vecp->i_addr = blfp;
- vecp->i_len = base_size;
- vecp->i_type = XLOG_REG_TYPE_BFORMAT;
- vecp++;
- nvecs = 1;
+ blfp = xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_BFORMAT, blfp, base_size);
+ blfp->blf_size = 1;
if (bip->bli_flags & XFS_BLI_STALE) {
/*
@@ -231,14 +254,13 @@ xfs_buf_item_format_segment(
*/
trace_xfs_buf_item_format_stale(bip);
ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
- goto out;
+ return;
}
/*
* Fill in an iovec for each set of contiguous chunks.
*/
-
last_bit = first_bit;
nbits = 1;
for (;;) {
@@ -251,42 +273,22 @@ xfs_buf_item_format_segment(
next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size,
(uint)last_bit + 1);
/*
- * If we run out of bits fill in the last iovec and get
- * out of the loop.
- * Else if we start a new set of bits then fill in the
- * iovec for the series we were looking at and start
- * counting the bits in the new one.
- * Else we're still in the same set of bits so just
- * keep counting and scanning.
+ * If we run out of bits fill in the last iovec and get out of
+ * the loop. Else if we start a new set of bits then fill in
+ * the iovec for the series we were looking at and start
+ * counting the bits in the new one. Else we're still in the
+ * same set of bits so just keep counting and scanning.
*/
if (next_bit == -1) {
- buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLF_CHUNK;
- vecp->i_type = XLOG_REG_TYPE_BCHUNK;
- nvecs++;
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
break;
- } else if (next_bit != last_bit + 1) {
- buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLF_CHUNK;
- vecp->i_type = XLOG_REG_TYPE_BCHUNK;
- nvecs++;
- vecp++;
- first_bit = next_bit;
- last_bit = next_bit;
- nbits = 1;
- } else if (xfs_buf_offset(bp, offset +
- (next_bit << XFS_BLF_SHIFT)) !=
- (xfs_buf_offset(bp, offset +
- (last_bit << XFS_BLF_SHIFT)) +
- XFS_BLF_CHUNK)) {
- buffer_offset = offset + first_bit * XFS_BLF_CHUNK;
- vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
- vecp->i_len = nbits * XFS_BLF_CHUNK;
- vecp->i_type = XLOG_REG_TYPE_BCHUNK;
- nvecs++;
- vecp++;
+ } else if (next_bit != last_bit + 1 ||
+ xfs_buf_item_straddle(bp, offset, next_bit, last_bit)) {
+ xfs_buf_item_copy_iovec(lv, vecp, bp, offset,
+ first_bit, nbits);
+ blfp->blf_size++;
first_bit = next_bit;
last_bit = next_bit;
nbits = 1;
@@ -295,9 +297,6 @@ xfs_buf_item_format_segment(
nbits++;
}
}
-out:
- blfp->blf_size = nvecs;
- return vecp;
}
/*
@@ -309,10 +308,11 @@ out:
STATIC void
xfs_buf_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *vecp)
+ struct xfs_log_vec *lv)
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
+ struct xfs_log_iovec *vecp = NULL;
uint offset = 0;
int i;
@@ -353,8 +353,8 @@ xfs_buf_item_format(
}
for (i = 0; i < bip->bli_format_count; i++) {
- vecp = xfs_buf_item_format_segment(bip, vecp, offset,
- &bip->bli_formats[i]);
+ xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+ &bip->bli_formats[i]);
offset += bp->b_maps[i].bm_len;
}
@@ -495,6 +495,14 @@ xfs_buf_item_unpin(
}
}
+/*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
+ * seconds so as to not spam logs too much on repeated detection of the same
+ * buffer being bad..
+ */
+
+DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+
STATIC uint
xfs_buf_item_push(
struct xfs_log_item *lip,
@@ -523,6 +531,14 @@ xfs_buf_item_push(
trace_xfs_buf_item_push(bip);
+ /* has a previous flush failed due to IO errors? */
+ if ((bp->b_flags & XBF_WRITE_FAIL) &&
+ ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+ xfs_warn(bp->b_target->bt_mount,
+"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+ (long long)bp->b_bn);
+ }
+
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_unlock(bp);
@@ -780,20 +796,6 @@ xfs_buf_item_init(
bip->bli_formats[i].blf_map_size = map_size;
}
-#ifdef XFS_TRANS_DEBUG
- /*
- * Allocate the arrays for tracking what needs to be logged
- * and what our callers request to be logged. bli_orig
- * holds a copy of the original, clean buffer for comparison
- * against, and bli_logged keeps a 1 bit flag per byte in
- * the buffer to indicate which bytes the callers have asked
- * to have logged.
- */
- bip->bli_orig = kmem_alloc(BBTOB(bp->b_length), KM_SLEEP);
- memcpy(bip->bli_orig, bp->b_addr, BBTOB(bp->b_length));
- bip->bli_logged = kmem_zalloc(BBTOB(bp->b_length) / NBBY, KM_SLEEP);
-#endif
-
/*
* Put the buf item into the list of items attached to the
* buffer at the front.
@@ -808,9 +810,8 @@ xfs_buf_item_init(
* Mark bytes first through last inclusive as dirty in the buf
* item's bitmap.
*/
-void
+static void
xfs_buf_item_log_segment(
- struct xfs_buf_log_item *bip,
uint first,
uint last,
uint *map)
@@ -918,7 +919,7 @@ xfs_buf_item_log(
if (end > last)
end = last;
- xfs_buf_item_log_segment(bip, first, end,
+ xfs_buf_item_log_segment(first, end,
&bip->bli_formats[i].blf_data_map[0]);
start += bp->b_maps[i].bm_len;
@@ -941,11 +942,6 @@ STATIC void
xfs_buf_item_free(
xfs_buf_log_item_t *bip)
{
-#ifdef XFS_TRANS_DEBUG
- kmem_free(bip->bli_orig);
- kmem_free(bip->bli_logged);
-#endif /* XFS_TRANS_DEBUG */
-
xfs_buf_item_free_format(bip);
kmem_zone_free(xfs_buf_item_zone, bip);
}
@@ -1056,7 +1052,7 @@ xfs_buf_iodone_callbacks(
static ulong lasttime;
static xfs_buftarg_t *lasttarg;
- if (likely(!xfs_buf_geterror(bp)))
+ if (likely(!bp->b_error))
goto do_callbacks;
/*
@@ -1095,8 +1091,9 @@ xfs_buf_iodone_callbacks(
xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
- if (!XFS_BUF_ISSTALE(bp)) {
- bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
+ if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
+ bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+ XBF_DONE | XBF_WRITE_FAIL;
xfs_buf_iorequest(bp);
} else {
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index db6371087fe..3f3455a4151 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -71,10 +71,6 @@ void xfs_buf_attach_iodone(struct xfs_buf *,
void xfs_buf_iodone_callbacks(struct xfs_buf *);
void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
- enum xfs_blft);
-void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp);
-
extern kmem_zone_t *xfs_buf_item_zone;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 20bf8e8002d..a514ab61665 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -18,20 +18,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
@@ -129,56 +129,6 @@ xfs_da_state_free(xfs_da_state_t *state)
kmem_zone_free(xfs_da_state_zone, state);
}
-void
-xfs_da3_node_hdr_from_disk(
- struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from)
-{
- ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
- from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
-
- if (from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
- struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
-
- to->forw = be32_to_cpu(hdr3->info.hdr.forw);
- to->back = be32_to_cpu(hdr3->info.hdr.back);
- to->magic = be16_to_cpu(hdr3->info.hdr.magic);
- to->count = be16_to_cpu(hdr3->__count);
- to->level = be16_to_cpu(hdr3->__level);
- return;
- }
- to->forw = be32_to_cpu(from->hdr.info.forw);
- to->back = be32_to_cpu(from->hdr.info.back);
- to->magic = be16_to_cpu(from->hdr.info.magic);
- to->count = be16_to_cpu(from->hdr.__count);
- to->level = be16_to_cpu(from->hdr.__level);
-}
-
-void
-xfs_da3_node_hdr_to_disk(
- struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from)
-{
- ASSERT(from->magic == XFS_DA_NODE_MAGIC ||
- from->magic == XFS_DA3_NODE_MAGIC);
-
- if (from->magic == XFS_DA3_NODE_MAGIC) {
- struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
-
- hdr3->info.hdr.forw = cpu_to_be32(from->forw);
- hdr3->info.hdr.back = cpu_to_be32(from->back);
- hdr3->info.hdr.magic = cpu_to_be16(from->magic);
- hdr3->__count = cpu_to_be16(from->count);
- hdr3->__level = cpu_to_be16(from->level);
- return;
- }
- to->hdr.info.forw = cpu_to_be32(from->forw);
- to->hdr.info.back = cpu_to_be32(from->back);
- to->hdr.info.magic = cpu_to_be16(from->magic);
- to->hdr.__count = cpu_to_be16(from->count);
- to->hdr.__level = cpu_to_be16(from->level);
-}
-
static bool
xfs_da3_node_verify(
struct xfs_buf *bp)
@@ -186,8 +136,11 @@ xfs_da3_node_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_da_intnode *hdr = bp->b_addr;
struct xfs_da3_icnode_hdr ichdr;
+ const struct xfs_dir_ops *ops;
- xfs_da3_node_hdr_from_disk(&ichdr, hdr);
+ ops = xfs_dir_get_ops(mp, NULL);
+
+ ops->node_hdr_from_disk(&ichdr, hdr);
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
@@ -214,8 +167,8 @@ xfs_da3_node_verify(
* we don't know if the node is for and attribute or directory tree,
* so only fail if the count is outside both bounds
*/
- if (ichdr.count > mp->m_dir_node_ents &&
- ichdr.count > mp->m_attr_node_ents)
+ if (ichdr.count > mp->m_dir_geo->node_ents &&
+ ichdr.count > mp->m_attr_geo->node_ents)
return false;
/* XXX: hash order check? */
@@ -232,8 +185,8 @@ xfs_da3_node_write_verify(
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
if (!xfs_da3_node_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -243,7 +196,7 @@ xfs_da3_node_write_verify(
if (bip)
hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DA3_NODE_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_DA3_NODE_CRC_OFF);
}
/*
@@ -256,18 +209,20 @@ static void
xfs_da3_node_read_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_da_blkinfo *info = bp->b_addr;
switch (be16_to_cpu(info->magic)) {
case XFS_DA3_NODE_MAGIC:
- if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_DA3_NODE_CRC_OFF))
+ if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
+ xfs_buf_ioerror(bp, EFSBADCRC);
break;
+ }
/* fall through */
case XFS_DA_NODE_MAGIC:
- if (!xfs_da3_node_verify(bp))
+ if (!xfs_da3_node_verify(bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
break;
+ }
return;
case XFS_ATTR_LEAF_MAGIC:
case XFS_ATTR3_LEAF_MAGIC:
@@ -284,8 +239,7 @@ xfs_da3_node_read_verify(
}
/* corrupt block */
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
}
const struct xfs_buf_ops xfs_da3_node_buf_ops = {
@@ -354,11 +308,12 @@ xfs_da3_node_create(
struct xfs_da3_icnode_hdr ichdr = {0};
struct xfs_buf *bp;
int error;
+ struct xfs_inode *dp = args->dp;
trace_xfs_da_node_create(args);
ASSERT(level <= XFS_DA_NODE_MAXDEPTH);
- error = xfs_da_get_buf(tp, args->dp, blkno, -1, &bp, whichfork);
+ error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
if (error)
return(error);
bp->b_ops = &xfs_da3_node_buf_ops;
@@ -377,9 +332,9 @@ xfs_da3_node_create(
}
ichdr.level = level;
- xfs_da3_node_hdr_to_disk(node, &ichdr);
+ dp->d_ops->node_hdr_to_disk(node, &ichdr);
xfs_trans_log_buf(tp, bp,
- XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
*bpp = bp;
return(0);
@@ -589,8 +544,8 @@ xfs_da3_root_split(
oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
struct xfs_da3_icnode_hdr nodehdr;
- xfs_da3_node_hdr_from_disk(&nodehdr, oldroot);
- btree = xfs_da3_node_tree_p(oldroot);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot);
+ btree = dp->d_ops->node_tree_p(oldroot);
size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot);
level = nodehdr.level;
@@ -604,8 +559,8 @@ xfs_da3_root_split(
struct xfs_dir2_leaf_entry *ents;
leaf = (xfs_dir2_leaf_t *)oldroot;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -643,28 +598,28 @@ xfs_da3_root_split(
* Set up the new root node.
*/
error = xfs_da3_node_create(args,
- (args->whichfork == XFS_DATA_FORK) ? mp->m_dirleafblk : 0,
+ (args->whichfork == XFS_DATA_FORK) ? args->geo->leafblk : 0,
level + 1, &bp, args->whichfork);
if (error)
return error;
node = bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
btree[0].hashval = cpu_to_be32(blk1->hashval);
btree[0].before = cpu_to_be32(blk1->blkno);
btree[1].hashval = cpu_to_be32(blk2->hashval);
btree[1].before = cpu_to_be32(blk2->blkno);
nodehdr.count = 2;
- xfs_da3_node_hdr_to_disk(node, &nodehdr);
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
#ifdef DEBUG
if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
- ASSERT(blk1->blkno >= mp->m_dirleafblk &&
- blk1->blkno < mp->m_dirfreeblk);
- ASSERT(blk2->blkno >= mp->m_dirleafblk &&
- blk2->blkno < mp->m_dirfreeblk);
+ ASSERT(blk1->blkno >= args->geo->leafblk &&
+ blk1->blkno < args->geo->freeblk);
+ ASSERT(blk2->blkno >= args->geo->leafblk &&
+ blk2->blkno < args->geo->freeblk);
}
#endif
@@ -693,11 +648,12 @@ xfs_da3_node_split(
int newcount;
int error;
int useextra;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_split(state->args);
node = oldblk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
/*
* With V2 dirs the extra block is data or freespace.
@@ -707,7 +663,7 @@ xfs_da3_node_split(
/*
* Do we have to split the node?
*/
- if (nodehdr.count + newcount > state->node_ents) {
+ if (nodehdr.count + newcount > state->args->geo->node_ents) {
/*
* Allocate a new node, add to the doubly linked chain of
* nodes, then move some of our excess entries into it.
@@ -744,7 +700,7 @@ xfs_da3_node_split(
* If we had double-split op below us, then add the extra block too.
*/
node = oldblk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
if (oldblk->index <= nodehdr.count) {
oldblk->index++;
xfs_da3_node_add(state, oldblk, addblk);
@@ -793,15 +749,16 @@ xfs_da3_node_rebalance(
int count;
int tmp;
int swap = 0;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_rebalance(state->args);
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
- xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
- btree1 = xfs_da3_node_tree_p(node1);
- btree2 = xfs_da3_node_tree_p(node2);
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
/*
* Figure out how many entries need to move, and in which direction.
@@ -814,10 +771,10 @@ xfs_da3_node_rebalance(
tmpnode = node1;
node1 = node2;
node2 = tmpnode;
- xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
- xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
- btree1 = xfs_da3_node_tree_p(node1);
- btree2 = xfs_da3_node_tree_p(node2);
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
swap = 1;
}
@@ -879,15 +836,14 @@ xfs_da3_node_rebalance(
/*
* Log header of node 1 and all current bits of node 2.
*/
- xfs_da3_node_hdr_to_disk(node1, &nodehdr1);
+ dp->d_ops->node_hdr_to_disk(node1, &nodehdr1);
xfs_trans_log_buf(tp, blk1->bp,
- XFS_DA_LOGRANGE(node1, &node1->hdr,
- xfs_da3_node_hdr_size(node1)));
+ XFS_DA_LOGRANGE(node1, &node1->hdr, dp->d_ops->node_hdr_size));
- xfs_da3_node_hdr_to_disk(node2, &nodehdr2);
+ dp->d_ops->node_hdr_to_disk(node2, &nodehdr2);
xfs_trans_log_buf(tp, blk2->bp,
XFS_DA_LOGRANGE(node2, &node2->hdr,
- xfs_da3_node_hdr_size(node2) +
+ dp->d_ops->node_hdr_size +
(sizeof(btree2[0]) * nodehdr2.count)));
/*
@@ -897,10 +853,10 @@ xfs_da3_node_rebalance(
if (swap) {
node1 = blk1->bp->b_addr;
node2 = blk2->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr1, node1);
- xfs_da3_node_hdr_from_disk(&nodehdr2, node2);
- btree1 = xfs_da3_node_tree_p(node1);
- btree2 = xfs_da3_node_tree_p(node2);
+ dp->d_ops->node_hdr_from_disk(&nodehdr1, node1);
+ dp->d_ops->node_hdr_from_disk(&nodehdr2, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
}
blk1->hashval = be32_to_cpu(btree1[nodehdr1.count - 1].hashval);
blk2->hashval = be32_to_cpu(btree2[nodehdr2.count - 1].hashval);
@@ -927,18 +883,19 @@ xfs_da3_node_add(
struct xfs_da3_icnode_hdr nodehdr;
struct xfs_da_node_entry *btree;
int tmp;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_add(state->args);
node = oldblk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
ASSERT(oldblk->index >= 0 && oldblk->index <= nodehdr.count);
ASSERT(newblk->blkno != 0);
if (state->args->whichfork == XFS_DATA_FORK)
- ASSERT(newblk->blkno >= state->mp->m_dirleafblk &&
- newblk->blkno < state->mp->m_dirfreeblk);
+ ASSERT(newblk->blkno >= state->args->geo->leafblk &&
+ newblk->blkno < state->args->geo->freeblk);
/*
* We may need to make some room before we insert the new node.
@@ -955,9 +912,9 @@ xfs_da3_node_add(
tmp + sizeof(*btree)));
nodehdr.count += 1;
- xfs_da3_node_hdr_to_disk(node, &nodehdr);
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
xfs_trans_log_buf(state->args->trans, oldblk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
/*
* Copy the last hash value from the oldblk to propagate upwards.
@@ -1094,6 +1051,7 @@ xfs_da3_root_join(
struct xfs_da3_icnode_hdr oldroothdr;
struct xfs_da_node_entry *btree;
int error;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_root_join(state->args);
@@ -1101,7 +1059,7 @@ xfs_da3_root_join(
args = state->args;
oldroot = root_blk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&oldroothdr, oldroot);
+ dp->d_ops->node_hdr_from_disk(&oldroothdr, oldroot);
ASSERT(oldroothdr.forw == 0);
ASSERT(oldroothdr.back == 0);
@@ -1115,10 +1073,10 @@ xfs_da3_root_join(
* Read in the (only) child block, then copy those bytes into
* the root block's buffer and free the original child block.
*/
- btree = xfs_da3_node_tree_p(oldroot);
+ btree = dp->d_ops->node_tree_p(oldroot);
child = be32_to_cpu(btree[0].before);
ASSERT(child != 0);
- error = xfs_da3_node_read(args->trans, args->dp, child, -1, &bp,
+ error = xfs_da3_node_read(args->trans, dp, child, -1, &bp,
args->whichfork);
if (error)
return error;
@@ -1131,14 +1089,15 @@ xfs_da3_root_join(
* that could occur. For dir3 blocks we also need to update the block
* number in the buffer header.
*/
- memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+ memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
root_blk->bp->b_ops = bp->b_ops;
xfs_trans_buf_copy_type(root_blk->bp, bp);
if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
da3->blkno = cpu_to_be64(root_blk->bp->b_bn);
}
- xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
+ xfs_trans_log_buf(args->trans, root_blk->bp, 0,
+ args->geo->blksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
}
@@ -1168,6 +1127,7 @@ xfs_da3_node_toosmall(
int error;
int retval;
int i;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_toosmall(state->args);
@@ -1179,8 +1139,8 @@ xfs_da3_node_toosmall(
blk = &state->path.blk[ state->path.active-1 ];
info = blk->bp->b_addr;
node = (xfs_da_intnode_t *)info;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- if (nodehdr.count > (state->node_ents >> 1)) {
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
*action = 0; /* blk over 50%, don't try to join */
return(0); /* blk over 50%, don't try to join */
}
@@ -1217,8 +1177,8 @@ xfs_da3_node_toosmall(
* We prefer coalescing with the lower numbered sibling so as
* to shrink a directory over time.
*/
- count = state->node_ents;
- count -= state->node_ents >> 2;
+ count = state->args->geo->node_ents;
+ count -= state->args->geo->node_ents >> 2;
count -= nodehdr.count;
/* start with smaller blk num */
@@ -1231,13 +1191,13 @@ xfs_da3_node_toosmall(
blkno = nodehdr.back;
if (blkno == 0)
continue;
- error = xfs_da3_node_read(state->args->trans, state->args->dp,
+ error = xfs_da3_node_read(state->args->trans, dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
return(error);
node = bp->b_addr;
- xfs_da3_node_hdr_from_disk(&thdr, node);
+ dp->d_ops->node_hdr_from_disk(&thdr, node);
xfs_trans_brelse(state->args->trans, bp);
if (count - thdr.count >= 0)
@@ -1275,6 +1235,7 @@ xfs_da3_node_toosmall(
*/
STATIC uint
xfs_da3_node_lasthash(
+ struct xfs_inode *dp,
struct xfs_buf *bp,
int *count)
{
@@ -1283,12 +1244,12 @@ xfs_da3_node_lasthash(
struct xfs_da3_icnode_hdr nodehdr;
node = bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
if (count)
*count = nodehdr.count;
if (!nodehdr.count)
return 0;
- btree = xfs_da3_node_tree_p(node);
+ btree = dp->d_ops->node_tree_p(node);
return be32_to_cpu(btree[nodehdr.count - 1].hashval);
}
@@ -1307,6 +1268,7 @@ xfs_da3_fixhashpath(
xfs_dahash_t lasthash=0;
int level;
int count;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_fixhashpath(state->args);
@@ -1319,12 +1281,12 @@ xfs_da3_fixhashpath(
return;
break;
case XFS_DIR2_LEAFN_MAGIC:
- lasthash = xfs_dir2_leafn_lasthash(blk->bp, &count);
+ lasthash = xfs_dir2_leafn_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
case XFS_DA_NODE_MAGIC:
- lasthash = xfs_da3_node_lasthash(blk->bp, &count);
+ lasthash = xfs_da3_node_lasthash(dp, blk->bp, &count);
if (count == 0)
return;
break;
@@ -1333,9 +1295,9 @@ xfs_da3_fixhashpath(
struct xfs_da3_icnode_hdr nodehdr;
node = blk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
- if (be32_to_cpu(btree->hashval) == lasthash)
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
+ if (be32_to_cpu(btree[blk->index].hashval) == lasthash)
break;
blk->hashval = lasthash;
btree[blk->index].hashval = cpu_to_be32(lasthash);
@@ -1360,11 +1322,12 @@ xfs_da3_node_remove(
struct xfs_da_node_entry *btree;
int index;
int tmp;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_remove(state->args);
node = drop_blk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
ASSERT(drop_blk->index < nodehdr.count);
ASSERT(drop_blk->index >= 0);
@@ -1372,7 +1335,7 @@ xfs_da3_node_remove(
* Copy over the offending entry, or just zero it out.
*/
index = drop_blk->index;
- btree = xfs_da3_node_tree_p(node);
+ btree = dp->d_ops->node_tree_p(node);
if (index < nodehdr.count - 1) {
tmp = nodehdr.count - index - 1;
tmp *= (uint)sizeof(xfs_da_node_entry_t);
@@ -1385,9 +1348,9 @@ xfs_da3_node_remove(
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
XFS_DA_LOGRANGE(node, &btree[index], sizeof(btree[index])));
nodehdr.count -= 1;
- xfs_da3_node_hdr_to_disk(node, &nodehdr);
+ dp->d_ops->node_hdr_to_disk(node, &nodehdr);
xfs_trans_log_buf(state->args->trans, drop_blk->bp,
- XFS_DA_LOGRANGE(node, &node->hdr, xfs_da3_node_hdr_size(node)));
+ XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
/*
* Copy the last hash value from the block to propagate upwards.
@@ -1414,15 +1377,16 @@ xfs_da3_node_unbalance(
struct xfs_trans *tp;
int sindex;
int tmp;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_node_unbalance(state->args);
drop_node = drop_blk->bp->b_addr;
save_node = save_blk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&drop_hdr, drop_node);
- xfs_da3_node_hdr_from_disk(&save_hdr, save_node);
- drop_btree = xfs_da3_node_tree_p(drop_node);
- save_btree = xfs_da3_node_tree_p(save_node);
+ dp->d_ops->node_hdr_from_disk(&drop_hdr, drop_node);
+ dp->d_ops->node_hdr_from_disk(&save_hdr, save_node);
+ drop_btree = dp->d_ops->node_tree_p(drop_node);
+ save_btree = dp->d_ops->node_tree_p(save_node);
tp = state->args->trans;
/*
@@ -1456,10 +1420,10 @@ xfs_da3_node_unbalance(
memcpy(&save_btree[sindex], &drop_btree[0], tmp);
save_hdr.count += drop_hdr.count;
- xfs_da3_node_hdr_to_disk(save_node, &save_hdr);
+ dp->d_ops->node_hdr_to_disk(save_node, &save_hdr);
xfs_trans_log_buf(tp, save_blk->bp,
XFS_DA_LOGRANGE(save_node, &save_node->hdr,
- xfs_da3_node_hdr_size(save_node)));
+ dp->d_ops->node_hdr_size));
/*
* Save the last hashval in the remaining block for upward propagation.
@@ -1501,6 +1465,7 @@ xfs_da3_node_lookup_int(
int max;
int error;
int retval;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
@@ -1508,7 +1473,7 @@ xfs_da3_node_lookup_int(
* Descend thru the B-tree searching each level for the right
* node to use, until the right hashval is found.
*/
- blkno = (args->whichfork == XFS_DATA_FORK)? state->mp->m_dirleafblk : 0;
+ blkno = (args->whichfork == XFS_DATA_FORK)? args->geo->leafblk : 0;
for (blk = &state->path.blk[0], state->path.active = 1;
state->path.active <= XFS_DA_NODE_MAXDEPTH;
blk++, state->path.active++) {
@@ -1536,7 +1501,8 @@ xfs_da3_node_lookup_int(
if (blk->magic == XFS_DIR2_LEAFN_MAGIC ||
blk->magic == XFS_DIR3_LEAFN_MAGIC) {
blk->magic = XFS_DIR2_LEAFN_MAGIC;
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp, NULL);
+ blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+ blk->bp, NULL);
break;
}
@@ -1547,8 +1513,8 @@ xfs_da3_node_lookup_int(
* Search an intermediate node for a match.
*/
node = blk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
max = nodehdr.count;
blk->hashval = be32_to_cpu(btree[max - 1].hashval);
@@ -1643,6 +1609,7 @@ xfs_da3_node_lookup_int(
*/
STATIC int
xfs_da3_node_order(
+ struct xfs_inode *dp,
struct xfs_buf *node1_bp,
struct xfs_buf *node2_bp)
{
@@ -1655,10 +1622,10 @@ xfs_da3_node_order(
node1 = node1_bp->b_addr;
node2 = node2_bp->b_addr;
- xfs_da3_node_hdr_from_disk(&node1hdr, node1);
- xfs_da3_node_hdr_from_disk(&node2hdr, node2);
- btree1 = xfs_da3_node_tree_p(node1);
- btree2 = xfs_da3_node_tree_p(node2);
+ dp->d_ops->node_hdr_from_disk(&node1hdr, node1);
+ dp->d_ops->node_hdr_from_disk(&node2hdr, node2);
+ btree1 = dp->d_ops->node_tree_p(node1);
+ btree2 = dp->d_ops->node_tree_p(node2);
if (node1hdr.count > 0 && node2hdr.count > 0 &&
((be32_to_cpu(btree2[0].hashval) < be32_to_cpu(btree1[0].hashval)) ||
@@ -1685,6 +1652,7 @@ xfs_da3_blk_link(
struct xfs_buf *bp;
int before = 0;
int error;
+ struct xfs_inode *dp = state->args->dp;
/*
* Set up environment.
@@ -1702,10 +1670,10 @@ xfs_da3_blk_link(
before = xfs_attr_leaf_order(old_blk->bp, new_blk->bp);
break;
case XFS_DIR2_LEAFN_MAGIC:
- before = xfs_dir2_leafn_order(old_blk->bp, new_blk->bp);
+ before = xfs_dir2_leafn_order(dp, old_blk->bp, new_blk->bp);
break;
case XFS_DA_NODE_MAGIC:
- before = xfs_da3_node_order(old_blk->bp, new_blk->bp);
+ before = xfs_da3_node_order(dp, old_blk->bp, new_blk->bp);
break;
}
@@ -1720,7 +1688,7 @@ xfs_da3_blk_link(
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
- error = xfs_da3_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->back),
-1, &bp, args->whichfork);
if (error)
@@ -1741,7 +1709,7 @@ xfs_da3_blk_link(
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
- error = xfs_da3_node_read(args->trans, args->dp,
+ error = xfs_da3_node_read(args->trans, dp,
be32_to_cpu(old_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1861,6 +1829,7 @@ xfs_da3_path_shift(
xfs_dablk_t blkno = 0;
int level;
int error;
+ struct xfs_inode *dp = state->args->dp;
trace_xfs_da_path_shift(state->args);
@@ -1876,8 +1845,8 @@ xfs_da3_path_shift(
level = (path->active-1) - 1; /* skip bottom layer in path */
for (blk = &path->blk[level]; level >= 0; blk--, level--) {
node = blk->bp->b_addr;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
if (forward && (blk->index < nodehdr.count - 1)) {
blk->index++;
@@ -1911,7 +1880,7 @@ xfs_da3_path_shift(
* Read the next child block.
*/
blk->blkno = blkno;
- error = xfs_da3_node_read(args->trans, args->dp, blkno, -1,
+ error = xfs_da3_node_read(args->trans, dp, blkno, -1,
&blk->bp, args->whichfork);
if (error)
return(error);
@@ -1933,8 +1902,8 @@ xfs_da3_path_shift(
case XFS_DA3_NODE_MAGIC:
blk->magic = XFS_DA_NODE_MAGIC;
node = (xfs_da_intnode_t *)info;
- xfs_da3_node_hdr_from_disk(&nodehdr, node);
- btree = xfs_da3_node_tree_p(node);
+ dp->d_ops->node_hdr_from_disk(&nodehdr, node);
+ btree = dp->d_ops->node_tree_p(node);
blk->hashval = be32_to_cpu(btree[nodehdr.count - 1].hashval);
if (forward)
blk->index = 0;
@@ -1947,16 +1916,15 @@ xfs_da3_path_shift(
blk->magic = XFS_ATTR_LEAF_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
- blk->hashval = xfs_attr_leaf_lasthash(blk->bp,
- NULL);
+ blk->hashval = xfs_attr_leaf_lasthash(blk->bp, NULL);
break;
case XFS_DIR2_LEAFN_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
blk->magic = XFS_DIR2_LEAFN_MAGIC;
ASSERT(level == path->active-1);
blk->index = 0;
- blk->hashval = xfs_dir2_leafn_lasthash(blk->bp,
- NULL);
+ blk->hashval = xfs_dir2_leafn_lasthash(args->dp,
+ blk->bp, NULL);
break;
default:
ASSERT(0);
@@ -2123,20 +2091,12 @@ xfs_da_grow_inode(
xfs_dablk_t *new_blkno)
{
xfs_fileoff_t bno;
- int count;
int error;
trace_xfs_da_grow_inode(args);
- if (args->whichfork == XFS_DATA_FORK) {
- bno = args->dp->i_mount->m_dirleafblk;
- count = args->dp->i_mount->m_dirblkfsbs;
- } else {
- bno = 0;
- count = 1;
- }
-
- error = xfs_da_grow_inode_int(args, &bno, count);
+ bno = args->geo->leafblk;
+ error = xfs_da_grow_inode_int(args, &bno, args->geo->fsbcount);
if (!error)
*new_blkno = (xfs_dablk_t)bno;
return error;
@@ -2163,7 +2123,7 @@ xfs_da3_swap_lastblock(
struct xfs_dir2_leaf *dead_leaf2;
struct xfs_da_node_entry *btree;
struct xfs_da3_icnode_hdr par_hdr;
- struct xfs_inode *ip;
+ struct xfs_inode *dp;
struct xfs_trans *tp;
struct xfs_mount *mp;
struct xfs_buf *dead_buf;
@@ -2187,12 +2147,12 @@ xfs_da3_swap_lastblock(
dead_buf = *dead_bufp;
dead_blkno = *dead_blknop;
tp = args->trans;
- ip = args->dp;
+ dp = args->dp;
w = args->whichfork;
ASSERT(w == XFS_DATA_FORK);
- mp = ip->i_mount;
- lastoff = mp->m_dirfreeblk;
- error = xfs_bmap_last_before(tp, ip, &lastoff, w);
+ mp = dp->i_mount;
+ lastoff = args->geo->freeblk;
+ error = xfs_bmap_last_before(tp, dp, &lastoff, w);
if (error)
return error;
if (unlikely(lastoff == 0)) {
@@ -2203,15 +2163,15 @@ xfs_da3_swap_lastblock(
/*
* Read the last block in the btree space.
*/
- last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
- error = xfs_da3_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+ last_blkno = (xfs_dablk_t)lastoff - args->geo->fsbcount;
+ error = xfs_da3_node_read(tp, dp, last_blkno, -1, &last_buf, w);
if (error)
return error;
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->b_addr, last_buf->b_addr, mp->m_dirblksize);
- xfs_trans_log_buf(tp, dead_buf, 0, mp->m_dirblksize - 1);
+ memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
/*
* Get values from the moved block.
@@ -2222,16 +2182,16 @@ xfs_da3_swap_lastblock(
struct xfs_dir2_leaf_entry *ents;
dead_leaf2 = (xfs_dir2_leaf_t *)dead_info;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, dead_leaf2);
- ents = xfs_dir3_leaf_ents_p(dead_leaf2);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, dead_leaf2);
+ ents = dp->d_ops->leaf_ents_p(dead_leaf2);
dead_level = 0;
dead_hash = be32_to_cpu(ents[leafhdr.count - 1].hashval);
} else {
struct xfs_da3_icnode_hdr deadhdr;
dead_node = (xfs_da_intnode_t *)dead_info;
- xfs_da3_node_hdr_from_disk(&deadhdr, dead_node);
- btree = xfs_da3_node_tree_p(dead_node);
+ dp->d_ops->node_hdr_from_disk(&deadhdr, dead_node);
+ btree = dp->d_ops->node_tree_p(dead_node);
dead_level = deadhdr.level;
dead_hash = be32_to_cpu(btree[deadhdr.count - 1].hashval);
}
@@ -2240,7 +2200,7 @@ xfs_da3_swap_lastblock(
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
@@ -2262,7 +2222,7 @@ xfs_da3_swap_lastblock(
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- error = xfs_da3_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ error = xfs_da3_node_read(tp, dp, sib_blkno, -1, &sib_buf, w);
if (error)
goto done;
sib_info = sib_buf->b_addr;
@@ -2280,17 +2240,17 @@ xfs_da3_swap_lastblock(
sizeof(sib_info->back)));
sib_buf = NULL;
}
- par_blkno = mp->m_dirleafblk;
+ par_blkno = args->geo->leafblk;
level = -1;
/*
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+ dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
if (level >= 0 && level != par_hdr.level + 1) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
XFS_ERRLEVEL_LOW, mp);
@@ -2298,7 +2258,7 @@ xfs_da3_swap_lastblock(
goto done;
}
level = par_hdr.level;
- btree = xfs_da3_node_tree_p(par_node);
+ btree = dp->d_ops->node_tree_p(par_node);
for (entno = 0;
entno < par_hdr.count &&
be32_to_cpu(btree[entno].hashval) < dead_hash;
@@ -2337,18 +2297,18 @@ xfs_da3_swap_lastblock(
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- error = xfs_da3_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
if (error)
goto done;
par_node = par_buf->b_addr;
- xfs_da3_node_hdr_from_disk(&par_hdr, par_node);
+ dp->d_ops->node_hdr_from_disk(&par_hdr, par_node);
if (par_hdr.level != level) {
XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
XFS_ERRLEVEL_LOW, mp);
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- btree = xfs_da3_node_tree_p(par_node);
+ btree = dp->d_ops->node_tree_p(par_node);
entno = 0;
}
/*
@@ -2390,10 +2350,7 @@ xfs_da_shrink_inode(
w = args->whichfork;
tp = args->trans;
mp = dp->i_mount;
- if (w == XFS_DATA_FORK)
- count = mp->m_dirblkfsbs;
- else
- count = 1;
+ count = args->geo->fsbcount;
for (;;) {
/*
* Remove extents. If we get ENOSPC for a dir we have to move
@@ -2495,7 +2452,6 @@ xfs_buf_map_from_irec(
*/
static int
xfs_dabuf_map(
- struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
@@ -2513,7 +2469,10 @@ xfs_dabuf_map(
ASSERT(map && *map);
ASSERT(*nmaps == 1);
- nfsb = (whichfork == XFS_DATA_FORK) ? mp->m_dirblkfsbs : 1;
+ if (whichfork == XFS_DATA_FORK)
+ nfsb = mp->m_dir_geo->fsbcount;
+ else
+ nfsb = mp->m_attr_geo->fsbcount;
/*
* Caller doesn't have a mapping. -2 means don't complain
@@ -2591,7 +2550,7 @@ xfs_da_get_buf(
*bpp = NULL;
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2639,7 +2598,7 @@ xfs_da_read_buf(
*bpp = NULL;
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2658,47 +2617,6 @@ xfs_da_read_buf(
xfs_buf_set_ref(bp, XFS_ATTR_BTREE_REF);
else
xfs_buf_set_ref(bp, XFS_DIR_BTREE_REF);
-
- /*
- * This verification code will be moved to a CRC verification callback
- * function so just leave it here unchanged until then.
- */
- {
- xfs_dir2_data_hdr_t *hdr = bp->b_addr;
- xfs_dir2_free_t *free = bp->b_addr;
- xfs_da_blkinfo_t *info = bp->b_addr;
- uint magic, magic1;
- struct xfs_mount *mp = dp->i_mount;
-
- magic = be16_to_cpu(info->magic);
- magic1 = be32_to_cpu(hdr->magic);
- if (unlikely(
- XFS_TEST_ERROR((magic != XFS_DA_NODE_MAGIC) &&
- (magic != XFS_DA3_NODE_MAGIC) &&
- (magic != XFS_ATTR_LEAF_MAGIC) &&
- (magic != XFS_ATTR3_LEAF_MAGIC) &&
- (magic != XFS_DIR2_LEAF1_MAGIC) &&
- (magic != XFS_DIR3_LEAF1_MAGIC) &&
- (magic != XFS_DIR2_LEAFN_MAGIC) &&
- (magic != XFS_DIR3_LEAFN_MAGIC) &&
- (magic1 != XFS_DIR2_BLOCK_MAGIC) &&
- (magic1 != XFS_DIR3_BLOCK_MAGIC) &&
- (magic1 != XFS_DIR2_DATA_MAGIC) &&
- (magic1 != XFS_DIR3_DATA_MAGIC) &&
- (free->hdr.magic !=
- cpu_to_be32(XFS_DIR2_FREE_MAGIC)) &&
- (free->hdr.magic !=
- cpu_to_be32(XFS_DIR3_FREE_MAGIC)),
- mp, XFS_ERRTAG_DA_READ_BUF,
- XFS_RANDOM_DA_READ_BUF))) {
- trace_xfs_da_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR("xfs_da_do_buf(2)",
- XFS_ERRLEVEL_LOW, mp, info);
- error = XFS_ERROR(EFSCORRUPTED);
- xfs_trans_brelse(trans, bp);
- goto out_free;
- }
- }
*bpp = bp;
out_free:
if (mapp != &map)
@@ -2712,7 +2630,6 @@ out_free:
*/
xfs_daddr_t
xfs_da_reada_buf(
- struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
@@ -2726,7 +2643,7 @@ xfs_da_reada_buf(
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
+ error = xfs_dabuf_map(dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index b1f267995de..6e153e399a7 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -23,147 +23,25 @@ struct xfs_bmap_free;
struct xfs_inode;
struct xfs_trans;
struct zone;
-
-/*========================================================================
- * Directory Structure when greater than XFS_LBSIZE(mp) bytes.
- *========================================================================*/
-
-/*
- * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
- *
- * It is used to manage a doubly linked list of all blocks at the same
- * level in the Btree, and to identify which type of block this is.
- */
-#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
-#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
-#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
-
-typedef struct xfs_da_blkinfo {
- __be32 forw; /* previous block in list */
- __be32 back; /* following block in list */
- __be16 magic; /* validity check on block */
- __be16 pad; /* unused */
-} xfs_da_blkinfo_t;
-
-/*
- * CRC enabled directory structure types
- *
- * The headers change size for the additional verification information, but
- * otherwise the tree layouts and contents are unchanged. Hence the da btree
- * code can use the struct xfs_da_blkinfo for manipulating the tree links and
- * magic numbers without modification for both v2 and v3 nodes.
- */
-#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
-#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
-#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
-#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
-
-struct xfs_da3_blkinfo {
- /*
- * the node link manipulation code relies on the fact that the first
- * element of this structure is the struct xfs_da_blkinfo so it can
- * ignore the differences in the rest of the structures.
- */
- struct xfs_da_blkinfo hdr;
- __be32 crc; /* CRC of block */
- __be64 blkno; /* first block of the buffer */
- __be64 lsn; /* sequence number of last write */
- uuid_t uuid; /* filesystem we belong to */
- __be64 owner; /* inode that owns the block */
+struct xfs_dir_ops;
+
+/*
+ * Directory/attribute geometry information. There will be one of these for each
+ * data fork type, and it will be passed around via the xfs_da_args. Global
+ * structures will be attached to the xfs_mount.
+ */
+struct xfs_da_geometry {
+ int blksize; /* da block size in bytes */
+ int fsbcount; /* da block size in filesystem blocks */
+ uint8_t fsblog; /* log2 of _filesystem_ block size */
+ uint8_t blklog; /* log2 of da block size */
+ uint node_ents; /* # of entries in a danode */
+ int magicpct; /* 37% of block size in bytes */
+ xfs_dablk_t datablk; /* blockno of dir data v2 */
+ xfs_dablk_t leafblk; /* blockno of leaf data v2 */
+ xfs_dablk_t freeblk; /* blockno of free data v2 */
};
-/*
- * This is the structure of the root and intermediate nodes in the Btree.
- * The leaf nodes are defined above.
- *
- * Entries are not packed.
- *
- * Since we have duplicate keys, use a binary search but always follow
- * all match in the block, not just the first match found.
- */
-#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
-
-typedef struct xfs_da_node_hdr {
- struct xfs_da_blkinfo info; /* block type, links, etc. */
- __be16 __count; /* count of active entries */
- __be16 __level; /* level above leaves (leaf == 0) */
-} xfs_da_node_hdr_t;
-
-struct xfs_da3_node_hdr {
- struct xfs_da3_blkinfo info; /* block type, links, etc. */
- __be16 __count; /* count of active entries */
- __be16 __level; /* level above leaves (leaf == 0) */
- __be32 __pad32;
-};
-
-#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
-
-typedef struct xfs_da_node_entry {
- __be32 hashval; /* hash value for this descendant */
- __be32 before; /* Btree block before this key */
-} xfs_da_node_entry_t;
-
-typedef struct xfs_da_intnode {
- struct xfs_da_node_hdr hdr;
- struct xfs_da_node_entry __btree[];
-} xfs_da_intnode_t;
-
-struct xfs_da3_intnode {
- struct xfs_da3_node_hdr hdr;
- struct xfs_da_node_entry __btree[];
-};
-
-/*
- * In-core version of the node header to abstract the differences in the v2 and
- * v3 disk format of the headers. Callers need to convert to/from disk format as
- * appropriate.
- */
-struct xfs_da3_icnode_hdr {
- __uint32_t forw;
- __uint32_t back;
- __uint16_t magic;
- __uint16_t count;
- __uint16_t level;
-};
-
-extern void xfs_da3_node_hdr_from_disk(struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from);
-extern void xfs_da3_node_hdr_to_disk(struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from);
-
-static inline int
-__xfs_da3_node_hdr_size(bool v3)
-{
- if (v3)
- return sizeof(struct xfs_da3_node_hdr);
- return sizeof(struct xfs_da_node_hdr);
-}
-static inline int
-xfs_da3_node_hdr_size(struct xfs_da_intnode *dap)
-{
- bool v3 = dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC);
-
- return __xfs_da3_node_hdr_size(v3);
-}
-
-static inline struct xfs_da_node_entry *
-xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
-{
- if (dap->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) {
- struct xfs_da3_intnode *dap3 = (struct xfs_da3_intnode *)dap;
- return dap3->__btree;
- }
- return dap->__btree;
-}
-
-extern void xfs_da3_intnode_from_disk(struct xfs_da3_icnode_hdr *to,
- struct xfs_da_intnode *from);
-extern void xfs_da3_intnode_to_disk(struct xfs_da_intnode *to,
- struct xfs_da3_icnode_hdr *from);
-
-#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
-
/*========================================================================
* Btree searching and modification structure definitions.
*========================================================================*/
@@ -181,6 +59,7 @@ enum xfs_dacmp {
* Structure to ease passing around component names.
*/
typedef struct xfs_da_args {
+ struct xfs_da_geometry *geo; /* da block geometry */
const __uint8_t *name; /* string (maybe not NULL terminated) */
int namelen; /* length of string (maybe no NULL) */
__uint8_t filetype; /* filetype of inode for directories */
@@ -199,10 +78,12 @@ typedef struct xfs_da_args {
int index; /* index of attr of interest in blk */
xfs_dablk_t rmtblkno; /* remote attr value starting blkno */
int rmtblkcnt; /* remote attr value block count */
+ int rmtvaluelen; /* remote attr value length in bytes */
xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */
int index2; /* index of 2nd attr in blk */
xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */
int rmtblkcnt2; /* remote attr value block count */
+ int rmtvaluelen2; /* remote attr value length in bytes */
int op_flags; /* operation flags */
enum xfs_dacmp cmpresult; /* name compare result for lookups */
} xfs_da_args_t;
@@ -247,8 +128,6 @@ typedef struct xfs_da_state_path {
typedef struct xfs_da_state {
xfs_da_args_t *args; /* filename arguments */
struct xfs_mount *mp; /* filesystem mount point */
- unsigned int blocksize; /* logical block size */
- unsigned int node_ents; /* how many entries in danode */
xfs_da_state_path_t path; /* search/split paths */
xfs_da_state_path_t altpath; /* alternate path for join */
unsigned char inleaf; /* insert into 1->lf, 0->splf */
@@ -309,8 +188,6 @@ int xfs_da3_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp, int which_fork);
-extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
-
/*
* Utility routines.
*/
@@ -324,9 +201,9 @@ int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
struct xfs_buf **bpp, int whichfork,
const struct xfs_buf_ops *ops);
-xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno,
- int whichfork, const struct xfs_buf_ops *ops);
+xfs_daddr_t xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno, int whichfork,
+ const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/xfs_da_format.c
new file mode 100644
index 00000000000..c9aee52a37e
--- /dev/null
+++ b/fs/xfs/xfs_da_format.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2000,2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+
+/*
+ * Shortform directory ops
+ */
+static int
+xfs_dir2_sf_entsize(
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
+
+ count += len; /* name */
+ count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
+ sizeof(xfs_dir2_ino4_t); /* ino # */
+ return count;
+}
+
+static int
+xfs_dir3_sf_entsize(
+ struct xfs_dir2_sf_hdr *hdr,
+ int len)
+{
+ return xfs_dir2_sf_entsize(hdr, len) + sizeof(__uint8_t);
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir2_sf_nextentry(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)sfep + xfs_dir2_sf_entsize(hdr, sfep->namelen));
+}
+
+static struct xfs_dir2_sf_entry *
+xfs_dir3_sf_nextentry(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return (struct xfs_dir2_sf_entry *)
+ ((char *)sfep + xfs_dir3_sf_entsize(hdr, sfep->namelen));
+}
+
+
+/*
+ * For filetype enabled shortform directories, the file type field is stored at
+ * the end of the name. Because it's only a single byte, endian conversion is
+ * not necessary. For non-filetype enable directories, the type is always
+ * unknown and we never store the value.
+ */
+static __uint8_t
+xfs_dir2_sfe_get_ftype(
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_sfe_put_ftype(
+ struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_sfe_get_ftype(
+ struct xfs_dir2_sf_entry *sfep)
+{
+ __uint8_t ftype;
+
+ ftype = sfep->name[sfep->namelen];
+ if (ftype >= XFS_DIR3_FT_MAX)
+ return XFS_DIR3_FT_UNKNOWN;
+ return ftype;
+}
+
+static void
+xfs_dir3_sfe_put_ftype(
+ struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+
+ sfep->name[sfep->namelen] = ftype;
+}
+
+/*
+ * Inode numbers in short-form directories can come in two versions,
+ * either 4 bytes or 8 bytes wide. These helpers deal with the
+ * two forms transparently by looking at the headers i8count field.
+ *
+ * For 64-bit inode number the most significant byte must be zero.
+ */
+static xfs_ino_t
+xfs_dir2_sf_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_dir2_inou_t *from)
+{
+ if (hdr->i8count)
+ return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
+ else
+ return get_unaligned_be32(&from->i4.i);
+}
+
+static void
+xfs_dir2_sf_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_dir2_inou_t *to,
+ xfs_ino_t ino)
+{
+ ASSERT((ino & 0xff00000000000000ULL) == 0);
+
+ if (hdr->i8count)
+ put_unaligned_be64(ino, &to->i8.i);
+ else
+ put_unaligned_be32(ino, &to->i4.i);
+}
+
+static xfs_ino_t
+xfs_dir2_sf_get_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr)
+{
+ return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
+}
+
+static void
+xfs_dir2_sf_put_parent_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
+}
+
+/*
+ * In short-form directory entries the inode numbers are stored at variable
+ * offset behind the entry name. If the entry stores a filetype value, then it
+ * sits between the name and the inode number. Hence the inode numbers may only
+ * be accessed through the helpers below.
+ */
+static xfs_ino_t
+xfs_dir2_sfe_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return xfs_dir2_sf_get_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen]);
+}
+
+static void
+xfs_dir2_sfe_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen], ino);
+}
+
+static xfs_ino_t
+xfs_dir3_sfe_get_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep)
+{
+ return xfs_dir2_sf_get_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1]);
+}
+
+static void
+xfs_dir3_sfe_put_ino(
+ struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino)
+{
+ xfs_dir2_sf_put_ino(hdr,
+ (xfs_dir2_inou_t *)&sfep->name[sfep->namelen + 1], ino);
+}
+
+
+/*
+ * Directory data block operations
+ */
+
+/*
+ * For special situations, the dirent size ends up fixed because we always know
+ * what the size of the entry is. That's true for the "." and "..", and
+ * therefore we know that they are a fixed size and hence their offsets are
+ * constant, as is the first entry.
+ *
+ * Hence, this calculation is written as a macro to be able to be calculated at
+ * compile time and so certain offsets can be calculated directly in the
+ * structure initaliser via the macro. There are two macros - one for dirents
+ * with ftype and without so there are no unresolvable conditionals in the
+ * calculations. We also use round_up() as XFS_DIR2_DATA_ALIGN is always a power
+ * of 2 and the compiler doesn't reject it (unlike roundup()).
+ */
+#define XFS_DIR2_DATA_ENTSIZE(n) \
+ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+ sizeof(xfs_dir2_data_off_t)), XFS_DIR2_DATA_ALIGN)
+
+#define XFS_DIR3_DATA_ENTSIZE(n) \
+ round_up((offsetof(struct xfs_dir2_data_entry, name[0]) + (n) + \
+ sizeof(xfs_dir2_data_off_t) + sizeof(__uint8_t)), \
+ XFS_DIR2_DATA_ALIGN)
+
+static int
+xfs_dir2_data_entsize(
+ int n)
+{
+ return XFS_DIR2_DATA_ENTSIZE(n);
+}
+
+static int
+xfs_dir3_data_entsize(
+ int n)
+{
+ return XFS_DIR3_DATA_ENTSIZE(n);
+}
+
+static __uint8_t
+xfs_dir2_data_get_ftype(
+ struct xfs_dir2_data_entry *dep)
+{
+ return XFS_DIR3_FT_UNKNOWN;
+}
+
+static void
+xfs_dir2_data_put_ftype(
+ struct xfs_dir2_data_entry *dep,
+ __uint8_t ftype)
+{
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+}
+
+static __uint8_t
+xfs_dir3_data_get_ftype(
+ struct xfs_dir2_data_entry *dep)
+{
+ __uint8_t ftype = dep->name[dep->namelen];
+
+ ASSERT(ftype < XFS_DIR3_FT_MAX);
+ if (ftype >= XFS_DIR3_FT_MAX)
+ return XFS_DIR3_FT_UNKNOWN;
+ return ftype;
+}
+
+static void
+xfs_dir3_data_put_ftype(
+ struct xfs_dir2_data_entry *dep,
+ __uint8_t type)
+{
+ ASSERT(type < XFS_DIR3_FT_MAX);
+ ASSERT(dep->namelen != 0);
+
+ dep->name[dep->namelen] = type;
+}
+
+/*
+ * Pointer to an entry's tag word.
+ */
+static __be16 *
+xfs_dir2_data_entry_tag_p(
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir2_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+static __be16 *
+xfs_dir3_data_entry_tag_p(
+ struct xfs_dir2_data_entry *dep)
+{
+ return (__be16 *)((char *)dep +
+ xfs_dir3_data_entsize(dep->namelen) - sizeof(__be16));
+}
+
+/*
+ * location of . and .. in data space (always block 0)
+ */
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1) +
+ XFS_DIR2_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_ftype_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_dotdot_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_first_entry_p(
+ struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2));
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir2_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return hdr->bestfree;
+}
+
+static struct xfs_dir2_data_free *
+xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir3_data_hdr *)hdr)->best_free;
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir2_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir2_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + sizeof(struct xfs_dir2_data_hdr));
+}
+
+static struct xfs_dir2_data_entry *
+xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_entry *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+static struct xfs_dir2_data_unused *
+xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
+{
+ return (struct xfs_dir2_data_unused *)
+ ((char *)hdr + sizeof(struct xfs_dir3_data_hdr));
+}
+
+
+/*
+ * Directory Leaf block operations
+ */
+static int
+xfs_dir2_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_leaf_hdr)) /
+ (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir2_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ return lp->__ents;
+}
+
+static int
+xfs_dir3_max_leaf_ents(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir3_leaf_hdr)) /
+ (uint)sizeof(struct xfs_dir2_leaf_entry);
+}
+
+static struct xfs_dir2_leaf_entry *
+xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
+{
+ return ((struct xfs_dir3_leaf *)lp)->__ents;
+}
+
+static void
+xfs_dir2_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.count);
+ to->stale = be16_to_cpu(from->hdr.stale);
+
+ ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
+ to->magic == XFS_DIR2_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir2_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
+ from->magic == XFS_DIR2_LEAFN_MAGIC);
+
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.count = cpu_to_be16(from->count);
+ to->hdr.stale = cpu_to_be16(from->stale);
+}
+
+static void
+xfs_dir3_leaf_hdr_from_disk(
+ struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from)
+{
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
+
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->count);
+ to->stale = be16_to_cpu(hdr3->stale);
+
+ ASSERT(to->magic == XFS_DIR3_LEAF1_MAGIC ||
+ to->magic == XFS_DIR3_LEAFN_MAGIC);
+}
+
+static void
+xfs_dir3_leaf_hdr_to_disk(
+ struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from)
+{
+ struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
+
+ ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC ||
+ from->magic == XFS_DIR3_LEAFN_MAGIC);
+
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->count = cpu_to_be16(from->count);
+ hdr3->stale = cpu_to_be16(from->stale);
+}
+
+
+/*
+ * Directory/Attribute Node block operations
+ */
+static struct xfs_da_node_entry *
+xfs_da2_node_tree_p(struct xfs_da_intnode *dap)
+{
+ return dap->__btree;
+}
+
+static struct xfs_da_node_entry *
+xfs_da3_node_tree_p(struct xfs_da_intnode *dap)
+{
+ return ((struct xfs_da3_intnode *)dap)->__btree;
+}
+
+static void
+xfs_da2_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC));
+ to->forw = be32_to_cpu(from->hdr.info.forw);
+ to->back = be32_to_cpu(from->hdr.info.back);
+ to->magic = be16_to_cpu(from->hdr.info.magic);
+ to->count = be16_to_cpu(from->hdr.__count);
+ to->level = be16_to_cpu(from->hdr.__level);
+}
+
+static void
+xfs_da2_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ ASSERT(from->magic == XFS_DA_NODE_MAGIC);
+ to->hdr.info.forw = cpu_to_be32(from->forw);
+ to->hdr.info.back = cpu_to_be32(from->back);
+ to->hdr.info.magic = cpu_to_be16(from->magic);
+ to->hdr.__count = cpu_to_be16(from->count);
+ to->hdr.__level = cpu_to_be16(from->level);
+}
+
+static void
+xfs_da3_node_hdr_from_disk(
+ struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from)
+{
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)from;
+
+ ASSERT(from->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC));
+ to->forw = be32_to_cpu(hdr3->info.hdr.forw);
+ to->back = be32_to_cpu(hdr3->info.hdr.back);
+ to->magic = be16_to_cpu(hdr3->info.hdr.magic);
+ to->count = be16_to_cpu(hdr3->__count);
+ to->level = be16_to_cpu(hdr3->__level);
+}
+
+static void
+xfs_da3_node_hdr_to_disk(
+ struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from)
+{
+ struct xfs_da3_node_hdr *hdr3 = (struct xfs_da3_node_hdr *)to;
+
+ ASSERT(from->magic == XFS_DA3_NODE_MAGIC);
+ hdr3->info.hdr.forw = cpu_to_be32(from->forw);
+ hdr3->info.hdr.back = cpu_to_be32(from->back);
+ hdr3->info.hdr.magic = cpu_to_be16(from->magic);
+ hdr3->__count = cpu_to_be16(from->count);
+ hdr3->__level = cpu_to_be16(from->level);
+}
+
+
+/*
+ * Directory free space block operations
+ */
+static int
+xfs_dir2_free_max_bests(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir2_free_hdr)) /
+ sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir2_free_bests_p(struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + sizeof(struct xfs_dir2_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir2_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir2_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir2_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % xfs_dir2_free_max_bests(geo);
+}
+
+static int
+xfs_dir3_free_max_bests(struct xfs_da_geometry *geo)
+{
+ return (geo->blksize - sizeof(struct xfs_dir3_free_hdr)) /
+ sizeof(xfs_dir2_data_off_t);
+}
+
+static __be16 *
+xfs_dir3_free_bests_p(struct xfs_dir2_free *free)
+{
+ return (__be16 *)((char *)free + sizeof(struct xfs_dir3_free_hdr));
+}
+
+/*
+ * Convert data space db to the corresponding free db.
+ */
+static xfs_dir2_db_t
+xfs_dir3_db_to_fdb(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return xfs_dir2_byte_to_db(geo, XFS_DIR2_FREE_OFFSET) +
+ (db / xfs_dir3_free_max_bests(geo));
+}
+
+/*
+ * Convert data space db to the corresponding index in a free db.
+ */
+static int
+xfs_dir3_db_to_fdindex(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return db % xfs_dir3_free_max_bests(geo);
+}
+
+static void
+xfs_dir2_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ to->magic = be32_to_cpu(from->hdr.magic);
+ to->firstdb = be32_to_cpu(from->hdr.firstdb);
+ to->nvalid = be32_to_cpu(from->hdr.nvalid);
+ to->nused = be32_to_cpu(from->hdr.nused);
+ ASSERT(to->magic == XFS_DIR2_FREE_MAGIC);
+}
+
+static void
+xfs_dir2_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ ASSERT(from->magic == XFS_DIR2_FREE_MAGIC);
+
+ to->hdr.magic = cpu_to_be32(from->magic);
+ to->hdr.firstdb = cpu_to_be32(from->firstdb);
+ to->hdr.nvalid = cpu_to_be32(from->nvalid);
+ to->hdr.nused = cpu_to_be32(from->nused);
+}
+
+static void
+xfs_dir3_free_hdr_from_disk(
+ struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from)
+{
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
+
+ to->magic = be32_to_cpu(hdr3->hdr.magic);
+ to->firstdb = be32_to_cpu(hdr3->firstdb);
+ to->nvalid = be32_to_cpu(hdr3->nvalid);
+ to->nused = be32_to_cpu(hdr3->nused);
+
+ ASSERT(to->magic == XFS_DIR3_FREE_MAGIC);
+}
+
+static void
+xfs_dir3_free_hdr_to_disk(
+ struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from)
+{
+ struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
+
+ ASSERT(from->magic == XFS_DIR3_FREE_MAGIC);
+
+ hdr3->hdr.magic = cpu_to_be32(from->magic);
+ hdr3->firstdb = cpu_to_be32(from->firstdb);
+ hdr3->nvalid = cpu_to_be32(from->nvalid);
+ hdr3->nused = cpu_to_be32(from->nused);
+}
+
+static const struct xfs_dir_ops xfs_dir2_ops = {
+ .sf_entsize = xfs_dir2_sf_entsize,
+ .sf_nextentry = xfs_dir2_sf_nextentry,
+ .sf_get_ftype = xfs_dir2_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir2_sfe_put_ftype,
+ .sf_get_ino = xfs_dir2_sfe_get_ino,
+ .sf_put_ino = xfs_dir2_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir2_data_entsize,
+ .data_get_ftype = xfs_dir2_data_get_ftype,
+ .data_put_ftype = xfs_dir2_data_put_ftype,
+ .data_entry_tag_p = xfs_dir2_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR2_DATA_ENTSIZE(1) +
+ XFS_DIR2_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+ .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir2_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir2_data_first_entry_p,
+ .data_entry_p = xfs_dir2_data_entry_p,
+ .data_unused_p = xfs_dir2_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir2_max_leaf_ents,
+ .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+ .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+ .free_max_bests = xfs_dir2_free_max_bests,
+ .free_bests_p = xfs_dir2_free_bests_p,
+ .db_to_fdb = xfs_dir2_db_to_fdb,
+ .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_ftype_ops = {
+ .sf_entsize = xfs_dir3_sf_entsize,
+ .sf_nextentry = xfs_dir3_sf_nextentry,
+ .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+ .sf_get_ino = xfs_dir3_sfe_get_ino,
+ .sf_put_ino = xfs_dir3_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir3_data_entsize,
+ .data_get_ftype = xfs_dir3_data_get_ftype,
+ .data_put_ftype = xfs_dir3_data_put_ftype,
+ .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir2_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir2_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir2_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir2_data_hdr),
+
+ .data_dot_entry_p = xfs_dir2_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir2_ftype_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir2_ftype_data_first_entry_p,
+ .data_entry_p = xfs_dir2_data_entry_p,
+ .data_unused_p = xfs_dir2_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir2_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir2_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir2_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir2_max_leaf_ents,
+ .leaf_ents_p = xfs_dir2_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir2_free_hdr),
+ .free_hdr_to_disk = xfs_dir2_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir2_free_hdr_from_disk,
+ .free_max_bests = xfs_dir2_free_max_bests,
+ .free_bests_p = xfs_dir2_free_bests_p,
+ .db_to_fdb = xfs_dir2_db_to_fdb,
+ .db_to_fdindex = xfs_dir2_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir3_ops = {
+ .sf_entsize = xfs_dir3_sf_entsize,
+ .sf_nextentry = xfs_dir3_sf_nextentry,
+ .sf_get_ftype = xfs_dir3_sfe_get_ftype,
+ .sf_put_ftype = xfs_dir3_sfe_put_ftype,
+ .sf_get_ino = xfs_dir3_sfe_get_ino,
+ .sf_put_ino = xfs_dir3_sfe_put_ino,
+ .sf_get_parent_ino = xfs_dir2_sf_get_parent_ino,
+ .sf_put_parent_ino = xfs_dir2_sf_put_parent_ino,
+
+ .data_entsize = xfs_dir3_data_entsize,
+ .data_get_ftype = xfs_dir3_data_get_ftype,
+ .data_put_ftype = xfs_dir3_data_put_ftype,
+ .data_entry_tag_p = xfs_dir3_data_entry_tag_p,
+ .data_bestfree_p = xfs_dir3_data_bestfree_p,
+
+ .data_dot_offset = sizeof(struct xfs_dir3_data_hdr),
+ .data_dotdot_offset = sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1),
+ .data_first_offset = sizeof(struct xfs_dir3_data_hdr) +
+ XFS_DIR3_DATA_ENTSIZE(1) +
+ XFS_DIR3_DATA_ENTSIZE(2),
+ .data_entry_offset = sizeof(struct xfs_dir3_data_hdr),
+
+ .data_dot_entry_p = xfs_dir3_data_dot_entry_p,
+ .data_dotdot_entry_p = xfs_dir3_data_dotdot_entry_p,
+ .data_first_entry_p = xfs_dir3_data_first_entry_p,
+ .data_entry_p = xfs_dir3_data_entry_p,
+ .data_unused_p = xfs_dir3_data_unused_p,
+
+ .leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr),
+ .leaf_hdr_to_disk = xfs_dir3_leaf_hdr_to_disk,
+ .leaf_hdr_from_disk = xfs_dir3_leaf_hdr_from_disk,
+ .leaf_max_ents = xfs_dir3_max_leaf_ents,
+ .leaf_ents_p = xfs_dir3_leaf_ents_p,
+
+ .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+ .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+ .node_tree_p = xfs_da3_node_tree_p,
+
+ .free_hdr_size = sizeof(struct xfs_dir3_free_hdr),
+ .free_hdr_to_disk = xfs_dir3_free_hdr_to_disk,
+ .free_hdr_from_disk = xfs_dir3_free_hdr_from_disk,
+ .free_max_bests = xfs_dir3_free_max_bests,
+ .free_bests_p = xfs_dir3_free_bests_p,
+ .db_to_fdb = xfs_dir3_db_to_fdb,
+ .db_to_fdindex = xfs_dir3_db_to_fdindex,
+};
+
+static const struct xfs_dir_ops xfs_dir2_nondir_ops = {
+ .node_hdr_size = sizeof(struct xfs_da_node_hdr),
+ .node_hdr_to_disk = xfs_da2_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da2_node_hdr_from_disk,
+ .node_tree_p = xfs_da2_node_tree_p,
+};
+
+static const struct xfs_dir_ops xfs_dir3_nondir_ops = {
+ .node_hdr_size = sizeof(struct xfs_da3_node_hdr),
+ .node_hdr_to_disk = xfs_da3_node_hdr_to_disk,
+ .node_hdr_from_disk = xfs_da3_node_hdr_from_disk,
+ .node_tree_p = xfs_da3_node_tree_p,
+};
+
+/*
+ * Return the ops structure according to the current config. If we are passed
+ * an inode, then that overrides the default config we use which is based on
+ * feature bits.
+ */
+const struct xfs_dir_ops *
+xfs_dir_get_ops(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp)
+{
+ if (dp)
+ return dp->d_ops;
+ if (mp->m_dir_inode_ops)
+ return mp->m_dir_inode_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return &xfs_dir3_ops;
+ if (xfs_sb_version_hasftype(&mp->m_sb))
+ return &xfs_dir2_ftype_ops;
+ return &xfs_dir2_ops;
+}
+
+const struct xfs_dir_ops *
+xfs_nondir_get_ops(
+ struct xfs_mount *mp,
+ struct xfs_inode *dp)
+{
+ if (dp)
+ return dp->d_ops;
+ if (mp->m_nondir_inode_ops)
+ return mp->m_nondir_inode_ops;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ return &xfs_dir3_nondir_ops;
+ return &xfs_dir2_nondir_ops;
+}
diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_da_format.h
index 9cf67381adf..0a49b028637 100644
--- a/fs/xfs/xfs_dir2_format.h
+++ b/fs/xfs/xfs_da_format.h
@@ -16,8 +16,107 @@
* along with this program; if not, write the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
-#ifndef __XFS_DIR2_FORMAT_H__
-#define __XFS_DIR2_FORMAT_H__
+#ifndef __XFS_DA_FORMAT_H__
+#define __XFS_DA_FORMAT_H__
+
+/*
+ * This structure is common to both leaf nodes and non-leaf nodes in the Btree.
+ *
+ * It is used to manage a doubly linked list of all blocks at the same
+ * level in the Btree, and to identify which type of block this is.
+ */
+#define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */
+#define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */
+#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */
+
+typedef struct xfs_da_blkinfo {
+ __be32 forw; /* previous block in list */
+ __be32 back; /* following block in list */
+ __be16 magic; /* validity check on block */
+ __be16 pad; /* unused */
+} xfs_da_blkinfo_t;
+
+/*
+ * CRC enabled directory structure types
+ *
+ * The headers change size for the additional verification information, but
+ * otherwise the tree layouts and contents are unchanged. Hence the da btree
+ * code can use the struct xfs_da_blkinfo for manipulating the tree links and
+ * magic numbers without modification for both v2 and v3 nodes.
+ */
+#define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */
+#define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */
+#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */
+#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */
+
+struct xfs_da3_blkinfo {
+ /*
+ * the node link manipulation code relies on the fact that the first
+ * element of this structure is the struct xfs_da_blkinfo so it can
+ * ignore the differences in the rest of the structures.
+ */
+ struct xfs_da_blkinfo hdr;
+ __be32 crc; /* CRC of block */
+ __be64 blkno; /* first block of the buffer */
+ __be64 lsn; /* sequence number of last write */
+ uuid_t uuid; /* filesystem we belong to */
+ __be64 owner; /* inode that owns the block */
+};
+
+/*
+ * This is the structure of the root and intermediate nodes in the Btree.
+ * The leaf nodes are defined above.
+ *
+ * Entries are not packed.
+ *
+ * Since we have duplicate keys, use a binary search but always follow
+ * all match in the block, not just the first match found.
+ */
+#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */
+
+typedef struct xfs_da_node_hdr {
+ struct xfs_da_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+} xfs_da_node_hdr_t;
+
+struct xfs_da3_node_hdr {
+ struct xfs_da3_blkinfo info; /* block type, links, etc. */
+ __be16 __count; /* count of active entries */
+ __be16 __level; /* level above leaves (leaf == 0) */
+ __be32 __pad32;
+};
+
+#define XFS_DA3_NODE_CRC_OFF (offsetof(struct xfs_da3_node_hdr, info.crc))
+
+typedef struct xfs_da_node_entry {
+ __be32 hashval; /* hash value for this descendant */
+ __be32 before; /* Btree block before this key */
+} xfs_da_node_entry_t;
+
+typedef struct xfs_da_intnode {
+ struct xfs_da_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+} xfs_da_intnode_t;
+
+struct xfs_da3_intnode {
+ struct xfs_da3_node_hdr hdr;
+ struct xfs_da_node_entry __btree[];
+};
+
+/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t level;
+};
/*
* Directory version 2.
@@ -189,79 +288,6 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
((char *)hdr + xfs_dir2_sf_hdr_size(hdr->i8count));
}
-static inline int
-xfs_dir3_sf_entsize(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr,
- int len)
-{
- int count = sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */
-
- count += len; /* name */
- count += hdr->i8count ? sizeof(xfs_dir2_ino8_t) :
- sizeof(xfs_dir2_ino4_t); /* ino # */
- if (xfs_sb_version_hasftype(&mp->m_sb))
- count += sizeof(__uint8_t); /* file type */
- return count;
-}
-
-static inline struct xfs_dir2_sf_entry *
-xfs_dir3_sf_nextentry(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (struct xfs_dir2_sf_entry *)
- ((char *)sfep + xfs_dir3_sf_entsize(mp, hdr, sfep->namelen));
-}
-
-/*
- * in dir3 shortform directories, the file type field is stored at a variable
- * offset after the inode number. Because it's only a single byte, endian
- * conversion is not necessary.
- */
-static inline __uint8_t *
-xfs_dir3_sfe_ftypep(
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return (__uint8_t *)&sfep->name[sfep->namelen];
-}
-
-static inline __uint8_t
-xfs_dir3_sfe_get_ftype(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- __uint8_t *ftp;
-
- if (!xfs_sb_version_hasftype(&mp->m_sb))
- return XFS_DIR3_FT_UNKNOWN;
-
- ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
- if (*ftp >= XFS_DIR3_FT_MAX)
- return XFS_DIR3_FT_UNKNOWN;
- return *ftp;
-}
-
-static inline void
-xfs_dir3_sfe_put_ftype(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- __uint8_t ftype)
-{
- __uint8_t *ftp;
-
- ASSERT(ftype < XFS_DIR3_FT_MAX);
-
- if (!xfs_sb_version_hasftype(&mp->m_sb))
- return;
- ftp = xfs_dir3_sfe_ftypep(hdr, sfep);
- *ftp = ftype;
-}
-
/*
* Data block structures.
*
@@ -298,8 +324,6 @@ xfs_dir3_sfe_put_ftype(
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
#define XFS_DIR2_DATA_SPACE 0
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_DATA_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_DATA_OFFSET)
/*
* Describe a free area in the data block.
@@ -345,17 +369,6 @@ struct xfs_dir3_data_hdr {
#define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc)
-static inline struct xfs_dir2_data_free *
-xfs_dir3_data_bestfree_p(struct xfs_dir2_data_hdr *hdr)
-{
- if (hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
- struct xfs_dir3_data_hdr *hdr3 = (struct xfs_dir3_data_hdr *)hdr;
- return hdr3->best_free;
- }
- return hdr->bestfree;
-}
-
/*
* Active entry in a data block.
*
@@ -389,72 +402,6 @@ typedef struct xfs_dir2_data_unused {
} xfs_dir2_data_unused_t;
/*
- * Size of a data entry.
- */
-static inline int
-__xfs_dir3_data_entsize(
- bool ftype,
- int n)
-{
- int size = offsetof(struct xfs_dir2_data_entry, name[0]);
-
- size += n;
- size += sizeof(xfs_dir2_data_off_t);
- if (ftype)
- size += sizeof(__uint8_t);
- return roundup(size, XFS_DIR2_DATA_ALIGN);
-}
-static inline int
-xfs_dir3_data_entsize(
- struct xfs_mount *mp,
- int n)
-{
- bool ftype = xfs_sb_version_hasftype(&mp->m_sb) ? true : false;
- return __xfs_dir3_data_entsize(ftype, n);
-}
-
-static inline __uint8_t
-xfs_dir3_dirent_get_ftype(
- struct xfs_mount *mp,
- struct xfs_dir2_data_entry *dep)
-{
- if (xfs_sb_version_hasftype(&mp->m_sb)) {
- __uint8_t type = dep->name[dep->namelen];
-
- ASSERT(type < XFS_DIR3_FT_MAX);
- if (type < XFS_DIR3_FT_MAX)
- return type;
-
- }
- return XFS_DIR3_FT_UNKNOWN;
-}
-
-static inline void
-xfs_dir3_dirent_put_ftype(
- struct xfs_mount *mp,
- struct xfs_dir2_data_entry *dep,
- __uint8_t type)
-{
- ASSERT(type < XFS_DIR3_FT_MAX);
- ASSERT(dep->namelen != 0);
-
- if (xfs_sb_version_hasftype(&mp->m_sb))
- dep->name[dep->namelen] = type;
-}
-
-/*
- * Pointer to an entry's tag word.
- */
-static inline __be16 *
-xfs_dir3_data_entry_tag_p(
- struct xfs_mount *mp,
- struct xfs_dir2_data_entry *dep)
-{
- return (__be16 *)((char *)dep +
- xfs_dir3_data_entsize(mp, dep->namelen) - sizeof(__be16));
-}
-
-/*
* Pointer to a freespace's tag word.
*/
static inline __be16 *
@@ -464,93 +411,6 @@ xfs_dir2_data_unused_tag_p(struct xfs_dir2_data_unused *dup)
be16_to_cpu(dup->length) - sizeof(__be16));
}
-static inline size_t
-xfs_dir3_data_hdr_size(bool dir3)
-{
- if (dir3)
- return sizeof(struct xfs_dir3_data_hdr);
- return sizeof(struct xfs_dir2_data_hdr);
-}
-
-static inline size_t
-xfs_dir3_data_entry_offset(struct xfs_dir2_data_hdr *hdr)
-{
- bool dir3 = hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
- hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
- return xfs_dir3_data_hdr_size(dir3);
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_entry_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_entry_offset(hdr));
-}
-
-static inline struct xfs_dir2_data_unused *
-xfs_dir3_data_unused_p(struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_unused *)
- ((char *)hdr + xfs_dir3_data_entry_offset(hdr));
-}
-
-/*
- * Offsets of . and .. in data space (always block 0)
- *
- * XXX: there is scope for significant optimisation of the logic here. Right
- * now we are checking for "dir3 format" over and over again. Ideally we should
- * only do it once for each operation.
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dot_offset(struct xfs_mount *mp)
-{
- return xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_dotdot_offset(struct xfs_mount *mp)
-{
- return xfs_dir3_data_dot_offset(mp) +
- xfs_dir3_data_entsize(mp, 1);
-}
-
-static inline xfs_dir2_data_aoff_t
-xfs_dir3_data_first_offset(struct xfs_mount *mp)
-{
- return xfs_dir3_data_dotdot_offset(mp) +
- xfs_dir3_data_entsize(mp, 2);
-}
-
-/*
- * location of . and .. in data space (always block 0)
- */
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dot_entry_p(
- struct xfs_mount *mp,
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_dot_offset(mp));
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_dotdot_entry_p(
- struct xfs_mount *mp,
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_dotdot_offset(mp));
-}
-
-static inline struct xfs_dir2_data_entry *
-xfs_dir3_data_first_entry_p(
- struct xfs_mount *mp,
- struct xfs_dir2_data_hdr *hdr)
-{
- return (struct xfs_dir2_data_entry *)
- ((char *)hdr + xfs_dir3_data_first_offset(mp));
-}
-
/*
* Leaf block structures.
*
@@ -588,8 +448,6 @@ xfs_dir3_data_first_entry_p(
*/
#define XFS_DIR2_LEAF_SPACE 1
#define XFS_DIR2_LEAF_OFFSET (XFS_DIR2_LEAF_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_LEAF_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_LEAF_OFFSET)
/*
* Leaf block header.
@@ -645,50 +503,6 @@ struct xfs_dir3_leaf {
#define XFS_DIR3_LEAF_CRC_OFF offsetof(struct xfs_dir3_leaf_hdr, info.crc)
-extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from);
-
-static inline int
-xfs_dir3_leaf_hdr_size(struct xfs_dir2_leaf *lp)
-{
- if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
- lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC))
- return sizeof(struct xfs_dir3_leaf_hdr);
- return sizeof(struct xfs_dir2_leaf_hdr);
-}
-
-static inline int
-xfs_dir3_max_leaf_ents(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
- return (mp->m_dirblksize - xfs_dir3_leaf_hdr_size(lp)) /
- (uint)sizeof(struct xfs_dir2_leaf_entry);
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir3_leaf_ents_p(struct xfs_dir2_leaf *lp)
-{
- if (lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
- lp->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
- struct xfs_dir3_leaf *lp3 = (struct xfs_dir3_leaf *)lp;
- return lp3->__ents;
- }
- return lp->__ents;
-}
-
-/*
- * Get address of the bestcount field in the single-leaf block.
- */
-static inline struct xfs_dir2_leaf_tail *
-xfs_dir2_leaf_tail_p(struct xfs_mount *mp, struct xfs_dir2_leaf *lp)
-{
- return (struct xfs_dir2_leaf_tail *)
- ((char *)lp + mp->m_dirblksize -
- sizeof(struct xfs_dir2_leaf_tail));
-}
-
/*
* Get address of the bests array in the single-leaf block.
*/
@@ -699,123 +513,6 @@ xfs_dir2_leaf_bests_p(struct xfs_dir2_leaf_tail *ltp)
}
/*
- * DB blocks here are logical directory block numbers, not filesystem blocks.
- */
-
-/*
- * Convert dataptr to byte in file space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_dataptr_to_byte(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
-}
-
-/*
- * Convert byte in file space to dataptr. It had better be aligned.
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_byte_to_dataptr(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
-}
-
-/*
- * Convert byte in space to (DB) block
- */
-static inline xfs_dir2_db_t
-xfs_dir2_byte_to_db(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_db_t)
- (by >> (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog));
-}
-
-/*
- * Convert dataptr to a block number
- */
-static inline xfs_dir2_db_t
-xfs_dir2_dataptr_to_db(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_db(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert byte in space to offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_byte_to_off(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return (xfs_dir2_data_aoff_t)(by &
- ((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) - 1));
-}
-
-/*
- * Convert dataptr to a byte offset in a block
- */
-static inline xfs_dir2_data_aoff_t
-xfs_dir2_dataptr_to_off(struct xfs_mount *mp, xfs_dir2_dataptr_t dp)
-{
- return xfs_dir2_byte_to_off(mp, xfs_dir2_dataptr_to_byte(mp, dp));
-}
-
-/*
- * Convert block and offset to byte in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_db_off_to_byte(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return ((xfs_dir2_off_t)db <<
- (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) + o;
-}
-
-/*
- * Convert block (DB) to block (dablk)
- */
-static inline xfs_dablk_t
-xfs_dir2_db_to_da(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return (xfs_dablk_t)(db << mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert byte in space to (DA) block
- */
-static inline xfs_dablk_t
-xfs_dir2_byte_to_da(struct xfs_mount *mp, xfs_dir2_off_t by)
-{
- return xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, by));
-}
-
-/*
- * Convert block and offset to dataptr
- */
-static inline xfs_dir2_dataptr_t
-xfs_dir2_db_off_to_dataptr(struct xfs_mount *mp, xfs_dir2_db_t db,
- xfs_dir2_data_aoff_t o)
-{
- return xfs_dir2_byte_to_dataptr(mp, xfs_dir2_db_off_to_byte(mp, db, o));
-}
-
-/*
- * Convert block (dablk) to block (DB)
- */
-static inline xfs_dir2_db_t
-xfs_dir2_da_to_db(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return (xfs_dir2_db_t)(da >> mp->m_sb.sb_dirblklog);
-}
-
-/*
- * Convert block (dablk) to byte offset in space
- */
-static inline xfs_dir2_off_t
-xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
-{
- return xfs_dir2_db_off_to_byte(mp, xfs_dir2_da_to_db(mp, da), 0);
-}
-
-/*
* Free space block defintions for the node format.
*/
@@ -824,8 +521,6 @@ xfs_dir2_da_to_byte(struct xfs_mount *mp, xfs_dablk_t da)
*/
#define XFS_DIR2_FREE_SPACE 2
#define XFS_DIR2_FREE_OFFSET (XFS_DIR2_FREE_SPACE * XFS_DIR2_SPACE_SIZE)
-#define XFS_DIR2_FREE_FIRSTDB(mp) \
- xfs_dir2_byte_to_db(mp, XFS_DIR2_FREE_OFFSET)
typedef struct xfs_dir2_free_hdr {
__be32 magic; /* XFS_DIR2_FREE_MAGIC */
@@ -869,48 +564,6 @@ struct xfs_dir3_icfree_hdr {
};
-void xfs_dir3_free_hdr_from_disk(struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from);
-
-static inline int
-xfs_dir3_free_hdr_size(struct xfs_mount *mp)
-{
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return sizeof(struct xfs_dir3_free_hdr);
- return sizeof(struct xfs_dir2_free_hdr);
-}
-
-static inline int
-xfs_dir3_free_max_bests(struct xfs_mount *mp)
-{
- return (mp->m_dirblksize - xfs_dir3_free_hdr_size(mp)) /
- sizeof(xfs_dir2_data_off_t);
-}
-
-static inline __be16 *
-xfs_dir3_free_bests_p(struct xfs_mount *mp, struct xfs_dir2_free *free)
-{
- return (__be16 *)((char *)free + xfs_dir3_free_hdr_size(mp));
-}
-
-/*
- * Convert data space db to the corresponding free db.
- */
-static inline xfs_dir2_db_t
-xfs_dir2_db_to_fdb(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return XFS_DIR2_FREE_FIRSTDB(mp) + db / xfs_dir3_free_max_bests(mp);
-}
-
-/*
- * Convert data space db to the corresponding index in a free db.
- */
-static inline int
-xfs_dir2_db_to_fdindex(struct xfs_mount *mp, xfs_dir2_db_t db)
-{
- return db % xfs_dir3_free_max_bests(mp);
-}
-
/*
* Single block format.
*
@@ -943,22 +596,266 @@ typedef struct xfs_dir2_block_tail {
} xfs_dir2_block_tail_t;
/*
- * Pointer to the leaf header embedded in a data block (1-block format)
+ * Pointer to the leaf entries embedded in a data block (1-block format)
*/
-static inline struct xfs_dir2_block_tail *
-xfs_dir2_block_tail_p(struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr)
+static inline struct xfs_dir2_leaf_entry *
+xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
{
- return ((struct xfs_dir2_block_tail *)
- ((char *)hdr + mp->m_dirblksize)) - 1;
+ return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
}
+
/*
- * Pointer to the leaf entries embedded in a data block (1-block format)
+ * Attribute storage layout
+ *
+ * Attribute lists are structured around Btrees where all the data
+ * elements are in the leaf nodes. Attribute names are hashed into an int,
+ * then that int is used as the index into the Btree. Since the hashval
+ * of an attribute name may not be unique, we may have duplicate keys. The
+ * internal links in the Btree are logical block offsets into the file.
+ *
+ * Struct leaf_entry's are packed from the top. Name/values grow from the
+ * bottom but are not packed. The freemap contains run-length-encoded entries
+ * for the free bytes after the leaf_entry's, but only the N largest such,
+ * smaller runs are dropped. When the freemap doesn't show enough space
+ * for an allocation, we compact the name/value area and try again. If we
+ * still don't have enough space, then we have to split the block. The
+ * name/value structs (both local and remote versions) must be 32bit aligned.
+ *
+ * Since we have duplicate hash keys, for each key that matches, compare
+ * the actual name string. The root and intermediate node search always
+ * takes the first-in-the-block key match found, so we should only have
+ * to work "forw"ard. If none matches, continue with the "forw"ard leaf
+ * nodes until the hash key changes or the attribute name is found.
+ *
+ * We store the fact that an attribute is a ROOT/USER/SECURE attribute in
+ * the leaf_entry. The namespaces are independent only because we also look
+ * at the namespace bit when we are looking for a matching attribute name.
+ *
+ * We also store an "incomplete" bit in the leaf_entry. It shows that an
+ * attribute is in the middle of being created and should not be shown to
+ * the user if we crash during the time that the bit is set. We clear the
+ * bit when we have finished setting up the attribute. We do this because
+ * we cannot create some large attributes inside a single transaction, and we
+ * need some indication that we weren't finished if we crash in the middle.
+ */
+#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
+
+typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
+ __be16 base; /* base of free region */
+ __be16 size; /* length of free region */
+} xfs_attr_leaf_map_t;
+
+typedef struct xfs_attr_leaf_hdr { /* constant-structure header block */
+ xfs_da_blkinfo_t info; /* block type, links, etc. */
+ __be16 count; /* count of active leaf_entry's */
+ __be16 usedbytes; /* num bytes of names/values stored */
+ __be16 firstused; /* first used byte in name area */
+ __u8 holes; /* != 0 if blk needs compaction */
+ __u8 pad1;
+ xfs_attr_leaf_map_t freemap[XFS_ATTR_LEAF_MAPSIZE];
+ /* N largest free regions */
+} xfs_attr_leaf_hdr_t;
+
+typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */
+ __be32 hashval; /* hash value of name */
+ __be16 nameidx; /* index into buffer of name/value */
+ __u8 flags; /* LOCAL/ROOT/SECURE/INCOMPLETE flag */
+ __u8 pad2; /* unused pad byte */
+} xfs_attr_leaf_entry_t;
+
+typedef struct xfs_attr_leaf_name_local {
+ __be16 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 nameval[1]; /* name/value bytes */
+} xfs_attr_leaf_name_local_t;
+
+typedef struct xfs_attr_leaf_name_remote {
+ __be32 valueblk; /* block number of value bytes */
+ __be32 valuelen; /* number of bytes in value */
+ __u8 namelen; /* length of name bytes */
+ __u8 name[1]; /* name bytes */
+} xfs_attr_leaf_name_remote_t;
+
+typedef struct xfs_attr_leafblock {
+ xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */
+ xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */
+ xfs_attr_leaf_name_local_t namelist; /* grows from bottom of buf */
+ xfs_attr_leaf_name_remote_t valuelist; /* grows from bottom of buf */
+} xfs_attr_leafblock_t;
+
+/*
+ * CRC enabled leaf structures. Called "version 3" structures to match the
+ * version number of the directory and dablk structures for this feature, and
+ * attr2 is already taken by the variable inode attribute fork size feature.
+ */
+struct xfs_attr3_leaf_hdr {
+ struct xfs_da3_blkinfo info;
+ __be16 count;
+ __be16 usedbytes;
+ __be16 firstused;
+ __u8 holes;
+ __u8 pad1;
+ struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE];
+ __be32 pad2; /* 64 bit alignment */
+};
+
+#define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc))
+
+struct xfs_attr3_leafblock {
+ struct xfs_attr3_leaf_hdr hdr;
+ struct xfs_attr_leaf_entry entries[1];
+
+ /*
+ * The rest of the block contains the following structures after the
+ * leaf entries, growing from the bottom up. The variables are never
+ * referenced, the locations accessed purely from helper functions.
+ *
+ * struct xfs_attr_leaf_name_local
+ * struct xfs_attr_leaf_name_remote
+ */
+};
+
+/*
+ * incore, neutral version of the attribute leaf header
+ */
+struct xfs_attr3_icleaf_hdr {
+ __uint32_t forw;
+ __uint32_t back;
+ __uint16_t magic;
+ __uint16_t count;
+ __uint16_t usedbytes;
+ __uint16_t firstused;
+ __u8 holes;
+ struct {
+ __uint16_t base;
+ __uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
+
+/*
+ * Flags used in the leaf_entry[i].flags field.
+ * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
+ * on the system call, they are "or"ed together for various operations.
*/
-static inline struct xfs_dir2_leaf_entry *
-xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
+#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
+#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
+#define XFS_ATTR_SECURE_BIT 2 /* limit access to secure attrs */
+#define XFS_ATTR_INCOMPLETE_BIT 7 /* attr in middle of create/delete */
+#define XFS_ATTR_LOCAL (1 << XFS_ATTR_LOCAL_BIT)
+#define XFS_ATTR_ROOT (1 << XFS_ATTR_ROOT_BIT)
+#define XFS_ATTR_SECURE (1 << XFS_ATTR_SECURE_BIT)
+#define XFS_ATTR_INCOMPLETE (1 << XFS_ATTR_INCOMPLETE_BIT)
+
+/*
+ * Conversion macros for converting namespace bits from argument flags
+ * to ondisk flags.
+ */
+#define XFS_ATTR_NSP_ARGS_MASK (ATTR_ROOT | ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
+#define XFS_ATTR_NSP_ONDISK(flags) ((flags) & XFS_ATTR_NSP_ONDISK_MASK)
+#define XFS_ATTR_NSP_ARGS(flags) ((flags) & XFS_ATTR_NSP_ARGS_MASK)
+#define XFS_ATTR_NSP_ARGS_TO_ONDISK(x) (((x) & ATTR_ROOT ? XFS_ATTR_ROOT : 0) |\
+ ((x) & ATTR_SECURE ? XFS_ATTR_SECURE : 0))
+#define XFS_ATTR_NSP_ONDISK_TO_ARGS(x) (((x) & XFS_ATTR_ROOT ? ATTR_ROOT : 0) |\
+ ((x) & XFS_ATTR_SECURE ? ATTR_SECURE : 0))
+
+/*
+ * Alignment for namelist and valuelist entries (since they are mixed
+ * there can be only one alignment value)
+ */
+#define XFS_ATTR_LEAF_NAME_ALIGN ((uint)sizeof(xfs_dablk_t))
+
+static inline int
+xfs_attr3_leaf_hdr_size(struct xfs_attr_leafblock *leafp)
{
- return ((struct xfs_dir2_leaf_entry *)btp) - be32_to_cpu(btp->count);
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return sizeof(struct xfs_attr3_leaf_hdr);
+ return sizeof(struct xfs_attr_leaf_hdr);
+}
+
+static inline struct xfs_attr_leaf_entry *
+xfs_attr3_leaf_entryp(xfs_attr_leafblock_t *leafp)
+{
+ if (leafp->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC))
+ return &((struct xfs_attr3_leafblock *)leafp)->entries[0];
+ return &leafp->entries[0];
}
-#endif /* __XFS_DIR2_FORMAT_H__ */
+/*
+ * Cast typed pointers for "local" and "remote" name/value structs.
+ */
+static inline char *
+xfs_attr3_leaf_name(xfs_attr_leafblock_t *leafp, int idx)
+{
+ struct xfs_attr_leaf_entry *entries = xfs_attr3_leaf_entryp(leafp);
+
+ return &((char *)leafp)[be16_to_cpu(entries[idx].nameidx)];
+}
+
+static inline xfs_attr_leaf_name_remote_t *
+xfs_attr3_leaf_name_remote(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_remote_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+static inline xfs_attr_leaf_name_local_t *
+xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx)
+{
+ return (xfs_attr_leaf_name_local_t *)xfs_attr3_leaf_name(leafp, idx);
+}
+
+/*
+ * Calculate total bytes used (including trailing pad for alignment) for
+ * a "local" name/value structure, a "remote" name/value structure, and
+ * a pointer which might be either.
+ */
+static inline int xfs_attr_leaf_entsize_remote(int nlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen)
+{
+ return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) +
+ XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1);
+}
+
+static inline int xfs_attr_leaf_entsize_local_max(int bsize)
+{
+ return (((bsize) >> 1) + ((bsize) >> 2));
+}
+
+
+
+/*
+ * Remote attribute block format definition
+ *
+ * There is one of these headers per filesystem block in a remote attribute.
+ * This is done to ensure there is a 1:1 mapping between the attribute value
+ * length and the number of blocks needed to store the attribute. This makes the
+ * verification of a buffer a little more complex, but greatly simplifies the
+ * allocation, reading and writing of these attributes as we don't have to guess
+ * the number of blocks needed to store the attribute data.
+ */
+#define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */
+
+struct xfs_attr3_rmt_hdr {
+ __be32 rm_magic;
+ __be32 rm_offset;
+ __be32 rm_bytes;
+ __be32 rm_crc;
+ uuid_t rm_uuid;
+ __be64 rm_owner;
+ __be64 rm_blkno;
+ __be64 rm_lsn;
+};
+
+#define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc)
+
+#define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \
+ ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
+ sizeof(struct xfs_attr3_rmt_hdr) : 0))
+
+#endif /* __XFS_DA_FORMAT_H__ */
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index e5869b50dc4..623bbe8fd92 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -89,6 +89,8 @@ typedef struct xfs_dinode {
/* structure must be padded to 64 bit alignment */
} xfs_dinode_t;
+#define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc)
+
#define DI_MAX_FLUSH 0xffff
/*
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index edf203ab50a..79670cda48a 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -17,25 +17,24 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_dinode.h"
struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
@@ -86,33 +85,74 @@ static struct xfs_nameops xfs_ascii_ci_nameops = {
.compname = xfs_ascii_ci_compname,
};
-void
-xfs_dir_mount(
- xfs_mount_t *mp)
+int
+xfs_da_mount(
+ struct xfs_mount *mp)
{
- int nodehdr_size;
+ struct xfs_da_geometry *dageo;
+ int nodehdr_size;
- ASSERT(xfs_sb_version_hasdirv2(&mp->m_sb));
+ ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
ASSERT((1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog)) <=
XFS_MAX_BLOCKSIZE);
- mp->m_dirblksize = 1 << (mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog);
- mp->m_dirblkfsbs = 1 << mp->m_sb.sb_dirblklog;
- mp->m_dirdatablk = xfs_dir2_db_to_da(mp, XFS_DIR2_DATA_FIRSTDB(mp));
- mp->m_dirleafblk = xfs_dir2_db_to_da(mp, XFS_DIR2_LEAF_FIRSTDB(mp));
- mp->m_dirfreeblk = xfs_dir2_db_to_da(mp, XFS_DIR2_FREE_FIRSTDB(mp));
-
- nodehdr_size = __xfs_da3_node_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
- mp->m_attr_node_ents = (mp->m_sb.sb_blocksize - nodehdr_size) /
+
+ mp->m_dir_inode_ops = xfs_dir_get_ops(mp, NULL);
+ mp->m_nondir_inode_ops = xfs_nondir_get_ops(mp, NULL);
+
+ nodehdr_size = mp->m_dir_inode_ops->node_hdr_size;
+ mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
+ KM_SLEEP | KM_MAYFAIL);
+ if (!mp->m_dir_geo || !mp->m_attr_geo) {
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
+ return ENOMEM;
+ }
+
+ /* set up directory geometry */
+ dageo = mp->m_dir_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog + mp->m_sb.sb_dirblklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog;
+
+ /*
+ * Now we've set up the block conversion variables, we can calculate the
+ * segment block constants using the geometry structure.
+ */
+ dageo->datablk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_DATA_OFFSET);
+ dageo->leafblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_LEAF_OFFSET);
+ dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
- mp->m_dir_node_ents = (mp->m_dirblksize - nodehdr_size) /
+ dageo->magicpct = (dageo->blksize * 37) / 100;
+
+ /* set up attribute geometry - single fsb only */
+ dageo = mp->m_attr_geo;
+ dageo->blklog = mp->m_sb.sb_blocklog;
+ dageo->fsblog = mp->m_sb.sb_blocklog;
+ dageo->blksize = 1 << dageo->blklog;
+ dageo->fsbcount = 1;
+ dageo->node_ents = (dageo->blksize - nodehdr_size) /
(uint)sizeof(xfs_da_node_entry_t);
+ dageo->magicpct = (dageo->blksize * 37) / 100;
- mp->m_dir_magicpct = (mp->m_dirblksize * 37) / 100;
if (xfs_sb_version_hasasciici(&mp->m_sb))
mp->m_dirnameops = &xfs_ascii_ci_nameops;
else
mp->m_dirnameops = &xfs_default_nameops;
+
+ return 0;
+}
+
+void
+xfs_da_unmount(
+ struct xfs_mount *mp)
+{
+ kmem_free(mp->m_dir_geo);
+ kmem_free(mp->m_attr_geo);
}
/*
@@ -176,16 +216,24 @@ xfs_dir_init(
xfs_inode_t *dp,
xfs_inode_t *pdp)
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int error;
- memset((char *)&args, 0, sizeof(args));
- args.dp = dp;
- args.trans = tp;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino)))
+ error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino);
+ if (error)
return error;
- return xfs_dir2_sf_create(&args, pdp->i_ino);
+
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->dp = dp;
+ args->trans = tp;
+ error = xfs_dir2_sf_create(args, pdp->i_ino);
+ kmem_free(args);
+ return error;
}
/*
@@ -201,41 +249,57 @@ xfs_dir_createname(
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
return rval;
XFS_STATS_INC(xs_dir_create);
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.filetype = name->type;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_addname(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_addname(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_addname(&args);
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
else
- rval = xfs_dir2_node_addname(&args);
+ rval = xfs_dir2_node_addname(args);
+
+out_free:
+ kmem_free(args);
return rval;
}
@@ -278,46 +342,67 @@ xfs_dir_lookup(
xfs_ino_t *inum, /* out: inode number */
struct xfs_name *ci_name) /* out: actual name if CI match */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_lookup);
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.filetype = name->type;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.dp = dp;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.op_flags = XFS_DA_OP_OKNOENT;
+ /*
+ * We need to use KM_NOFS here so that lockdep will not throw false
+ * positive deadlock warnings on a non-transactional lookup path. It is
+ * safe to recurse into inode recalim in that case, but lockdep can't
+ * easily be taught about it. Hence KM_NOFS avoids having to add more
+ * lockdep Doing this avoids having to add a bunch of lockdep class
+ * annotations into the reclaim path for the ilock.
+ */
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_OKNOENT;
if (ci_name)
- args.op_flags |= XFS_DA_OP_CILOOKUP;
+ args->op_flags |= XFS_DA_OP_CILOOKUP;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_lookup(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_lookup(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_lookup(&args);
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_lookup(args);
+ goto out_check_rval;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_lookup(args);
else
- rval = xfs_dir2_node_lookup(&args);
+ rval = xfs_dir2_node_lookup(args);
+
+out_check_rval:
if (rval == EEXIST)
rval = 0;
if (!rval) {
- *inum = args.inumber;
+ *inum = args->inumber;
if (ci_name) {
- ci_name->name = args.value;
- ci_name->len = args.valuelen;
+ ci_name->name = args->value;
+ ci_name->len = args->valuelen;
}
}
+out_free:
+ kmem_free(args);
return rval;
}
@@ -334,38 +419,52 @@ xfs_dir_removename(
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_remove);
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.filetype = name->type;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.inumber = ino;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_removename(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_removename(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_removename(&args);
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = ino;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_removename(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_removename(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_removename(args);
else
- rval = xfs_dir2_node_removename(&args);
+ rval = xfs_dir2_node_removename(args);
+out_free:
+ kmem_free(args);
return rval;
}
@@ -382,40 +481,55 @@ xfs_dir_replace(
xfs_bmap_free_t *flist, /* bmap's freeblock list */
xfs_extlen_t total) /* bmap's total block count */
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
+ rval = xfs_dir_ino_validate(tp->t_mountp, inum);
+ if (rval)
return rval;
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.filetype = name->type;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.inumber = inum;
- args.dp = dp;
- args.firstblock = first;
- args.flist = flist;
- args.total = total;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
-
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_replace(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_replace(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_replace(&args);
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->inumber = inum;
+ args->dp = dp;
+ args->firstblock = first;
+ args->flist = flist;
+ args->total = total;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_replace(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_replace(args);
else
- rval = xfs_dir2_node_replace(&args);
+ rval = xfs_dir2_node_replace(args);
+out_free:
+ kmem_free(args);
return rval;
}
@@ -430,7 +544,7 @@ xfs_dir_canenter(
struct xfs_name *name, /* name of entry to add */
uint resblks)
{
- xfs_da_args_t args;
+ struct xfs_da_args *args;
int rval;
int v; /* type-checking value */
@@ -439,29 +553,43 @@ xfs_dir_canenter(
ASSERT(S_ISDIR(dp->i_d.di_mode));
- memset(&args, 0, sizeof(xfs_da_args_t));
- args.name = name->name;
- args.namelen = name->len;
- args.filetype = name->type;
- args.hashval = dp->i_mount->m_dirnameops->hashname(name);
- args.dp = dp;
- args.whichfork = XFS_DATA_FORK;
- args.trans = tp;
- args.op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
+ args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
+ if (!args)
+ return ENOMEM;
+
+ args->geo = dp->i_mount->m_dir_geo;
+ args->name = name->name;
+ args->namelen = name->len;
+ args->filetype = name->type;
+ args->hashval = dp->i_mount->m_dirnameops->hashname(name);
+ args->dp = dp;
+ args->whichfork = XFS_DATA_FORK;
+ args->trans = tp;
+ args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
XFS_DA_OP_OKNOENT;
- if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_addname(&args);
- else if ((rval = xfs_dir2_isblock(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_block_addname(&args);
- else if ((rval = xfs_dir2_isleaf(tp, dp, &v)))
- return rval;
- else if (v)
- rval = xfs_dir2_leaf_addname(&args);
+ if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
+ rval = xfs_dir2_sf_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isblock(args, &v);
+ if (rval)
+ goto out_free;
+ if (v) {
+ rval = xfs_dir2_block_addname(args);
+ goto out_free;
+ }
+
+ rval = xfs_dir2_isleaf(args, &v);
+ if (rval)
+ goto out_free;
+ if (v)
+ rval = xfs_dir2_leaf_addname(args);
else
- rval = xfs_dir2_node_addname(&args);
+ rval = xfs_dir2_node_addname(args);
+out_free:
+ kmem_free(args);
return rval;
}
@@ -493,13 +621,13 @@ xfs_dir2_grow_inode(
* Set lowest possible block in the space requested.
*/
bno = XFS_B_TO_FSBT(mp, space * XFS_DIR2_SPACE_SIZE);
- count = mp->m_dirblkfsbs;
+ count = args->geo->fsbcount;
error = xfs_da_grow_inode_int(args, &bno, count);
if (error)
return error;
- *dbp = xfs_dir2_da_to_db(mp, (xfs_dablk_t)bno);
+ *dbp = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)bno);
/*
* Update file's size if this is the data space and it grew.
@@ -521,19 +649,16 @@ xfs_dir2_grow_inode(
*/
int
xfs_dir2_isblock(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- int *vp) /* out: 1 is block, 0 is not block */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp;
- int rval;
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- rval = XFS_FSB_TO_B(mp, last) == mp->m_dirblksize;
- ASSERT(rval == 0 || dp->i_d.di_size == mp->m_dirblksize);
+ rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
+ ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
*vp = rval;
return 0;
}
@@ -543,18 +668,15 @@ xfs_dir2_isblock(
*/
int
xfs_dir2_isleaf(
- xfs_trans_t *tp,
- xfs_inode_t *dp,
- int *vp) /* out: 1 is leaf, 0 is not leaf */
+ struct xfs_da_args *args,
+ int *vp) /* out: 1 is block, 0 is not block */
{
- xfs_fileoff_t last; /* last file offset */
- xfs_mount_t *mp;
- int rval;
+ xfs_fileoff_t last; /* last file offset */
+ int rval;
- mp = dp->i_mount;
- if ((rval = xfs_bmap_last_offset(tp, dp, &last, XFS_DATA_FORK)))
+ if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
- *vp = last == mp->m_dirleafblk + (1 << mp->m_sb.sb_dirblklog);
+ *vp = last == args->geo->leafblk + args->geo->fsbcount;
return 0;
}
@@ -582,11 +704,11 @@ xfs_dir2_shrink_inode(
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- da = xfs_dir2_db_to_da(mp, db);
+ da = xfs_dir2_db_to_da(args->geo, db);
/*
* Unmap the fsblock(s).
*/
- if ((error = xfs_bunmapi(tp, dp, da, mp->m_dirblkfsbs,
+ if ((error = xfs_bunmapi(tp, dp, da, args->geo->fsbcount,
XFS_BMAPI_METADATA, 0, args->firstblock, args->flist,
&done))) {
/*
@@ -613,12 +735,12 @@ xfs_dir2_shrink_inode(
/*
* If it's not a data block, we're done.
*/
- if (db >= XFS_DIR2_LEAF_FIRSTDB(mp))
+ if (db >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET))
return 0;
/*
* If the block isn't the last one in the directory, we're done.
*/
- if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(mp, db + 1, 0))
+ if (dp->i_d.di_size > xfs_dir2_db_off_to_byte(args->geo, db + 1, 0))
return 0;
bno = da;
if ((error = xfs_bmap_last_before(tp, dp, &bno, XFS_DATA_FORK))) {
@@ -627,7 +749,7 @@ xfs_dir2_shrink_inode(
*/
return error;
}
- if (db == mp->m_dirdatablk)
+ if (db == args->geo->datablk)
ASSERT(bno == 0);
else
ASSERT(bno > 0);
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index 9910401327d..c8e86b0b5e9 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -32,10 +32,91 @@ struct xfs_dir2_data_unused;
extern struct xfs_name xfs_name_dotdot;
/*
+ * directory operations vector for encode/decode routines
+ */
+struct xfs_dir_ops {
+ int (*sf_entsize)(struct xfs_dir2_sf_hdr *hdr, int len);
+ struct xfs_dir2_sf_entry *
+ (*sf_nextentry)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+ __uint8_t (*sf_get_ftype)(struct xfs_dir2_sf_entry *sfep);
+ void (*sf_put_ftype)(struct xfs_dir2_sf_entry *sfep,
+ __uint8_t ftype);
+ xfs_ino_t (*sf_get_ino)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep);
+ void (*sf_put_ino)(struct xfs_dir2_sf_hdr *hdr,
+ struct xfs_dir2_sf_entry *sfep,
+ xfs_ino_t ino);
+ xfs_ino_t (*sf_get_parent_ino)(struct xfs_dir2_sf_hdr *hdr);
+ void (*sf_put_parent_ino)(struct xfs_dir2_sf_hdr *hdr,
+ xfs_ino_t ino);
+
+ int (*data_entsize)(int len);
+ __uint8_t (*data_get_ftype)(struct xfs_dir2_data_entry *dep);
+ void (*data_put_ftype)(struct xfs_dir2_data_entry *dep,
+ __uint8_t ftype);
+ __be16 * (*data_entry_tag_p)(struct xfs_dir2_data_entry *dep);
+ struct xfs_dir2_data_free *
+ (*data_bestfree_p)(struct xfs_dir2_data_hdr *hdr);
+
+ xfs_dir2_data_aoff_t data_dot_offset;
+ xfs_dir2_data_aoff_t data_dotdot_offset;
+ xfs_dir2_data_aoff_t data_first_offset;
+ size_t data_entry_offset;
+
+ struct xfs_dir2_data_entry *
+ (*data_dot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_dotdot_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_first_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_entry *
+ (*data_entry_p)(struct xfs_dir2_data_hdr *hdr);
+ struct xfs_dir2_data_unused *
+ (*data_unused_p)(struct xfs_dir2_data_hdr *hdr);
+
+ int leaf_hdr_size;
+ void (*leaf_hdr_to_disk)(struct xfs_dir2_leaf *to,
+ struct xfs_dir3_icleaf_hdr *from);
+ void (*leaf_hdr_from_disk)(struct xfs_dir3_icleaf_hdr *to,
+ struct xfs_dir2_leaf *from);
+ int (*leaf_max_ents)(struct xfs_da_geometry *geo);
+ struct xfs_dir2_leaf_entry *
+ (*leaf_ents_p)(struct xfs_dir2_leaf *lp);
+
+ int node_hdr_size;
+ void (*node_hdr_to_disk)(struct xfs_da_intnode *to,
+ struct xfs_da3_icnode_hdr *from);
+ void (*node_hdr_from_disk)(struct xfs_da3_icnode_hdr *to,
+ struct xfs_da_intnode *from);
+ struct xfs_da_node_entry *
+ (*node_tree_p)(struct xfs_da_intnode *dap);
+
+ int free_hdr_size;
+ void (*free_hdr_to_disk)(struct xfs_dir2_free *to,
+ struct xfs_dir3_icfree_hdr *from);
+ void (*free_hdr_from_disk)(struct xfs_dir3_icfree_hdr *to,
+ struct xfs_dir2_free *from);
+ int (*free_max_bests)(struct xfs_da_geometry *geo);
+ __be16 * (*free_bests_p)(struct xfs_dir2_free *free);
+ xfs_dir2_db_t (*db_to_fdb)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+ int (*db_to_fdindex)(struct xfs_da_geometry *geo,
+ xfs_dir2_db_t db);
+};
+
+extern const struct xfs_dir_ops *
+ xfs_dir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+extern const struct xfs_dir_ops *
+ xfs_nondir_get_ops(struct xfs_mount *mp, struct xfs_inode *dp);
+
+/*
* Generic directory interface routines
*/
extern void xfs_dir_startup(void);
-extern void xfs_dir_mount(struct xfs_mount *mp);
+extern int xfs_da_mount(struct xfs_mount *mp);
+extern void xfs_da_unmount(struct xfs_mount *mp);
+
extern int xfs_dir_isempty(struct xfs_inode *dp);
extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_inode *pdp);
@@ -65,37 +146,30 @@ extern int xfs_dir2_sf_to_block(struct xfs_da_args *args);
/*
* Interface routines used by userspace utilities
*/
-extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
-extern void xfs_dir2_sf_put_parent_ino(struct xfs_dir2_sf_hdr *sfp,
- xfs_ino_t ino);
-extern xfs_ino_t xfs_dir3_sfe_get_ino(struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *sfp, struct xfs_dir2_sf_entry *sfep);
-extern void xfs_dir3_sfe_put_ino(struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr, struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino);
-
-extern int xfs_dir2_isblock(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
-extern int xfs_dir2_isleaf(struct xfs_trans *tp, struct xfs_inode *dp, int *r);
+extern int xfs_dir2_isblock(struct xfs_da_args *args, int *r);
+extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
struct xfs_buf *bp);
-extern void xfs_dir2_data_freescan(struct xfs_mount *mp,
+extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
struct xfs_dir2_data_hdr *hdr, int *loghead);
-extern void xfs_dir2_data_log_entry(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_entry *dep);
-extern void xfs_dir2_data_log_header(struct xfs_trans *tp,
+extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_entry *dep);
+extern void xfs_dir2_data_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
-extern void xfs_dir2_data_log_unused(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_unused *dup);
-extern void xfs_dir2_data_make_free(struct xfs_trans *tp, struct xfs_buf *bp,
+extern void xfs_dir2_data_log_unused(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup);
+extern void xfs_dir2_data_make_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, xfs_dir2_data_aoff_t offset,
+ xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
+extern void xfs_dir2_data_use_free(struct xfs_da_args *args,
+ struct xfs_buf *bp, struct xfs_dir2_data_unused *dup,
xfs_dir2_data_aoff_t offset, xfs_dir2_data_aoff_t len,
int *needlogp, int *needscanp);
-extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_dir2_data_unused *dup, xfs_dir2_data_aoff_t offset,
- xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
- struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_unused *dup);
+ struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
+ struct xfs_dir2_data_unused *dup);
extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 12dad188939..c7cd3154026 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -18,25 +18,25 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_buf_item.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* Local function prototypes.
@@ -89,13 +89,14 @@ xfs_dir3_block_read_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if ((xfs_sb_version_hascrc(&mp->m_sb) &&
- !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_DIR3_DATA_CRC_OFF)) ||
- !xfs_dir3_block_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_block_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -107,8 +108,8 @@ xfs_dir3_block_write_verify(
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_block_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -118,7 +119,7 @@ xfs_dir3_block_write_verify(
if (bip)
hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
}
const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
@@ -135,7 +136,7 @@ xfs_dir3_block_read(
struct xfs_mount *mp = dp->i_mount;
int err;
- err = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+ err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
if (!err && tp)
xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
@@ -168,6 +169,7 @@ xfs_dir3_block_init(
static void
xfs_dir2_block_need_space(
+ struct xfs_inode *dp,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_block_tail *btp,
struct xfs_dir2_leaf_entry *blp,
@@ -183,7 +185,7 @@ xfs_dir2_block_need_space(
struct xfs_dir2_data_unused *enddup = NULL;
*compact = 0;
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
/*
* If there are stale entries we'll use one for the leaf.
@@ -279,7 +281,7 @@ out:
*/
static void
xfs_dir2_block_compact(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_block_tail *btp,
@@ -312,18 +314,17 @@ xfs_dir2_block_compact(
*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
*lfloghigh -= be32_to_cpu(btp->stale) - 1;
be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
- xfs_dir2_data_make_free(tp, bp,
+ xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
needlog, &needscan);
- blp += be32_to_cpu(btp->stale) - 1;
btp->stale = cpu_to_be32(1);
/*
* If we now need to rebuild the bestfree map, do so.
* This needs to happen before the next call to use_free.
*/
if (needscan)
- xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+ xfs_dir2_data_freescan(args->dp, hdr, needlog);
}
/*
@@ -369,20 +370,20 @@ xfs_dir2_block_addname(
if (error)
return error;
- len = xfs_dir3_data_entsize(mp, args->namelen);
+ len = dp->d_ops->data_entsize(args->namelen);
/*
* Set up pointers to parts of the block.
*/
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Find out if we can reuse stale entries or whether we need extra
* space for entry and new leaf.
*/
- xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+ xfs_dir2_block_need_space(dp, hdr, btp, blp, &tagp, &dup,
&enddup, &compact, len);
/*
@@ -418,7 +419,7 @@ xfs_dir2_block_addname(
* If need to compact the leaf entries, do it now.
*/
if (compact) {
- xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+ xfs_dir2_block_compact(args, bp, hdr, btp, blp, &needlog,
&lfloghigh, &lfloglow);
/* recalculate blp post-compaction */
blp = xfs_dir2_block_leaf_p(btp);
@@ -453,7 +454,7 @@ xfs_dir2_block_addname(
/*
* Mark the space needed for the new leaf entry, now in use.
*/
- xfs_dir2_data_use_free(tp, bp, enddup,
+ xfs_dir2_data_use_free(args, bp, enddup,
(xfs_dir2_data_aoff_t)
((char *)enddup - (char *)hdr + be16_to_cpu(enddup->length) -
sizeof(*blp)),
@@ -468,7 +469,7 @@ xfs_dir2_block_addname(
* This needs to happen before the next call to use_free.
*/
if (needscan) {
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
needscan = 0;
}
/*
@@ -534,13 +535,13 @@ xfs_dir2_block_addname(
* Fill in the leaf entry.
*/
blp[mid].hashval = cpu_to_be32(args->hashval);
- blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[mid].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
xfs_dir2_block_log_leaf(tp, bp, lfloglow, lfloghigh);
/*
* Mark space for the data entry used.
*/
- xfs_dir2_data_use_free(tp, bp, dup,
+ xfs_dir2_data_use_free(args, bp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
(xfs_dir2_data_aoff_t)len, &needlog, &needscan);
/*
@@ -549,18 +550,18 @@ xfs_dir2_block_addname(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, args->namelen);
- xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
- tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Clean up the bestfree array and log the header, tail, and entry.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_data_log_header(args, bp);
xfs_dir2_block_log_tail(tp, bp);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -579,7 +580,7 @@ xfs_dir2_block_log_leaf(
xfs_dir2_leaf_entry_t *blp;
xfs_dir2_block_tail_t *btp;
- btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
xfs_trans_log_buf(tp, bp, (uint)((char *)&blp[first] - (char *)hdr),
(uint)((char *)&blp[last + 1] - (char *)hdr - 1));
@@ -596,7 +597,7 @@ xfs_dir2_block_log_tail(
xfs_dir2_data_hdr_t *hdr = bp->b_addr;
xfs_dir2_block_tail_t *btp;
- btp = xfs_dir2_block_tail_p(tp->t_mountp, hdr);
+ btp = xfs_dir2_block_tail_p(tp->t_mountp->m_dir_geo, hdr);
xfs_trans_log_buf(tp, bp, (uint)((char *)btp - (char *)hdr),
(uint)((char *)(btp + 1) - (char *)hdr - 1));
}
@@ -631,18 +632,19 @@ xfs_dir2_block_lookup(
mp = dp->i_mount;
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Get the offset from the leaf entry, to point to the data.
*/
dep = (xfs_dir2_data_entry_t *)((char *)hdr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Fill in inode number, CI name if appropriate, release the block.
*/
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(args->trans, bp);
return XFS_ERROR(error);
@@ -683,7 +685,7 @@ xfs_dir2_block_lookup_int(
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Loop doing a binary search for our hash value.
@@ -721,7 +723,7 @@ xfs_dir2_block_lookup_int(
* Get pointer to the entry from the leaf.
*/
dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr));
+ ((char *)hdr + xfs_dir2_dataptr_to_off(args->geo, addr));
/*
* Compare name and if it's an exact match, return the index
* and buffer. If it's the first case-insensitive match, store
@@ -788,20 +790,21 @@ xfs_dir2_block_removename(
tp = args->trans;
mp = dp->i_mount;
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry using the leaf entry.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
/*
* Mark the data entry's space free.
*/
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, bp,
+ xfs_dir2_data_make_free(args, bp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Fix up the block tail.
*/
@@ -816,9 +819,9 @@ xfs_dir2_block_removename(
* Fix up bestfree, log the header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, bp);
+ xfs_dir2_data_log_header(args, bp);
xfs_dir3_data_check(dp, bp);
/*
* See if the size as a shortform is good enough.
@@ -863,20 +866,21 @@ xfs_dir2_block_replace(
dp = args->dp;
mp = dp->i_mount;
hdr = bp->b_addr;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
* Point to the data entry we need to change.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(blp[ent].address)));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(blp[ent].address)));
ASSERT(be64_to_cpu(dep->inumber) != args->inumber);
/*
* Change the inode number to the new value.
*/
dep->inumber = cpu_to_be64(args->inumber);
- xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
- xfs_dir2_data_log_entry(args->trans, bp, dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_log_entry(args, bp, dep);
xfs_dir3_data_check(dp, bp);
return 0;
}
@@ -934,9 +938,9 @@ xfs_dir2_leaf_to_block(
tp = args->trans;
mp = dp->i_mount;
leaf = lbp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAF1_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAF1_MAGIC);
@@ -946,13 +950,13 @@ xfs_dir2_leaf_to_block(
* been left behind during no-space-reservation operations.
* These will show up in the leaf bests table.
*/
- while (dp->i_d.di_size > mp->m_dirblksize) {
+ while (dp->i_d.di_size > args->geo->blksize) {
int hdrsz;
- hdrsz = xfs_dir3_data_hdr_size(xfs_sb_version_hascrc(&mp->m_sb));
+ hdrsz = dp->d_ops->data_entry_offset;
bestsp = xfs_dir2_leaf_bests_p(ltp);
if (be16_to_cpu(bestsp[be32_to_cpu(ltp->bestcount) - 1]) ==
- mp->m_dirblksize - hdrsz) {
+ args->geo->blksize - hdrsz) {
if ((error =
xfs_dir2_leaf_trim_data(args, lbp,
(xfs_dir2_db_t)(be32_to_cpu(ltp->bestcount) - 1))))
@@ -964,7 +968,7 @@ xfs_dir2_leaf_to_block(
* Read the data block if we don't already have it, give up if it fails.
*/
if (!dbp) {
- error = xfs_dir3_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, args->geo->datablk, -1, &dbp);
if (error)
return error;
}
@@ -980,7 +984,7 @@ xfs_dir2_leaf_to_block(
/*
* Look at the last data entry.
*/
- tagp = (__be16 *)((char *)hdr + mp->m_dirblksize) - 1;
+ tagp = (__be16 *)((char *)hdr + args->geo->blksize) - 1;
dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
/*
* If it's not free or is too short we can't do it.
@@ -999,12 +1003,12 @@ xfs_dir2_leaf_to_block(
/*
* Use up the space at the end of the block (blp/btp).
*/
- xfs_dir2_data_use_free(tp, dbp, dup, mp->m_dirblksize - size, size,
+ xfs_dir2_data_use_free(args, dbp, dup, args->geo->blksize - size, size,
&needlog, &needscan);
/*
* Initialize the block tail.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
btp->count = cpu_to_be32(leafhdr.count - leafhdr.stale);
btp->stale = 0;
xfs_dir2_block_log_tail(tp, dbp);
@@ -1023,13 +1027,13 @@ xfs_dir2_leaf_to_block(
* Scan the bestfree if we need it and log the data block header.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* Pitch the old leaf block.
*/
- error = xfs_da_shrink_inode(args, mp->m_dirleafblk, lbp);
+ error = xfs_da_shrink_inode(args, args->geo->leafblk, lbp);
if (error)
return error;
@@ -1136,15 +1140,15 @@ xfs_dir2_sf_to_block(
* The whole thing is initialized to free by the init routine.
* Say we're using the leaf and tail area.
*/
- dup = xfs_dir3_data_unused_p(hdr);
+ dup = dp->d_ops->data_unused_p(hdr);
needlog = needscan = 0;
- xfs_dir2_data_use_free(tp, bp, dup, mp->m_dirblksize - i, i, &needlog,
- &needscan);
+ xfs_dir2_data_use_free(args, bp, dup, args->geo->blksize - i,
+ i, &needlog, &needscan);
ASSERT(needscan == 0);
/*
* Fill in the tail.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
btp->count = cpu_to_be32(sfp->count + 2); /* ., .. */
btp->stale = 0;
blp = xfs_dir2_block_leaf_p(btp);
@@ -1152,38 +1156,38 @@ xfs_dir2_sf_to_block(
/*
* Remove the freespace, we'll manage it.
*/
- xfs_dir2_data_use_free(tp, bp, dup,
+ xfs_dir2_data_use_free(args, bp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr),
be16_to_cpu(dup->length), &needlog, &needscan);
/*
* Create entry for .
*/
- dep = xfs_dir3_data_dot_entry_p(mp, hdr);
+ dep = dp->d_ops->data_dot_entry_p(hdr);
dep->inumber = cpu_to_be64(dp->i_ino);
dep->namelen = 1;
dep->name[0] = '.';
- xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
- tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+ dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
blp[0].hashval = cpu_to_be32(xfs_dir_hash_dot);
- blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[0].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
/*
* Create entry for ..
*/
- dep = xfs_dir3_data_dotdot_entry_p(mp, hdr);
- dep->inumber = cpu_to_be64(xfs_dir2_sf_get_parent_ino(sfp));
+ dep = dp->d_ops->data_dotdot_entry_p(hdr);
+ dep->inumber = cpu_to_be64(dp->d_ops->sf_get_parent_ino(sfp));
dep->namelen = 2;
dep->name[0] = dep->name[1] = '.';
- xfs_dir3_dirent_put_ftype(mp, dep, XFS_DIR3_FT_DIR);
- tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+ dp->d_ops->data_put_ftype(dep, XFS_DIR3_FT_DIR);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
blp[1].hashval = cpu_to_be32(xfs_dir_hash_dotdot);
- blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[1].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
- offset = xfs_dir3_data_first_offset(mp);
+ offset = dp->d_ops->data_first_offset;
/*
* Loop over existing entries, stuff them in.
*/
@@ -1213,8 +1217,10 @@ xfs_dir2_sf_to_block(
dup->length = cpu_to_be16(newoffset - offset);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16(
((char *)dup - (char *)hdr));
- xfs_dir2_data_log_unused(tp, bp, dup);
- xfs_dir2_data_freeinsert(hdr, dup, &dummy);
+ xfs_dir2_data_log_unused(args, bp, dup);
+ xfs_dir2_data_freeinsert(hdr,
+ dp->d_ops->data_bestfree_p(hdr),
+ dup, &dummy);
offset += be16_to_cpu(dup->length);
continue;
}
@@ -1222,25 +1228,24 @@ xfs_dir2_sf_to_block(
* Copy a real entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)hdr + newoffset);
- dep->inumber = cpu_to_be64(xfs_dir3_sfe_get_ino(mp, sfp, sfep));
+ dep->inumber = cpu_to_be64(dp->d_ops->sf_get_ino(sfp, sfep));
dep->namelen = sfep->namelen;
- xfs_dir3_dirent_put_ftype(mp, dep,
- xfs_dir3_sfe_get_ftype(mp, sfp, sfep));
+ dp->d_ops->data_put_ftype(dep, dp->d_ops->sf_get_ftype(sfep));
memcpy(dep->name, sfep->name, dep->namelen);
- tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, bp, dep);
+ xfs_dir2_data_log_entry(args, bp, dep);
name.name = sfep->name;
name.len = sfep->namelen;
blp[2 + i].hashval = cpu_to_be32(mp->m_dirnameops->
hashname(&name));
- blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(mp,
+ blp[2 + i].address = cpu_to_be32(xfs_dir2_byte_to_dataptr(
(char *)dep - (char *)hdr));
offset = (int)((char *)(tagp + 1) - (char *)hdr);
if (++i == sfp->count)
sfep = NULL;
else
- sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
/* Done with the temporary buffer */
kmem_free(sfp);
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 47e1326c169..8c2f6422648 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -18,20 +18,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
@@ -63,32 +62,52 @@ __xfs_dir3_data_check(
char *p; /* current data position */
int stale; /* count of stale leaves */
struct xfs_name name;
+ const struct xfs_dir_ops *ops;
+ struct xfs_da_geometry *geo;
mp = bp->b_target->bt_mount;
+ geo = mp->m_dir_geo;
+
+ /*
+ * We can be passed a null dp here from a verifier, so we need to go the
+ * hard way to get them.
+ */
+ ops = xfs_dir_get_ops(mp, dp);
+
hdr = bp->b_addr;
- bf = xfs_dir3_data_bestfree_p(hdr);
- p = (char *)xfs_dir3_data_entry_p(hdr);
+ p = (char *)ops->data_entry_p(hdr);
switch (hdr->magic) {
case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
+
+ /*
+ * The number of leaf entries is limited by the size of the
+ * block and the amount of space used by the data entries.
+ * We don't know how much space is used by the data entries yet,
+ * so just ensure that the count falls somewhere inside the
+ * block right now.
+ */
+ XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) <
+ ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry));
break;
case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
- endp = (char *)hdr + mp->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
break;
default:
XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
return EFSCORRUPTED;
}
- count = lastfree = freeseen = 0;
/*
* Account for zero bestfree entries.
*/
+ bf = ops->data_bestfree_p(hdr);
+ count = lastfree = freeseen = 0;
if (!bf[0].length) {
XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
freeseen |= 1 << 0;
@@ -121,7 +140,7 @@ __xfs_dir3_data_check(
XFS_WANT_CORRUPTED_RETURN(
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
(char *)dup - (char *)hdr);
- dfp = xfs_dir2_data_freefind(hdr, dup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
if (dfp) {
i = (int)(dfp - bf);
XFS_WANT_CORRUPTED_RETURN(
@@ -147,17 +166,17 @@ __xfs_dir3_data_check(
XFS_WANT_CORRUPTED_RETURN(
!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
XFS_WANT_CORRUPTED_RETURN(
- be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)) ==
+ be16_to_cpu(*ops->data_entry_tag_p(dep)) ==
(char *)dep - (char *)hdr);
XFS_WANT_CORRUPTED_RETURN(
- xfs_dir3_dirent_get_ftype(mp, dep) < XFS_DIR3_FT_MAX);
+ ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX);
count++;
lastfree = 0;
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
- addr = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- (xfs_dir2_data_aoff_t)
- ((char *)dep - (char *)hdr));
+ addr = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ (xfs_dir2_data_aoff_t)
+ ((char *)dep - (char *)hdr));
name.name = dep->name;
name.len = dep->namelen;
hash = mp->m_dirnameops->hashname(&name);
@@ -168,7 +187,7 @@ __xfs_dir3_data_check(
}
XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
}
- p += xfs_dir3_data_entsize(mp, dep->namelen);
+ p += ops->data_entsize(dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
@@ -224,7 +243,6 @@ static void
xfs_dir3_data_reada_verify(
struct xfs_buf *bp)
{
- struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_data_hdr *hdr = bp->b_addr;
switch (hdr->magic) {
@@ -238,8 +256,8 @@ xfs_dir3_data_reada_verify(
xfs_dir3_data_verify(bp);
return;
default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
break;
}
}
@@ -250,13 +268,14 @@ xfs_dir3_data_read_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if ((xfs_sb_version_hascrc(&mp->m_sb) &&
- !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_DIR3_DATA_CRC_OFF)) ||
- !xfs_dir3_data_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_data_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -268,8 +287,8 @@ xfs_dir3_data_write_verify(
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_data_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -279,7 +298,7 @@ xfs_dir3_data_write_verify(
if (bip)
hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_DATA_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_DIR3_DATA_CRC_OFF);
}
const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
@@ -312,12 +331,11 @@ xfs_dir3_data_read(
int
xfs_dir3_data_readahead(
- struct xfs_trans *tp,
struct xfs_inode *dp,
xfs_dablk_t bno,
xfs_daddr_t mapped_bno)
{
- return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+ return xfs_da_reada_buf(dp, bno, mapped_bno,
XFS_DATA_FORK, &xfs_dir3_data_reada_buf_ops);
}
@@ -327,19 +345,18 @@ xfs_dir3_data_readahead(
*/
xfs_dir2_data_free_t *
xfs_dir2_data_freefind(
- xfs_dir2_data_hdr_t *hdr, /* data block */
- xfs_dir2_data_unused_t *dup) /* data unused entry */
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup) /* unused space */
{
xfs_dir2_data_free_t *dfp; /* bestfree entry */
xfs_dir2_data_aoff_t off; /* offset value needed */
- struct xfs_dir2_data_free *bf;
#ifdef DEBUG
int matched; /* matched the value */
int seenzero; /* saw a 0 bestfree entry */
#endif
off = (xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr);
- bf = xfs_dir3_data_bestfree_p(hdr);
#ifdef DEBUG
/*
@@ -399,11 +416,11 @@ xfs_dir2_data_freefind(
*/
xfs_dir2_data_free_t * /* entry inserted */
xfs_dir2_data_freeinsert(
- xfs_dir2_data_hdr_t *hdr, /* data block pointer */
- xfs_dir2_data_unused_t *dup, /* unused space */
+ struct xfs_dir2_data_hdr *hdr, /* data block pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree table pointer */
+ struct xfs_dir2_data_unused *dup, /* unused space */
int *loghead) /* log the data header (out) */
{
- xfs_dir2_data_free_t *dfp; /* bestfree table pointer */
xfs_dir2_data_free_t new; /* new bestfree entry */
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
@@ -411,7 +428,6 @@ xfs_dir2_data_freeinsert(
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- dfp = xfs_dir3_data_bestfree_p(hdr);
new.length = dup->length;
new.offset = cpu_to_be16((char *)dup - (char *)hdr);
@@ -444,11 +460,11 @@ xfs_dir2_data_freeinsert(
*/
STATIC void
xfs_dir2_data_freeremove(
- xfs_dir2_data_hdr_t *hdr, /* data block header */
- xfs_dir2_data_free_t *dfp, /* bestfree entry pointer */
+ struct xfs_dir2_data_hdr *hdr, /* data block header */
+ struct xfs_dir2_data_free *bf, /* bestfree table pointer */
+ struct xfs_dir2_data_free *dfp, /* bestfree entry pointer */
int *loghead) /* out: log data header */
{
- struct xfs_dir2_data_free *bf;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
@@ -458,7 +474,6 @@ xfs_dir2_data_freeremove(
/*
* It's the first entry, slide the next 2 up.
*/
- bf = xfs_dir3_data_bestfree_p(hdr);
if (dfp == &bf[0]) {
bf[0] = bf[1];
bf[1] = bf[2];
@@ -486,9 +501,9 @@ xfs_dir2_data_freeremove(
*/
void
xfs_dir2_data_freescan(
- xfs_mount_t *mp, /* filesystem mount point */
- xfs_dir2_data_hdr_t *hdr, /* data block header */
- int *loghead) /* out: log data header */
+ struct xfs_inode *dp,
+ struct xfs_dir2_data_hdr *hdr,
+ int *loghead)
{
xfs_dir2_block_tail_t *btp; /* block tail */
xfs_dir2_data_entry_t *dep; /* active data entry */
@@ -496,6 +511,7 @@ xfs_dir2_data_freescan(
struct xfs_dir2_data_free *bf;
char *endp; /* end of block's data */
char *p; /* current entry pointer */
+ struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -505,19 +521,19 @@ xfs_dir2_data_freescan(
/*
* Start by clearing the table.
*/
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
*loghead = 1;
/*
* Set up pointers.
*/
- p = (char *)xfs_dir3_data_entry_p(hdr);
+ p = (char *)dp->d_ops->data_entry_p(hdr);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
endp = (char *)xfs_dir2_block_leaf_p(btp);
} else
- endp = (char *)hdr + mp->m_dirblksize;
+ endp = (char *)hdr + geo->blksize;
/*
* Loop over the block's entries.
*/
@@ -529,7 +545,7 @@ xfs_dir2_data_freescan(
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
ASSERT((char *)dup - (char *)hdr ==
be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)));
- xfs_dir2_data_freeinsert(hdr, dup, loghead);
+ xfs_dir2_data_freeinsert(hdr, bf, dup, loghead);
p += be16_to_cpu(dup->length);
}
/*
@@ -538,8 +554,8 @@ xfs_dir2_data_freescan(
else {
dep = (xfs_dir2_data_entry_t *)p;
ASSERT((char *)dep - (char *)hdr ==
- be16_to_cpu(*xfs_dir3_data_entry_tag_p(mp, dep)));
- p += xfs_dir3_data_entsize(mp, dep->namelen);
+ be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
+ p += dp->d_ops->data_entsize(dep->namelen);
}
}
}
@@ -571,8 +587,8 @@ xfs_dir3_data_init(
/*
* Get the buffer set up for the block.
*/
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
- XFS_DATA_FORK);
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, blkno),
+ -1, &bp, XFS_DATA_FORK);
if (error)
return error;
bp->b_ops = &xfs_dir3_data_buf_ops;
@@ -594,8 +610,8 @@ xfs_dir3_data_init(
} else
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
- bf = xfs_dir3_data_bestfree_p(hdr);
- bf[0].offset = cpu_to_be16(xfs_dir3_data_entry_offset(hdr));
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ bf[0].offset = cpu_to_be16(dp->d_ops->data_entry_offset);
for (i = 1; i < XFS_DIR2_DATA_FD_COUNT; i++) {
bf[i].length = 0;
bf[i].offset = 0;
@@ -604,18 +620,18 @@ xfs_dir3_data_init(
/*
* Set up an unused entry for the block's body.
*/
- dup = xfs_dir3_data_unused_p(hdr);
+ dup = dp->d_ops->data_unused_p(hdr);
dup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
- t = mp->m_dirblksize - (uint)xfs_dir3_data_entry_offset(hdr);
+ t = args->geo->blksize - (uint)dp->d_ops->data_entry_offset;
bf[0].length = cpu_to_be16(t);
dup->length = cpu_to_be16(t);
*xfs_dir2_data_unused_tag_p(dup) = cpu_to_be16((char *)dup - (char *)hdr);
/*
* Log it and return it.
*/
- xfs_dir2_data_log_header(tp, bp);
- xfs_dir2_data_log_unused(tp, bp, dup);
+ xfs_dir2_data_log_header(args, bp);
+ xfs_dir2_data_log_unused(args, bp, dup);
*bpp = bp;
return 0;
}
@@ -625,20 +641,19 @@ xfs_dir3_data_init(
*/
void
xfs_dir2_data_log_entry(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_entry_t *dep) /* data entry pointer */
{
struct xfs_dir2_data_hdr *hdr = bp->b_addr;
- struct xfs_mount *mp = tp->t_mountp;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)dep - (char *)hdr),
- (uint)((char *)(xfs_dir3_data_entry_tag_p(mp, dep) + 1) -
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dep - (char *)hdr),
+ (uint)((char *)(args->dp->d_ops->data_entry_tag_p(dep) + 1) -
(char *)hdr - 1));
}
@@ -647,17 +662,20 @@ xfs_dir2_data_log_entry(
*/
void
xfs_dir2_data_log_header(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
- xfs_dir2_data_hdr_t *hdr = bp->b_addr;
+#ifdef DEBUG
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
+#endif
- xfs_trans_log_buf(tp, bp, 0, xfs_dir3_data_entry_offset(hdr) - 1);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->data_entry_offset - 1);
}
/*
@@ -665,7 +683,7 @@ xfs_dir2_data_log_header(
*/
void
xfs_dir2_data_log_unused(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup) /* data unused pointer */
{
@@ -679,13 +697,13 @@ xfs_dir2_data_log_unused(
/*
* Log the first part of the unused entry.
*/
- xfs_trans_log_buf(tp, bp, (uint)((char *)dup - (char *)hdr),
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)dup - (char *)hdr),
(uint)((char *)&dup->length + sizeof(dup->length) -
1 - (char *)hdr));
/*
* Log the end (tag) of the unused entry.
*/
- xfs_trans_log_buf(tp, bp,
+ xfs_trans_log_buf(args->trans, bp,
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr),
(uint)((char *)xfs_dir2_data_unused_tag_p(dup) - (char *)hdr +
sizeof(xfs_dir2_data_off_t) - 1));
@@ -697,7 +715,7 @@ xfs_dir2_data_log_unused(
*/
void
xfs_dir2_data_make_free(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_aoff_t offset, /* starting byte offset */
xfs_dir2_data_aoff_t len, /* length in bytes */
@@ -707,14 +725,12 @@ xfs_dir2_data_make_free(
xfs_dir2_data_hdr_t *hdr; /* data block pointer */
xfs_dir2_data_free_t *dfp; /* bestfree pointer */
char *endptr; /* end of data area */
- xfs_mount_t *mp; /* filesystem mount point */
int needscan; /* need to regen bestfree */
xfs_dir2_data_unused_t *newdup; /* new unused entry */
xfs_dir2_data_unused_t *postdup; /* unused entry after us */
xfs_dir2_data_unused_t *prevdup; /* unused entry before us */
struct xfs_dir2_data_free *bf;
- mp = tp->t_mountp;
hdr = bp->b_addr;
/*
@@ -722,20 +738,20 @@ xfs_dir2_data_make_free(
*/
if (hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC))
- endptr = (char *)hdr + mp->m_dirblksize;
+ endptr = (char *)hdr + args->geo->blksize;
else {
xfs_dir2_block_tail_t *btp; /* block tail */
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC));
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
}
/*
* If this isn't the start of the block, then back up to
* the previous entry and see if it's free.
*/
- if (offset > xfs_dir3_data_entry_offset(hdr)) {
+ if (offset > args->dp->d_ops->data_entry_offset) {
__be16 *tagp; /* tag just before us */
tagp = (__be16 *)((char *)hdr + offset) - 1;
@@ -761,15 +777,15 @@ xfs_dir2_data_make_free(
* Previous and following entries are both free,
* merge everything into a single free entry.
*/
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
if (prevdup && postdup) {
xfs_dir2_data_free_t *dfp2; /* another bestfree pointer */
/*
* See if prevdup and/or postdup are in bestfree table.
*/
- dfp = xfs_dir2_data_freefind(hdr, prevdup);
- dfp2 = xfs_dir2_data_freefind(hdr, postdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
+ dfp2 = xfs_dir2_data_freefind(hdr, bf, postdup);
/*
* We need a rescan unless there are exactly 2 free entries
* namely our two. Then we know what's happening, otherwise
@@ -783,7 +799,7 @@ xfs_dir2_data_make_free(
be16_add_cpu(&prevdup->length, len + be16_to_cpu(postdup->length));
*xfs_dir2_data_unused_tag_p(prevdup) =
cpu_to_be16((char *)prevdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
if (!needscan) {
/*
* Has to be the case that entries 0 and 1 are
@@ -797,12 +813,13 @@ xfs_dir2_data_make_free(
ASSERT(dfp2 == dfp);
dfp2 = &bf[1];
}
- xfs_dir2_data_freeremove(hdr, dfp2, needlogp);
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp2, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
/*
* Now insert the new entry.
*/
- dfp = xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, prevdup,
+ needlogp);
ASSERT(dfp == &bf[0]);
ASSERT(dfp->length == prevdup->length);
ASSERT(!dfp[1].length);
@@ -813,19 +830,19 @@ xfs_dir2_data_make_free(
* The entry before us is free, merge with it.
*/
else if (prevdup) {
- dfp = xfs_dir2_data_freefind(hdr, prevdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, prevdup);
be16_add_cpu(&prevdup->length, len);
*xfs_dir2_data_unused_tag_p(prevdup) =
cpu_to_be16((char *)prevdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, prevdup);
+ xfs_dir2_data_log_unused(args, bp, prevdup);
/*
* If the previous entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
* the old one and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- xfs_dir2_data_freeinsert(hdr, prevdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, prevdup, needlogp);
}
/*
* Otherwise we need a scan if the new entry is big enough.
@@ -839,21 +856,21 @@ xfs_dir2_data_make_free(
* The following entry is free, merge with it.
*/
else if (postdup) {
- dfp = xfs_dir2_data_freefind(hdr, postdup);
+ dfp = xfs_dir2_data_freefind(hdr, bf, postdup);
newdup = (xfs_dir2_data_unused_t *)((char *)hdr + offset);
newdup->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
newdup->length = cpu_to_be16(len + be16_to_cpu(postdup->length));
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If the following entry was in the table, the new entry
* is longer, so it will be in the table too. Remove
* the old one and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
/*
* Otherwise we need a scan if the new entry is big enough.
@@ -872,8 +889,8 @@ xfs_dir2_data_make_free(
newdup->length = cpu_to_be16(len);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
- xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_log_unused(args, bp, newdup);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup, needlogp);
}
*needscanp = needscan;
}
@@ -883,7 +900,7 @@ xfs_dir2_data_make_free(
*/
void
xfs_dir2_data_use_free(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
xfs_dir2_data_unused_t *dup, /* unused entry */
xfs_dir2_data_aoff_t offset, /* starting offset to use */
@@ -913,9 +930,9 @@ xfs_dir2_data_use_free(
/*
* Look up the entry in the bestfree table.
*/
- dfp = xfs_dir2_data_freefind(hdr, dup);
oldlen = be16_to_cpu(dup->length);
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = args->dp->d_ops->data_bestfree_p(hdr);
+ dfp = xfs_dir2_data_freefind(hdr, bf, dup);
ASSERT(dfp || oldlen <= be16_to_cpu(bf[2].length));
/*
* Check for alignment with front and back of the entry.
@@ -932,7 +949,8 @@ xfs_dir2_data_use_free(
if (dfp) {
needscan = (bf[2].offset != 0);
if (!needscan)
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
}
}
/*
@@ -945,13 +963,14 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(oldlen - len);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
ASSERT(dfp != NULL);
ASSERT(dfp->length == newdup->length);
ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
@@ -972,13 +991,14 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
/*
* If it was in the table, remove it and add the new one.
*/
if (dfp) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- dfp = xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
+ xfs_dir2_data_freeremove(hdr, bf, dfp, needlogp);
+ dfp = xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
ASSERT(dfp != NULL);
ASSERT(dfp->length == newdup->length);
ASSERT(be16_to_cpu(dfp->offset) == (char *)newdup - (char *)hdr);
@@ -999,13 +1019,13 @@ xfs_dir2_data_use_free(
newdup->length = cpu_to_be16(((char *)hdr + offset) - (char *)newdup);
*xfs_dir2_data_unused_tag_p(newdup) =
cpu_to_be16((char *)newdup - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup);
+ xfs_dir2_data_log_unused(args, bp, newdup);
newdup2 = (xfs_dir2_data_unused_t *)((char *)hdr + offset + len);
newdup2->freetag = cpu_to_be16(XFS_DIR2_DATA_FREE_TAG);
newdup2->length = cpu_to_be16(oldlen - len - be16_to_cpu(newdup->length));
*xfs_dir2_data_unused_tag_p(newdup2) =
cpu_to_be16((char *)newdup2 - (char *)hdr);
- xfs_dir2_data_log_unused(tp, bp, newdup2);
+ xfs_dir2_data_log_unused(args, bp, newdup2);
/*
* If the old entry was in the table, we need to scan
* if the 3rd entry was valid, since these entries
@@ -1017,9 +1037,11 @@ xfs_dir2_data_use_free(
if (dfp) {
needscan = (bf[2].length != 0);
if (!needscan) {
- xfs_dir2_data_freeremove(hdr, dfp, needlogp);
- xfs_dir2_data_freeinsert(hdr, newdup, needlogp);
- xfs_dir2_data_freeinsert(hdr, newdup2,
+ xfs_dir2_data_freeremove(hdr, bf, dfp,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup,
+ needlogp);
+ xfs_dir2_data_freeinsert(hdr, bf, newdup2,
needlogp);
}
}
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 1021c8356d0..fb0aad4440c 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -18,23 +18,21 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
@@ -43,30 +41,31 @@
*/
static int xfs_dir2_leaf_lookup_int(xfs_da_args_t *args, struct xfs_buf **lbpp,
int *indexp, struct xfs_buf **dbpp);
-static void xfs_dir3_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
- int first, int last);
-static void xfs_dir3_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void xfs_dir3_leaf_log_bests(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+static void xfs_dir3_leaf_log_tail(struct xfs_da_args *args,
+ struct xfs_buf *bp);
/*
* Check the internal consistency of a leaf1 block.
* Pop an assert if something is wrong.
*/
#ifdef DEBUG
-#define xfs_dir3_leaf_check(mp, bp) \
+#define xfs_dir3_leaf_check(dp, bp) \
do { \
- if (!xfs_dir3_leaf1_check((mp), (bp))) \
+ if (!xfs_dir3_leaf1_check((dp), (bp))) \
ASSERT(0); \
} while (0);
STATIC bool
xfs_dir3_leaf1_check(
- struct xfs_mount *mp,
+ struct xfs_inode *dp,
struct xfs_buf *bp)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
struct xfs_dir3_icleaf_hdr leafhdr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -75,71 +74,16 @@ xfs_dir3_leaf1_check(
} else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC)
return false;
- return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
}
#else
-#define xfs_dir3_leaf_check(mp, bp)
+#define xfs_dir3_leaf_check(dp, bp)
#endif
-void
-xfs_dir3_leaf_hdr_from_disk(
- struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from)
-{
- if (from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
- from->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC)) {
- to->forw = be32_to_cpu(from->hdr.info.forw);
- to->back = be32_to_cpu(from->hdr.info.back);
- to->magic = be16_to_cpu(from->hdr.info.magic);
- to->count = be16_to_cpu(from->hdr.count);
- to->stale = be16_to_cpu(from->hdr.stale);
- } else {
- struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)from;
-
- to->forw = be32_to_cpu(hdr3->info.hdr.forw);
- to->back = be32_to_cpu(hdr3->info.hdr.back);
- to->magic = be16_to_cpu(hdr3->info.hdr.magic);
- to->count = be16_to_cpu(hdr3->count);
- to->stale = be16_to_cpu(hdr3->stale);
- }
-
- ASSERT(to->magic == XFS_DIR2_LEAF1_MAGIC ||
- to->magic == XFS_DIR3_LEAF1_MAGIC ||
- to->magic == XFS_DIR2_LEAFN_MAGIC ||
- to->magic == XFS_DIR3_LEAFN_MAGIC);
-}
-
-void
-xfs_dir3_leaf_hdr_to_disk(
- struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from)
-{
- ASSERT(from->magic == XFS_DIR2_LEAF1_MAGIC ||
- from->magic == XFS_DIR3_LEAF1_MAGIC ||
- from->magic == XFS_DIR2_LEAFN_MAGIC ||
- from->magic == XFS_DIR3_LEAFN_MAGIC);
-
- if (from->magic == XFS_DIR2_LEAF1_MAGIC ||
- from->magic == XFS_DIR2_LEAFN_MAGIC) {
- to->hdr.info.forw = cpu_to_be32(from->forw);
- to->hdr.info.back = cpu_to_be32(from->back);
- to->hdr.info.magic = cpu_to_be16(from->magic);
- to->hdr.count = cpu_to_be16(from->count);
- to->hdr.stale = cpu_to_be16(from->stale);
- } else {
- struct xfs_dir3_leaf_hdr *hdr3 = (struct xfs_dir3_leaf_hdr *)to;
-
- hdr3->info.hdr.forw = cpu_to_be32(from->forw);
- hdr3->info.hdr.back = cpu_to_be32(from->back);
- hdr3->info.hdr.magic = cpu_to_be16(from->magic);
- hdr3->count = cpu_to_be16(from->count);
- hdr3->stale = cpu_to_be16(from->stale);
- }
-}
-
bool
xfs_dir3_leaf_check_int(
struct xfs_mount *mp,
+ struct xfs_inode *dp,
struct xfs_dir3_icleaf_hdr *hdr,
struct xfs_dir2_leaf *leaf)
{
@@ -147,16 +91,30 @@ xfs_dir3_leaf_check_int(
xfs_dir2_leaf_tail_t *ltp;
int stale;
int i;
+ const struct xfs_dir_ops *ops;
+ struct xfs_dir3_icleaf_hdr leafhdr;
+ struct xfs_da_geometry *geo = mp->m_dir_geo;
+
+ /*
+ * we can be passed a null dp here from a verifier, so we need to go the
+ * hard way to get them.
+ */
+ ops = xfs_dir_get_ops(mp, dp);
+
+ if (!hdr) {
+ ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ hdr = &leafhdr;
+ }
- ents = xfs_dir3_leaf_ents_p(leaf);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ents = ops->leaf_ents_p(leaf);
+ ltp = xfs_dir2_leaf_tail_p(geo, leaf);
/*
* XXX (dgc): This value is not restrictive enough.
* Should factor in the size of the bests table as well.
* We can deduce a value for that from di_size.
*/
- if (hdr->count > xfs_dir3_max_leaf_ents(mp, leaf))
+ if (hdr->count > ops->leaf_max_ents(geo))
return false;
/* Leaves and bests don't overlap in leaf format. */
@@ -192,7 +150,6 @@ xfs_dir3_leaf_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_dir2_leaf *leaf = bp->b_addr;
- struct xfs_dir3_icleaf_hdr leafhdr;
ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
@@ -214,8 +171,7 @@ xfs_dir3_leaf_verify(
return false;
}
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(mp, NULL, NULL, leaf);
}
static void
@@ -225,13 +181,14 @@ __read_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if ((xfs_sb_version_hascrc(&mp->m_sb) &&
- !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_DIR3_LEAF_CRC_OFF)) ||
- !xfs_dir3_leaf_verify(bp, magic)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_leaf_verify(bp, magic))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -244,8 +201,8 @@ __write_verify(
struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_leaf_verify(bp, magic)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -255,7 +212,7 @@ __write_verify(
if (bip)
hdr3->info.lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_LEAF_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_DIR3_LEAF_CRC_OFF);
}
static void
@@ -368,7 +325,7 @@ xfs_dir3_leaf_init(
if (type == XFS_DIR2_LEAF1_MAGIC) {
struct xfs_dir2_leaf_tail *ltp;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(mp->m_dir_geo, leaf);
ltp->bestcount = 0;
bp->b_ops = &xfs_dir3_leaf1_buf_ops;
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAF1_BUF);
@@ -392,18 +349,18 @@ xfs_dir3_leaf_get_buf(
int error;
ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
- ASSERT(bno >= XFS_DIR2_LEAF_FIRSTDB(mp) &&
- bno < XFS_DIR2_FREE_FIRSTDB(mp));
+ ASSERT(bno >= xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET) &&
+ bno < xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
- XFS_DATA_FORK);
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, bno),
+ -1, &bp, XFS_DATA_FORK);
if (error)
return error;
xfs_dir3_leaf_init(mp, tp, bp, dp->i_ino, magic);
- xfs_dir3_leaf_log_header(tp, bp);
+ xfs_dir3_leaf_log_header(args, bp);
if (magic == XFS_DIR2_LEAF1_MAGIC)
- xfs_dir3_leaf_log_tail(tp, bp);
+ xfs_dir3_leaf_log_tail(args, bp);
*bpp = bp;
return 0;
}
@@ -448,8 +405,8 @@ xfs_dir2_block_to_leaf(
if ((error = xfs_da_grow_inode(args, &blkno))) {
return error;
}
- ldb = xfs_dir2_da_to_db(mp, blkno);
- ASSERT(ldb == XFS_DIR2_LEAF_FIRSTDB(mp));
+ ldb = xfs_dir2_da_to_db(args->geo, blkno);
+ ASSERT(ldb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_LEAF_OFFSET));
/*
* Initialize the leaf block, get a buffer for it.
*/
@@ -460,35 +417,35 @@ xfs_dir2_block_to_leaf(
leaf = lbp->b_addr;
hdr = dbp->b_addr;
xfs_dir3_data_check(dp, dbp);
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
- bf = xfs_dir3_data_bestfree_p(hdr);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Set the counts in the leaf header.
*/
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
leafhdr.count = be32_to_cpu(btp->count);
leafhdr.stale = be32_to_cpu(btp->stale);
- xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, lbp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
/*
* Could compact these but I think we always do the conversion
* after squeezing out stale entries.
*/
memcpy(ents, blp, be32_to_cpu(btp->count) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, lbp, 0, leafhdr.count - 1);
+ xfs_dir3_leaf_log_ents(args, lbp, 0, leafhdr.count - 1);
needscan = 0;
needlog = 1;
/*
* Make the space formerly occupied by the leaf entries and block
* tail be free.
*/
- xfs_dir2_data_make_free(tp, dbp,
+ xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
- (xfs_dir2_data_aoff_t)((char *)hdr + mp->m_dirblksize -
+ (xfs_dir2_data_aoff_t)((char *)hdr + args->geo->blksize -
(char *)blp),
&needlog, &needscan);
/*
@@ -502,11 +459,11 @@ xfs_dir2_block_to_leaf(
hdr->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Set up leaf tail and bests table.
*/
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ltp->bestcount = cpu_to_be32(1);
bestsp = xfs_dir2_leaf_bests_p(ltp);
bestsp[0] = bf[0].length;
@@ -514,10 +471,10 @@ xfs_dir2_block_to_leaf(
* Log the data header and leaf bests table.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir3_leaf_check(mp, lbp);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir3_leaf_check(dp, lbp);
xfs_dir3_data_check(dp, dbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, 0);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, 0);
return 0;
}
@@ -686,7 +643,7 @@ xfs_dir2_leaf_addname(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
@@ -698,11 +655,11 @@ xfs_dir2_leaf_addname(
*/
index = xfs_dir2_leaf_search_hash(args, lbp);
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- length = xfs_dir3_data_entsize(mp, args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* See if there are any entries with the same hash value
@@ -715,7 +672,7 @@ xfs_dir2_leaf_addname(
index++, lep++) {
if (be32_to_cpu(lep->address) == XFS_DIR2_NULL_DATAPTR)
continue;
- i = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ i = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(i < be32_to_cpu(ltp->bestcount));
ASSERT(bestsp[i] != cpu_to_be16(NULLDATAOFF));
if (be16_to_cpu(bestsp[i]) >= length) {
@@ -855,16 +812,17 @@ xfs_dir2_leaf_addname(
memmove(&bestsp[0], &bestsp[1],
be32_to_cpu(ltp->bestcount) * sizeof(bestsp[0]));
be32_add_cpu(&ltp->bestcount, 1);
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
}
/*
* If we're filling in a previously empty block just log it.
*/
else
- xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
hdr = dbp->b_addr;
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
bestsp[use_block] = bf[0].length;
grown = 1;
} else {
@@ -873,14 +831,14 @@ xfs_dir2_leaf_addname(
* Just read that one in.
*/
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, use_block),
- -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, use_block),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
}
hdr = dbp->b_addr;
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
grown = 0;
}
/*
@@ -893,7 +851,7 @@ xfs_dir2_leaf_addname(
/*
* Mark the initial part of our freespace in use for the new entry.
*/
- xfs_dir2_data_use_free(tp, dbp, dup,
+ xfs_dir2_data_use_free(args, dbp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
@@ -903,20 +861,20 @@ xfs_dir2_leaf_addname(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
- tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
/*
* Need to scan fix up the bestfree table.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Need to log the data block's header.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
- xfs_dir2_data_log_entry(tp, dbp, dep);
+ xfs_dir2_data_log_header(args, dbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* If the bests table needs to be changed, do it.
* Log the change unless we've already done that.
@@ -924,7 +882,7 @@ xfs_dir2_leaf_addname(
if (be16_to_cpu(bestsp[use_block]) != be16_to_cpu(bf[0].length)) {
bestsp[use_block] = bf[0].length;
if (!grown)
- xfs_dir3_leaf_log_bests(tp, lbp, use_block, use_block);
+ xfs_dir3_leaf_log_bests(args, lbp, use_block, use_block);
}
lep = xfs_dir3_leaf_find_entry(&leafhdr, ents, index, compact, lowstale,
@@ -934,15 +892,16 @@ xfs_dir2_leaf_addname(
* Fill in the new leaf entry.
*/
lep->hashval = cpu_to_be32(args->hashval);
- lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp, use_block,
+ lep->address = cpu_to_be32(
+ xfs_dir2_db_off_to_dataptr(args->geo, use_block,
be16_to_cpu(*tagp)));
/*
* Log the leaf fields and give up the buffers.
*/
- xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, lbp);
- xfs_dir3_leaf_log_ents(tp, lbp, lfloglow, lfloghigh);
- xfs_dir3_leaf_check(mp, lbp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_ents(args, lbp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, lbp);
xfs_dir3_data_check(dp, dbp);
return 0;
}
@@ -962,6 +921,7 @@ xfs_dir3_leaf_compact(
int loglow; /* first leaf entry to log */
int to; /* target leaf index */
struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = args->dp;
leaf = bp->b_addr;
if (!leafhdr->stale)
@@ -970,7 +930,7 @@ xfs_dir3_leaf_compact(
/*
* Compress out the stale entries in place.
*/
- ents = xfs_dir3_leaf_ents_p(leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
for (from = to = 0, loglow = -1; from < leafhdr->count; from++) {
if (ents[from].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
continue;
@@ -991,10 +951,10 @@ xfs_dir3_leaf_compact(
leafhdr->count -= leafhdr->stale;
leafhdr->stale = 0;
- xfs_dir3_leaf_hdr_to_disk(leaf, leafhdr);
- xfs_dir3_leaf_log_header(args->trans, bp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
if (loglow != -1)
- xfs_dir3_leaf_log_ents(args->trans, bp, loglow, to - 1);
+ xfs_dir3_leaf_log_ents(args, bp, loglow, to - 1);
}
/*
@@ -1096,7 +1056,7 @@ xfs_dir3_leaf_compact_x1(
*/
static void
xfs_dir3_leaf_log_bests(
- xfs_trans_t *tp, /* transaction pointer */
+ struct xfs_da_args *args,
struct xfs_buf *bp, /* leaf buffer */
int first, /* first entry to log */
int last) /* last entry to log */
@@ -1109,10 +1069,11 @@ xfs_dir3_leaf_log_bests(
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(tp->t_mountp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
firstb = xfs_dir2_leaf_bests_p(ltp) + first;
lastb = xfs_dir2_leaf_bests_p(ltp) + last;
- xfs_trans_log_buf(tp, bp, (uint)((char *)firstb - (char *)leaf),
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstb - (char *)leaf),
(uint)((char *)lastb - (char *)leaf + sizeof(*lastb) - 1));
}
@@ -1121,10 +1082,10 @@ xfs_dir3_leaf_log_bests(
*/
void
xfs_dir3_leaf_log_ents(
- xfs_trans_t *tp, /* transaction pointer */
- struct xfs_buf *bp, /* leaf buffer */
- int first, /* first entry to log */
- int last) /* last entry to log */
+ struct xfs_da_args *args,
+ struct xfs_buf *bp,
+ int first,
+ int last)
{
xfs_dir2_leaf_entry_t *firstlep; /* pointer to first entry */
xfs_dir2_leaf_entry_t *lastlep; /* pointer to last entry */
@@ -1136,10 +1097,11 @@ xfs_dir3_leaf_log_ents(
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- ents = xfs_dir3_leaf_ents_p(leaf);
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
firstlep = &ents[first];
lastlep = &ents[last];
- xfs_trans_log_buf(tp, bp, (uint)((char *)firstlep - (char *)leaf),
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)firstlep - (char *)leaf),
(uint)((char *)lastlep - (char *)leaf + sizeof(*lastlep) - 1));
}
@@ -1148,7 +1110,7 @@ xfs_dir3_leaf_log_ents(
*/
void
xfs_dir3_leaf_log_header(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
@@ -1158,8 +1120,9 @@ xfs_dir3_leaf_log_header(
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- xfs_trans_log_buf(tp, bp, (uint)((char *)&leaf->hdr - (char *)leaf),
- xfs_dir3_leaf_hdr_size(leaf) - 1);
+ xfs_trans_log_buf(args->trans, bp,
+ (uint)((char *)&leaf->hdr - (char *)leaf),
+ args->dp->d_ops->leaf_hdr_size - 1);
}
/*
@@ -1167,21 +1130,20 @@ xfs_dir3_leaf_log_header(
*/
STATIC void
xfs_dir3_leaf_log_tail(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
xfs_dir2_leaf_tail_t *ltp; /* leaf tail structure */
- struct xfs_mount *mp = tp->t_mountp;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR2_LEAFN_MAGIC) ||
leaf->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC));
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
- xfs_trans_log_buf(tp, bp, (uint)((char *)ltp - (char *)leaf),
- (uint)(mp->m_dirblksize - 1));
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
+ xfs_trans_log_buf(args->trans, bp, (uint)((char *)ltp - (char *)leaf),
+ (uint)(args->geo->blksize - 1));
}
/*
@@ -1214,9 +1176,9 @@ xfs_dir2_leaf_lookup(
}
tp = args->trans;
dp = args->dp;
- xfs_dir3_leaf_check(dp->i_mount, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
leaf = lbp->b_addr;
- ents = xfs_dir3_leaf_ents_p(leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Get to the leaf entry and contained data entry address.
*/
@@ -1227,12 +1189,12 @@ xfs_dir2_leaf_lookup(
*/
dep = (xfs_dir2_data_entry_t *)
((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
/*
* Return the found inode number & CI name if appropriate
*/
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = xfs_dir3_dirent_get_ftype(dp->i_mount, dep);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
xfs_trans_brelse(tp, dbp);
xfs_trans_brelse(tp, lbp);
@@ -1273,15 +1235,15 @@ xfs_dir2_leaf_lookup_int(
tp = args->trans;
mp = dp->i_mount;
- error = xfs_dir3_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ error = xfs_dir3_leaf_read(tp, dp, args->geo->leafblk, -1, &lbp);
if (error)
return error;
*lbpp = lbp;
leaf = lbp->b_addr;
- xfs_dir3_leaf_check(mp, lbp);
- ents = xfs_dir3_leaf_ents_p(leaf);
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ xfs_dir3_leaf_check(dp, lbp);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
/*
* Look for the first leaf entry with our hash value.
@@ -1302,7 +1264,8 @@ xfs_dir2_leaf_lookup_int(
/*
* Get the new data block number.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* If it's not the same as the old data block number,
* need to pitch the old one and read the new one.
@@ -1311,8 +1274,8 @@ xfs_dir2_leaf_lookup_int(
if (dbp)
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
- -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, newdb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1323,7 +1286,8 @@ xfs_dir2_leaf_lookup_int(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
* Compare name and if it's an exact match, return the index
* and buffer. If it's the first case-insensitive match, store
@@ -1352,8 +1316,8 @@ xfs_dir2_leaf_lookup_int(
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, cidb),
- -1, &dbp);
+ xfs_dir2_db_to_da(args->geo, cidb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1415,60 +1379,60 @@ xfs_dir2_leaf_removename(
leaf = lbp->b_addr;
hdr = dbp->b_addr;
xfs_dir3_data_check(dp, dbp);
- bf = xfs_dir3_data_bestfree_p(hdr);
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ bf = dp->d_ops->data_bestfree_p(hdr);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, use that to point to the data entry.
*/
lep = &ents[index];
- db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
needscan = needlog = 0;
oldbest = be16_to_cpu(bf[0].length);
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
ASSERT(be16_to_cpu(bestsp[db]) == oldbest);
/*
* Mark the former data entry unused.
*/
- xfs_dir2_data_make_free(tp, dbp,
+ xfs_dir2_data_make_free(args, dbp,
(xfs_dir2_data_aoff_t)((char *)dep - (char *)hdr),
- xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* We just mark the leaf entry stale by putting a null in it.
*/
leafhdr.stale++;
- xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, lbp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(tp, lbp, index, index);
+ xfs_dir3_leaf_log_ents(args, lbp, index, index);
/*
* Scan the freespace in the data block again if necessary,
* log the data block header if necessary.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the longest freespace in the data block has changed,
* put the new value in the bests table and log that.
*/
if (be16_to_cpu(bf[0].length) != oldbest) {
bestsp[db] = bf[0].length;
- xfs_dir3_leaf_log_bests(tp, lbp, db, db);
+ xfs_dir3_leaf_log_bests(args, lbp, db, db);
}
xfs_dir3_data_check(dp, dbp);
/*
* If the data block is now empty then get rid of the data block.
*/
if (be16_to_cpu(bf[0].length) ==
- mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr)) {
- ASSERT(db != mp->m_dirdatablk);
+ args->geo->blksize - dp->d_ops->data_entry_offset) {
+ ASSERT(db != args->geo->datablk);
if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
/*
* Nope, can't get rid of it because it caused
@@ -1478,7 +1442,7 @@ xfs_dir2_leaf_removename(
*/
if (error == ENOSPC && args->total == 0)
error = 0;
- xfs_dir3_leaf_check(mp, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
return error;
}
dbp = NULL;
@@ -1501,18 +1465,19 @@ xfs_dir2_leaf_removename(
memmove(&bestsp[db - i], bestsp,
(be32_to_cpu(ltp->bestcount) - (db - i)) * sizeof(*bestsp));
be32_add_cpu(&ltp->bestcount, -(db - i));
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0,
+ be32_to_cpu(ltp->bestcount) - 1);
} else
bestsp[db] = cpu_to_be16(NULLDATAOFF);
}
/*
* If the data block was not the first one, drop it.
*/
- else if (db != mp->m_dirdatablk)
+ else if (db != args->geo->datablk)
dbp = NULL;
- xfs_dir3_leaf_check(mp, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
/*
* See if we can convert to block form.
*/
@@ -1547,7 +1512,7 @@ xfs_dir2_leaf_replace(
}
dp = args->dp;
leaf = lbp->b_addr;
- ents = xfs_dir3_leaf_ents_p(leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the leaf entry, get data address from it.
*/
@@ -1557,16 +1522,16 @@ xfs_dir2_leaf_replace(
*/
dep = (xfs_dir2_data_entry_t *)
((char *)dbp->b_addr +
- xfs_dir2_dataptr_to_off(dp->i_mount, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address)));
ASSERT(args->inumber != be64_to_cpu(dep->inumber));
/*
* Put the new inode number in, log it.
*/
dep->inumber = cpu_to_be64(args->inumber);
- xfs_dir3_dirent_put_ftype(dp->i_mount, dep, args->filetype);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
tp = args->trans;
- xfs_dir2_data_log_entry(tp, dbp, dep);
- xfs_dir3_leaf_check(dp->i_mount, lbp);
+ xfs_dir2_data_log_entry(args, dbp, dep);
+ xfs_dir3_leaf_check(dp, lbp);
xfs_trans_brelse(tp, lbp);
return 0;
}
@@ -1592,8 +1557,8 @@ xfs_dir2_leaf_search_hash(
struct xfs_dir3_icleaf_hdr leafhdr;
leaf = lbp->b_addr;
- ents = xfs_dir3_leaf_ents_p(leaf);
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
+ args->dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
/*
* Note, the table cannot be empty, so we have to go through the loop.
@@ -1651,22 +1616,23 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+ error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(args->geo, db),
+ -1, &dbp);
if (error)
return error;
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
#ifdef DEBUG
{
struct xfs_dir2_data_hdr *hdr = dbp->b_addr;
- struct xfs_dir2_data_free *bf = xfs_dir3_data_bestfree_p(hdr);
+ struct xfs_dir2_data_free *bf = dp->d_ops->data_bestfree_p(hdr);
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
ASSERT(be16_to_cpu(bf[0].length) ==
- mp->m_dirblksize - xfs_dir3_data_entry_offset(hdr));
+ args->geo->blksize - dp->d_ops->data_entry_offset);
ASSERT(db == be32_to_cpu(ltp->bestcount) - 1);
}
#endif
@@ -1685,8 +1651,8 @@ xfs_dir2_leaf_trim_data(
bestsp = xfs_dir2_leaf_bests_p(ltp);
be32_add_cpu(&ltp->bestcount, -1);
memmove(&bestsp[1], &bestsp[0], be32_to_cpu(ltp->bestcount) * sizeof(*bestsp));
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
return 0;
}
@@ -1750,22 +1716,22 @@ xfs_dir2_node_to_leaf(
/*
* Get the last offset in the file.
*/
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK))) {
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK))) {
return error;
}
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
/*
* If there are freespace blocks other than the first one,
* take this opportunity to remove trailing empty freespace blocks
* that may have been left behind during no-space-reservation
* operations.
*/
- while (fo > mp->m_dirfreeblk) {
+ while (fo > args->geo->freeblk) {
if ((error = xfs_dir2_node_trim_free(args, fo, &rval))) {
return error;
}
if (rval)
- fo -= mp->m_dirblkfsbs;
+ fo -= args->geo->fsbcount;
else
return 0;
}
@@ -1778,11 +1744,11 @@ xfs_dir2_node_to_leaf(
/*
* If it's not the single leaf block, give up.
*/
- if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + mp->m_dirblksize)
+ if (XFS_FSB_TO_B(mp, fo) > XFS_DIR2_LEAF_OFFSET + args->geo->blksize)
return 0;
lbp = state->path.blk[0].bp;
leaf = lbp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -1790,11 +1756,11 @@ xfs_dir2_node_to_leaf(
/*
* Read the freespace block.
*/
- error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
+ error = xfs_dir2_free_read(tp, dp, args->geo->freeblk, &fbp);
if (error)
return error;
free = fbp->b_addr;
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
ASSERT(!freehdr.firstdb);
@@ -1802,7 +1768,7 @@ xfs_dir2_node_to_leaf(
* Now see if the leafn and free data will fit in a leaf1.
* If not, release the buffer and give up.
*/
- if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > mp->m_dirblksize) {
+ if (xfs_dir3_leaf_size(&leafhdr, freehdr.nvalid) > args->geo->blksize) {
xfs_trans_brelse(tp, fbp);
return 0;
}
@@ -1822,25 +1788,27 @@ xfs_dir2_node_to_leaf(
/*
* Set up the leaf tail from the freespace block.
*/
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ltp->bestcount = cpu_to_be32(freehdr.nvalid);
/*
* Set up the leaf bests table.
*/
- memcpy(xfs_dir2_leaf_bests_p(ltp), xfs_dir3_free_bests_p(mp, free),
+ memcpy(xfs_dir2_leaf_bests_p(ltp), dp->d_ops->free_bests_p(free),
freehdr.nvalid * sizeof(xfs_dir2_data_off_t));
- xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, lbp);
- xfs_dir3_leaf_log_bests(tp, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
- xfs_dir3_leaf_log_tail(tp, lbp);
- xfs_dir3_leaf_check(mp, lbp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_log_bests(args, lbp, 0, be32_to_cpu(ltp->bestcount) - 1);
+ xfs_dir3_leaf_log_tail(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
/*
* Get rid of the freespace block.
*/
- error = xfs_dir2_shrink_inode(args, XFS_DIR2_FREE_FIRSTDB(mp), fbp);
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET),
+ fbp);
if (error) {
/*
* This can't fail here because it can only happen when
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 4c3dba7ffb7..da43d304fca 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -18,22 +18,21 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
@@ -55,21 +54,21 @@ static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
* Check internal consistency of a leafn block.
*/
#ifdef DEBUG
-#define xfs_dir3_leaf_check(mp, bp) \
+#define xfs_dir3_leaf_check(dp, bp) \
do { \
- if (!xfs_dir3_leafn_check((mp), (bp))) \
+ if (!xfs_dir3_leafn_check((dp), (bp))) \
ASSERT(0); \
} while (0);
static bool
xfs_dir3_leafn_check(
- struct xfs_mount *mp,
+ struct xfs_inode *dp,
struct xfs_buf *bp)
{
struct xfs_dir2_leaf *leaf = bp->b_addr;
struct xfs_dir3_icleaf_hdr leafhdr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) {
struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
@@ -78,10 +77,10 @@ xfs_dir3_leafn_check(
} else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC)
return false;
- return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
+ return xfs_dir3_leaf_check_int(dp->i_mount, dp, &leafhdr, leaf);
}
#else
-#define xfs_dir3_leaf_check(mp, bp)
+#define xfs_dir3_leaf_check(dp, bp)
#endif
static bool
@@ -116,13 +115,14 @@ xfs_dir3_free_read_verify(
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- if ((xfs_sb_version_hascrc(&mp->m_sb) &&
- !xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- XFS_DIR3_FREE_CRC_OFF)) ||
- !xfs_dir3_free_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dir3_free_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -134,8 +134,8 @@ xfs_dir3_free_write_verify(
struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
if (!xfs_dir3_free_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -145,7 +145,7 @@ xfs_dir3_free_write_verify(
if (bip)
hdr3->lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), XFS_DIR3_FREE_CRC_OFF);
+ xfs_buf_update_cksum(bp, XFS_DIR3_FREE_CRC_OFF);
}
const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
@@ -193,66 +193,20 @@ xfs_dir2_free_try_read(
return __xfs_dir3_free_read(tp, dp, fbno, -2, bpp);
}
-
-void
-xfs_dir3_free_hdr_from_disk(
- struct xfs_dir3_icfree_hdr *to,
- struct xfs_dir2_free *from)
-{
- if (from->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC)) {
- to->magic = be32_to_cpu(from->hdr.magic);
- to->firstdb = be32_to_cpu(from->hdr.firstdb);
- to->nvalid = be32_to_cpu(from->hdr.nvalid);
- to->nused = be32_to_cpu(from->hdr.nused);
- } else {
- struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)from;
-
- to->magic = be32_to_cpu(hdr3->hdr.magic);
- to->firstdb = be32_to_cpu(hdr3->firstdb);
- to->nvalid = be32_to_cpu(hdr3->nvalid);
- to->nused = be32_to_cpu(hdr3->nused);
- }
-
- ASSERT(to->magic == XFS_DIR2_FREE_MAGIC ||
- to->magic == XFS_DIR3_FREE_MAGIC);
-}
-
-static void
-xfs_dir3_free_hdr_to_disk(
- struct xfs_dir2_free *to,
- struct xfs_dir3_icfree_hdr *from)
-{
- ASSERT(from->magic == XFS_DIR2_FREE_MAGIC ||
- from->magic == XFS_DIR3_FREE_MAGIC);
-
- if (from->magic == XFS_DIR2_FREE_MAGIC) {
- to->hdr.magic = cpu_to_be32(from->magic);
- to->hdr.firstdb = cpu_to_be32(from->firstdb);
- to->hdr.nvalid = cpu_to_be32(from->nvalid);
- to->hdr.nused = cpu_to_be32(from->nused);
- } else {
- struct xfs_dir3_free_hdr *hdr3 = (struct xfs_dir3_free_hdr *)to;
-
- hdr3->hdr.magic = cpu_to_be32(from->magic);
- hdr3->firstdb = cpu_to_be32(from->firstdb);
- hdr3->nvalid = cpu_to_be32(from->nvalid);
- hdr3->nused = cpu_to_be32(from->nused);
- }
-}
-
static int
xfs_dir3_free_get_buf(
- struct xfs_trans *tp,
- struct xfs_inode *dp,
+ xfs_da_args_t *args,
xfs_dir2_db_t fbno,
struct xfs_buf **bpp)
{
+ struct xfs_trans *tp = args->trans;
+ struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
struct xfs_buf *bp;
int error;
struct xfs_dir3_icfree_hdr hdr;
- error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno),
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(args->geo, fbno),
-1, &bp, XFS_DATA_FORK);
if (error)
return error;
@@ -277,7 +231,7 @@ xfs_dir3_free_get_buf(
uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid);
} else
hdr.magic = XFS_DIR2_FREE_MAGIC;
- xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr);
+ dp->d_ops->free_hdr_to_disk(bp->b_addr, &hdr);
*bpp = bp;
return 0;
}
@@ -287,7 +241,7 @@ xfs_dir3_free_get_buf(
*/
STATIC void
xfs_dir2_free_log_bests(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp,
int first, /* first entry to log */
int last) /* last entry to log */
@@ -296,10 +250,10 @@ xfs_dir2_free_log_bests(
__be16 *bests;
free = bp->b_addr;
- bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+ bests = args->dp->d_ops->free_bests_p(free);
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
- xfs_trans_log_buf(tp, bp,
+ xfs_trans_log_buf(args->trans, bp,
(uint)((char *)&bests[first] - (char *)free),
(uint)((char *)&bests[last] - (char *)free +
sizeof(bests[0]) - 1));
@@ -310,7 +264,7 @@ xfs_dir2_free_log_bests(
*/
static void
xfs_dir2_free_log_header(
- struct xfs_trans *tp,
+ struct xfs_da_args *args,
struct xfs_buf *bp)
{
#ifdef DEBUG
@@ -320,7 +274,8 @@ xfs_dir2_free_log_header(
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
#endif
- xfs_trans_log_buf(tp, bp, 0, xfs_dir3_free_hdr_size(tp->t_mountp) - 1);
+ xfs_trans_log_buf(args->trans, bp, 0,
+ args->dp->d_ops->free_hdr_size - 1);
}
/*
@@ -360,27 +315,27 @@ xfs_dir2_leaf_to_node(
if ((error = xfs_dir2_grow_inode(args, XFS_DIR2_FREE_SPACE, &fdb))) {
return error;
}
- ASSERT(fdb == XFS_DIR2_FREE_FIRSTDB(mp));
+ ASSERT(fdb == xfs_dir2_byte_to_db(args->geo, XFS_DIR2_FREE_OFFSET));
/*
* Get the buffer for the new freespace block.
*/
- error = xfs_dir3_free_get_buf(tp, dp, fdb, &fbp);
+ error = xfs_dir3_free_get_buf(args, fdb, &fbp);
if (error)
return error;
free = fbp->b_addr;
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
leaf = lbp->b_addr;
- ltp = xfs_dir2_leaf_tail_p(mp, leaf);
+ ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
ASSERT(be32_to_cpu(ltp->bestcount) <=
- (uint)dp->i_d.di_size / mp->m_dirblksize);
+ (uint)dp->i_d.di_size / args->geo->blksize);
/*
* Copy freespace entries from the leaf block to the new block.
* Count active entries.
*/
from = xfs_dir2_leaf_bests_p(ltp);
- to = xfs_dir3_free_bests_p(mp, free);
+ to = dp->d_ops->free_bests_p(free);
for (i = n = 0; i < be32_to_cpu(ltp->bestcount); i++, from++, to++) {
if ((off = be16_to_cpu(*from)) != NULLDATAOFF)
n++;
@@ -393,9 +348,9 @@ xfs_dir2_leaf_to_node(
freehdr.nused = n;
freehdr.nvalid = be32_to_cpu(ltp->bestcount);
- xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_bests(tp, fbp, 0, freehdr.nvalid - 1);
- xfs_dir2_free_log_header(tp, fbp);
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_bests(args, fbp, 0, freehdr.nvalid - 1);
+ xfs_dir2_free_log_header(args, fbp);
/*
* Converting the leaf to a leafnode is just a matter of changing the
@@ -409,8 +364,8 @@ xfs_dir2_leaf_to_node(
leaf->hdr.info.magic = cpu_to_be16(XFS_DIR3_LEAFN_MAGIC);
lbp->b_ops = &xfs_dir3_leafn_buf_ops;
xfs_trans_buf_set_type(tp, lbp, XFS_BLFT_DIR_LEAFN_BUF);
- xfs_dir3_leaf_log_header(tp, lbp);
- xfs_dir3_leaf_check(mp, lbp);
+ xfs_dir3_leaf_log_header(args, lbp);
+ xfs_dir3_leaf_check(dp, lbp);
return 0;
}
@@ -443,8 +398,8 @@ xfs_dir2_leafn_add(
mp = dp->i_mount;
tp = args->trans;
leaf = bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Quick check just to make sure we are not going to index
@@ -460,7 +415,7 @@ xfs_dir2_leafn_add(
* a compact.
*/
- if (leafhdr.count == xfs_dir3_max_leaf_ents(mp, leaf)) {
+ if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
if (!leafhdr.stale)
return XFS_ERROR(ENOSPC);
compact = leafhdr.stale > 1;
@@ -495,33 +450,34 @@ xfs_dir2_leafn_add(
highstale, &lfloglow, &lfloghigh);
lep->hashval = cpu_to_be32(args->hashval);
- lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(mp,
+ lep->address = cpu_to_be32(xfs_dir2_db_off_to_dataptr(args->geo,
args->blkno, args->index));
- xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, bp);
- xfs_dir3_leaf_log_ents(tp, bp, lfloglow, lfloghigh);
- xfs_dir3_leaf_check(mp, bp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
+ xfs_dir3_leaf_log_ents(args, bp, lfloglow, lfloghigh);
+ xfs_dir3_leaf_check(dp, bp);
return 0;
}
#ifdef DEBUG
static void
xfs_dir2_free_hdr_check(
- struct xfs_mount *mp,
+ struct xfs_inode *dp,
struct xfs_buf *bp,
xfs_dir2_db_t db)
{
struct xfs_dir3_icfree_hdr hdr;
- xfs_dir3_free_hdr_from_disk(&hdr, bp->b_addr);
+ dp->d_ops->free_hdr_from_disk(&hdr, bp->b_addr);
- ASSERT((hdr.firstdb % xfs_dir3_free_max_bests(mp)) == 0);
+ ASSERT((hdr.firstdb %
+ dp->d_ops->free_max_bests(dp->i_mount->m_dir_geo)) == 0);
ASSERT(hdr.firstdb <= db);
ASSERT(db < hdr.firstdb + hdr.nvalid);
}
#else
-#define xfs_dir2_free_hdr_check(mp, dp, db)
+#define xfs_dir2_free_hdr_check(dp, bp, db)
#endif /* DEBUG */
/*
@@ -530,6 +486,7 @@ xfs_dir2_free_hdr_check(
*/
xfs_dahash_t /* hash value */
xfs_dir2_leafn_lasthash(
+ struct xfs_inode *dp,
struct xfs_buf *bp, /* leaf buffer */
int *count) /* count of entries in leaf */
{
@@ -537,7 +494,7 @@ xfs_dir2_leafn_lasthash(
struct xfs_dir2_leaf_entry *ents;
struct xfs_dir3_icleaf_hdr leafhdr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
ASSERT(leafhdr.magic == XFS_DIR2_LEAFN_MAGIC ||
leafhdr.magic == XFS_DIR3_LEAFN_MAGIC);
@@ -547,7 +504,7 @@ xfs_dir2_leafn_lasthash(
if (!leafhdr.count)
return 0;
- ents = xfs_dir3_leaf_ents_p(leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
return be32_to_cpu(ents[leafhdr.count - 1].hashval);
}
@@ -584,10 +541,10 @@ xfs_dir2_leafn_lookup_for_addname(
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
- xfs_dir3_leaf_check(mp, bp);
+ xfs_dir3_leaf_check(dp, bp);
ASSERT(leafhdr.count > 0);
/*
@@ -605,7 +562,7 @@ xfs_dir2_leafn_lookup_for_addname(
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC) ||
free->hdr.magic == cpu_to_be32(XFS_DIR3_FREE_MAGIC));
}
- length = xfs_dir3_data_entsize(mp, args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* Loop over leaf entries with the right hash value.
*/
@@ -620,7 +577,8 @@ xfs_dir2_leafn_lookup_for_addname(
/*
* Pull the data block number from the entry.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* For addname, we're looking for a place to put the new entry.
* We want to use a data block with an entry of equal
@@ -637,7 +595,7 @@ xfs_dir2_leafn_lookup_for_addname(
* Convert the data block to the free block
* holding its freespace information.
*/
- newfdb = xfs_dir2_db_to_fdb(mp, newdb);
+ newfdb = dp->d_ops->db_to_fdb(args->geo, newdb);
/*
* If it's not the one we have in hand, read it in.
*/
@@ -649,22 +607,23 @@ xfs_dir2_leafn_lookup_for_addname(
xfs_trans_brelse(tp, curbp);
error = xfs_dir2_free_read(tp, dp,
- xfs_dir2_db_to_da(mp, newfdb),
+ xfs_dir2_db_to_da(args->geo,
+ newfdb),
&curbp);
if (error)
return error;
free = curbp->b_addr;
- xfs_dir2_free_hdr_check(mp, curbp, curdb);
+ xfs_dir2_free_hdr_check(dp, curbp, curdb);
}
/*
* Get the index for our entry.
*/
- fi = xfs_dir2_db_to_fdindex(mp, curdb);
+ fi = dp->d_ops->db_to_fdindex(args->geo, curdb);
/*
* If it has room, return it.
*/
- bests = xfs_dir3_free_bests_p(mp, free);
+ bests = dp->d_ops->free_bests_p(free);
if (unlikely(bests[fi] == cpu_to_be16(NULLDATAOFF))) {
XFS_ERROR_REPORT("xfs_dir2_leafn_lookup_int",
XFS_ERRLEVEL_LOW, mp);
@@ -734,10 +693,10 @@ xfs_dir2_leafn_lookup_for_entry(
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
- xfs_dir3_leaf_check(mp, bp);
+ xfs_dir3_leaf_check(dp, bp);
ASSERT(leafhdr.count > 0);
/*
@@ -765,7 +724,8 @@ xfs_dir2_leafn_lookup_for_entry(
/*
* Pull the data block number from the entry.
*/
- newdb = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ newdb = xfs_dir2_dataptr_to_db(args->geo,
+ be32_to_cpu(lep->address));
/*
* Not adding a new entry, so we really want to find
* the name given to us.
@@ -790,7 +750,8 @@ xfs_dir2_leafn_lookup_for_entry(
curbp = state->extrablk.bp;
} else {
error = xfs_dir3_data_read(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
+ xfs_dir2_db_to_da(args->geo,
+ newdb),
-1, &curbp);
if (error)
return error;
@@ -802,7 +763,8 @@ xfs_dir2_leafn_lookup_for_entry(
* Point to the data entry.
*/
dep = (xfs_dir2_data_entry_t *)((char *)curbp->b_addr +
- xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
/*
* Compare the entry and if it's an exact match, return
* EEXIST immediately. If it's the first case-insensitive
@@ -816,7 +778,7 @@ xfs_dir2_leafn_lookup_for_entry(
xfs_trans_brelse(tp, state->extrablk.bp);
args->cmpresult = cmp;
args->inumber = be64_to_cpu(dep->inumber);
- args->filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+ args->filetype = dp->d_ops->data_get_ftype(dep);
*indexp = index;
state->extravalid = 1;
state->extrablk.bp = curbp;
@@ -888,7 +850,6 @@ xfs_dir3_leafn_moveents(
int start_d,/* destination leaf index */
int count) /* count of leaves to copy */
{
- struct xfs_trans *tp = args->trans;
int stale; /* count stale leaves copied */
trace_xfs_dir2_leafn_moveents(args, start_s, start_d, count);
@@ -907,7 +868,7 @@ xfs_dir3_leafn_moveents(
if (start_d < dhdr->count) {
memmove(&dents[start_d + count], &dents[start_d],
(dhdr->count - start_d) * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, bp_d, start_d + count,
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d + count,
count + dhdr->count - 1);
}
/*
@@ -929,7 +890,7 @@ xfs_dir3_leafn_moveents(
*/
memcpy(&dents[start_d], &sents[start_s],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, bp_d, start_d, start_d + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_d, start_d, start_d + count - 1);
/*
* If there are source entries after the ones we copied,
@@ -938,7 +899,7 @@ xfs_dir3_leafn_moveents(
if (start_s + count < shdr->count) {
memmove(&sents[start_s], &sents[start_s + count],
count * sizeof(xfs_dir2_leaf_entry_t));
- xfs_dir3_leaf_log_ents(tp, bp_s, start_s, start_s + count - 1);
+ xfs_dir3_leaf_log_ents(args, bp_s, start_s, start_s + count - 1);
}
/*
@@ -956,6 +917,7 @@ xfs_dir3_leafn_moveents(
*/
int /* sort order */
xfs_dir2_leafn_order(
+ struct xfs_inode *dp,
struct xfs_buf *leaf1_bp, /* leaf1 buffer */
struct xfs_buf *leaf2_bp) /* leaf2 buffer */
{
@@ -966,10 +928,10 @@ xfs_dir2_leafn_order(
struct xfs_dir3_icleaf_hdr hdr1;
struct xfs_dir3_icleaf_hdr hdr2;
- xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
- xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
- ents1 = xfs_dir3_leaf_ents_p(leaf1);
- ents2 = xfs_dir3_leaf_ents_p(leaf2);
+ dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = dp->d_ops->leaf_ents_p(leaf1);
+ ents2 = dp->d_ops->leaf_ents_p(leaf2);
if (hdr1.count > 0 && hdr2.count > 0 &&
(be32_to_cpu(ents2[0].hashval) < be32_to_cpu(ents1[0].hashval) ||
@@ -1007,12 +969,13 @@ xfs_dir2_leafn_rebalance(
struct xfs_dir2_leaf_entry *ents2;
struct xfs_dir3_icleaf_hdr hdr1;
struct xfs_dir3_icleaf_hdr hdr2;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
/*
* If the block order is wrong, swap the arguments.
*/
- if ((swap = xfs_dir2_leafn_order(blk1->bp, blk2->bp))) {
+ if ((swap = xfs_dir2_leafn_order(dp, blk1->bp, blk2->bp))) {
xfs_da_state_blk_t *tmp; /* temp for block swap */
tmp = blk1;
@@ -1021,10 +984,10 @@ xfs_dir2_leafn_rebalance(
}
leaf1 = blk1->bp->b_addr;
leaf2 = blk2->bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&hdr1, leaf1);
- xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf2);
- ents1 = xfs_dir3_leaf_ents_p(leaf1);
- ents2 = xfs_dir3_leaf_ents_p(leaf2);
+ dp->d_ops->leaf_hdr_from_disk(&hdr1, leaf1);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf2);
+ ents1 = dp->d_ops->leaf_ents_p(leaf1);
+ ents2 = dp->d_ops->leaf_ents_p(leaf2);
oldsum = hdr1.count + hdr2.count;
#if defined(DEBUG) || defined(XFS_WARN)
@@ -1070,13 +1033,13 @@ xfs_dir2_leafn_rebalance(
ASSERT(hdr1.stale + hdr2.stale == oldstale);
/* log the changes made when moving the entries */
- xfs_dir3_leaf_hdr_to_disk(leaf1, &hdr1);
- xfs_dir3_leaf_hdr_to_disk(leaf2, &hdr2);
- xfs_dir3_leaf_log_header(args->trans, blk1->bp);
- xfs_dir3_leaf_log_header(args->trans, blk2->bp);
+ dp->d_ops->leaf_hdr_to_disk(leaf1, &hdr1);
+ dp->d_ops->leaf_hdr_to_disk(leaf2, &hdr2);
+ xfs_dir3_leaf_log_header(args, blk1->bp);
+ xfs_dir3_leaf_log_header(args, blk2->bp);
- xfs_dir3_leaf_check(args->dp->i_mount, blk1->bp);
- xfs_dir3_leaf_check(args->dp->i_mount, blk2->bp);
+ xfs_dir3_leaf_check(dp, blk1->bp);
+ xfs_dir3_leaf_check(dp, blk2->bp);
/*
* Mark whether we're inserting into the old or new leaf.
@@ -1097,11 +1060,11 @@ xfs_dir2_leafn_rebalance(
* Finally sanity check just to make sure we are not returning a
* negative index
*/
- if(blk2->index < 0) {
+ if (blk2->index < 0) {
state->inleaf = 1;
blk2->index = 0;
- xfs_alert(args->dp->i_mount,
- "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
+ xfs_alert(dp->i_mount,
+ "%s: picked the wrong leaf? reverting original leaf: blk1->index %d",
__func__, blk1->index);
}
}
@@ -1116,21 +1079,20 @@ xfs_dir3_data_block_free(
struct xfs_buf *fbp,
int longest)
{
- struct xfs_trans *tp = args->trans;
int logfree = 0;
__be16 *bests;
struct xfs_dir3_icfree_hdr freehdr;
+ struct xfs_inode *dp = args->dp;
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
-
- bests = xfs_dir3_free_bests_p(tp->t_mountp, free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
if (hdr) {
/*
* Data block is not empty, just set the free entry to the new
* value.
*/
bests[findex] = cpu_to_be16(longest);
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
return 0;
}
@@ -1157,8 +1119,8 @@ xfs_dir3_data_block_free(
logfree = 1;
}
- xfs_dir3_free_hdr_to_disk(free, &freehdr);
- xfs_dir2_free_log_header(tp, fbp);
+ dp->d_ops->free_hdr_to_disk(free, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
/*
* If there are no useful entries left in the block, get rid of the
@@ -1182,7 +1144,7 @@ xfs_dir3_data_block_free(
/* Log the free entry that changed, unless we got rid of it. */
if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
return 0;
}
@@ -1222,8 +1184,8 @@ xfs_dir2_leafn_remove(
tp = args->trans;
mp = dp->i_mount;
leaf = bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
/*
* Point to the entry we're removing.
@@ -1233,9 +1195,9 @@ xfs_dir2_leafn_remove(
/*
* Extract the data block and offset from the entry.
*/
- db = xfs_dir2_dataptr_to_db(mp, be32_to_cpu(lep->address));
+ db = xfs_dir2_dataptr_to_db(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->blkno == db);
- off = xfs_dir2_dataptr_to_off(mp, be32_to_cpu(lep->address));
+ off = xfs_dir2_dataptr_to_off(args->geo, be32_to_cpu(lep->address));
ASSERT(dblk->index == off);
/*
@@ -1243,11 +1205,11 @@ xfs_dir2_leafn_remove(
* Log the leaf block changes.
*/
leafhdr.stale++;
- xfs_dir3_leaf_hdr_to_disk(leaf, &leafhdr);
- xfs_dir3_leaf_log_header(tp, bp);
+ dp->d_ops->leaf_hdr_to_disk(leaf, &leafhdr);
+ xfs_dir3_leaf_log_header(args, bp);
lep->address = cpu_to_be32(XFS_DIR2_NULL_DATAPTR);
- xfs_dir3_leaf_log_ents(tp, bp, index, index);
+ xfs_dir3_leaf_log_ents(args, bp, index, index);
/*
* Make the data entry free. Keep track of the longest freespace
@@ -1256,19 +1218,19 @@ xfs_dir2_leafn_remove(
dbp = dblk->bp;
hdr = dbp->b_addr;
dep = (xfs_dir2_data_entry_t *)((char *)hdr + off);
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
longest = be16_to_cpu(bf[0].length);
needlog = needscan = 0;
- xfs_dir2_data_make_free(tp, dbp, off,
- xfs_dir3_data_entsize(mp, dep->namelen), &needlog, &needscan);
+ xfs_dir2_data_make_free(args, dbp, off,
+ dp->d_ops->data_entsize(dep->namelen), &needlog, &needscan);
/*
* Rescan the data block freespaces for bestfree.
* Log the data block header if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
xfs_dir3_data_check(dp, dbp);
/*
* If the longest data block freespace changes, need to update
@@ -1285,8 +1247,9 @@ xfs_dir2_leafn_remove(
* Convert the data block number to a free block,
* read in the free block.
*/
- fdb = xfs_dir2_db_to_fdb(mp, db);
- error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+ fdb = dp->d_ops->db_to_fdb(args->geo, db);
+ error = xfs_dir2_free_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, fdb),
&fbp);
if (error)
return error;
@@ -1294,22 +1257,23 @@ xfs_dir2_leafn_remove(
#ifdef DEBUG
{
struct xfs_dir3_icfree_hdr freehdr;
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
- ASSERT(freehdr.firstdb == xfs_dir3_free_max_bests(mp) *
- (fdb - XFS_DIR2_FREE_FIRSTDB(mp)));
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
+ ASSERT(freehdr.firstdb == dp->d_ops->free_max_bests(args->geo) *
+ (fdb - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)));
}
#endif
/*
* Calculate which entry we need to fix.
*/
- findex = xfs_dir2_db_to_fdindex(mp, db);
+ findex = dp->d_ops->db_to_fdindex(args->geo, db);
longest = be16_to_cpu(bf[0].length);
/*
* If the data block is now empty we can get rid of it
* (usually).
*/
- if (longest == mp->m_dirblksize -
- xfs_dir3_data_entry_offset(hdr)) {
+ if (longest == args->geo->blksize -
+ dp->d_ops->data_entry_offset) {
/*
* Try to punch out the data block.
*/
@@ -1336,14 +1300,14 @@ xfs_dir2_leafn_remove(
return error;
}
- xfs_dir3_leaf_check(mp, bp);
+ xfs_dir3_leaf_check(dp, bp);
/*
* Return indication of whether this leaf block is empty enough
* to justify trying to join it with a neighbor.
*/
- *rval = (xfs_dir3_leaf_hdr_size(leaf) +
+ *rval = (dp->d_ops->leaf_hdr_size +
(uint)sizeof(ents[0]) * (leafhdr.count - leafhdr.stale)) <
- mp->m_dir_magicpct;
+ args->geo->magicpct;
return 0;
}
@@ -1360,13 +1324,14 @@ xfs_dir2_leafn_split(
xfs_dablk_t blkno; /* new leaf block number */
int error; /* error return value */
xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_inode *dp;
/*
* Allocate space for a new leaf node.
*/
args = state->args;
- mp = args->dp->i_mount;
- ASSERT(args != NULL);
+ dp = args->dp;
+ mp = dp->i_mount;
ASSERT(oldblk->magic == XFS_DIR2_LEAFN_MAGIC);
error = xfs_da_grow_inode(args, &blkno);
if (error) {
@@ -1375,7 +1340,7 @@ xfs_dir2_leafn_split(
/*
* Initialize the new leaf block.
*/
- error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(mp, blkno),
+ error = xfs_dir3_leaf_get_buf(args, xfs_dir2_da_to_db(args->geo, blkno),
&newblk->bp, XFS_DIR2_LEAFN_MAGIC);
if (error)
return error;
@@ -1401,10 +1366,10 @@ xfs_dir2_leafn_split(
/*
* Update last hashval in each block since we added the name.
*/
- oldblk->hashval = xfs_dir2_leafn_lasthash(oldblk->bp, NULL);
- newblk->hashval = xfs_dir2_leafn_lasthash(newblk->bp, NULL);
- xfs_dir3_leaf_check(mp, oldblk->bp);
- xfs_dir3_leaf_check(mp, newblk->bp);
+ oldblk->hashval = xfs_dir2_leafn_lasthash(dp, oldblk->bp, NULL);
+ newblk->hashval = xfs_dir2_leafn_lasthash(dp, newblk->bp, NULL);
+ xfs_dir3_leaf_check(dp, oldblk->bp);
+ xfs_dir3_leaf_check(dp, newblk->bp);
return error;
}
@@ -1434,6 +1399,7 @@ xfs_dir2_leafn_toosmall(
int rval; /* result from path_shift */
struct xfs_dir3_icleaf_hdr leafhdr;
struct xfs_dir2_leaf_entry *ents;
+ struct xfs_inode *dp = state->args->dp;
/*
* Check for the degenerate case of the block being over 50% full.
@@ -1442,13 +1408,13 @@ xfs_dir2_leafn_toosmall(
*/
blk = &state->path.blk[state->path.active - 1];
leaf = blk->bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
- xfs_dir3_leaf_check(state->args->dp->i_mount, blk->bp);
+ dp->d_ops->leaf_hdr_from_disk(&leafhdr, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
+ xfs_dir3_leaf_check(dp, blk->bp);
count = leafhdr.count - leafhdr.stale;
- bytes = xfs_dir3_leaf_hdr_size(leaf) + count * sizeof(ents[0]);
- if (bytes > (state->blocksize >> 1)) {
+ bytes = dp->d_ops->leaf_hdr_size + count * sizeof(ents[0]);
+ if (bytes > (state->args->geo->blksize >> 1)) {
/*
* Blk over 50%, don't try to join.
*/
@@ -1492,7 +1458,7 @@ xfs_dir2_leafn_toosmall(
/*
* Read the sibling leaf block.
*/
- error = xfs_dir3_leafn_read(state->args->trans, state->args->dp,
+ error = xfs_dir3_leafn_read(state->args->trans, dp,
blkno, -1, &bp);
if (error)
return error;
@@ -1501,11 +1467,12 @@ xfs_dir2_leafn_toosmall(
* Count bytes in the two blocks combined.
*/
count = leafhdr.count - leafhdr.stale;
- bytes = state->blocksize - (state->blocksize >> 2);
+ bytes = state->args->geo->blksize -
+ (state->args->geo->blksize >> 2);
leaf = bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&hdr2, leaf);
- ents = xfs_dir3_leaf_ents_p(leaf);
+ dp->d_ops->leaf_hdr_from_disk(&hdr2, leaf);
+ ents = dp->d_ops->leaf_ents_p(leaf);
count += hdr2.count - hdr2.stale;
bytes -= count * sizeof(ents[0]);
@@ -1559,6 +1526,7 @@ xfs_dir2_leafn_unbalance(
struct xfs_dir3_icleaf_hdr drophdr;
struct xfs_dir2_leaf_entry *sents;
struct xfs_dir2_leaf_entry *dents;
+ struct xfs_inode *dp = state->args->dp;
args = state->args;
ASSERT(drop_blk->magic == XFS_DIR2_LEAFN_MAGIC);
@@ -1566,10 +1534,10 @@ xfs_dir2_leafn_unbalance(
drop_leaf = drop_blk->bp->b_addr;
save_leaf = save_blk->bp->b_addr;
- xfs_dir3_leaf_hdr_from_disk(&savehdr, save_leaf);
- xfs_dir3_leaf_hdr_from_disk(&drophdr, drop_leaf);
- sents = xfs_dir3_leaf_ents_p(save_leaf);
- dents = xfs_dir3_leaf_ents_p(drop_leaf);
+ dp->d_ops->leaf_hdr_from_disk(&savehdr, save_leaf);
+ dp->d_ops->leaf_hdr_from_disk(&drophdr, drop_leaf);
+ sents = dp->d_ops->leaf_ents_p(save_leaf);
+ dents = dp->d_ops->leaf_ents_p(drop_leaf);
/*
* If there are any stale leaf entries, take this opportunity
@@ -1584,7 +1552,7 @@ xfs_dir2_leafn_unbalance(
* Move the entries from drop to the appropriate end of save.
*/
drop_blk->hashval = be32_to_cpu(dents[drophdr.count - 1].hashval);
- if (xfs_dir2_leafn_order(save_blk->bp, drop_blk->bp))
+ if (xfs_dir2_leafn_order(dp, save_blk->bp, drop_blk->bp))
xfs_dir3_leafn_moveents(args, drop_blk->bp, &drophdr, dents, 0,
save_blk->bp, &savehdr, sents, 0,
drophdr.count);
@@ -1595,13 +1563,13 @@ xfs_dir2_leafn_unbalance(
save_blk->hashval = be32_to_cpu(sents[savehdr.count - 1].hashval);
/* log the changes made when moving the entries */
- xfs_dir3_leaf_hdr_to_disk(save_leaf, &savehdr);
- xfs_dir3_leaf_hdr_to_disk(drop_leaf, &drophdr);
- xfs_dir3_leaf_log_header(args->trans, save_blk->bp);
- xfs_dir3_leaf_log_header(args->trans, drop_blk->bp);
+ dp->d_ops->leaf_hdr_to_disk(save_leaf, &savehdr);
+ dp->d_ops->leaf_hdr_to_disk(drop_leaf, &drophdr);
+ xfs_dir3_leaf_log_header(args, save_blk->bp);
+ xfs_dir3_leaf_log_header(args, drop_blk->bp);
- xfs_dir3_leaf_check(args->dp->i_mount, save_blk->bp);
- xfs_dir3_leaf_check(args->dp->i_mount, drop_blk->bp);
+ xfs_dir3_leaf_check(dp, save_blk->bp);
+ xfs_dir3_leaf_check(dp, drop_blk->bp);
}
/*
@@ -1624,8 +1592,6 @@ xfs_dir2_node_addname(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Look up the name. We're not supposed to find it, but
* this gives us the insertion point.
@@ -1712,7 +1678,7 @@ xfs_dir2_node_addname_int(
dp = args->dp;
mp = dp->i_mount;
tp = args->trans;
- length = xfs_dir3_data_entsize(mp, args->namelen);
+ length = dp->d_ops->data_entsize(args->namelen);
/*
* If we came in with a freespace block that means that lookup
* found an entry with our hash value. This is the freespace
@@ -1726,8 +1692,8 @@ xfs_dir2_node_addname_int(
ifbno = fblk->blkno;
free = fbp->b_addr;
findex = fblk->index;
- bests = xfs_dir3_free_bests_p(mp, free);
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
/*
* This means the free entry showed that the data block had
@@ -1764,9 +1730,9 @@ xfs_dir2_node_addname_int(
if (dbno == -1) {
xfs_fileoff_t fo; /* freespace block number */
- if ((error = xfs_bmap_last_offset(tp, dp, &fo, XFS_DATA_FORK)))
+ if ((error = xfs_bmap_last_offset(dp, &fo, XFS_DATA_FORK)))
return error;
- lastfbno = xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo);
+ lastfbno = xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo);
fbno = ifbno;
}
/*
@@ -1784,7 +1750,8 @@ xfs_dir2_node_addname_int(
* us a freespace block to start with.
*/
if (++fbno == 0)
- fbno = XFS_DIR2_FREE_FIRSTDB(mp);
+ fbno = xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET);
/*
* If it's ifbno we already looked at it.
*/
@@ -1802,8 +1769,8 @@ xfs_dir2_node_addname_int(
* to avoid it.
*/
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
if (!fbp)
@@ -1819,8 +1786,8 @@ xfs_dir2_node_addname_int(
* and the freehdr are actually initialised if they are placed
* there, so we have to do it here to avoid warnings. Blech.
*/
- bests = xfs_dir3_free_bests_p(mp, free);
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
if (be16_to_cpu(bests[findex]) != NULLDATAOFF &&
be16_to_cpu(bests[findex]) >= length)
dbno = freehdr.firstdb + findex;
@@ -1871,10 +1838,10 @@ xfs_dir2_node_addname_int(
* Get the freespace block corresponding to the data block
* that was just allocated.
*/
- fbno = xfs_dir2_db_to_fdb(mp, dbno);
+ fbno = dp->d_ops->db_to_fdb(args->geo, dbno);
error = xfs_dir2_free_try_read(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- &fbp);
+ xfs_dir2_db_to_da(args->geo, fbno),
+ &fbp);
if (error)
return error;
@@ -1888,12 +1855,13 @@ xfs_dir2_node_addname_int(
if (error)
return error;
- if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
+ if (dp->d_ops->db_to_fdb(args->geo, dbno) != fbno) {
xfs_alert(mp,
"%s: dir ino %llu needed freesp block %lld for\n"
" data block %lld, got %lld ifbno %llu lastfbno %d",
__func__, (unsigned long long)dp->i_ino,
- (long long)xfs_dir2_db_to_fdb(mp, dbno),
+ (long long)dp->d_ops->db_to_fdb(
+ args->geo, dbno),
(long long)dbno, (long long)fbno,
(unsigned long long)ifbno, lastfbno);
if (fblk) {
@@ -1914,34 +1882,36 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- error = xfs_dir3_free_get_buf(tp, dp, fbno, &fbp);
+ error = xfs_dir3_free_get_buf(args, fbno, &fbp);
if (error)
return error;
free = fbp->b_addr;
- bests = xfs_dir3_free_bests_p(mp, free);
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
/*
* Remember the first slot as our empty slot.
*/
- freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) *
- xfs_dir3_free_max_bests(mp);
+ freehdr.firstdb =
+ (fbno - xfs_dir2_byte_to_db(args->geo,
+ XFS_DIR2_FREE_OFFSET)) *
+ dp->d_ops->free_max_bests(args->geo);
} else {
free = fbp->b_addr;
- bests = xfs_dir3_free_bests_p(mp, free);
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ bests = dp->d_ops->free_bests_p(free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
}
/*
* Set the freespace block index from the data block number.
*/
- findex = xfs_dir2_db_to_fdindex(mp, dbno);
+ findex = dp->d_ops->db_to_fdindex(args->geo, dbno);
/*
* If it's after the end of the current entries in the
* freespace block, extend that table.
*/
if (findex >= freehdr.nvalid) {
- ASSERT(findex < xfs_dir3_free_max_bests(mp));
+ ASSERT(findex < dp->d_ops->free_max_bests(args->geo));
freehdr.nvalid = findex + 1;
/*
* Tag new entry so nused will go up.
@@ -1954,8 +1924,8 @@ xfs_dir2_node_addname_int(
*/
if (bests[findex] == cpu_to_be16(NULLDATAOFF)) {
freehdr.nused++;
- xfs_dir3_free_hdr_to_disk(fbp->b_addr, &freehdr);
- xfs_dir2_free_log_header(tp, fbp);
+ dp->d_ops->free_hdr_to_disk(fbp->b_addr, &freehdr);
+ xfs_dir2_free_log_header(args, fbp);
}
/*
* Update the real value in the table.
@@ -1963,7 +1933,7 @@ xfs_dir2_node_addname_int(
* change again.
*/
hdr = dbp->b_addr;
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
bests[findex] = bf[0].length;
logfree = 1;
}
@@ -1980,12 +1950,13 @@ xfs_dir2_node_addname_int(
/*
* Read the data block in.
*/
- error = xfs_dir3_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ error = xfs_dir3_data_read(tp, dp,
+ xfs_dir2_db_to_da(args->geo, dbno),
-1, &dbp);
if (error)
return error;
hdr = dbp->b_addr;
- bf = xfs_dir3_data_bestfree_p(hdr);
+ bf = dp->d_ops->data_bestfree_p(hdr);
logfree = 0;
}
ASSERT(be16_to_cpu(bf[0].length) >= length);
@@ -1998,7 +1969,7 @@ xfs_dir2_node_addname_int(
/*
* Mark the first part of the unused space, inuse for us.
*/
- xfs_dir2_data_use_free(tp, dbp, dup,
+ xfs_dir2_data_use_free(args, dbp, dup,
(xfs_dir2_data_aoff_t)((char *)dup - (char *)hdr), length,
&needlog, &needscan);
/*
@@ -2008,24 +1979,24 @@ xfs_dir2_node_addname_int(
dep->inumber = cpu_to_be64(args->inumber);
dep->namelen = args->namelen;
memcpy(dep->name, args->name, dep->namelen);
- xfs_dir3_dirent_put_ftype(mp, dep, args->filetype);
- tagp = xfs_dir3_data_entry_tag_p(mp, dep);
+ dp->d_ops->data_put_ftype(dep, args->filetype);
+ tagp = dp->d_ops->data_entry_tag_p(dep);
*tagp = cpu_to_be16((char *)dep - (char *)hdr);
- xfs_dir2_data_log_entry(tp, dbp, dep);
+ xfs_dir2_data_log_entry(args, dbp, dep);
/*
* Rescan the block for bestfree if needed.
*/
if (needscan)
- xfs_dir2_data_freescan(mp, hdr, &needlog);
+ xfs_dir2_data_freescan(dp, hdr, &needlog);
/*
* Log the data block header if needed.
*/
if (needlog)
- xfs_dir2_data_log_header(tp, dbp);
+ xfs_dir2_data_log_header(args, dbp);
/*
* If the freespace entry is now wrong, update it.
*/
- bests = xfs_dir3_free_bests_p(mp, free); /* gcc is so stupid */
+ bests = dp->d_ops->free_bests_p(free); /* gcc is so stupid */
if (be16_to_cpu(bests[findex]) != be16_to_cpu(bf[0].length)) {
bests[findex] = bf[0].length;
logfree = 1;
@@ -2034,7 +2005,7 @@ xfs_dir2_node_addname_int(
* Log the freespace entry if needed.
*/
if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ xfs_dir2_free_log_bests(args, fbp, findex, findex);
/*
* Return the data block and offset in args, then drop the data block.
*/
@@ -2065,8 +2036,6 @@ xfs_dir2_node_lookup(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
/*
* Fill in the path to the entry in the cursor.
*/
@@ -2105,12 +2074,12 @@ xfs_dir2_node_lookup(
*/
int /* error */
xfs_dir2_node_removename(
- xfs_da_args_t *args) /* operation arguments */
+ struct xfs_da_args *args) /* operation arguments */
{
- xfs_da_state_blk_t *blk; /* leaf block */
+ struct xfs_da_state_blk *blk; /* leaf block */
int error; /* error return value */
int rval; /* operation return value */
- xfs_da_state_t *state; /* btree cursor */
+ struct xfs_da_state *state; /* btree cursor */
trace_xfs_dir2_node_removename(args);
@@ -2120,21 +2089,18 @@ xfs_dir2_node_removename(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
- /*
- * Look up the entry we're deleting, set up the cursor.
- */
+
+ /* Look up the entry we're deleting, set up the cursor. */
error = xfs_da3_node_lookup_int(state, &rval);
if (error)
- rval = error;
- /*
- * Didn't find it, upper layer screwed up.
- */
+ goto out_free;
+
+ /* Didn't find it, upper layer screwed up. */
if (rval != EEXIST) {
- xfs_da_state_free(state);
- return rval;
+ error = rval;
+ goto out_free;
}
+
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
ASSERT(state->extravalid);
@@ -2145,7 +2111,7 @@ xfs_dir2_node_removename(
error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
&state->extrablk, &rval);
if (error)
- return error;
+ goto out_free;
/*
* Fix the hash values up the btree.
*/
@@ -2160,6 +2126,7 @@ xfs_dir2_node_removename(
*/
if (!error)
error = xfs_dir2_node_to_leaf(state);
+out_free:
xfs_da_state_free(state);
return error;
}
@@ -2190,8 +2157,6 @@ xfs_dir2_node_replace(
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
- state->blocksize = state->mp->m_dirblksize;
- state->node_ents = state->mp->m_dir_node_ents;
inum = args->inumber;
/*
* Lookup the entry to change in the btree.
@@ -2212,7 +2177,7 @@ xfs_dir2_node_replace(
blk = &state->path.blk[state->path.active - 1];
ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
leaf = blk->bp->b_addr;
- ents = xfs_dir3_leaf_ents_p(leaf);
+ ents = args->dp->d_ops->leaf_ents_p(leaf);
lep = &ents[blk->index];
ASSERT(state->extravalid);
/*
@@ -2223,14 +2188,15 @@ xfs_dir2_node_replace(
hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC));
dep = (xfs_dir2_data_entry_t *)
((char *)hdr +
- xfs_dir2_dataptr_to_off(state->mp, be32_to_cpu(lep->address)));
+ xfs_dir2_dataptr_to_off(args->geo,
+ be32_to_cpu(lep->address)));
ASSERT(inum != be64_to_cpu(dep->inumber));
/*
* Fill in the new inode number and log the entry.
*/
dep->inumber = cpu_to_be64(inum);
- xfs_dir3_dirent_put_ftype(state->mp, dep, args->filetype);
- xfs_dir2_data_log_entry(args->trans, state->extrablk.bp, dep);
+ args->dp->d_ops->data_put_ftype(dep, args->filetype);
+ xfs_dir2_data_log_entry(args, state->extrablk.bp, dep);
rval = 0;
}
/*
@@ -2285,7 +2251,7 @@ xfs_dir2_node_trim_free(
if (!bp)
return 0;
free = bp->b_addr;
- xfs_dir3_free_hdr_from_disk(&freehdr, free);
+ dp->d_ops->free_hdr_from_disk(&freehdr, free);
/*
* If there are used entries, there's nothing to do.
@@ -2298,9 +2264,9 @@ xfs_dir2_node_trim_free(
/*
* Blow the block away.
*/
- if ((error =
- xfs_dir2_shrink_inode(args, xfs_dir2_da_to_db(mp, (xfs_dablk_t)fo),
- bp))) {
+ error = xfs_dir2_shrink_inode(args,
+ xfs_dir2_da_to_db(args->geo, (xfs_dablk_t)fo), bp);
+ if (error) {
/*
* Can't fail with ENOSPC since that only happens with no
* space reservation, when breaking up an extent into two
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 1bad84c4082..27ce0794d19 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -20,6 +20,140 @@
struct dir_context;
+/*
+ * Directory offset/block conversion functions.
+ *
+ * DB blocks here are logical directory block numbers, not filesystem blocks.
+ */
+
+/*
+ * Convert dataptr to byte in file space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_dataptr_to_byte(xfs_dir2_dataptr_t dp)
+{
+ return (xfs_dir2_off_t)dp << XFS_DIR2_DATA_ALIGN_LOG;
+}
+
+/*
+ * Convert byte in file space to dataptr. It had better be aligned.
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_byte_to_dataptr(xfs_dir2_off_t by)
+{
+ return (xfs_dir2_dataptr_t)(by >> XFS_DIR2_DATA_ALIGN_LOG);
+}
+
+/*
+ * Convert byte in space to (DB) block
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_byte_to_db(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_db_t)(by >> geo->blklog);
+}
+
+/*
+ * Convert dataptr to a block number
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_dataptr_to_db(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_db(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert byte in space to offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_byte_to_off(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return (xfs_dir2_data_aoff_t)(by & (geo->blksize - 1));
+}
+
+/*
+ * Convert dataptr to a byte offset in a block
+ */
+static inline xfs_dir2_data_aoff_t
+xfs_dir2_dataptr_to_off(struct xfs_da_geometry *geo, xfs_dir2_dataptr_t dp)
+{
+ return xfs_dir2_byte_to_off(geo, xfs_dir2_dataptr_to_byte(dp));
+}
+
+/*
+ * Convert block and offset to byte in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_db_off_to_byte(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return ((xfs_dir2_off_t)db << geo->blklog) + o;
+}
+
+/*
+ * Convert block (DB) to block (dablk)
+ */
+static inline xfs_dablk_t
+xfs_dir2_db_to_da(struct xfs_da_geometry *geo, xfs_dir2_db_t db)
+{
+ return (xfs_dablk_t)(db << (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert byte in space to (DA) block
+ */
+static inline xfs_dablk_t
+xfs_dir2_byte_to_da(struct xfs_da_geometry *geo, xfs_dir2_off_t by)
+{
+ return xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, by));
+}
+
+/*
+ * Convert block and offset to dataptr
+ */
+static inline xfs_dir2_dataptr_t
+xfs_dir2_db_off_to_dataptr(struct xfs_da_geometry *geo, xfs_dir2_db_t db,
+ xfs_dir2_data_aoff_t o)
+{
+ return xfs_dir2_byte_to_dataptr(xfs_dir2_db_off_to_byte(geo, db, o));
+}
+
+/*
+ * Convert block (dablk) to block (DB)
+ */
+static inline xfs_dir2_db_t
+xfs_dir2_da_to_db(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return (xfs_dir2_db_t)(da >> (geo->blklog - geo->fsblog));
+}
+
+/*
+ * Convert block (dablk) to byte offset in space
+ */
+static inline xfs_dir2_off_t
+xfs_dir2_da_to_byte(struct xfs_da_geometry *geo, xfs_dablk_t da)
+{
+ return xfs_dir2_db_off_to_byte(geo, xfs_dir2_da_to_db(geo, da), 0);
+}
+
+/*
+ * Directory tail pointer accessor functions. Based on block geometry.
+ */
+static inline struct xfs_dir2_block_tail *
+xfs_dir2_block_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_data_hdr *hdr)
+{
+ return ((struct xfs_dir2_block_tail *)
+ ((char *)hdr + geo->blksize)) - 1;
+}
+
+static inline struct xfs_dir2_leaf_tail *
+xfs_dir2_leaf_tail_p(struct xfs_da_geometry *geo, struct xfs_dir2_leaf *lp)
+{
+ return (struct xfs_dir2_leaf_tail *)
+ ((char *)lp + geo->blksize -
+ sizeof(struct xfs_dir2_leaf_tail));
+}
+
/* xfs_dir2.c */
extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
@@ -54,12 +188,13 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
extern int __xfs_dir3_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
extern int xfs_dir3_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
-extern int xfs_dir3_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
- xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+extern int xfs_dir3_data_readahead(struct xfs_inode *dp, xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno);
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
- struct xfs_dir2_data_unused *dup, int *loghead);
+ struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup,
+ int *loghead);
extern int xfs_dir3_data_init(struct xfs_da_args *args, xfs_dir2_db_t blkno,
struct xfs_buf **bpp);
@@ -76,9 +211,9 @@ extern void xfs_dir3_leaf_compact_x1(struct xfs_dir3_icleaf_hdr *leafhdr,
int *lowstalep, int *highstalep, int *lowlogp, int *highlogp);
extern int xfs_dir3_leaf_get_buf(struct xfs_da_args *args, xfs_dir2_db_t bno,
struct xfs_buf **bpp, __uint16_t magic);
-extern void xfs_dir3_leaf_log_ents(struct xfs_trans *tp, struct xfs_buf *bp,
- int first, int last);
-extern void xfs_dir3_leaf_log_header(struct xfs_trans *tp,
+extern void xfs_dir3_leaf_log_ents(struct xfs_da_args *args,
+ struct xfs_buf *bp, int first, int last);
+extern void xfs_dir3_leaf_log_header(struct xfs_da_args *args,
struct xfs_buf *bp);
extern int xfs_dir2_leaf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_leaf_removename(struct xfs_da_args *args);
@@ -93,21 +228,18 @@ xfs_dir3_leaf_find_entry(struct xfs_dir3_icleaf_hdr *leafhdr,
int lowstale, int highstale, int *lfloglow, int *lfloghigh);
extern int xfs_dir2_node_to_leaf(struct xfs_da_state *state);
-extern void xfs_dir3_leaf_hdr_from_disk(struct xfs_dir3_icleaf_hdr *to,
- struct xfs_dir2_leaf *from);
-extern void xfs_dir3_leaf_hdr_to_disk(struct xfs_dir2_leaf *to,
- struct xfs_dir3_icleaf_hdr *from);
-extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp,
+extern bool xfs_dir3_leaf_check_int(struct xfs_mount *mp, struct xfs_inode *dp,
struct xfs_dir3_icleaf_hdr *hdr, struct xfs_dir2_leaf *leaf);
/* xfs_dir2_node.c */
extern int xfs_dir2_leaf_to_node(struct xfs_da_args *args,
struct xfs_buf *lbp);
-extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_buf *bp, int *count);
+extern xfs_dahash_t xfs_dir2_leafn_lasthash(struct xfs_inode *dp,
+ struct xfs_buf *bp, int *count);
extern int xfs_dir2_leafn_lookup_int(struct xfs_buf *bp,
struct xfs_da_args *args, int *indexp,
struct xfs_da_state *state);
-extern int xfs_dir2_leafn_order(struct xfs_buf *leaf1_bp,
+extern int xfs_dir2_leafn_order(struct xfs_inode *dp, struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
extern int xfs_dir2_leafn_split(struct xfs_da_state *state,
struct xfs_da_state_blk *oldblk, struct xfs_da_state_blk *newblk);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 8f84153e98a..48e99afb9cb 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,23 +18,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_dinode.h"
/*
* Directory file type support functions
@@ -76,26 +76,25 @@ const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
STATIC int
xfs_dir2_sf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
+ struct xfs_da_args *args,
struct dir_context *ctx)
{
int i; /* shortform entry number */
- xfs_mount_t *mp; /* filesystem mount point */
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_dataptr_t dot_offset;
xfs_dir2_dataptr_t dotdot_offset;
xfs_ino_t ino;
-
- mp = dp->i_mount;
+ struct xfs_da_geometry *geo = args->geo;
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
/*
* Give up if the directory is way too short.
*/
if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
return XFS_ERROR(EIO);
}
@@ -109,19 +108,19 @@ xfs_dir2_sf_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
/*
* Precalculate offsets for . and .. as we will always need them.
*
* XXX(hch): the second argument is sometimes 0 and sometimes
- * mp->m_dirdatablk.
+ * geo->datablk
*/
- dot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- xfs_dir3_data_dot_offset(mp));
- dotdot_offset = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
- xfs_dir3_data_dotdot_offset(mp));
+ dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ dp->d_ops->data_dot_offset);
+ dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
+ dp->d_ops->data_dotdot_offset);
/*
* Put . entry unless we're starting past it.
@@ -136,7 +135,7 @@ xfs_dir2_sf_getdents(
* Put .. entry unless we're starting past it.
*/
if (ctx->pos <= dotdot_offset) {
- ino = xfs_dir2_sf_get_parent_ino(sfp);
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
ctx->pos = dotdot_offset & 0x7fffffff;
if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
return 0;
@@ -149,25 +148,25 @@ xfs_dir2_sf_getdents(
for (i = 0; i < sfp->count; i++) {
__uint8_t filetype;
- off = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
xfs_dir2_sf_get_offset(sfep));
if (ctx->pos > off) {
- sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
continue;
}
- ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
- filetype = xfs_dir3_sfe_get_ftype(mp, sfp, sfep);
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
+ filetype = dp->d_ops->sf_get_ftype(sfep);
ctx->pos = off & 0x7fffffff;
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
- xfs_dir3_get_dtype(mp, filetype)))
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
return 0;
- sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
- ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
- 0x7fffffff;
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
return 0;
}
@@ -176,9 +175,10 @@ xfs_dir2_sf_getdents(
*/
STATIC int
xfs_dir2_block_getdents(
- xfs_inode_t *dp, /* incore inode */
+ struct xfs_da_args *args,
struct dir_context *ctx)
{
+ struct xfs_inode *dp = args->dp; /* incore directory inode */
xfs_dir2_data_hdr_t *hdr; /* block header */
struct xfs_buf *bp; /* buffer for block */
xfs_dir2_block_tail_t *btp; /* block tail */
@@ -186,16 +186,15 @@ xfs_dir2_block_getdents(
xfs_dir2_data_unused_t *dup; /* block unused entry */
char *endptr; /* end of the data entries */
int error; /* error return value */
- xfs_mount_t *mp; /* filesystem mount point */
char *ptr; /* current data entry */
int wantoff; /* starting block offset */
xfs_off_t cook;
+ struct xfs_da_geometry *geo = args->geo;
- mp = dp->i_mount;
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, ctx->pos) > mp->m_dirdatablk)
+ if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
return 0;
error = xfs_dir3_block_read(NULL, dp, &bp);
@@ -206,14 +205,14 @@ xfs_dir2_block_getdents(
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
*/
- wantoff = xfs_dir2_dataptr_to_off(mp, ctx->pos);
+ wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Set up values for the loop.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
- ptr = (char *)xfs_dir3_data_entry_p(hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
/*
@@ -237,24 +236,24 @@ xfs_dir2_block_getdents(
/*
* Bump pointer for the next iteration.
*/
- ptr += xfs_dir3_data_entsize(mp, dep->namelen);
+ ptr += dp->d_ops->data_entsize(dep->namelen);
/*
* The entry is before the desired starting point, skip it.
*/
if ((char *)dep - (char *)hdr < wantoff)
continue;
- cook = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk,
+ cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
(char *)dep - (char *)hdr);
ctx->pos = cook & 0x7fffffff;
- filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+ filetype = dp->d_ops->data_get_ftype(dep);
/*
* If it didn't fit, set the final offset to here & return.
*/
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
- xfs_dir3_get_dtype(mp, filetype))) {
+ xfs_dir3_get_dtype(dp->i_mount, filetype))) {
xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -264,8 +263,8 @@ xfs_dir2_block_getdents(
* Reached the end of the block.
* Set the offset to a non-existent block 1 and return.
*/
- ctx->pos = xfs_dir2_db_off_to_dataptr(mp, mp->m_dirdatablk + 1, 0) &
- 0x7fffffff;
+ ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
+ 0x7fffffff;
xfs_trans_brelse(NULL, bp);
return 0;
}
@@ -286,13 +285,13 @@ struct xfs_dir2_leaf_map_info {
STATIC int
xfs_dir2_leaf_readbuf(
- struct xfs_inode *dp,
+ struct xfs_da_args *args,
size_t bufsize,
struct xfs_dir2_leaf_map_info *mip,
xfs_dir2_off_t *curoff,
struct xfs_buf **bpp)
{
- struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = *bpp;
struct xfs_bmbt_irec *map = mip->map;
struct blk_plug plug;
@@ -300,6 +299,7 @@ xfs_dir2_leaf_readbuf(
int length;
int i;
int j;
+ struct xfs_da_geometry *geo = args->geo;
/*
* If we have a buffer, we need to release it and
@@ -309,12 +309,12 @@ xfs_dir2_leaf_readbuf(
if (bp) {
xfs_trans_brelse(NULL, bp);
bp = NULL;
- mip->map_blocks -= mp->m_dirblkfsbs;
+ mip->map_blocks -= geo->fsbcount;
/*
* Loop to get rid of the extents for the
* directory block.
*/
- for (i = mp->m_dirblkfsbs; i > 0; ) {
+ for (i = geo->fsbcount; i > 0; ) {
j = min_t(int, map->br_blockcount, i);
map->br_blockcount -= j;
map->br_startblock += j;
@@ -333,8 +333,7 @@ xfs_dir2_leaf_readbuf(
/*
* Recalculate the readahead blocks wanted.
*/
- mip->ra_want = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize) - 1;
+ mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
ASSERT(mip->ra_want >= 0);
/*
@@ -342,14 +341,14 @@ xfs_dir2_leaf_readbuf(
* run out of data blocks, get some more mappings.
*/
if (1 + mip->ra_want > mip->map_blocks &&
- mip->map_off < xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET)) {
+ mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
/*
* Get more bmaps, fill in after the ones
* we already have in the table.
*/
mip->nmap = mip->map_size - mip->map_valid;
error = xfs_bmapi_read(dp, mip->map_off,
- xfs_dir2_byte_to_da(mp, XFS_DIR2_LEAF_OFFSET) -
+ xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
mip->map_off,
&map[mip->map_valid], &mip->nmap, 0);
@@ -370,7 +369,7 @@ xfs_dir2_leaf_readbuf(
i = mip->map_valid + mip->nmap - 1;
mip->map_off = map[i].br_startoff + map[i].br_blockcount;
} else
- mip->map_off = xfs_dir2_byte_to_da(mp,
+ mip->map_off = xfs_dir2_byte_to_da(geo,
XFS_DIR2_LEAF_OFFSET);
/*
@@ -396,18 +395,18 @@ xfs_dir2_leaf_readbuf(
* No valid mappings, so no more data blocks.
*/
if (!mip->map_valid) {
- *curoff = xfs_dir2_da_to_byte(mp, mip->map_off);
+ *curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
goto out;
}
/*
* Read the directory block starting at the first mapping.
*/
- mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
+ mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
- map->br_blockcount >= mp->m_dirblkfsbs ?
- XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-
+ map->br_blockcount >= geo->fsbcount ?
+ XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
+ -1, &bp);
/*
* Should just skip over the data block instead of giving up.
*/
@@ -419,7 +418,7 @@ xfs_dir2_leaf_readbuf(
* was previously ra.
*/
if (mip->ra_current)
- mip->ra_current -= mp->m_dirblkfsbs;
+ mip->ra_current -= geo->fsbcount;
/*
* Do we need more readahead?
@@ -427,16 +426,16 @@ xfs_dir2_leaf_readbuf(
blk_start_plug(&plug);
for (mip->ra_index = mip->ra_offset = i = 0;
mip->ra_want > mip->ra_current && i < mip->map_blocks;
- i += mp->m_dirblkfsbs) {
+ i += geo->fsbcount) {
ASSERT(mip->ra_index < mip->map_valid);
/*
* Read-ahead a contiguous directory block.
*/
if (i > mip->ra_current &&
- map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
- xfs_dir3_data_readahead(NULL, dp,
+ map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+ xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff + mip->ra_offset,
- XFS_FSB_TO_DADDR(mp,
+ XFS_FSB_TO_DADDR(dp->i_mount,
map[mip->ra_index].br_startblock +
mip->ra_offset));
mip->ra_current = i;
@@ -447,7 +446,7 @@ xfs_dir2_leaf_readbuf(
* use our mapping, but this is a very rare case.
*/
else if (i > mip->ra_current) {
- xfs_dir3_data_readahead(NULL, dp,
+ xfs_dir3_data_readahead(dp,
map[mip->ra_index].br_startoff +
mip->ra_offset, -1);
mip->ra_current = i;
@@ -456,15 +455,14 @@ xfs_dir2_leaf_readbuf(
/*
* Advance offset through the mapping table.
*/
- for (j = 0; j < mp->m_dirblkfsbs; j++) {
+ for (j = 0; j < geo->fsbcount; j += length ) {
/*
* The rest of this extent but not more than a dir
* block.
*/
- length = min_t(int, mp->m_dirblkfsbs,
+ length = min_t(int, geo->fsbcount,
map[mip->ra_index].br_blockcount -
mip->ra_offset);
- j += length;
mip->ra_offset += length;
/*
@@ -489,22 +487,23 @@ out:
*/
STATIC int
xfs_dir2_leaf_getdents(
- xfs_inode_t *dp, /* incore directory inode */
+ struct xfs_da_args *args,
struct dir_context *ctx,
size_t bufsize)
{
+ struct xfs_inode *dp = args->dp;
struct xfs_buf *bp = NULL; /* data block buffer */
xfs_dir2_data_hdr_t *hdr; /* data block header */
xfs_dir2_data_entry_t *dep; /* data entry */
xfs_dir2_data_unused_t *dup; /* unused entry */
int error = 0; /* error return value */
int length; /* temporary length value */
- xfs_mount_t *mp; /* filesystem mount point */
int byteoff; /* offset in current block */
xfs_dir2_off_t curoff; /* current overall offset */
xfs_dir2_off_t newoff; /* new curoff after new blk */
char *ptr = NULL; /* pointer to current data */
struct xfs_dir2_leaf_map_info *map_info;
+ struct xfs_da_geometry *geo = args->geo;
/*
* If the offset is at or past the largest allowed value,
@@ -513,15 +512,12 @@ xfs_dir2_leaf_getdents(
if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
return 0;
- mp = dp->i_mount;
-
/*
* Set up to bmap a number of blocks based on the caller's
* buffer size, the directory block size, and the filesystem
* block size.
*/
- length = howmany(bufsize + mp->m_dirblksize,
- mp->m_sb.sb_blocksize);
+ length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
(length * sizeof(struct xfs_bmbt_irec)),
KM_SLEEP | KM_NOFS);
@@ -531,14 +527,14 @@ xfs_dir2_leaf_getdents(
* Inside the loop we keep the main offset value as a byte offset
* in the directory file.
*/
- curoff = xfs_dir2_dataptr_to_byte(mp, ctx->pos);
+ curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
/*
* Force this conversion through db so we truncate the offset
* down to get the start of the data block.
*/
- map_info->map_off = xfs_dir2_db_to_da(mp,
- xfs_dir2_byte_to_db(mp, curoff));
+ map_info->map_off = xfs_dir2_db_to_da(geo,
+ xfs_dir2_byte_to_db(geo, curoff));
/*
* Loop over directory entries until we reach the end offset.
@@ -551,9 +547,9 @@ xfs_dir2_leaf_getdents(
* If we have no buffer, or we're off the end of the
* current buffer, need to get another one.
*/
- if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
+ if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
- error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
+ error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
&curoff, &bp);
if (error || !map_info->map_valid)
break;
@@ -561,7 +557,8 @@ xfs_dir2_leaf_getdents(
/*
* Having done a read, we need to set a new offset.
*/
- newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
+ newoff = xfs_dir2_db_off_to_byte(geo,
+ map_info->curdb, 0);
/*
* Start of the current block.
*/
@@ -571,20 +568,20 @@ xfs_dir2_leaf_getdents(
* Make sure we're in the right block.
*/
else if (curoff > newoff)
- ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
+ ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
map_info->curdb);
hdr = bp->b_addr;
xfs_dir3_data_check(dp, bp);
/*
* Find our position in the block.
*/
- ptr = (char *)xfs_dir3_data_entry_p(hdr);
- byteoff = xfs_dir2_byte_to_off(mp, curoff);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
+ byteoff = xfs_dir2_byte_to_off(geo, curoff);
/*
* Skip past the header.
*/
if (byteoff == 0)
- curoff += xfs_dir3_data_entry_offset(hdr);
+ curoff += dp->d_ops->data_entry_offset;
/*
* Skip past entries until we reach our offset.
*/
@@ -601,17 +598,17 @@ xfs_dir2_leaf_getdents(
}
dep = (xfs_dir2_data_entry_t *)ptr;
length =
- xfs_dir3_data_entsize(mp, dep->namelen);
+ dp->d_ops->data_entsize(dep->namelen);
ptr += length;
}
/*
* Now set our real offset.
*/
curoff =
- xfs_dir2_db_off_to_byte(mp,
- xfs_dir2_byte_to_db(mp, curoff),
+ xfs_dir2_db_off_to_byte(geo,
+ xfs_dir2_byte_to_db(geo, curoff),
(char *)ptr - (char *)hdr);
- if (ptr >= (char *)hdr + mp->m_dirblksize) {
+ if (ptr >= (char *)hdr + geo->blksize) {
continue;
}
}
@@ -632,13 +629,13 @@ xfs_dir2_leaf_getdents(
}
dep = (xfs_dir2_data_entry_t *)ptr;
- length = xfs_dir3_data_entsize(mp, dep->namelen);
- filetype = xfs_dir3_dirent_get_ftype(mp, dep);
+ length = dp->d_ops->data_entsize(dep->namelen);
+ filetype = dp->d_ops->data_get_ftype(dep);
- ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
- xfs_dir3_get_dtype(mp, filetype)))
+ xfs_dir3_get_dtype(dp->i_mount, filetype)))
break;
/*
@@ -653,10 +650,10 @@ xfs_dir2_leaf_getdents(
/*
* All done. Set output offset value to current offset.
*/
- if (curoff > xfs_dir2_dataptr_to_byte(mp, XFS_DIR2_MAX_DATAPTR))
+ if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
else
- ctx->pos = xfs_dir2_byte_to_dataptr(mp, curoff) & 0x7fffffff;
+ ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
kmem_free(map_info);
if (bp)
xfs_trans_brelse(NULL, bp);
@@ -668,12 +665,14 @@ xfs_dir2_leaf_getdents(
*/
int
xfs_readdir(
- xfs_inode_t *dp,
- struct dir_context *ctx,
- size_t bufsize)
+ struct xfs_inode *dp,
+ struct dir_context *ctx,
+ size_t bufsize)
{
- int rval; /* return value */
- int v; /* type-checking value */
+ struct xfs_da_args args = { NULL };
+ int rval;
+ int v;
+ uint lock_mode;
trace_xfs_readdir(dp);
@@ -683,13 +682,19 @@ xfs_readdir(
ASSERT(S_ISDIR(dp->i_d.di_mode));
XFS_STATS_INC(xs_dir_getdents);
+ args.dp = dp;
+ args.geo = dp->i_mount->m_dir_geo;
+
+ lock_mode = xfs_ilock_data_map_shared(dp);
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
- rval = xfs_dir2_sf_getdents(dp, ctx);
- else if ((rval = xfs_dir2_isblock(NULL, dp, &v)))
+ rval = xfs_dir2_sf_getdents(&args, ctx);
+ else if ((rval = xfs_dir2_isblock(&args, &v)))
;
else if (v)
- rval = xfs_dir2_block_getdents(dp, ctx);
+ rval = xfs_dir2_block_getdents(&args, ctx);
else
- rval = xfs_dir2_leaf_getdents(dp, ctx, bufsize);
+ rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
+ xfs_iunlock(dp, lock_mode);
+
return rval;
}
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/xfs_dir2_sf.c
index 3ef6d402084..53c3be619db 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/xfs_dir2_sf.c
@@ -17,22 +17,22 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_error.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_trace.h"
+#include "xfs_dinode.h"
/*
* Prototypes for internal functions.
@@ -57,89 +57,6 @@ static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
#endif /* XFS_BIG_INUMS */
/*
- * Inode numbers in short-form directories can come in two versions,
- * either 4 bytes or 8 bytes wide. These helpers deal with the
- * two forms transparently by looking at the headers i8count field.
- *
- * For 64-bit inode number the most significant byte must be zero.
- */
-static xfs_ino_t
-xfs_dir2_sf_get_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *from)
-{
- if (hdr->i8count)
- return get_unaligned_be64(&from->i8.i) & 0x00ffffffffffffffULL;
- else
- return get_unaligned_be32(&from->i4.i);
-}
-
-static void
-xfs_dir2_sf_put_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_dir2_inou_t *to,
- xfs_ino_t ino)
-{
- ASSERT((ino & 0xff00000000000000ULL) == 0);
-
- if (hdr->i8count)
- put_unaligned_be64(ino, &to->i8.i);
- else
- put_unaligned_be32(ino, &to->i4.i);
-}
-
-xfs_ino_t
-xfs_dir2_sf_get_parent_ino(
- struct xfs_dir2_sf_hdr *hdr)
-{
- return xfs_dir2_sf_get_ino(hdr, &hdr->parent);
-}
-
-void
-xfs_dir2_sf_put_parent_ino(
- struct xfs_dir2_sf_hdr *hdr,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, &hdr->parent, ino);
-}
-
-/*
- * In short-form directory entries the inode numbers are stored at variable
- * offset behind the entry name. If the entry stores a filetype value, then it
- * sits between the name and the inode number. Hence the inode numbers may only
- * be accessed through the helpers below.
- */
-static xfs_dir2_inou_t *
-xfs_dir3_sfe_inop(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_entry *sfep)
-{
- __uint8_t *ptr = &sfep->name[sfep->namelen];
- if (xfs_sb_version_hasftype(&mp->m_sb))
- ptr++;
- return (xfs_dir2_inou_t *)ptr;
-}
-
-xfs_ino_t
-xfs_dir3_sfe_get_ino(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep)
-{
- return xfs_dir2_sf_get_ino(hdr, xfs_dir3_sfe_inop(mp, sfep));
-}
-
-void
-xfs_dir3_sfe_put_ino(
- struct xfs_mount *mp,
- struct xfs_dir2_sf_hdr *hdr,
- struct xfs_dir2_sf_entry *sfep,
- xfs_ino_t ino)
-{
- xfs_dir2_sf_put_ino(hdr, xfs_dir3_sfe_inop(mp, sfep), ino);
-}
-
-/*
* Given a block directory (dp/block), calculate its size as a shortform (sf)
* directory and a header for the sf directory, if it will fit it the
* space currently present in the inode. If it won't fit, the output
@@ -165,8 +82,10 @@ xfs_dir2_block_sfsize(
xfs_ino_t parent = 0; /* parent inode number */
int size=0; /* total computed size */
int has_ftype;
+ struct xfs_da_geometry *geo;
mp = dp->i_mount;
+ geo = mp->m_dir_geo;
/*
* if there is a filetype field, add the extra byte to the namelen
@@ -175,7 +94,7 @@ xfs_dir2_block_sfsize(
has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0;
count = i8count = namelen = 0;
- btp = xfs_dir2_block_tail_p(mp, hdr);
+ btp = xfs_dir2_block_tail_p(geo, hdr);
blp = xfs_dir2_block_leaf_p(btp);
/*
@@ -187,8 +106,8 @@ xfs_dir2_block_sfsize(
/*
* Calculate the pointer to the entry at hand.
*/
- dep = (xfs_dir2_data_entry_t *)
- ((char *)hdr + xfs_dir2_dataptr_to_off(mp, addr));
+ dep = (xfs_dir2_data_entry_t *)((char *)hdr +
+ xfs_dir2_dataptr_to_off(geo, addr));
/*
* Detect . and .., so we can special-case them.
* . is not included in sf directories.
@@ -226,7 +145,7 @@ xfs_dir2_block_sfsize(
*/
sfhp->count = count;
sfhp->i8count = i8count;
- xfs_dir2_sf_put_parent_ino(sfhp, parent);
+ dp->d_ops->sf_put_parent_ino(sfhp, parent);
return size;
}
@@ -253,6 +172,7 @@ xfs_dir2_block_to_sf(
char *ptr; /* current data pointer */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
xfs_dir2_sf_hdr_t *sfp; /* shortform directory header */
+ xfs_dir2_sf_hdr_t *dst; /* temporary data buffer */
trace_xfs_dir2_block_to_sf(args);
@@ -260,40 +180,25 @@ xfs_dir2_block_to_sf(
mp = dp->i_mount;
/*
- * Make a copy of the block data, so we can shrink the inode
- * and add local data.
+ * allocate a temporary destination buffer the size of the inode
+ * to format the data into. Once we have formatted the data, we
+ * can free the block and copy the formatted data into the inode literal
+ * area.
*/
- hdr = kmem_alloc(mp->m_dirblksize, KM_SLEEP);
- memcpy(hdr, bp->b_addr, mp->m_dirblksize);
- logflags = XFS_ILOG_CORE;
- if ((error = xfs_dir2_shrink_inode(args, mp->m_dirdatablk, bp))) {
- ASSERT(error != ENOSPC);
- goto out;
- }
+ dst = kmem_alloc(mp->m_sb.sb_inodesize, KM_SLEEP);
+ hdr = bp->b_addr;
/*
- * The buffer is now unconditionally gone, whether
- * xfs_dir2_shrink_inode worked or not.
- *
- * Convert the inode to local format.
- */
- dp->i_df.if_flags &= ~XFS_IFEXTENTS;
- dp->i_df.if_flags |= XFS_IFINLINE;
- dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
- ASSERT(dp->i_df.if_bytes == 0);
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- logflags |= XFS_ILOG_DDATA;
- /*
* Copy the header into the newly allocate local space.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = (xfs_dir2_sf_hdr_t *)dst;
memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
- dp->i_d.di_size = size;
+
/*
* Set up to loop over the block's entries.
*/
- btp = xfs_dir2_block_tail_p(mp, hdr);
- ptr = (char *)xfs_dir3_data_entry_p(hdr);
+ btp = xfs_dir2_block_tail_p(args->geo, hdr);
+ ptr = (char *)dp->d_ops->data_entry_p(hdr);
endptr = (char *)xfs_dir2_block_leaf_p(btp);
sfep = xfs_dir2_sf_firstentry(sfp);
/*
@@ -321,7 +226,7 @@ xfs_dir2_block_to_sf(
else if (dep->namelen == 2 &&
dep->name[0] == '.' && dep->name[1] == '.')
ASSERT(be64_to_cpu(dep->inumber) ==
- xfs_dir2_sf_get_parent_ino(sfp));
+ dp->d_ops->sf_get_parent_ino(sfp));
/*
* Normal entry, copy it into shortform.
*/
@@ -331,20 +236,44 @@ xfs_dir2_block_to_sf(
(xfs_dir2_data_aoff_t)
((char *)dep - (char *)hdr));
memcpy(sfep->name, dep->name, dep->namelen);
- xfs_dir3_sfe_put_ino(mp, sfp, sfep,
- be64_to_cpu(dep->inumber));
- xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
- xfs_dir3_dirent_get_ftype(mp, dep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ be64_to_cpu(dep->inumber));
+ dp->d_ops->sf_put_ftype(sfep,
+ dp->d_ops->data_get_ftype(dep));
- sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
- ptr += xfs_dir3_data_entsize(mp, dep->namelen);
+ ptr += dp->d_ops->data_entsize(dep->namelen);
}
ASSERT((char *)sfep - (char *)sfp == size);
+
+ /* now we are done with the block, we can shrink the inode */
+ logflags = XFS_ILOG_CORE;
+ error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
+ if (error) {
+ ASSERT(error != ENOSPC);
+ goto out;
+ }
+
+ /*
+ * The buffer is now unconditionally gone, whether
+ * xfs_dir2_shrink_inode worked or not.
+ *
+ * Convert the inode to local format and copy the data in.
+ */
+ dp->i_df.if_flags &= ~XFS_IFEXTENTS;
+ dp->i_df.if_flags |= XFS_IFINLINE;
+ dp->i_d.di_format = XFS_DINODE_FMT_LOCAL;
+ ASSERT(dp->i_df.if_bytes == 0);
+ xfs_idata_realloc(dp, size, XFS_DATA_FORK);
+
+ logflags |= XFS_ILOG_DDATA;
+ memcpy(dp->i_df.if_u1.if_data, dst, size);
+ dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
out:
xfs_trans_log_inode(args->trans, dp, logflags);
- kmem_free(hdr);
+ kmem_free(dst);
return error;
}
@@ -358,14 +287,12 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- int add_entsize; /* size of the new entry */
xfs_inode_t *dp; /* incore directory inode */
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* di_size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
- int old_isize; /* di_size before adding name */
int pick; /* which algorithm to use */
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
@@ -389,8 +316,7 @@ xfs_dir2_sf_addname(
/*
* Compute entry (and change in) size.
*/
- add_entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
- incr_isize = add_entsize;
+ incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
objchange = 0;
#if XFS_BIG_INUMS
/*
@@ -398,11 +324,8 @@ xfs_dir2_sf_addname(
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
/*
- * Yes, adjust the entry size and the total size.
+ * Yes, adjust the inode size. old count + (parent + new)
*/
- add_entsize +=
- (uint)sizeof(xfs_dir2_ino8_t) -
- (uint)sizeof(xfs_dir2_ino4_t);
incr_isize +=
(sfp->count + 2) *
((uint)sizeof(xfs_dir2_ino8_t) -
@@ -410,8 +333,7 @@ xfs_dir2_sf_addname(
objchange = 1;
}
#endif
- old_isize = (int)dp->i_d.di_size;
- new_isize = old_isize + incr_isize;
+ new_isize = (int)dp->i_d.di_size + incr_isize;
/*
* Won't fit as shortform any more (due to size),
* or the pick routine says it won't (due to offset values).
@@ -483,8 +405,7 @@ xfs_dir2_sf_addname_easy(
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp,
- xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen),
+ xfs_idata_realloc(dp, dp->d_ops->sf_entsize(sfp, args->namelen),
XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
@@ -497,8 +418,8 @@ xfs_dir2_sf_addname_easy(
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep, args->inumber);
- xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep, args->filetype);
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
/*
* Update the header and inode.
@@ -557,13 +478,13 @@ xfs_dir2_sf_addname_hard(
* to insert the new entry.
* If it's going to end up at the end then oldsfep will point there.
*/
- for (offset = xfs_dir3_data_first_offset(mp),
+ for (offset = dp->d_ops->data_first_offset,
oldsfep = xfs_dir2_sf_firstentry(oldsfp),
- add_datasize = xfs_dir3_data_entsize(mp, args->namelen),
+ add_datasize = dp->d_ops->data_entsize(args->namelen),
eof = (char *)oldsfep == &buf[old_isize];
!eof;
- offset = new_offset + xfs_dir3_data_entsize(mp, oldsfep->namelen),
- oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep),
+ offset = new_offset + dp->d_ops->data_entsize(oldsfep->namelen),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep),
eof = (char *)oldsfep == &buf[old_isize]) {
new_offset = xfs_dir2_sf_get_offset(oldsfep);
if (offset + add_datasize <= new_offset)
@@ -592,8 +513,8 @@ xfs_dir2_sf_addname_hard(
sfep->namelen = args->namelen;
xfs_dir2_sf_put_offset(sfep, offset);
memcpy(sfep->name, args->name, sfep->namelen);
- xfs_dir3_sfe_put_ino(mp, sfp, sfep, args->inumber);
- xfs_dir3_sfe_put_ftype(mp, sfp, sfep, args->filetype);
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
sfp->count++;
#if XFS_BIG_INUMS
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
@@ -603,7 +524,7 @@ xfs_dir2_sf_addname_hard(
* If there's more left to copy, do that.
*/
if (!eof) {
- sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
memcpy(sfep, oldsfep, old_isize - nbytes);
}
kmem_free(buf);
@@ -639,8 +560,8 @@ xfs_dir2_sf_addname_pick(
mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- size = xfs_dir3_data_entsize(mp, args->namelen);
- offset = xfs_dir3_data_first_offset(mp);
+ size = dp->d_ops->data_entsize(args->namelen);
+ offset = dp->d_ops->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
holefit = 0;
/*
@@ -652,8 +573,8 @@ xfs_dir2_sf_addname_pick(
if (!holefit)
holefit = offset + size <= xfs_dir2_sf_get_offset(sfep);
offset = xfs_dir2_sf_get_offset(sfep) +
- xfs_dir3_data_entsize(mp, sfep->namelen);
- sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep);
+ dp->d_ops->data_entsize(sfep->namelen);
+ sfep = dp->d_ops->sf_nextentry(sfp, sfep);
}
/*
* Calculate data bytes used excluding the new entry, if this
@@ -667,7 +588,7 @@ xfs_dir2_sf_addname_pick(
* we'll go back, convert to block, then try the insert and convert
* to leaf.
*/
- if (used + (holefit ? 0 : size) > mp->m_dirblksize)
+ if (used + (holefit ? 0 : size) > args->geo->blksize)
return 0;
/*
* If changing the inode number size, do it the hard way.
@@ -682,7 +603,7 @@ xfs_dir2_sf_addname_pick(
/*
* If it won't fit at the end then do it the hard way (use the hole).
*/
- if (used + size > mp->m_dirblksize)
+ if (used + size > args->geo->blksize)
return 2;
/*
* Do it the easy way.
@@ -713,28 +634,27 @@ xfs_dir2_sf_check(
mp = dp->i_mount;
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- offset = xfs_dir3_data_first_offset(mp);
- ino = xfs_dir2_sf_get_parent_ino(sfp);
+ offset = dp->d_ops->data_first_offset;
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp);
i < sfp->count;
- i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
ASSERT(xfs_dir2_sf_get_offset(sfep) >= offset);
- ino = xfs_dir3_sfe_get_ino(mp, sfp, sfep);
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
offset =
xfs_dir2_sf_get_offset(sfep) +
- xfs_dir3_data_entsize(mp, sfep->namelen);
- ASSERT(xfs_dir3_sfe_get_ftype(mp, sfp, sfep) <
- XFS_DIR3_FT_MAX);
+ dp->d_ops->data_entsize(sfep->namelen);
+ ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
}
ASSERT(i8count == sfp->i8count);
ASSERT(XFS_BIG_INUMS || i8count == 0);
ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
ASSERT(offset +
(sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
- (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dirblksize);
+ (uint)sizeof(xfs_dir2_block_tail_t) <= args->geo->blksize);
}
#endif /* DEBUG */
@@ -783,7 +703,7 @@ xfs_dir2_sf_create(
/*
* Now can put in the inode number, since i8count is set.
*/
- xfs_dir2_sf_put_parent_ino(sfp, pino);
+ dp->d_ops->sf_put_parent_ino(sfp, pino);
sfp->count = 0;
dp->i_d.di_size = size;
xfs_dir2_sf_check(args);
@@ -838,7 +758,7 @@ xfs_dir2_sf_lookup(
*/
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
- args->inumber = xfs_dir2_sf_get_parent_ino(sfp);
+ args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
args->cmpresult = XFS_CMP_EXACT;
args->filetype = XFS_DIR3_FT_DIR;
return XFS_ERROR(EEXIST);
@@ -848,7 +768,7 @@ xfs_dir2_sf_lookup(
*/
ci_sfep = NULL;
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
/*
* Compare name and if it's an exact match, return the inode
* number. If it's the first case-insensitive match, store the
@@ -858,10 +778,8 @@ xfs_dir2_sf_lookup(
sfep->namelen);
if (cmp != XFS_CMP_DIFFERENT && cmp != args->cmpresult) {
args->cmpresult = cmp;
- args->inumber = xfs_dir3_sfe_get_ino(dp->i_mount,
- sfp, sfep);
- args->filetype = xfs_dir3_sfe_get_ftype(dp->i_mount,
- sfp, sfep);
+ args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
+ args->filetype = dp->d_ops->sf_get_ftype(sfep);
if (cmp == XFS_CMP_EXACT)
return XFS_ERROR(EEXIST);
ci_sfep = sfep;
@@ -917,10 +835,10 @@ xfs_dir2_sf_removename(
* Find the one we're deleting.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
- ASSERT(xfs_dir3_sfe_get_ino(dp->i_mount, sfp, sfep) ==
+ ASSERT(dp->d_ops->sf_get_ino(sfp, sfep) ==
args->inumber);
break;
}
@@ -934,7 +852,7 @@ xfs_dir2_sf_removename(
* Calculate sizes.
*/
byteoff = (int)((char *)sfep - (char *)sfp);
- entsize = xfs_dir3_sf_entsize(dp->i_mount, sfp, args->namelen);
+ entsize = dp->d_ops->sf_entsize(sfp, args->namelen);
newsize = oldsize - entsize;
/*
* Copy the part if any after the removed entry, sliding it down.
@@ -1041,28 +959,25 @@ xfs_dir2_sf_replace(
if (args->namelen == 2 &&
args->name[0] == '.' && args->name[1] == '.') {
#if XFS_BIG_INUMS || defined(DEBUG)
- ino = xfs_dir2_sf_get_parent_ino(sfp);
+ ino = dp->d_ops->sf_get_parent_ino(sfp);
ASSERT(args->inumber != ino);
#endif
- xfs_dir2_sf_put_parent_ino(sfp, args->inumber);
+ dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
}
/*
* Normal entry, look for the name.
*/
else {
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp); i < sfp->count;
- i++, sfep = xfs_dir3_sf_nextentry(dp->i_mount, sfp, sfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
XFS_CMP_EXACT) {
#if XFS_BIG_INUMS || defined(DEBUG)
- ino = xfs_dir3_sfe_get_ino(dp->i_mount,
- sfp, sfep);
+ ino = dp->d_ops->sf_get_ino(sfp, sfep);
ASSERT(args->inumber != ino);
#endif
- xfs_dir3_sfe_put_ino(dp->i_mount, sfp, sfep,
- args->inumber);
- xfs_dir3_sfe_put_ftype(dp->i_mount, sfp, sfep,
- args->filetype);
+ dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
+ dp->d_ops->sf_put_ftype(sfep, args->filetype);
break;
}
}
@@ -1165,22 +1080,21 @@ xfs_dir2_sf_toino4(
*/
sfp->count = oldsfp->count;
sfp->i8count = 0;
- xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+ dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
- oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
sfep->offset = oldsfep->offset;
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- xfs_dir3_sfe_put_ino(mp, sfp, sfep,
- xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
- xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
- xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
}
/*
* Clean up the inode.
@@ -1191,9 +1105,9 @@ xfs_dir2_sf_toino4(
}
/*
- * Convert from 4-byte inode numbers to 8-byte inode numbers.
- * The new 8-byte inode number is not there yet, we leave with the
- * count 1 but no corresponding entry.
+ * Convert existing entries from 4-byte inode numbers to 8-byte inode numbers.
+ * The new entry w/ an 8-byte inode number is not there yet; we leave with
+ * i8count set to 1, but no corresponding 8-byte entry.
*/
static void
xfs_dir2_sf_toino8(
@@ -1226,7 +1140,7 @@ xfs_dir2_sf_toino8(
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
- * Compute the new inode size.
+ * Compute the new inode size (nb: entry count + 1 for parent)
*/
newsize =
oldsize +
@@ -1244,22 +1158,21 @@ xfs_dir2_sf_toino8(
*/
sfp->count = oldsfp->count;
sfp->i8count = 1;
- xfs_dir2_sf_put_parent_ino(sfp, xfs_dir2_sf_get_parent_ino(oldsfp));
+ dp->d_ops->sf_put_parent_ino(sfp, dp->d_ops->sf_get_parent_ino(oldsfp));
/*
* Copy the entries field by field.
*/
for (i = 0, sfep = xfs_dir2_sf_firstentry(sfp),
oldsfep = xfs_dir2_sf_firstentry(oldsfp);
i < sfp->count;
- i++, sfep = xfs_dir3_sf_nextentry(mp, sfp, sfep),
- oldsfep = xfs_dir3_sf_nextentry(mp, oldsfp, oldsfep)) {
+ i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep),
+ oldsfep = dp->d_ops->sf_nextentry(oldsfp, oldsfep)) {
sfep->namelen = oldsfep->namelen;
sfep->offset = oldsfep->offset;
memcpy(sfep->name, oldsfep->name, sfep->namelen);
- xfs_dir3_sfe_put_ino(mp, sfp, sfep,
- xfs_dir3_sfe_get_ino(mp, oldsfp, oldsfep));
- xfs_dir3_sfe_put_ftype(mp, sfp, sfep,
- xfs_dir3_sfe_get_ftype(mp, oldsfp, oldsfep));
+ dp->d_ops->sf_put_ino(sfp, sfep,
+ dp->d_ops->sf_get_ino(oldsfp, oldsfep));
+ dp->d_ops->sf_put_ftype(sfep, dp->d_ops->sf_get_ftype(oldsfep));
}
/*
* Clean up the inode.
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 45560ee1a4b..4f11ef01113 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -17,22 +17,21 @@
*/
#include "xfs.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_quota.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_discard.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
STATIC int
xfs_trim_extents(
@@ -158,7 +157,7 @@ xfs_ioc_trim(
struct xfs_mount *mp,
struct fstrim_range __user *urange)
{
- struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+ struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
unsigned int granularity = q->limits.discard_granularity;
struct fstrim_range range;
xfs_daddr_t start, end, minlen;
@@ -181,7 +180,8 @@ xfs_ioc_trim(
* matter as trimming blocks is an advisory interface.
*/
if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
- range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+ range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+ range.len < mp->m_sb.sb_blocksize)
return -XFS_ERROR(EINVAL);
start = BTOBB(range.start);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1ee776d477c..3ee0cd43edc 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -18,28 +18,28 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_rtalloc.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_space.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
/*
* Lock order:
@@ -292,118 +292,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp)
dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5;
}
-STATIC bool
-xfs_dquot_buf_verify_crc(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
- int ndquots;
- int i;
-
- if (!xfs_sb_version_hascrc(&mp->m_sb))
- return true;
-
- /*
- * if we are in log recovery, the quota subsystem has not been
- * initialised so we have no quotainfo structure. In that case, we need
- * to manually calculate the number of dquots in the buffer.
- */
- if (mp->m_quotainfo)
- ndquots = mp->m_quotainfo->qi_dqperchunk;
- else
- ndquots = xfs_qm_calc_dquots_per_chunk(mp,
- XFS_BB_TO_FSB(mp, bp->b_length));
-
- for (i = 0; i < ndquots; i++, d++) {
- if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
- XFS_DQUOT_CRC_OFF))
- return false;
- if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
- return false;
- }
- return true;
-}
-
-STATIC bool
-xfs_dquot_buf_verify(
- struct xfs_mount *mp,
- struct xfs_buf *bp)
-{
- struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
- xfs_dqid_t id = 0;
- int ndquots;
- int i;
-
- /*
- * if we are in log recovery, the quota subsystem has not been
- * initialised so we have no quotainfo structure. In that case, we need
- * to manually calculate the number of dquots in the buffer.
- */
- if (mp->m_quotainfo)
- ndquots = mp->m_quotainfo->qi_dqperchunk;
- else
- ndquots = xfs_qm_calc_dquots_per_chunk(mp, bp->b_length);
-
- /*
- * On the first read of the buffer, verify that each dquot is valid.
- * We don't know what the id of the dquot is supposed to be, just that
- * they should be increasing monotonically within the buffer. If the
- * first id is corrupt, then it will fail on the second dquot in the
- * buffer so corruptions could point to the wrong dquot in this case.
- */
- for (i = 0; i < ndquots; i++) {
- struct xfs_disk_dquot *ddq;
- int error;
-
- ddq = &d[i].dd_diskdq;
-
- if (i == 0)
- id = be32_to_cpu(ddq->d_id);
-
- error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
- "xfs_dquot_buf_verify");
- if (error)
- return false;
- }
- return true;
-}
-
-static void
-xfs_dquot_buf_read_verify(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_target->bt_mount;
-
- if (!xfs_dquot_buf_verify_crc(mp, bp) || !xfs_dquot_buf_verify(mp, bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
-}
-
-/*
- * we don't calculate the CRC here as that is done when the dquot is flushed to
- * the buffer after the update is done. This ensures that the dquot in the
- * buffer always has an up-to-date CRC value.
- */
-void
-xfs_dquot_buf_write_verify(
- struct xfs_buf *bp)
-{
- struct xfs_mount *mp = bp->b_target->bt_mount;
-
- if (!xfs_dquot_buf_verify(mp, bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
- xfs_buf_ioerror(bp, EFSCORRUPTED);
- return;
- }
-}
-
-const struct xfs_buf_ops xfs_dquot_buf_ops = {
- .verify_read = xfs_dquot_buf_read_verify,
- .verify_write = xfs_dquot_buf_write_verify,
-};
-
/*
* Allocate a block and fill it with dquots.
* This is called when the bmapi finds a hole.
@@ -465,10 +353,10 @@ xfs_qm_dqalloc(
dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen,
0);
-
- error = xfs_buf_geterror(bp);
- if (error)
+ if (!bp) {
+ error = ENOMEM;
goto error1;
+ }
bp->b_ops = &xfs_dquot_buf_ops;
/*
@@ -514,6 +402,7 @@ xfs_qm_dqalloc(
return (error);
}
+
STATIC int
xfs_qm_dqrepair(
struct xfs_mount *mp,
@@ -547,7 +436,7 @@ xfs_qm_dqrepair(
/* Do the actual repair of dquots in this buffer */
for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
ddq = &d[i].dd_diskdq;
- error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+ error = xfs_dqcheck(mp, ddq, firstid + i,
dqp->dq_flags & XFS_DQ_ALLTYPES,
XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
if (error) {
@@ -580,16 +469,17 @@ xfs_qm_dqtobp(
struct xfs_mount *mp = dqp->q_mount;
xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
struct xfs_trans *tp = (tpp ? *tpp : NULL);
+ uint lock_mode;
dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
- xfs_ilock(quotip, XFS_ILOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(quotip);
if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
/*
* Return if this type of quotas is turned off while we
* didn't have the quota inode lock.
*/
- xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+ xfs_iunlock(quotip, lock_mode);
return ESRCH;
}
@@ -599,7 +489,7 @@ xfs_qm_dqtobp(
error = xfs_bmapi_read(quotip, dqp->q_fileoffset,
XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0);
- xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+ xfs_iunlock(quotip, lock_mode);
if (error)
return error;
@@ -725,7 +615,7 @@ xfs_qm_dqread(
if (flags & XFS_QMOPT_DQALLOC) {
tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_attrsetm,
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_dqalloc,
XFS_QM_DQALLOC_SPACE_RES(mp), 0);
if (error)
goto error1;
@@ -942,47 +832,6 @@ restart:
return (0);
}
-
-STATIC void
-xfs_qm_dqput_final(
- struct xfs_dquot *dqp)
-{
- struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
- struct xfs_dquot *gdqp;
- struct xfs_dquot *pdqp;
-
- trace_xfs_dqput_free(dqp);
-
- if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
-
- /*
- * If we just added a udquot to the freelist, then we want to release
- * the gdquot/pdquot reference that it (probably) has. Otherwise it'll
- * keep the gdquot/pdquot from getting reclaimed.
- */
- gdqp = dqp->q_gdquot;
- if (gdqp) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
-
- pdqp = dqp->q_pdquot;
- if (pdqp) {
- xfs_dqlock(pdqp);
- dqp->q_pdquot = NULL;
- }
- xfs_dqunlock(dqp);
-
- /*
- * If we had a group/project quota hint, release it now.
- */
- if (gdqp)
- xfs_qm_dqput(gdqp);
- if (pdqp)
- xfs_qm_dqput(pdqp);
-}
-
/*
* Release a reference to the dquot (decrement ref-count) and unlock it.
*
@@ -998,10 +847,14 @@ xfs_qm_dqput(
trace_xfs_dqput(dqp);
- if (--dqp->q_nrefs > 0)
- xfs_dqunlock(dqp);
- else
- xfs_qm_dqput_final(dqp);
+ if (--dqp->q_nrefs == 0) {
+ struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
+ trace_xfs_dqput_free(dqp);
+
+ if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ XFS_STATS_INC(xs_qm_dquot_unused);
+ }
+ xfs_dqunlock(dqp);
}
/*
@@ -1133,7 +986,7 @@ xfs_qm_dqflush(
/*
* A simple sanity check in case we got a corrupted dquot..
*/
- error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+ error = xfs_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
XFS_QMOPT_DOWARN, "dqflush (incore copy)");
if (error) {
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 55abbca2883..68a68f70483 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -52,8 +52,6 @@ typedef struct xfs_dquot {
int q_bufoffset; /* off of dq in buffer (# dquots) */
xfs_fileoff_t q_fileoffset; /* offset in quotas file */
- struct xfs_dquot*q_gdquot; /* group dquot, hint only */
- struct xfs_dquot*q_pdquot; /* project dquot, hint only */
xfs_disk_dquot_t q_core; /* actual usage & quotas */
xfs_dq_logitem_t q_logitem; /* dquot log item */
xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
@@ -172,6 +170,4 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/xfs_dquot_buf.c
new file mode 100644
index 00000000000..c2ac0c611ad
--- /dev/null
+++ b/fs/xfs/xfs_dquot_buf.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_qm.h"
+#include "xfs_error.h"
+#include "xfs_cksum.h"
+#include "xfs_trace.h"
+
+int
+xfs_calc_dquots_per_chunk(
+ unsigned int nbblks) /* basic block units */
+{
+ unsigned int ndquots;
+
+ ASSERT(nbblks > 0);
+ ndquots = BBTOB(nbblks);
+ do_div(ndquots, sizeof(xfs_dqblk_t));
+
+ return ndquots;
+}
+
+/*
+ * Do some primitive error checking on ondisk dquot data structures.
+ */
+int
+xfs_dqcheck(
+ struct xfs_mount *mp,
+ xfs_disk_dquot_t *ddq,
+ xfs_dqid_t id,
+ uint type, /* used only when IO_dorepair is true */
+ uint flags,
+ char *str)
+{
+ xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
+ int errs = 0;
+
+ /*
+ * We can encounter an uninitialized dquot buffer for 2 reasons:
+ * 1. If we crash while deleting the quotainode(s), and those blks got
+ * used for user data. This is because we take the path of regular
+ * file deletion; however, the size field of quotainodes is never
+ * updated, so all the tricks that we play in itruncate_finish
+ * don't quite matter.
+ *
+ * 2. We don't play the quota buffers when there's a quotaoff logitem.
+ * But the allocation will be replayed so we'll end up with an
+ * uninitialized quota block.
+ *
+ * This is all fine; things are still consistent, and we haven't lost
+ * any quota information. Just don't complain about bad dquot blks.
+ */
+ if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
+ str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
+ errs++;
+ }
+ if (ddq->d_version != XFS_DQUOT_VERSION) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
+ str, id, ddq->d_version, XFS_DQUOT_VERSION);
+ errs++;
+ }
+
+ if (ddq->d_flags != XFS_DQ_USER &&
+ ddq->d_flags != XFS_DQ_PROJ &&
+ ddq->d_flags != XFS_DQ_GROUP) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
+ str, id, ddq->d_flags);
+ errs++;
+ }
+
+ if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : ondisk-dquot 0x%p, ID mismatch: "
+ "0x%x expected, found id 0x%x",
+ str, ddq, id, be32_to_cpu(ddq->d_id));
+ errs++;
+ }
+
+ if (!errs && ddq->d_id) {
+ if (ddq->d_blk_softlimit &&
+ be64_to_cpu(ddq->d_bcount) >
+ be64_to_cpu(ddq->d_blk_softlimit)) {
+ if (!ddq->d_btimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_ino_softlimit &&
+ be64_to_cpu(ddq->d_icount) >
+ be64_to_cpu(ddq->d_ino_softlimit)) {
+ if (!ddq->d_itimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ if (ddq->d_rtb_softlimit &&
+ be64_to_cpu(ddq->d_rtbcount) >
+ be64_to_cpu(ddq->d_rtb_softlimit)) {
+ if (!ddq->d_rtbtimer) {
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_alert(mp,
+ "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
+ str, (int)be32_to_cpu(ddq->d_id), ddq);
+ errs++;
+ }
+ }
+ }
+
+ if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
+ return errs;
+
+ if (flags & XFS_QMOPT_DOWARN)
+ xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
+
+ /*
+ * Typically, a repair is only requested by quotacheck.
+ */
+ ASSERT(id != -1);
+ ASSERT(flags & XFS_QMOPT_DQREPAIR);
+ memset(d, 0, sizeof(xfs_dqblk_t));
+
+ d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+ d->dd_diskdq.d_flags = type;
+ d->dd_diskdq.d_id = cpu_to_be32(id);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
+ xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
+
+ return errs;
+}
+
+STATIC bool
+xfs_dquot_buf_verify_crc(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ int ndquots;
+ int i;
+
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ return true;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(
+ XFS_BB_TO_FSB(mp, bp->b_length));
+
+ for (i = 0; i < ndquots; i++, d++) {
+ if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF))
+ return false;
+ if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid))
+ return false;
+ }
+ return true;
+}
+
+STATIC bool
+xfs_dquot_buf_verify(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ xfs_dqid_t id = 0;
+ int ndquots;
+ int i;
+
+ /*
+ * if we are in log recovery, the quota subsystem has not been
+ * initialised so we have no quotainfo structure. In that case, we need
+ * to manually calculate the number of dquots in the buffer.
+ */
+ if (mp->m_quotainfo)
+ ndquots = mp->m_quotainfo->qi_dqperchunk;
+ else
+ ndquots = xfs_calc_dquots_per_chunk(bp->b_length);
+
+ /*
+ * On the first read of the buffer, verify that each dquot is valid.
+ * We don't know what the id of the dquot is supposed to be, just that
+ * they should be increasing monotonically within the buffer. If the
+ * first id is corrupt, then it will fail on the second dquot in the
+ * buffer so corruptions could point to the wrong dquot in this case.
+ */
+ for (i = 0; i < ndquots; i++) {
+ struct xfs_disk_dquot *ddq;
+ int error;
+
+ ddq = &d[i].dd_diskdq;
+
+ if (i == 0)
+ id = be32_to_cpu(ddq->d_id);
+
+ error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+ "xfs_dquot_buf_verify");
+ if (error)
+ return false;
+ }
+ return true;
+}
+
+static void
+xfs_dquot_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify_crc(mp, bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_dquot_buf_verify(mp, bp))
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
+}
+
+/*
+ * we don't calculate the CRC here as that is done when the dquot is flushed to
+ * the buffer after the update is done. This ensures that the dquot in the
+ * buffer always has an up-to-date CRC value.
+ */
+static void
+xfs_dquot_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (!xfs_dquot_buf_verify(mp, bp)) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
+ }
+}
+
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .verify_read = xfs_dquot_buf_read_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
+
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index e838d84b4e8..f33fbaaa4d8 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -18,23 +18,19 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_qm.h"
+#include "xfs_log.h"
static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
{
@@ -61,20 +57,24 @@ xfs_qm_dquot_logitem_size(
STATIC void
xfs_qm_dquot_logitem_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *logvec)
+ struct xfs_log_vec *lv)
{
struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
-
- logvec->i_addr = &qlip->qli_format;
- logvec->i_len = sizeof(xfs_dq_logformat_t);
- logvec->i_type = XLOG_REG_TYPE_QFORMAT;
- logvec++;
- logvec->i_addr = &qlip->qli_dquot->q_core;
- logvec->i_len = sizeof(xfs_disk_dquot_t);
- logvec->i_type = XLOG_REG_TYPE_DQUOT;
-
- qlip->qli_format.qlf_size = 2;
-
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_dq_logformat *qlf;
+
+ qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QFORMAT);
+ qlf->qlf_type = XFS_LI_DQUOT;
+ qlf->qlf_size = 2;
+ qlf->qlf_id = be32_to_cpu(qlip->qli_dquot->q_core.d_id);
+ qlf->qlf_blkno = qlip->qli_dquot->q_blkno;
+ qlf->qlf_len = 1;
+ qlf->qlf_boffset = qlip->qli_dquot->q_bufoffset;
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_dq_logformat));
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_DQUOT,
+ &qlip->qli_dquot->q_core,
+ sizeof(struct xfs_disk_dquot));
}
/*
@@ -261,18 +261,6 @@ xfs_qm_dquot_logitem_init(
xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
&xfs_dquot_item_ops);
lp->qli_dquot = dqp;
- lp->qli_format.qlf_type = XFS_LI_DQUOT;
- lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
- lp->qli_format.qlf_blkno = dqp->q_blkno;
- lp->qli_format.qlf_len = 1;
- /*
- * This is just the offset of this dquot within its buffer
- * (which is currently 1 FSB and probably won't change).
- * Hence 32 bits for this offset should be just fine.
- * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
- * here, and recompute it at recovery time.
- */
- lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
}
/*------------------ QUOTAOFF LOG ITEMS -------------------*/
@@ -298,26 +286,20 @@ xfs_qm_qoff_logitem_size(
*nbytes += sizeof(struct xfs_qoff_logitem);
}
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
STATIC void
xfs_qm_qoff_logitem_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
-
- ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
-
- log_vector->i_addr = &qflip->qql_format;
- log_vector->i_len = sizeof(xfs_qoff_logitem_t);
- log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
- qflip->qql_format.qf_size = 1;
+ struct xfs_log_iovec *vecp = NULL;
+ struct xfs_qoff_logformat *qlf;
+
+ qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF);
+ qlf->qf_type = XFS_LI_QUOTAOFF;
+ qlf->qf_size = 1;
+ qlf->qf_flags = qflip->qql_flags;
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem));
}
/*
@@ -457,8 +439,7 @@ xfs_qm_qoff_logitem_init(
xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
&xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
qf->qql_item.li_mountp = mp;
- qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
- qf->qql_format.qf_flags = flags;
qf->qql_start_lip = start;
+ qf->qql_flags = flags;
return qf;
}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 5acae2ada70..502e9464634 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -27,13 +27,12 @@ typedef struct xfs_dq_logitem {
xfs_log_item_t qli_item; /* common portion */
struct xfs_dquot *qli_dquot; /* dquot ptr */
xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
- xfs_dq_logformat_t qli_format; /* logged structure */
} xfs_dq_logitem_t;
typedef struct xfs_qoff_logitem {
xfs_log_item_t qql_item; /* common portion */
struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
- xfs_qoff_logformat_t qql_format; /* logged structure */
+ unsigned int qql_flags;
} xfs_qoff_logitem_t;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 1123d93ff79..edac5b057d2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -16,16 +16,13 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_format.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
#include "xfs_error.h"
#ifdef DEBUG
@@ -159,7 +156,7 @@ xfs_error_report(
{
if (level <= xfs_error_level) {
xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
- "Internal error %s at line %d of file %s. Caller 0x%p\n",
+ "Internal error %s at line %d of file %s. Caller %pF",
tag, linenum, filename, ra);
xfs_stack_trace();
@@ -181,3 +178,28 @@ xfs_corruption_error(
xfs_error_report(tag, level, mp, filename, linenum, ra);
xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
+
+/*
+ * Warnings specifically for verifier errors. Differentiate CRC vs. invalid
+ * values, and omit the stack trace unless the error level is tuned high.
+ */
+void
+xfs_verifier_error(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+ bp->b_error == EFSBADCRC ? "CRC error" : "corruption",
+ __return_address, bp->b_bn);
+
+ xfs_alert(mp, "Unmount and run xfs_repair");
+
+ if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
+ xfs_alert(mp, "First 64 bytes of corrupted metadata buffer:");
+ xfs_hex_dump(xfs_buf_offset(bp, 0), 64);
+ }
+
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 079a367f44e..c1c57d4a4b5 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -34,6 +34,7 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
extern void xfs_corruption_error(const char *tag, int level,
struct xfs_mount *mp, void *p, const char *filename,
int linenum, inst_t *ra);
+extern void xfs_verifier_error(struct xfs_buf *bp);
#define XFS_ERROR_REPORT(e, lvl, mp) \
xfs_error_report(e, lvl, mp, __FILE__, __LINE__, __return_address)
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 066df425c14..753e467aa1a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -16,21 +16,21 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_da_format.h"
#include "xfs_dir2.h"
#include "xfs_export.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_log.h"
/*
* Note that we only accept fileids which are long enough rather than allow
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index e43708e2f08..fd22f69049d 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -19,17 +19,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_alloc.h"
-#include "xfs_inode.h"
#include "xfs_extent_busy.h"
#include "xfs_trace.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
void
xfs_extent_busy_insert(
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 985412d65ba..bfff284d2dc 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -20,6 +20,10 @@
#ifndef __XFS_EXTENT_BUSY_H__
#define __XFS_EXTENT_BUSY_H__
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_alloc_arg;
+
/*
* Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
* have been freed but whose transactions aren't committed to disk yet.
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index dc53e8febbb..fb7a4c1ce1c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -17,15 +17,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_buf_item.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
#include "xfs_extfree_item.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_efi_zone;
@@ -101,9 +102,10 @@ xfs_efi_item_size(
STATIC void
xfs_efi_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
ASSERT(atomic_read(&efip->efi_next_extent) ==
efip->efi_format.efi_nextents);
@@ -111,10 +113,9 @@ xfs_efi_item_format(
efip->efi_format.efi_type = XFS_LI_EFI;
efip->efi_format.efi_size = 1;
- log_vector->i_addr = &efip->efi_format;
- log_vector->i_len = xfs_efi_item_sizeof(efip);
- log_vector->i_type = XLOG_REG_TYPE_EFI_FORMAT;
- ASSERT(log_vector->i_len >= sizeof(xfs_efi_log_format_t));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFI_FORMAT,
+ &efip->efi_format,
+ xfs_efi_item_sizeof(efip));
}
@@ -368,19 +369,19 @@ xfs_efd_item_size(
STATIC void
xfs_efd_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_efd_log_item *efdp = EFD_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
ASSERT(efdp->efd_next_extent == efdp->efd_format.efd_nextents);
efdp->efd_format.efd_type = XFS_LI_EFD;
efdp->efd_format.efd_size = 1;
- log_vector->i_addr = &efdp->efd_format;
- log_vector->i_len = xfs_efd_item_sizeof(efdp);
- log_vector->i_type = XLOG_REG_TYPE_EFD_FORMAT;
- ASSERT(log_vector->i_len >= sizeof(xfs_efd_log_format_t));
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_EFD_FORMAT,
+ &efdp->efd_format,
+ xfs_efd_item_sizeof(efdp));
}
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 4c749ab543d..1f66779d7a4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -17,25 +17,27 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_trans.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_error.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
#include <linux/aio.h>
#include <linux/dcache.h>
@@ -153,7 +155,7 @@ xfs_dir_fsync(
if (!lsn)
return 0;
- return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
}
STATIC int
@@ -227,39 +229,33 @@ xfs_file_fsync(
}
STATIC ssize_t
-xfs_file_aio_read(
+xfs_file_read_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- size_t size = 0;
+ size_t size = iov_iter_count(to);
ssize_t ret = 0;
int ioflags = 0;
xfs_fsize_t n;
+ loff_t pos = iocb->ki_pos;
XFS_STATS_INC(xs_read_calls);
- BUG_ON(iocb->ki_pos != pos);
-
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
if (file->f_mode & FMODE_NOCMTIME)
ioflags |= IO_INVIS;
- ret = generic_segment_checks(iovp, &nr_segs, &size, VERIFY_WRITE);
- if (ret < 0)
- return ret;
-
if (unlikely(ioflags & IO_ISDIRECT)) {
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | size) & target->bt_logical_sectormask) {
if (pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
@@ -292,7 +288,7 @@ xfs_file_aio_read(
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
- ret = -filemap_write_and_wait_range(
+ ret = filemap_write_and_wait_range(
VFS_I(ip)->i_mapping,
pos, -1);
if (ret) {
@@ -306,7 +302,7 @@ xfs_file_aio_read(
trace_xfs_file_read(ip, size, pos, ioflags);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
+ ret = generic_file_read_iter(iocb, to);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -347,47 +343,6 @@ xfs_file_splice_read(
}
/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
- struct pipe_inode_info *pipe,
- struct file *outfilp,
- loff_t *ppos,
- size_t count,
- unsigned int flags)
-{
- struct inode *inode = outfilp->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- int ioflags = 0;
- ssize_t ret;
-
- XFS_STATS_INC(xs_write_calls);
-
- if (outfilp->f_mode & FMODE_NOCMTIME)
- ioflags |= IO_INVIS;
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return -EIO;
-
- xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
- trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
- ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
- if (ret > 0)
- XFS_STATS_ADD(xs_write_bytes, ret);
-
- xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return ret;
-}
-
-/*
* This routine is called to handle zeroing any space in the last block of the
* file that is beyond the EOF. We do this since the size is being increased
* without writing anything to that block and we don't want to read the
@@ -622,10 +577,7 @@ restart:
STATIC ssize_t
xfs_file_dio_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -633,15 +585,18 @@ xfs_file_dio_aio_write(
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
ssize_t ret = 0;
- size_t count = ocount;
int unaligned_io = 0;
int iolock;
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((pos & target->bt_smask) || (count & target->bt_smask))
+ /* DIO must be aligned to device logical sector size */
+ if ((pos | count) & target->bt_logical_sectormask)
return -XFS_ERROR(EINVAL);
+ /* "unaligned" here means not aligned to a filesystem block */
if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
unaligned_io = 1;
@@ -672,9 +627,10 @@ xfs_file_dio_aio_write(
ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock);
if (ret)
goto out;
+ iov_iter_truncate(from, count);
if (mapping->nrpages) {
- ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
pos, -1);
if (ret)
goto out;
@@ -693,8 +649,7 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_direct_write(iocb, iovp,
- &nr_segs, pos, &iocb->ki_pos, count, ocount);
+ ret = generic_file_direct_write(iocb, from, pos);
out:
xfs_rw_iunlock(ip, iolock);
@@ -707,10 +662,7 @@ out:
STATIC ssize_t
xfs_file_buffered_aio_write(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos,
- size_t ocount)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -719,7 +671,8 @@ xfs_file_buffered_aio_write(
ssize_t ret;
int enospc = 0;
int iolock = XFS_IOLOCK_EXCL;
- size_t count = ocount;
+ loff_t pos = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
xfs_rw_ilock(ip, iolock);
@@ -727,14 +680,15 @@ xfs_file_buffered_aio_write(
if (ret)
goto out;
+ iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
- ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, 0);
-
+ ret = generic_perform_write(file, from, pos);
+ if (likely(ret >= 0))
+ iocb->ki_pos = pos + ret;
/*
* If we just got an ENOSPC, try to write back all dirty inodes to
* convert delalloc space to free up some of the excess reserved
@@ -753,40 +707,29 @@ out:
}
STATIC ssize_t
-xfs_file_aio_write(
+xfs_file_write_iter(
struct kiocb *iocb,
- const struct iovec *iovp,
- unsigned long nr_segs,
- loff_t pos)
+ struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
struct xfs_inode *ip = XFS_I(inode);
ssize_t ret;
- size_t ocount = 0;
+ size_t ocount = iov_iter_count(from);
XFS_STATS_INC(xs_write_calls);
- BUG_ON(iocb->ki_pos != pos);
-
- ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
- if (ret)
- return ret;
-
if (ocount == 0)
return 0;
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
- ret = -EIO;
- goto out;
- }
+ if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+ return -EIO;
if (unlikely(file->f_flags & O_DIRECT))
- ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos, ocount);
+ ret = xfs_file_dio_aio_write(iocb, from);
else
- ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
- ocount);
+ ret = xfs_file_buffered_aio_write(iocb, from);
if (ret > 0) {
ssize_t err;
@@ -794,55 +737,99 @@ xfs_file_aio_write(
XFS_STATS_ADD(xs_write_bytes, ret);
/* Handle various SYNC-type writes */
- err = generic_write_sync(file, pos, ret);
+ err = generic_write_sync(file, iocb->ki_pos - ret, ret);
if (err < 0)
ret = err;
}
-
-out:
return ret;
}
STATIC long
xfs_file_fallocate(
- struct file *file,
- int mode,
- loff_t offset,
- loff_t len)
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len)
{
- struct inode *inode = file_inode(file);
- long error;
- loff_t new_size = 0;
- xfs_flock64_t bf;
- xfs_inode_t *ip = XFS_I(inode);
- int cmd = XFS_IOC_RESVSP;
- int attr_flags = XFS_ATTR_NOLOCK;
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_trans *tp;
+ long error;
+ loff_t new_size = 0;
- if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
return -EOPNOTSUPP;
- bf.l_whence = 0;
- bf.l_start = offset;
- bf.l_len = len;
-
xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = xfs_free_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
+
+ if (offset & blksize_mask || len & blksize_mask) {
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ /*
+ * There is no need to overlap collapse range with EOF,
+ * in which case it is effectively a truncate operation
+ */
+ if (offset + len >= i_size_read(inode)) {
+ error = EINVAL;
+ goto out_unlock;
+ }
+
+ new_size = i_size_read(inode) - len;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- cmd = XFS_IOC_UNRESVSP;
+ error = xfs_collapse_file_space(ip, offset, len);
+ if (error)
+ goto out_unlock;
+ } else {
+ if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+ offset + len > i_size_read(inode)) {
+ new_size = offset + len;
+ error = -inode_newsize_ok(inode, new_size);
+ if (error)
+ goto out_unlock;
+ }
- /* check the new inode size is valid before allocating */
- if (!(mode & FALLOC_FL_KEEP_SIZE) &&
- offset + len > i_size_read(inode)) {
- new_size = offset + len;
- error = inode_newsize_ok(inode, new_size);
+ if (mode & FALLOC_FL_ZERO_RANGE)
+ error = xfs_zero_file_space(ip, offset, len);
+ else
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_PREALLOC);
if (error)
goto out_unlock;
}
- if (file->f_flags & O_DSYNC)
- attr_flags |= XFS_ATTR_SYNC;
+ tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+
+ if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+ if (file->f_flags & O_DSYNC)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
if (error)
goto out_unlock;
@@ -852,12 +839,12 @@ xfs_file_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
+ error = xfs_setattr_size(ip, &iattr);
}
out_unlock:
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
- return error;
+ return -error;
}
@@ -890,9 +877,9 @@ xfs_dir_open(
* If there are any blocks, read-ahead block 0 as we're almost
* certain to have the next operation be a read there.
*/
- mode = xfs_ilock_map_shared(ip);
+ mode = xfs_ilock_data_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_dir3_data_readahead(NULL, ip, 0, -1);
+ xfs_dir3_data_readahead(ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
@@ -1193,7 +1180,7 @@ xfs_seek_data(
uint lock;
int error;
- lock = xfs_ilock_map_shared(ip);
+ lock = xfs_ilock_data_map_shared(ip);
isize = i_size_read(inode);
if (start >= isize) {
@@ -1272,7 +1259,7 @@ out:
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
- xfs_iunlock_map_shared(ip, lock);
+ xfs_iunlock(ip, lock);
if (error)
return -error;
@@ -1297,7 +1284,7 @@ xfs_seek_hole(
if (XFS_FORCED_SHUTDOWN(mp))
return -XFS_ERROR(EIO);
- lock = xfs_ilock_map_shared(ip);
+ lock = xfs_ilock_data_map_shared(ip);
isize = i_size_read(inode);
if (start >= isize) {
@@ -1380,7 +1367,7 @@ out:
offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out_unlock:
- xfs_iunlock_map_shared(ip, lock);
+ xfs_iunlock(ip, lock);
if (error)
return -error;
@@ -1409,12 +1396,12 @@ xfs_file_llseek(
const struct file_operations xfs_file_operations = {
.llseek = xfs_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = xfs_file_aio_read,
- .aio_write = xfs_file_aio_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = xfs_file_read_iter,
+ .write_iter = xfs_file_write_iter,
.splice_read = xfs_file_splice_read,
- .splice_write = xfs_file_splice_write,
+ .splice_write = iter_file_splice_write,
.unlocked_ioctl = xfs_file_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = xfs_file_compat_ioctl,
@@ -1440,6 +1427,7 @@ const struct file_operations xfs_dir_file_operations = {
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
+ .map_pages = filemap_map_pages,
.page_mkwrite = xfs_vm_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index ce78e654d37..8ec81bed799 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2006-2007 Silicon Graphics, Inc.
+ * Copyright (c) 2014 Christoph Hellwig.
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -16,116 +17,36 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
-#include "xfs_log.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inum.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_ag.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_inum.h"
+#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_alloc.h"
#include "xfs_mru_cache.h"
+#include "xfs_dinode.h"
#include "xfs_filestream.h"
#include "xfs_trace.h"
-#ifdef XFS_FILESTREAMS_TRACE
-
-ktrace_t *xfs_filestreams_trace_buf;
-
-STATIC void
-xfs_filestreams_trace(
- xfs_mount_t *mp, /* mount point */
- int type, /* type of trace */
- const char *func, /* source function */
- int line, /* source line number */
- __psunsigned_t arg0,
- __psunsigned_t arg1,
- __psunsigned_t arg2,
- __psunsigned_t arg3,
- __psunsigned_t arg4,
- __psunsigned_t arg5)
-{
- ktrace_enter(xfs_filestreams_trace_buf,
- (void *)(__psint_t)(type | (line << 16)),
- (void *)func,
- (void *)(__psunsigned_t)current_pid(),
- (void *)mp,
- (void *)(__psunsigned_t)arg0,
- (void *)(__psunsigned_t)arg1,
- (void *)(__psunsigned_t)arg2,
- (void *)(__psunsigned_t)arg3,
- (void *)(__psunsigned_t)arg4,
- (void *)(__psunsigned_t)arg5,
- NULL, NULL, NULL, NULL, NULL, NULL);
-}
-
-#define TRACE0(mp,t) TRACE6(mp,t,0,0,0,0,0,0)
-#define TRACE1(mp,t,a0) TRACE6(mp,t,a0,0,0,0,0,0)
-#define TRACE2(mp,t,a0,a1) TRACE6(mp,t,a0,a1,0,0,0,0)
-#define TRACE3(mp,t,a0,a1,a2) TRACE6(mp,t,a0,a1,a2,0,0,0)
-#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
-#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
-#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
- xfs_filestreams_trace(mp, t, __func__, __LINE__, \
- (__psunsigned_t)a0, (__psunsigned_t)a1, \
- (__psunsigned_t)a2, (__psunsigned_t)a3, \
- (__psunsigned_t)a4, (__psunsigned_t)a5)
-
-#define TRACE_AG_SCAN(mp, ag, ag2) \
- TRACE2(mp, XFS_FSTRM_KTRACE_AGSCAN, ag, ag2);
-#define TRACE_AG_PICK1(mp, max_ag, maxfree) \
- TRACE2(mp, XFS_FSTRM_KTRACE_AGPICK1, max_ag, maxfree);
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag) \
- TRACE6(mp, XFS_FSTRM_KTRACE_AGPICK2, ag, ag2, \
- cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2) \
- TRACE5(mp, XFS_FSTRM_KTRACE_UPDATE, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_FREE, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_ITEM_LOOKUP, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt) \
- TRACE4(mp, XFS_FSTRM_KTRACE_ASSOCIATE, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt) \
- TRACE6(mp, XFS_FSTRM_KTRACE_MOVEAG, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag) \
- TRACE2(mp, XFS_FSTRM_KTRACE_ORPHAN, ip, ag);
-
-
-#else
-#define TRACE_AG_SCAN(mp, ag, ag2)
-#define TRACE_AG_PICK1(mp, max_ag, maxfree)
-#define TRACE_AG_PICK2(mp, ag, ag2, cnt, free, scan, flag)
-#define TRACE_UPDATE(mp, ip, ag, cnt, ag2, cnt2)
-#define TRACE_FREE(mp, ip, pip, ag, cnt)
-#define TRACE_LOOKUP(mp, ip, pip, ag, cnt)
-#define TRACE_ASSOCIATE(mp, ip, pip, ag, cnt)
-#define TRACE_MOVEAG(mp, ip, pip, oag, ocnt, nag, ncnt)
-#define TRACE_ORPHAN(mp, ip, ag)
-#endif
-
-static kmem_zone_t *item_zone;
+struct xfs_fstrm_item {
+ struct xfs_mru_cache_elem mru;
+ struct xfs_inode *ip;
+ xfs_agnumber_t ag; /* AG in use for this directory */
+};
-/*
- * Structure for associating a file or a directory with an allocation group.
- * The parent directory pointer is only needed for files, but since there will
- * generally be vastly more files than directories in the cache, using the same
- * data structure simplifies the code with very little memory overhead.
- */
-typedef struct fstrm_item
-{
- xfs_agnumber_t ag; /* AG currently in use for the file/directory. */
- xfs_inode_t *ip; /* inode self-pointer. */
- xfs_inode_t *pip; /* Parent directory inode pointer. */
-} fstrm_item_t;
+enum xfs_fstrm_alloc {
+ XFS_PICK_USERDATA = 1,
+ XFS_PICK_LOWSPACE = 2,
+};
/*
* Allocation group filestream associations are tracked with per-ag atomic
- * counters. These counters allow _xfs_filestream_pick_ag() to tell whether a
+ * counters. These counters allow xfs_filestream_pick_ag() to tell whether a
* particular AG already has active filestreams associated with it. The mount
* point's m_peraglock is used to protect these counters from per-ag array
* re-allocation during a growfs operation. When xfs_growfs_data_private() is
@@ -160,7 +81,7 @@ typedef struct fstrm_item
* the cache that reference per-ag array elements that have since been
* reallocated.
*/
-static int
+int
xfs_filestream_peek_ag(
xfs_mount_t *mp,
xfs_agnumber_t agno)
@@ -200,23 +121,40 @@ xfs_filestream_put_ag(
xfs_perag_put(pag);
}
+static void
+xfs_fstrm_free_func(
+ struct xfs_mru_cache_elem *mru)
+{
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+
+ xfs_filestream_put_ag(item->ip->i_mount, item->ag);
+
+ trace_xfs_filestream_free(item->ip, item->ag);
+
+ kmem_free(item);
+}
+
/*
* Scan the AGs starting at startag looking for an AG that isn't in use and has
* at least minlen blocks free.
*/
static int
-_xfs_filestream_pick_ag(
- xfs_mount_t *mp,
- xfs_agnumber_t startag,
- xfs_agnumber_t *agp,
- int flags,
- xfs_extlen_t minlen)
+xfs_filestream_pick_ag(
+ struct xfs_inode *ip,
+ xfs_agnumber_t startag,
+ xfs_agnumber_t *agp,
+ int flags,
+ xfs_extlen_t minlen)
{
- int streams, max_streams;
- int err, trylock, nscan;
- xfs_extlen_t longest, free, minfree, maxfree = 0;
- xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
- struct xfs_perag *pag;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_fstrm_item *item;
+ struct xfs_perag *pag;
+ xfs_extlen_t longest, free = 0, minfree, maxfree = 0;
+ xfs_agnumber_t ag, max_ag = NULLAGNUMBER;
+ int err, trylock, nscan;
+
+ ASSERT(S_ISDIR(ip->i_d.di_mode));
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
@@ -228,8 +166,9 @@ _xfs_filestream_pick_ag(
trylock = XFS_ALLOC_FLAG_TRYLOCK;
for (nscan = 0; 1; nscan++) {
+ trace_xfs_filestream_scan(ip, ag);
+
pag = xfs_perag_get(mp, ag);
- TRACE_AG_SCAN(mp, ag, atomic_read(&pag->pagf_fstrms));
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
@@ -246,7 +185,6 @@ _xfs_filestream_pick_ag(
/* Keep track of the AG with the most free blocks. */
if (pag->pagf_freeblks > maxfree) {
maxfree = pag->pagf_freeblks;
- max_streams = atomic_read(&pag->pagf_fstrms);
max_ag = ag;
}
@@ -269,7 +207,6 @@ _xfs_filestream_pick_ag(
/* Break out, retaining the reference on the AG. */
free = pag->pagf_freeblks;
- streams = atomic_read(&pag->pagf_fstrms);
xfs_perag_put(pag);
*agp = ag;
break;
@@ -305,317 +242,98 @@ next_ag:
*/
if (max_ag != NULLAGNUMBER) {
xfs_filestream_get_ag(mp, max_ag);
- TRACE_AG_PICK1(mp, max_ag, maxfree);
- streams = max_streams;
free = maxfree;
*agp = max_ag;
break;
}
/* take AG 0 if none matched */
- TRACE_AG_PICK1(mp, max_ag, maxfree);
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
*agp = 0;
return 0;
}
- TRACE_AG_PICK2(mp, startag, *agp, streams, free, nscan, flags);
-
- return 0;
-}
-
-/*
- * Set the allocation group number for a file or a directory, updating inode
- * references and per-AG references as appropriate.
- */
-static int
-_xfs_filestream_update_ag(
- xfs_inode_t *ip,
- xfs_inode_t *pip,
- xfs_agnumber_t ag)
-{
- int err = 0;
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t old_ag;
- xfs_inode_t *old_pip;
+ trace_xfs_filestream_pick(ip, *agp, free, nscan);
- /*
- * Either ip is a regular file and pip is a directory, or ip is a
- * directory and pip is NULL.
- */
- ASSERT(ip && ((S_ISREG(ip->i_d.di_mode) && pip &&
- S_ISDIR(pip->i_d.di_mode)) ||
- (S_ISDIR(ip->i_d.di_mode) && !pip)));
-
- mp = ip->i_mount;
- cache = mp->m_filestream;
-
- item = xfs_mru_cache_lookup(cache, ip->i_ino);
- if (item) {
- ASSERT(item->ip == ip);
- old_ag = item->ag;
- item->ag = ag;
- old_pip = item->pip;
- item->pip = pip;
- xfs_mru_cache_done(cache);
-
- /*
- * If the AG has changed, drop the old ref and take a new one,
- * effectively transferring the reference from old to new AG.
- */
- if (ag != old_ag) {
- xfs_filestream_put_ag(mp, old_ag);
- xfs_filestream_get_ag(mp, ag);
- }
-
- /*
- * If ip is a file and its pip has changed, drop the old ref and
- * take a new one.
- */
- if (pip && pip != old_pip) {
- IRELE(old_pip);
- IHOLD(pip);
- }
-
- TRACE_UPDATE(mp, ip, old_ag, xfs_filestream_peek_ag(mp, old_ag),
- ag, xfs_filestream_peek_ag(mp, ag));
+ if (*agp == NULLAGNUMBER)
return 0;
- }
- item = kmem_zone_zalloc(item_zone, KM_MAYFAIL);
+ err = ENOMEM;
+ item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
if (!item)
- return ENOMEM;
+ goto out_put_ag;
- item->ag = ag;
+ item->ag = *agp;
item->ip = ip;
- item->pip = pip;
- err = xfs_mru_cache_insert(cache, ip->i_ino, item);
+ err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
if (err) {
- kmem_zone_free(item_zone, item);
- return err;
+ if (err == EEXIST)
+ err = 0;
+ goto out_free_item;
}
- /* Take a reference on the AG. */
- xfs_filestream_get_ag(mp, ag);
-
- /*
- * Take a reference on the inode itself regardless of whether it's a
- * regular file or a directory.
- */
- IHOLD(ip);
-
- /*
- * In the case of a regular file, take a reference on the parent inode
- * as well to ensure it remains in-core.
- */
- if (pip)
- IHOLD(pip);
-
- TRACE_UPDATE(mp, ip, ag, xfs_filestream_peek_ag(mp, ag),
- ag, xfs_filestream_peek_ag(mp, ag));
-
- return 0;
-}
-
-/* xfs_fstrm_free_func(): callback for freeing cached stream items. */
-STATIC void
-xfs_fstrm_free_func(
- unsigned long ino,
- void *data)
-{
- fstrm_item_t *item = (fstrm_item_t *)data;
- xfs_inode_t *ip = item->ip;
-
- ASSERT(ip->i_ino == ino);
-
- xfs_iflags_clear(ip, XFS_IFILESTREAM);
-
- /* Drop the reference taken on the AG when the item was added. */
- xfs_filestream_put_ag(ip->i_mount, item->ag);
-
- TRACE_FREE(ip->i_mount, ip, item->pip, item->ag,
- xfs_filestream_peek_ag(ip->i_mount, item->ag));
-
- /*
- * _xfs_filestream_update_ag() always takes a reference on the inode
- * itself, whether it's a file or a directory. Release it here.
- * This can result in the inode being freed and so we must
- * not hold any inode locks when freeing filesstreams objects
- * otherwise we can deadlock here.
- */
- IRELE(ip);
-
- /*
- * In the case of a regular file, _xfs_filestream_update_ag() also
- * takes a ref on the parent inode to keep it in-core. Release that
- * too.
- */
- if (item->pip)
- IRELE(item->pip);
-
- /* Finally, free the memory allocated for the item. */
- kmem_zone_free(item_zone, item);
-}
-
-/*
- * xfs_filestream_init() is called at xfs initialisation time to set up the
- * memory zone that will be used for filestream data structure allocation.
- */
-int
-xfs_filestream_init(void)
-{
- item_zone = kmem_zone_init(sizeof(fstrm_item_t), "fstrm_item");
- if (!item_zone)
- return -ENOMEM;
-
return 0;
-}
-
-/*
- * xfs_filestream_uninit() is called at xfs termination time to destroy the
- * memory zone that was used for filestream data structure allocation.
- */
-void
-xfs_filestream_uninit(void)
-{
- kmem_zone_destroy(item_zone);
-}
-
-/*
- * xfs_filestream_mount() is called when a file system is mounted with the
- * filestream option. It is responsible for allocating the data structures
- * needed to track the new file system's file streams.
- */
-int
-xfs_filestream_mount(
- xfs_mount_t *mp)
-{
- int err;
- unsigned int lifetime, grp_count;
-
- /*
- * The filestream timer tunable is currently fixed within the range of
- * one second to four minutes, with five seconds being the default. The
- * group count is somewhat arbitrary, but it'd be nice to adhere to the
- * timer tunable to within about 10 percent. This requires at least 10
- * groups.
- */
- lifetime = xfs_fstrm_centisecs * 10;
- grp_count = 10;
-
- err = xfs_mru_cache_create(&mp->m_filestream, lifetime, grp_count,
- xfs_fstrm_free_func);
+out_free_item:
+ kmem_free(item);
+out_put_ag:
+ xfs_filestream_put_ag(mp, *agp);
return err;
}
-/*
- * xfs_filestream_unmount() is called when a file system that was mounted with
- * the filestream option is unmounted. It drains the data structures created
- * to track the file system's file streams and frees all the memory that was
- * allocated.
- */
-void
-xfs_filestream_unmount(
- xfs_mount_t *mp)
+static struct xfs_inode *
+xfs_filestream_get_parent(
+ struct xfs_inode *ip)
{
- xfs_mru_cache_destroy(mp->m_filestream);
-}
+ struct inode *inode = VFS_I(ip), *dir = NULL;
+ struct dentry *dentry, *parent;
-/*
- * Return the AG of the filestream the file or directory belongs to, or
- * NULLAGNUMBER otherwise.
- */
-xfs_agnumber_t
-xfs_filestream_lookup_ag(
- xfs_inode_t *ip)
-{
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t ag;
- int ref;
-
- if (!S_ISREG(ip->i_d.di_mode) && !S_ISDIR(ip->i_d.di_mode)) {
- ASSERT(0);
- return NULLAGNUMBER;
- }
+ dentry = d_find_alias(inode);
+ if (!dentry)
+ goto out;
- cache = ip->i_mount->m_filestream;
- item = xfs_mru_cache_lookup(cache, ip->i_ino);
- if (!item) {
- TRACE_LOOKUP(ip->i_mount, ip, NULL, NULLAGNUMBER, 0);
- return NULLAGNUMBER;
- }
+ parent = dget_parent(dentry);
+ if (!parent)
+ goto out_dput;
- ASSERT(ip == item->ip);
- ag = item->ag;
- ref = xfs_filestream_peek_ag(ip->i_mount, ag);
- xfs_mru_cache_done(cache);
+ dir = igrab(parent->d_inode);
+ dput(parent);
- TRACE_LOOKUP(ip->i_mount, ip, item->pip, ag, ref);
- return ag;
+out_dput:
+ dput(dentry);
+out:
+ return dir ? XFS_I(dir) : NULL;
}
/*
- * xfs_filestream_associate() should only be called to associate a regular file
- * with its parent directory. Calling it with a child directory isn't
- * appropriate because filestreams don't apply to entire directory hierarchies.
- * Creating a file in a child directory of an existing filestream directory
- * starts a new filestream with its own allocation group association.
+ * Find the right allocation group for a file, either by finding an
+ * existing file stream or creating a new one.
*
- * Returns < 0 on error, 0 if successful association occurred, > 0 if
- * we failed to get an association because of locking issues.
+ * Returns NULLAGNUMBER in case of an error.
*/
-int
-xfs_filestream_associate(
- xfs_inode_t *pip,
- xfs_inode_t *ip)
+xfs_agnumber_t
+xfs_filestream_lookup_ag(
+ struct xfs_inode *ip)
{
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- fstrm_item_t *item;
- xfs_agnumber_t ag, rotorstep, startag;
- int err = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_inode *pip = NULL;
+ xfs_agnumber_t startag, ag = NULLAGNUMBER;
+ struct xfs_mru_cache_elem *mru;
- ASSERT(S_ISDIR(pip->i_d.di_mode));
ASSERT(S_ISREG(ip->i_d.di_mode));
- if (!S_ISDIR(pip->i_d.di_mode) || !S_ISREG(ip->i_d.di_mode))
- return -EINVAL;
-
- mp = pip->i_mount;
- cache = mp->m_filestream;
-
- /*
- * We have a problem, Houston.
- *
- * Taking the iolock here violates inode locking order - we already
- * hold the ilock. Hence if we block getting this lock we may never
- * wake. Unfortunately, that means if we can't get the lock, we're
- * screwed in terms of getting a stream association - we can't spin
- * waiting for the lock because someone else is waiting on the lock we
- * hold and we cannot drop that as we are in a transaction here.
- *
- * Lucky for us, this inversion is not a problem because it's a
- * directory inode that we are trying to lock here.
- *
- * So, if we can't get the iolock without sleeping then just give up
- */
- if (!xfs_ilock_nowait(pip, XFS_IOLOCK_EXCL))
- return 1;
- /* If the parent directory is already in the cache, use its AG. */
- item = xfs_mru_cache_lookup(cache, pip->i_ino);
- if (item) {
- ASSERT(item->ip == pip);
- ag = item->ag;
- xfs_mru_cache_done(cache);
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto out;
- TRACE_LOOKUP(mp, pip, pip, ag, xfs_filestream_peek_ag(mp, ag));
- err = _xfs_filestream_update_ag(ip, pip, ag);
+ mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ ag = container_of(mru, struct xfs_fstrm_item, mru)->ag;
+ xfs_mru_cache_done(mp->m_filestream);
- goto exit;
+ trace_xfs_filestream_lookup(ip, ag);
+ goto out;
}
/*
@@ -623,202 +341,94 @@ xfs_filestream_associate(
* use the directory inode's AG.
*/
if (mp->m_flags & XFS_MOUNT_32BITINODES) {
- rotorstep = xfs_rotorstep;
+ xfs_agnumber_t rotorstep = xfs_rotorstep;
startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount;
mp->m_agfrotor = (mp->m_agfrotor + 1) %
(mp->m_sb.sb_agcount * rotorstep);
} else
startag = XFS_INO_TO_AGNO(mp, pip->i_ino);
- /* Pick a new AG for the parent inode starting at startag. */
- err = _xfs_filestream_pick_ag(mp, startag, &ag, 0, 0);
- if (err || ag == NULLAGNUMBER)
- goto exit_did_pick;
-
- /* Associate the parent inode with the AG. */
- err = _xfs_filestream_update_ag(pip, NULL, ag);
- if (err)
- goto exit_did_pick;
-
- /* Associate the file inode with the AG. */
- err = _xfs_filestream_update_ag(ip, pip, ag);
- if (err)
- goto exit_did_pick;
-
- TRACE_ASSOCIATE(mp, ip, pip, ag, xfs_filestream_peek_ag(mp, ag));
-
-exit_did_pick:
- /*
- * If _xfs_filestream_pick_ag() returned a valid AG, remove the
- * reference it took on it, since the file and directory will have taken
- * their own now if they were successfully cached.
- */
- if (ag != NULLAGNUMBER)
- xfs_filestream_put_ag(mp, ag);
-
-exit:
- xfs_iunlock(pip, XFS_IOLOCK_EXCL);
- return -err;
+ if (xfs_filestream_pick_ag(pip, startag, &ag, 0, 0))
+ ag = NULLAGNUMBER;
+out:
+ IRELE(pip);
+ return ag;
}
/*
- * Pick a new allocation group for the current file and its file stream. This
- * function is called by xfs_bmap_filestreams() with the mount point's per-ag
- * lock held.
+ * Pick a new allocation group for the current file and its file stream.
+ *
+ * This is called when the allocator can't find a suitable extent in the
+ * current AG, and we have to move the stream into a new AG with more space.
*/
int
xfs_filestream_new_ag(
struct xfs_bmalloca *ap,
xfs_agnumber_t *agp)
{
- int flags, err;
- xfs_inode_t *ip, *pip = NULL;
- xfs_mount_t *mp;
- xfs_mru_cache_t *cache;
- xfs_extlen_t minlen;
- fstrm_item_t *dir, *file;
- xfs_agnumber_t ag = NULLAGNUMBER;
-
- ip = ap->ip;
- mp = ip->i_mount;
- cache = mp->m_filestream;
- minlen = ap->length;
- *agp = NULLAGNUMBER;
+ struct xfs_inode *ip = ap->ip, *pip;
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_extlen_t minlen = ap->length;
+ xfs_agnumber_t startag = 0;
+ int flags, err = 0;
+ struct xfs_mru_cache_elem *mru;
- /*
- * Look for the file in the cache, removing it if it's found. Doing
- * this allows it to be held across the dir lookup that follows.
- */
- file = xfs_mru_cache_remove(cache, ip->i_ino);
- if (file) {
- ASSERT(ip == file->ip);
-
- /* Save the file's parent inode and old AG number for later. */
- pip = file->pip;
- ag = file->ag;
-
- /* Look for the file's directory in the cache. */
- dir = xfs_mru_cache_lookup(cache, pip->i_ino);
- if (dir) {
- ASSERT(pip == dir->ip);
-
- /*
- * If the directory has already moved on to a new AG,
- * use that AG as the new AG for the file. Don't
- * forget to twiddle the AG refcounts to match the
- * movement.
- */
- if (dir->ag != file->ag) {
- xfs_filestream_put_ag(mp, file->ag);
- xfs_filestream_get_ag(mp, dir->ag);
- *agp = file->ag = dir->ag;
- }
-
- xfs_mru_cache_done(cache);
- }
+ *agp = NULLAGNUMBER;
- /*
- * Put the file back in the cache. If this fails, the free
- * function needs to be called to tidy up in the same way as if
- * the item had simply expired from the cache.
- */
- err = xfs_mru_cache_insert(cache, ip->i_ino, file);
- if (err) {
- xfs_fstrm_free_func(ip->i_ino, file);
- return err;
- }
+ pip = xfs_filestream_get_parent(ip);
+ if (!pip)
+ goto exit;
- /*
- * If the file's AG was moved to the directory's new AG, there's
- * nothing more to be done.
- */
- if (*agp != NULLAGNUMBER) {
- TRACE_MOVEAG(mp, ip, pip,
- ag, xfs_filestream_peek_ag(mp, ag),
- *agp, xfs_filestream_peek_ag(mp, *agp));
- return 0;
- }
+ mru = xfs_mru_cache_remove(mp->m_filestream, pip->i_ino);
+ if (mru) {
+ struct xfs_fstrm_item *item =
+ container_of(mru, struct xfs_fstrm_item, mru);
+ startag = (item->ag + 1) % mp->m_sb.sb_agcount;
}
- /*
- * If the file's parent directory is known, take its iolock in exclusive
- * mode to prevent two sibling files from racing each other to migrate
- * themselves and their parent to different AGs.
- *
- * Note that we lock the parent directory iolock inside the child
- * iolock here. That's fine as we never hold both parent and child
- * iolock in any other place. This is different from the ilock,
- * which requires locking of the child after the parent for namespace
- * operations.
- */
- if (pip)
- xfs_ilock(pip, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
- /*
- * A new AG needs to be found for the file. If the file's parent
- * directory is also known, it will be moved to the new AG as well to
- * ensure that files created inside it in future use the new AG.
- */
- ag = (ag == NULLAGNUMBER) ? 0 : (ag + 1) % mp->m_sb.sb_agcount;
flags = (ap->userdata ? XFS_PICK_USERDATA : 0) |
(ap->flist->xbf_low ? XFS_PICK_LOWSPACE : 0);
- err = _xfs_filestream_pick_ag(mp, ag, agp, flags, minlen);
- if (err || *agp == NULLAGNUMBER)
- goto exit;
+ err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
/*
- * If the file wasn't found in the file cache, then its parent directory
- * inode isn't known. For this to have happened, the file must either
- * be pre-existing, or it was created long enough ago that its cache
- * entry has expired. This isn't the sort of usage that the filestreams
- * allocator is trying to optimise, so there's no point trying to track
- * its new AG somehow in the filestream data structures.
+ * Only free the item here so we skip over the old AG earlier.
*/
- if (!pip) {
- TRACE_ORPHAN(mp, ip, *agp);
- goto exit;
- }
-
- /* Associate the parent inode with the AG. */
- err = _xfs_filestream_update_ag(pip, NULL, *agp);
- if (err)
- goto exit;
-
- /* Associate the file inode with the AG. */
- err = _xfs_filestream_update_ag(ip, pip, *agp);
- if (err)
- goto exit;
-
- TRACE_MOVEAG(mp, ip, pip, NULLAGNUMBER, 0,
- *agp, xfs_filestream_peek_ag(mp, *agp));
+ if (mru)
+ xfs_fstrm_free_func(mru);
+ IRELE(pip);
exit:
- /*
- * If _xfs_filestream_pick_ag() returned a valid AG, remove the
- * reference it took on it, since the file and directory will have taken
- * their own now if they were successfully cached.
- */
- if (*agp != NULLAGNUMBER)
- xfs_filestream_put_ag(mp, *agp);
- else
+ if (*agp == NULLAGNUMBER)
*agp = 0;
-
- if (pip)
- xfs_iunlock(pip, XFS_IOLOCK_EXCL);
-
return err;
}
-/*
- * Remove an association between an inode and a filestream object.
- * Typically this is done on last close of an unlinked file.
- */
void
xfs_filestream_deassociate(
- xfs_inode_t *ip)
+ struct xfs_inode *ip)
{
- xfs_mru_cache_t *cache = ip->i_mount->m_filestream;
+ xfs_mru_cache_delete(ip->i_mount->m_filestream, ip->i_ino);
+}
- xfs_mru_cache_delete(cache, ip->i_ino);
+int
+xfs_filestream_mount(
+ xfs_mount_t *mp)
+{
+ /*
+ * The filestream timer tunable is currently fixed within the range of
+ * one second to four minutes, with five seconds being the default. The
+ * group count is somewhat arbitrary, but it'd be nice to adhere to the
+ * timer tunable to within about 10 percent. This requires at least 10
+ * groups.
+ */
+ return xfs_mru_cache_create(&mp->m_filestream, xfs_fstrm_centisecs * 10,
+ 10, xfs_fstrm_free_func);
+}
+
+void
+xfs_filestream_unmount(
+ xfs_mount_t *mp)
+{
+ xfs_mru_cache_destroy(mp->m_filestream);
}
diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h
index 6d61dbee856..2ef43406e53 100644
--- a/fs/xfs/xfs_filestream.h
+++ b/fs/xfs/xfs_filestream.h
@@ -20,50 +20,20 @@
struct xfs_mount;
struct xfs_inode;
-struct xfs_perag;
struct xfs_bmalloca;
-#ifdef XFS_FILESTREAMS_TRACE
-#define XFS_FSTRM_KTRACE_INFO 1
-#define XFS_FSTRM_KTRACE_AGSCAN 2
-#define XFS_FSTRM_KTRACE_AGPICK1 3
-#define XFS_FSTRM_KTRACE_AGPICK2 4
-#define XFS_FSTRM_KTRACE_UPDATE 5
-#define XFS_FSTRM_KTRACE_FREE 6
-#define XFS_FSTRM_KTRACE_ITEM_LOOKUP 7
-#define XFS_FSTRM_KTRACE_ASSOCIATE 8
-#define XFS_FSTRM_KTRACE_MOVEAG 9
-#define XFS_FSTRM_KTRACE_ORPHAN 10
-
-#define XFS_FSTRM_KTRACE_SIZE 16384
-extern ktrace_t *xfs_filestreams_trace_buf;
-
-#endif
-
-/* allocation selection flags */
-typedef enum xfs_fstrm_alloc {
- XFS_PICK_USERDATA = 1,
- XFS_PICK_LOWSPACE = 2,
-} xfs_fstrm_alloc_t;
-
-/* prototypes for filestream.c */
-int xfs_filestream_init(void);
-void xfs_filestream_uninit(void);
int xfs_filestream_mount(struct xfs_mount *mp);
void xfs_filestream_unmount(struct xfs_mount *mp);
-xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
-int xfs_filestream_associate(struct xfs_inode *dip, struct xfs_inode *ip);
void xfs_filestream_deassociate(struct xfs_inode *ip);
+xfs_agnumber_t xfs_filestream_lookup_ag(struct xfs_inode *ip);
int xfs_filestream_new_ag(struct xfs_bmalloca *ap, xfs_agnumber_t *agp);
+int xfs_filestream_peek_ag(struct xfs_mount *mp, xfs_agnumber_t agno);
-
-/* filestreams for the inode? */
static inline int
xfs_inode_is_filestream(
struct xfs_inode *ip)
{
return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) ||
- xfs_iflags_test(ip, XFS_IFILESTREAM) ||
(ip->i_d.di_flags & XFS_DIFLAG_FILESTREAM);
}
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/xfs_format.h
index 35c08ff54ca..34d85aca305 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/xfs_format.h
@@ -145,6 +145,8 @@ struct xfs_dsymlink_hdr {
__be64 sl_lsn;
};
+#define XFS_SYMLINK_CRC_OFF offsetof(struct xfs_dsymlink_hdr, sl_crc)
+
/*
* The maximum pathlen is 1024 bytes. Since the minimum file system
* blocksize is 512 bytes, we can get a max of 3 extents back from
@@ -156,14 +158,271 @@ struct xfs_dsymlink_hdr {
((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \
sizeof(struct xfs_dsymlink_hdr) : 0))
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
- uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
- struct xfs_inode *ip, struct xfs_ifork *ifp);
-extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+/*
+ * Allocation Btree format definitions
+ *
+ * There are two on-disk btrees, one sorted by blockno and one sorted
+ * by blockcount and blockno. All blocks look the same to make the code
+ * simpler; if we have time later, we'll make the optimizations.
+ */
+#define XFS_ABTB_MAGIC 0x41425442 /* 'ABTB' for bno tree */
+#define XFS_ABTB_CRC_MAGIC 0x41423342 /* 'AB3B' */
+#define XFS_ABTC_MAGIC 0x41425443 /* 'ABTC' for cnt tree */
+#define XFS_ABTC_CRC_MAGIC 0x41423343 /* 'AB3C' */
+
+/*
+ * Data record/key structure
+ */
+typedef struct xfs_alloc_rec {
+ __be32 ar_startblock; /* starting block number */
+ __be32 ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_t, xfs_alloc_key_t;
+
+typedef struct xfs_alloc_rec_incore {
+ xfs_agblock_t ar_startblock; /* starting block number */
+ xfs_extlen_t ar_blockcount; /* count of free blocks */
+} xfs_alloc_rec_incore_t;
+
+/* btree pointer type */
+typedef __be32 xfs_alloc_ptr_t;
+
+/*
+ * Block numbers in the AG:
+ * SB is sector 0, AGF is sector 1, AGI is sector 2, AGFL is sector 3.
+ */
+#define XFS_BNO_BLOCK(mp) ((xfs_agblock_t)(XFS_AGFL_BLOCK(mp) + 1))
+#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
+
+
+/*
+ * Inode Allocation Btree format definitions
+ *
+ * There is a btree for the inode map per allocation group.
+ */
+#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
+#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
+#define XFS_FIBT_MAGIC 0x46494254 /* 'FIBT' */
+#define XFS_FIBT_CRC_MAGIC 0x46494233 /* 'FIB3' */
+
+typedef __uint64_t xfs_inofree_t;
+#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
+#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
+#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
+#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
+
+static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
+{
+ return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
+}
+
+/*
+ * Data record structure
+ */
+typedef struct xfs_inobt_rec {
+ __be32 ir_startino; /* starting inode number */
+ __be32 ir_freecount; /* count of free inodes (set bits) */
+ __be64 ir_free; /* free inode mask */
+} xfs_inobt_rec_t;
+
+typedef struct xfs_inobt_rec_incore {
+ xfs_agino_t ir_startino; /* starting inode number */
+ __int32_t ir_freecount; /* count of free inodes (set bits) */
+ xfs_inofree_t ir_free; /* free inode mask */
+} xfs_inobt_rec_incore_t;
+
+
+/*
+ * Key structure
+ */
+typedef struct xfs_inobt_key {
+ __be32 ir_startino; /* starting inode number */
+} xfs_inobt_key_t;
+
+/* btree pointer type */
+typedef __be32 xfs_inobt_ptr_t;
+
+/*
+ * block numbers in the AG.
+ */
+#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
+#define XFS_FIBT_BLOCK(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
+
+/*
+ * The first data block of an AG depends on whether the filesystem was formatted
+ * with the finobt feature. If so, account for the finobt reserved root btree
+ * block.
+ */
+#define XFS_PREALLOC_BLOCKS(mp) \
+ (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \
+ XFS_FIBT_BLOCK(mp) + 1 : \
+ XFS_IBT_BLOCK(mp) + 1)
+
+
+
+/*
+ * BMAP Btree format definitions
+ *
+ * This includes both the root block definition that sits inside an inode fork
+ * and the record/pointer formats for the leaf/node in the blocks.
+ */
+#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
+#define XFS_BMAP_CRC_MAGIC 0x424d4133 /* 'BMA3' */
+
+/*
+ * Bmap root header, on-disk form only.
+ */
+typedef struct xfs_bmdr_block {
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+} xfs_bmdr_block_t;
+
+/*
+ * Bmap btree record and extent descriptor.
+ * l0:63 is an extent flag (value 1 indicates non-normal).
+ * l0:9-62 are startoff.
+ * l0:0-8 and l1:21-63 are startblock.
+ * l1:0-20 are blockcount.
+ */
+#define BMBT_EXNTFLAG_BITLEN 1
+#define BMBT_STARTOFF_BITLEN 54
+#define BMBT_STARTBLOCK_BITLEN 52
+#define BMBT_BLOCKCOUNT_BITLEN 21
+
+typedef struct xfs_bmbt_rec {
+ __be64 l0, l1;
+} xfs_bmbt_rec_t;
+
+typedef __uint64_t xfs_bmbt_rec_base_t; /* use this for casts */
+typedef xfs_bmbt_rec_t xfs_bmdr_rec_t;
+
+typedef struct xfs_bmbt_rec_host {
+ __uint64_t l0, l1;
+} xfs_bmbt_rec_host_t;
+
+/*
+ * Values and macros for delayed-allocation startblock fields.
+ */
+#define STARTBLOCKVALBITS 17
+#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20)
+#define DSTARTBLOCKMASKBITS (15 + 20)
+#define STARTBLOCKMASK \
+ (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+#define DSTARTBLOCKMASK \
+ (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
+
+static inline int isnullstartblock(xfs_fsblock_t x)
+{
+ return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
+}
+
+static inline int isnulldstartblock(xfs_dfsbno_t x)
+{
+ return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
+}
+
+static inline xfs_fsblock_t nullstartblock(int k)
+{
+ ASSERT(k < (1 << STARTBLOCKVALBITS));
+ return STARTBLOCKMASK | (k);
+}
+
+static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
+{
+ return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
+}
+
+/*
+ * Possible extent formats.
+ */
+typedef enum {
+ XFS_EXTFMT_NOSTATE = 0,
+ XFS_EXTFMT_HASSTATE
+} xfs_exntfmt_t;
+
+/*
+ * Possible extent states.
+ */
+typedef enum {
+ XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
+ XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
+} xfs_exntst_t;
+
+/*
+ * Incore version of above.
+ */
+typedef struct xfs_bmbt_irec
+{
+ xfs_fileoff_t br_startoff; /* starting file offset */
+ xfs_fsblock_t br_startblock; /* starting block number */
+ xfs_filblks_t br_blockcount; /* number of blocks */
+ xfs_exntst_t br_state; /* extent state */
+} xfs_bmbt_irec_t;
+
+/*
+ * Key structure for non-leaf levels of the tree.
+ */
+typedef struct xfs_bmbt_key {
+ __be64 br_startoff; /* starting file offset */
+} xfs_bmbt_key_t, xfs_bmdr_key_t;
+
+/* btree pointer type */
+typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
+
+
+/*
+ * Generic Btree block format definitions
+ *
+ * This is a combination of the actual format used on disk for short and long
+ * format btrees. The first three fields are shared by both format, but the
+ * pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use the size
+ * macros below. Never use sizeof(xfs_btree_block).
+ *
+ * The blkno, crc, lsn, owner and uuid fields are only available in filesystems
+ * with the crc feature bit, and all accesses to them must be conditional on
+ * that flag.
+ */
+struct xfs_btree_block {
+ __be32 bb_magic; /* magic number for block type */
+ __be16 bb_level; /* 0 is a leaf */
+ __be16 bb_numrecs; /* current # of data records */
+ union {
+ struct {
+ __be32 bb_leftsib;
+ __be32 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be32 bb_owner;
+ __le32 bb_crc;
+ } s; /* short form pointers */
+ struct {
+ __be64 bb_leftsib;
+ __be64 bb_rightsib;
+
+ __be64 bb_blkno;
+ __be64 bb_lsn;
+ uuid_t bb_uuid;
+ __be64 bb_owner;
+ __le32 bb_crc;
+ __be32 bb_pad; /* padding for alignment */
+ } l; /* long form pointers */
+ } bb_u; /* rest */
+};
+
+#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+
+/* sizes of CRC enabled btree blocks */
+#define XFS_BTREE_SBLOCK_CRC_LEN (XFS_BTREE_SBLOCK_LEN + 40)
+#define XFS_BTREE_LBLOCK_CRC_LEN (XFS_BTREE_LBLOCK_LEN + 48)
+
+#define XFS_BTREE_SBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.s.bb_crc)
+#define XFS_BTREE_LBLOCK_CRC_OFF \
+ offsetof(struct xfs_btree_block, bb_u.l.bb_crc)
#endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 18272c766a5..d34703dbcb4 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,11 +233,12 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */
-
+#define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */
+#define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */
/*
* Minimum and maximum sizes need for growth checks.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index e64ee5288b8..d2295561570 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -17,28 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_alloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_fsops.h"
#include "xfs_itable.h"
#include "xfs_trans_space.h"
#include "xfs_rtalloc.h"
-#include "xfs_filestream.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
/*
* File system operations
@@ -73,23 +76,18 @@ xfs_fs_geometry(
}
if (new_version >= 3) {
geo->version = XFS_FSOP_GEOM_VERSION;
- geo->flags =
+ geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
(xfs_sb_version_hasattr(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_ATTR : 0) |
- (xfs_sb_version_hasnlink(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_NLINK : 0) |
(xfs_sb_version_hasquota(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_QUOTA : 0) |
(xfs_sb_version_hasalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_IALIGN : 0) |
(xfs_sb_version_hasdalign(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_DALIGN : 0) |
- (xfs_sb_version_hasshared(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_SHARED : 0) |
(xfs_sb_version_hasextflgbit(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_EXTFLG : 0) |
- (xfs_sb_version_hasdirv2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_DIRV2 : 0) |
(xfs_sb_version_hassector(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_SECTOR : 0) |
(xfs_sb_version_hasasciici(&mp->m_sb) ?
@@ -101,11 +99,15 @@ xfs_fs_geometry(
(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) |
(xfs_sb_version_hascrc(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_V5SB : 0);
+ XFS_FSOP_GEOM_FLAGS_V5SB : 0) |
+ (xfs_sb_version_hasftype(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FTYPE : 0) |
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_FINOBT : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
- geo->dirblocksize = mp->m_dirblksize;
+ geo->dirblocksize = mp->m_dir_geo->blksize;
}
if (new_version >= 4) {
geo->flags |=
@@ -153,7 +155,7 @@ xfs_growfs_data_private(
xfs_buf_t *bp;
int bucket;
int dpct;
- int error;
+ int error, saved_error = 0;
xfs_agnumber_t nagcount;
xfs_agnumber_t nagimax = 0;
xfs_rfsblock_t nb, nb_mod;
@@ -217,6 +219,8 @@ xfs_growfs_data_private(
*/
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+ __be32 *agfl_bno;
+
/*
* AG freespace header block
*/
@@ -276,8 +280,10 @@ xfs_growfs_data_private(
agfl->agfl_seqno = cpu_to_be32(agno);
uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
}
+
+ agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
- agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+ agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
@@ -309,6 +315,10 @@ xfs_growfs_data_private(
agi->agi_dirino = cpu_to_be32(NULLAGINO);
if (xfs_sb_version_hascrc(&mp->m_sb))
uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_uuid);
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp));
+ agi->agi_free_level = cpu_to_be32(1);
+ }
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++)
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
@@ -400,6 +410,34 @@ xfs_growfs_data_private(
xfs_buf_relse(bp);
if (error)
goto error0;
+
+ /*
+ * FINO btree root block
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
+ 0, 0, agno,
+ XFS_BTREE_CRC_BLOCKS);
+ else
+ xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
+ 0, agno, 0);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+ }
+
}
xfs_trans_agblocks_delta(tp, nfree);
/*
@@ -496,29 +534,33 @@ xfs_growfs_data_private(
error = ENOMEM;
}
+ /*
+ * If we get an error reading or writing alternate superblocks,
+ * continue. xfs_repair chooses the "best" superblock based
+ * on most matches; if we break early, we'll leave more
+ * superblocks un-updated than updated, and xfs_repair may
+ * pick them over the properly-updated primary.
+ */
if (error) {
xfs_warn(mp,
"error %d reading secondary superblock for ag %d",
error, agno);
- break;
+ saved_error = error;
+ continue;
}
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
- /*
- * If we get an error writing out the alternate superblocks,
- * just issue a warning and continue. The real work is
- * already done and committed.
- */
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error) {
xfs_warn(mp,
"write error %d updating secondary superblock for ag %d",
error, agno);
- break; /* no point in continuing */
+ saved_error = error;
+ continue;
}
}
- return error;
+ return saved_error ? saved_error : error;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index ccf2fb14396..5960e5593fe 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -17,29 +17,30 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
#include "xfs_cksum.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_trace.h"
/*
@@ -51,7 +52,7 @@ xfs_ialloc_cluster_alignment(
{
if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
args->mp->m_sb.sb_inoalignmt >=
- XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
+ XFS_B_TO_FSBT(args->mp, args->mp->m_inode_cluster_size))
return args->mp->m_sb.sb_inoalignmt;
return 1;
}
@@ -111,6 +112,66 @@ xfs_inobt_get_rec(
}
/*
+ * Insert a single inobt record. Cursor must already point to desired location.
+ */
+STATIC int
+xfs_inobt_insert_rec(
+ struct xfs_btree_cur *cur,
+ __int32_t freecount,
+ xfs_inofree_t free,
+ int *stat)
+{
+ cur->bc_rec.i.ir_freecount = freecount;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_insert(cur, stat);
+}
+
+/*
+ * Insert records describing a newly allocated inode chunk into the inobt.
+ */
+STATIC int
+xfs_inobt_insert(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t newino,
+ xfs_agino_t newlen,
+ xfs_btnum_t btnum)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agino_t thisino;
+ int i;
+ int error;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
+
+ for (thisino = newino;
+ thisino < newino + newlen;
+ thisino += XFS_INODES_PER_CHUNK) {
+ error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 0);
+
+ error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK,
+ XFS_INOBT_ALL_FREE, &i);
+ if (error) {
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+ }
+ ASSERT(i == 1);
+ }
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+
+ return 0;
+}
+
+/*
* Verify that the number of free inodes in the AGI is correct.
*/
#ifdef DEBUG
@@ -169,27 +230,20 @@ xfs_ialloc_inode_init(
{
struct xfs_buf *fbuf;
struct xfs_dinode *free;
- int blks_per_cluster, nbufs, ninodes;
+ int nbufs, blks_per_cluster, inodes_per_cluster;
int version;
int i, j;
xfs_daddr_t d;
xfs_ino_t ino = 0;
/*
- * Loop over the new block(s), filling in the inodes.
- * For small block sizes, manipulate the inodes in buffers
- * which are multiples of the blocks size.
+ * Loop over the new block(s), filling in the inodes. For small block
+ * sizes, manipulate the inodes in buffers which are multiples of the
+ * blocks size.
*/
- if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
- blks_per_cluster = 1;
- nbufs = length;
- ninodes = mp->m_sb.sb_inopblock;
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
- mp->m_sb.sb_blocksize;
- nbufs = length / blks_per_cluster;
- ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
- }
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = length / blks_per_cluster;
/*
* Figure out what version number to use in the inodes we create. If
@@ -224,12 +278,10 @@ xfs_ialloc_inode_init(
* they track in the AIL as if they were physically logged.
*/
if (tp)
- xfs_icreate_log(tp, agno, agbno, XFS_IALLOC_INODES(mp),
+ xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos,
mp->m_sb.sb_inodesize, length, gen);
- } else if (xfs_sb_version_hasnlink(&mp->m_sb))
+ } else
version = 2;
- else
- version = 1;
for (j = 0; j < nbufs; j++) {
/*
@@ -245,7 +297,7 @@ xfs_ialloc_inode_init(
/* Initialize the inode buffers and log them appropriately. */
fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
- for (i = 0; i < ninodes; i++) {
+ for (i = 0; i < inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
uint isize = xfs_dinode_size(version);
@@ -309,13 +361,10 @@ xfs_ialloc_ag_alloc(
{
xfs_agi_t *agi; /* allocation group header */
xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_btree_cur_t *cur; /* inode btree cursor */
xfs_agnumber_t agno;
int error;
- int i;
xfs_agino_t newino; /* new first inode's number */
xfs_agino_t newlen; /* new number of inodes */
- xfs_agino_t thisino; /* current inode number, for loop */
int isaligned = 0; /* inode allocation at stripe unit */
/* boundary */
struct xfs_perag *pag;
@@ -328,11 +377,11 @@ xfs_ialloc_ag_alloc(
* Locking will ensure that we don't have two callers in here
* at one time.
*/
- newlen = XFS_IALLOC_INODES(args.mp);
+ newlen = args.mp->m_ialloc_inos;
if (args.mp->m_maxicount &&
args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
return XFS_ERROR(ENOSPC);
- args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
+ args.minlen = args.maxlen = args.mp->m_ialloc_blks;
/*
* First try to allocate inodes contiguous with the last-allocated
* chunk of inodes. If the filesystem is striped, this will fill
@@ -342,7 +391,7 @@ xfs_ialloc_ag_alloc(
newino = be32_to_cpu(agi->agi_newino);
agno = be32_to_cpu(agi->agi_seqno);
args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
- XFS_IALLOC_BLOCKS(args.mp);
+ args.mp->m_ialloc_blks;
if (likely(newino != NULLAGINO &&
(args.agbno < be32_to_cpu(agi->agi_length)))) {
args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
@@ -369,6 +418,18 @@ xfs_ialloc_ag_alloc(
args.minleft = args.mp->m_in_maxlevels - 1;
if ((error = xfs_alloc_vextent(&args)))
return error;
+
+ /*
+ * This request might have dirtied the transaction if the AG can
+ * satisfy the request, but the exact block was not available.
+ * If the allocation did fail, subsequent requests will relax
+ * the exact agbno requirement and increase the alignment
+ * instead. It is critical that the total size of the request
+ * (len + alignment + slop) does not increase from this point
+ * on, so reset minalignslop to ensure it is not included in
+ * subsequent requests.
+ */
+ args.minalignslop = 0;
} else
args.fsbno = NULLFSBLOCK;
@@ -453,29 +514,19 @@ xfs_ialloc_ag_alloc(
agi->agi_newino = cpu_to_be32(newino);
/*
- * Insert records describing the new inode chunk into the btree.
+ * Insert records describing the new inode chunk into the btrees.
*/
- cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
- for (thisino = newino;
- thisino < newino + newlen;
- thisino += XFS_INODES_PER_CHUNK) {
- cur->bc_rec.i.ir_startino = thisino;
- cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
- cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
- error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
- }
- ASSERT(i == 0);
- error = xfs_btree_insert(cur, &i);
- if (error) {
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_INO);
+ if (error)
+ return error;
+
+ if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
+ error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
+ XFS_BTNUM_FINO);
+ if (error)
return error;
- }
- ASSERT(i == 1);
}
- xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
/*
* Log allocation group header fields
*/
@@ -584,7 +635,7 @@ xfs_ialloc_ag_select(
* Is there enough free space for the file plus a block of
* inodes? (if we need to allocate some)?
*/
- ineed = XFS_IALLOC_BLOCKS(mp);
+ ineed = mp->m_ialloc_blks;
longest = pag->pagf_longest;
if (!longest)
longest = pag->pagf_flcount > 0;
@@ -669,13 +720,10 @@ xfs_ialloc_get_rec(
}
/*
- * Allocate an inode.
- *
- * The caller selected an AG for us, and made sure that free inodes are
- * available.
+ * Allocate an inode using the inobt-only algorithm.
*/
STATIC int
-xfs_dialloc_ag(
+xfs_dialloc_ag_inobt(
struct xfs_trans *tp,
struct xfs_buf *agbp,
xfs_ino_t parent,
@@ -701,7 +749,7 @@ xfs_dialloc_ag(
ASSERT(pag->pagi_freecount > 0);
restart_pagno:
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -934,6 +982,294 @@ error0:
}
/*
+ * Use the free inode btree to allocate an inode based on distance from the
+ * parent. Note that the provided cursor may be deleted and replaced.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_near(
+ xfs_agino_t pagino,
+ struct xfs_btree_cur **ocur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ struct xfs_btree_cur *lcur = *ocur; /* left search cursor */
+ struct xfs_btree_cur *rcur; /* right search cursor */
+ struct xfs_inobt_rec_incore rrec;
+ int error;
+ int i, j;
+
+ error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
+ if (error)
+ return error;
+
+ if (i == 1) {
+ error = xfs_inobt_get_rec(lcur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ /*
+ * See if we've landed in the parent inode record. The finobt
+ * only tracks chunks with at least one free inode, so record
+ * existence is enough.
+ */
+ if (pagino >= rec->ir_startino &&
+ pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
+ return 0;
+ }
+
+ error = xfs_btree_dup_cursor(lcur, &rcur);
+ if (error)
+ return error;
+
+ error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
+ if (error)
+ goto error_rcur;
+ if (j == 1) {
+ error = xfs_inobt_get_rec(rcur, &rrec, &j);
+ if (error)
+ goto error_rcur;
+ XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur);
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur);
+ if (i == 1 && j == 1) {
+ /*
+ * Both the left and right records are valid. Choose the closer
+ * inode chunk to the target.
+ */
+ if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
+ (rrec.ir_startino - pagino)) {
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else {
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+ } else if (j == 1) {
+ /* only the right record is valid */
+ *rec = rrec;
+ xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
+ *ocur = rcur;
+ } else if (i == 1) {
+ /* only the left record is valid */
+ xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
+ }
+
+ return 0;
+
+error_rcur:
+ xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Use the free inode btree to find a free inode based on a newino hint. If
+ * the hint is NULL, find the first free inode in the AG.
+ */
+STATIC int
+xfs_dialloc_ag_finobt_newino(
+ struct xfs_agi *agi,
+ struct xfs_btree_cur *cur,
+ struct xfs_inobt_rec_incore *rec)
+{
+ int error;
+ int i;
+
+ if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+ error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ,
+ &i);
+ if (error)
+ return error;
+ if (i == 1) {
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+ }
+ }
+
+ /*
+ * Find the first inode available in the AG.
+ */
+ error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ return 0;
+}
+
+/*
+ * Update the inobt based on a modification made to the finobt. Also ensure that
+ * the records from both trees are equivalent post-modification.
+ */
+STATIC int
+xfs_dialloc_ag_update_inobt(
+ struct xfs_btree_cur *cur, /* inobt cursor */
+ struct xfs_inobt_rec_incore *frec, /* finobt record */
+ int offset) /* inode offset */
+{
+ struct xfs_inobt_rec_incore rec;
+ int error;
+ int i;
+
+ error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ return error;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+
+ XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) &&
+ (rec.ir_freecount == frec->ir_freecount));
+
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ return error;
+
+ return 0;
+}
+
+/*
+ * Allocate an inode using the free inode btree, if available. Otherwise, fall
+ * back to the inobt search algorithm.
+ *
+ * The caller selected an AG for us, and made sure that free inodes are
+ * available.
+ */
+STATIC int
+xfs_dialloc_ag(
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_ino_t parent,
+ xfs_ino_t *inop)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ xfs_agnumber_t pagno = XFS_INO_TO_AGNO(mp, parent);
+ xfs_agino_t pagino = XFS_INO_TO_AGINO(mp, parent);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur; /* finobt cursor */
+ struct xfs_btree_cur *icur; /* inobt cursor */
+ struct xfs_inobt_rec_incore rec;
+ xfs_ino_t ino;
+ int error;
+ int offset;
+ int i;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
+
+ pag = xfs_perag_get(mp, agno);
+
+ /*
+ * If pagino is 0 (this is the root inode allocation) use newino.
+ * This must work because we've just allocated some.
+ */
+ if (!pagino)
+ pagino = be32_to_cpu(agi->agi_newino);
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The search algorithm depends on whether we're in the same AG as the
+ * parent. If so, find the closest available inode to the parent. If
+ * not, consider the agi hint or find the first free inode in the AG.
+ */
+ if (agno == pagno)
+ error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
+ else
+ error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
+ if (error)
+ goto error_cur;
+
+ offset = xfs_lowbit64(rec.ir_free);
+ ASSERT(offset >= 0);
+ ASSERT(offset < XFS_INODES_PER_CHUNK);
+ ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
+ XFS_INODES_PER_CHUNK) == 0);
+ ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
+
+ /*
+ * Modify or remove the finobt record.
+ */
+ rec.ir_free &= ~XFS_INOBT_MASK(offset);
+ rec.ir_freecount--;
+ if (rec.ir_freecount)
+ error = xfs_inobt_update(cur, &rec);
+ else
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error_cur;
+
+ /*
+ * The finobt has now been updated appropriately. We haven't updated the
+ * agi and superblock yet, so we can create an inobt cursor and validate
+ * the original freecount. If all is well, make the equivalent update to
+ * the inobt using the finobt record and offset information.
+ */
+ icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+
+ error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
+ if (error)
+ goto error_icur;
+
+ /*
+ * Both trees have now been updated. We must update the perag and
+ * superblock before we can check the freecount for each btree.
+ */
+ be32_add_cpu(&agi->agi_freecount, -1);
+ xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
+ pag->pagi_freecount--;
+
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
+
+ error = xfs_check_agi_freecount(icur, agi);
+ if (error)
+ goto error_icur;
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error_icur;
+
+ xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ xfs_perag_put(pag);
+ *inop = ino;
+ return 0;
+
+error_icur:
+ xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
+error_cur:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ xfs_perag_put(pag);
+ return error;
+}
+
+/*
* Allocate an inode on disk.
*
* Mode is used to tell whether the new inode will need space, and whether it
@@ -998,7 +1334,7 @@ xfs_dialloc(
* inode.
*/
if (mp->m_maxicount &&
- mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
+ mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) {
noroom = 1;
okalloc = 0;
}
@@ -1092,78 +1428,34 @@ out_error:
return XFS_ERROR(error);
}
-/*
- * Free disk inode. Carefully avoids touching the incore inode, all
- * manipulations incore are the caller's responsibility.
- * The on-disk inode is not changed by this operation, only the
- * btree (free inode mask) is changed.
- */
-int
-xfs_difree(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_ino_t inode, /* inode to be freed */
- xfs_bmap_free_t *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
- xfs_ino_t *first_ino) /* first inode in deleted cluster */
+STATIC int
+xfs_difree_inobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_bmap_free *flist,
+ int *deleted,
+ xfs_ino_t *first_ino,
+ struct xfs_inobt_rec_incore *orec)
{
- /* REFERENCED */
- xfs_agblock_t agbno; /* block number containing inode */
- xfs_buf_t *agbp; /* buffer containing allocation group header */
- xfs_agino_t agino; /* inode number relative to allocation group */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_agi_t *agi; /* allocation group header */
- xfs_btree_cur_t *cur; /* inode btree cursor */
- int error; /* error return value */
- int i; /* result code */
- int ilen; /* inodes in an inode cluster */
- xfs_mount_t *mp; /* mount structure for filesystem */
- int off; /* offset of inode in inode chunk */
- xfs_inobt_rec_incore_t rec; /* btree record */
- struct xfs_perag *pag;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_perag *pag;
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int ilen;
+ int error;
+ int i;
+ int off;
- mp = tp->t_mountp;
-
- /*
- * Break up inode number into its components.
- */
- agno = XFS_INO_TO_AGNO(mp, inode);
- if (agno >= mp->m_sb.sb_agcount) {
- xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
- __func__, agno, mp->m_sb.sb_agcount);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- agino = XFS_INO_TO_AGINO(mp, inode);
- if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
- xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
- __func__, (unsigned long long)inode,
- (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- agbno = XFS_AGINO_TO_AGBNO(mp, agino);
- if (agbno >= mp->m_sb.sb_agblocks) {
- xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
- __func__, agbno, mp->m_sb.sb_agblocks);
- ASSERT(0);
- return XFS_ERROR(EINVAL);
- }
- /*
- * Get the allocation group header.
- */
- error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
- if (error) {
- xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
- __func__, error);
- return error;
- }
- agi = XFS_BUF_TO_AGI(agbp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
- ASSERT(agbno < be32_to_cpu(agi->agi_length));
+ ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
+
/*
* Initialize the cursor.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
error = xfs_check_agi_freecount(cur, agi);
if (error)
@@ -1201,9 +1493,9 @@ xfs_difree(
* When an inode cluster is free, it becomes eligible for removal
*/
if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
- (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
+ (rec.ir_freecount == mp->m_ialloc_inos)) {
- *delete = 1;
+ *deleted = 1;
*first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
/*
@@ -1211,7 +1503,7 @@ xfs_difree(
* AGI and Superblock inode counts, and mark the disk space
* to be freed when the transaction is committed.
*/
- ilen = XFS_IALLOC_INODES(mp);
+ ilen = mp->m_ialloc_inos;
be32_add_cpu(&agi->agi_count, -ilen);
be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
@@ -1227,11 +1519,11 @@ xfs_difree(
goto error0;
}
- xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
- agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
- XFS_IALLOC_BLOCKS(mp), flist, mp);
+ xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno,
+ XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)),
+ mp->m_ialloc_blks, flist, mp);
} else {
- *delete = 0;
+ *deleted = 0;
error = xfs_inobt_update(cur, &rec);
if (error) {
@@ -1255,6 +1547,7 @@ xfs_difree(
if (error)
goto error0;
+ *orec = rec;
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
return 0;
@@ -1263,6 +1556,182 @@ error0:
return error;
}
+/*
+ * Free an inode in the free inode btree.
+ */
+STATIC int
+xfs_difree_finobt(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_buf *agbp,
+ xfs_agino_t agino,
+ struct xfs_inobt_rec_incore *ibtrec) /* inobt record */
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno);
+ struct xfs_btree_cur *cur;
+ struct xfs_inobt_rec_incore rec;
+ int offset = agino - ibtrec->ir_startino;
+ int error;
+ int i;
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
+
+ error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
+ if (error)
+ goto error;
+ if (i == 0) {
+ /*
+ * If the record does not exist in the finobt, we must have just
+ * freed an inode in a previously fully allocated chunk. If not,
+ * something is out of sync.
+ */
+ XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error);
+
+ error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount,
+ ibtrec->ir_free, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+
+ goto out;
+ }
+
+ /*
+ * Read and update the existing record. We could just copy the ibtrec
+ * across here, but that would defeat the purpose of having redundant
+ * metadata. By making the modifications independently, we can catch
+ * corruptions that we wouldn't see if we just copied from one record
+ * to another.
+ */
+ error = xfs_inobt_get_rec(cur, &rec, &i);
+ if (error)
+ goto error;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error);
+
+ rec.ir_free |= XFS_INOBT_MASK(offset);
+ rec.ir_freecount++;
+
+ XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) &&
+ (rec.ir_freecount == ibtrec->ir_freecount),
+ error);
+
+ /*
+ * The content of inobt records should always match between the inobt
+ * and finobt. The lifecycle of records in the finobt is different from
+ * the inobt in that the finobt only tracks records with at least one
+ * free inode. Hence, if all of the inodes are free and we aren't
+ * keeping inode chunks permanently on disk, remove the record.
+ * Otherwise, update the record with the new information.
+ */
+ if (rec.ir_freecount == mp->m_ialloc_inos &&
+ !(mp->m_flags & XFS_MOUNT_IKEEP)) {
+ error = xfs_btree_delete(cur, &i);
+ if (error)
+ goto error;
+ ASSERT(i == 1);
+ } else {
+ error = xfs_inobt_update(cur, &rec);
+ if (error)
+ goto error;
+ }
+
+out:
+ error = xfs_check_agi_freecount(cur, agi);
+ if (error)
+ goto error;
+
+ xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+ return 0;
+
+error:
+ xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Free disk inode. Carefully avoids touching the incore inode, all
+ * manipulations incore are the caller's responsibility.
+ * The on-disk inode is not changed by this operation, only the
+ * btree (free inode mask) is changed.
+ */
+int
+xfs_difree(
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_ino_t inode, /* inode to be freed */
+ struct xfs_bmap_free *flist, /* extents to free */
+ int *deleted,/* set if inode cluster was deleted */
+ xfs_ino_t *first_ino)/* first inode in deleted cluster */
+{
+ /* REFERENCED */
+ xfs_agblock_t agbno; /* block number containing inode */
+ struct xfs_buf *agbp; /* buffer for allocation group header */
+ xfs_agino_t agino; /* allocation group inode number */
+ xfs_agnumber_t agno; /* allocation group number */
+ int error; /* error return value */
+ struct xfs_mount *mp; /* mount structure for filesystem */
+ struct xfs_inobt_rec_incore rec;/* btree record */
+
+ mp = tp->t_mountp;
+
+ /*
+ * Break up inode number into its components.
+ */
+ agno = XFS_INO_TO_AGNO(mp, inode);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
+ __func__, agno, mp->m_sb.sb_agcount);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agino = XFS_INO_TO_AGINO(mp, inode);
+ if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
+ xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
+ __func__, (unsigned long long)inode,
+ (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+ if (agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
+ __func__, agbno, mp->m_sb.sb_agblocks);
+ ASSERT(0);
+ return XFS_ERROR(EINVAL);
+ }
+ /*
+ * Get the allocation group header.
+ */
+ error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
+ if (error) {
+ xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
+ __func__, error);
+ return error;
+ }
+
+ /*
+ * Fix up the inode allocation btree.
+ */
+ error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino,
+ &rec);
+ if (error)
+ goto error0;
+
+ /*
+ * Fix up the free inode btree.
+ */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
+ error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
+ if (error)
+ goto error0;
+ }
+
+ return 0;
+
+error0:
+ return error;
+}
+
STATIC int
xfs_imap_lookup(
struct xfs_mount *mp,
@@ -1294,7 +1763,7 @@ xfs_imap_lookup(
* we have a record, we need to ensure it contains the inode number
* we are looking up.
*/
- cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
if (!error) {
if (i)
@@ -1310,7 +1779,7 @@ xfs_imap_lookup(
/* check that the returned record contains the required inode */
if (rec.ir_startino > agino ||
- rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+ rec.ir_startino + mp->m_ialloc_inos <= agino)
return EINVAL;
/* for untrusted inodes check it is allocated first */
@@ -1383,7 +1852,7 @@ xfs_imap(
return XFS_ERROR(EINVAL);
}
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
/*
* For bulkstat and handle lookups, we have an untrusted inode number
@@ -1404,7 +1873,7 @@ xfs_imap(
* If the inode cluster size is the same as the blocksize or
* smaller we get to the buffer by simple arithmetics.
*/
- if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
+ if (blks_per_cluster == 1) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
@@ -1482,7 +1951,16 @@ xfs_ialloc_compute_maxlevels(
}
/*
- * Log specified fields for the ag hdr (inode section)
+ * Log specified fields for the ag hdr (inode section). The growth of the agi
+ * structure over time requires that we interpret the buffer as two logical
+ * regions delineated by the end of the unlinked list. This is due to the size
+ * of the hash table and its location in the middle of the agi.
+ *
+ * For example, a request to log a field before agi_unlinked and a field after
+ * agi_unlinked could cause us to log the entire hash table and use an excessive
+ * amount of log space. To avoid this behavior, log the region up through
+ * agi_unlinked in one call and the region after agi_unlinked through the end of
+ * the structure in another.
*/
void
xfs_ialloc_log_agi(
@@ -1505,6 +1983,8 @@ xfs_ialloc_log_agi(
offsetof(xfs_agi_t, agi_newino),
offsetof(xfs_agi_t, agi_dirino),
offsetof(xfs_agi_t, agi_unlinked),
+ offsetof(xfs_agi_t, agi_free_root),
+ offsetof(xfs_agi_t, agi_free_level),
sizeof(xfs_agi_t)
};
#ifdef DEBUG
@@ -1513,15 +1993,30 @@ xfs_ialloc_log_agi(
agi = XFS_BUF_TO_AGI(bp);
ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
#endif
+
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
+
/*
- * Compute byte offsets for the first and last fields.
+ * Compute byte offsets for the first and last fields in the first
+ * region and log the agi buffer. This only logs up through
+ * agi_unlinked.
*/
- xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
+ if (fields & XFS_AGI_ALL_BITS_R1) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
+
/*
- * Log the allocation group inode header buffer.
+ * Mask off the bits in the first region and calculate the first and
+ * last field offsets for any bits in the second region.
*/
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
- xfs_trans_log_buf(tp, bp, first, last);
+ fields &= ~XFS_AGI_ALL_BITS_R1;
+ if (fields) {
+ xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
+ &first, &last);
+ xfs_trans_log_buf(tp, bp, first, last);
+ }
}
#ifdef DEBUG
@@ -1574,18 +2069,17 @@ xfs_agi_read_verify(
struct xfs_buf *bp)
{
struct xfs_mount *mp = bp->b_target->bt_mount;
- int agi_ok = 1;
-
- if (xfs_sb_version_hascrc(&mp->m_sb))
- agi_ok = xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_agi, agi_crc));
- agi_ok = agi_ok && xfs_agi_verify(bp);
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (xfs_sb_version_hascrc(&mp->m_sb) &&
+ !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
+ XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -1596,8 +2090,8 @@ xfs_agi_write_verify(
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_agi_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -1606,8 +2100,7 @@ xfs_agi_write_verify(
if (bip)
XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_agi, agi_crc));
+ xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
}
const struct xfs_buf_ops xfs_agi_buf_ops = {
@@ -1627,15 +2120,15 @@ xfs_read_agi(
{
int error;
- ASSERT(agno != NULLAGNUMBER);
+ trace_xfs_read_agi(mp, agno);
+ ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
- ASSERT(!xfs_buf_geterror(*bpp));
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
return 0;
}
@@ -1651,6 +2144,8 @@ xfs_ialloc_read_agi(
struct xfs_perag *pag; /* per allocation group data */
int error;
+ trace_xfs_ialloc_read_agi(mp, agno);
+
error = xfs_read_agi(mp, tp, agno, bpp);
if (error)
return error;
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 68c07320f09..95ad1c002d6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -23,18 +23,20 @@ struct xfs_dinode;
struct xfs_imap;
struct xfs_mount;
struct xfs_trans;
+struct xfs_btree_cur;
-/*
- * Allocation parameters for inode allocation.
- */
-#define XFS_IALLOC_INODES(mp) (mp)->m_ialloc_inos
-#define XFS_IALLOC_BLOCKS(mp) (mp)->m_ialloc_blks
-
-/*
- * Move inodes in clusters of this size.
- */
+/* Move inodes in clusters of this size */
#define XFS_INODE_BIG_CLUSTER_SIZE 8192
-#define XFS_INODE_CLUSTER_SIZE(mp) (mp)->m_inode_cluster_size
+
+/* Calculate and return the number of filesystem blocks per inode cluster */
+static inline int
+xfs_icluster_size_fsb(
+ struct xfs_mount *mp)
+{
+ if (mp->m_sb.sb_blocksize >= mp->m_inode_cluster_size)
+ return 1;
+ return mp->m_inode_cluster_size >> mp->m_sb.sb_blocklog;
+}
/*
* Make an inode pointer out of the buffer/offset.
@@ -42,7 +44,7 @@ struct xfs_trans;
static inline struct xfs_dinode *
xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o)
{
- return (xfs_dinode_t *)
+ return (struct xfs_dinode *)
(xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog));
}
@@ -88,7 +90,7 @@ xfs_difree(
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t inode, /* inode to be freed */
struct xfs_bmap_free *flist, /* extents to free */
- int *delete, /* set if inode cluster was deleted */
+ int *deleted, /* set if inode cluster was deleted */
xfs_ino_t *first_ino); /* first inode in deleted cluster */
/*
@@ -158,6 +160,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno,
xfs_agblock_t length, unsigned int gen);
-extern const struct xfs_buf_ops xfs_agi_buf_ops;
-
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 5448eb6b8c1..726f83a681a 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -17,24 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_trans.h"
STATIC int
@@ -50,7 +49,8 @@ xfs_inobt_dup_cursor(
struct xfs_btree_cur *cur)
{
return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agbp, cur->bc_private.a.agno);
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
}
STATIC void
@@ -67,12 +67,26 @@ xfs_inobt_set_root(
xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
}
+STATIC void
+xfs_finobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+
+ agi->agi_free_root = nptr->s;
+ be32_add_cpu(&agi->agi_free_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp,
+ XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL);
+}
+
STATIC int
xfs_inobt_alloc_block(
struct xfs_btree_cur *cur,
union xfs_btree_ptr *start,
union xfs_btree_ptr *new,
- int length,
int *stat)
{
xfs_alloc_arg_t args; /* block allocation args */
@@ -174,6 +188,17 @@ xfs_inobt_init_ptr_from_cur(
ptr->s = agi->agi_root;
}
+STATIC void
+xfs_finobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
+
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
+ ptr->s = agi->agi_free_root;
+}
+
STATIC __int64_t
xfs_inobt_key_diff(
struct xfs_btree_cur *cur,
@@ -204,6 +229,7 @@ xfs_inobt_verify(
*/
switch (block->bb_magic) {
case cpu_to_be32(XFS_IBT_CRC_MAGIC):
+ case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid))
@@ -215,6 +241,7 @@ xfs_inobt_verify(
return false;
/* fall through */
case cpu_to_be32(XFS_IBT_MAGIC):
+ case cpu_to_be32(XFS_FIBT_MAGIC):
break;
default:
return 0;
@@ -244,12 +271,14 @@ static void
xfs_inobt_read_verify(
struct xfs_buf *bp)
{
- if (!(xfs_btree_sblock_verify_crc(bp) &&
- xfs_inobt_verify(bp))) {
- trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- bp->b_target->bt_mount, bp->b_addr);
+ if (!xfs_btree_sblock_verify_crc(bp))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_inobt_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+ if (bp->b_error) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ xfs_verifier_error(bp);
}
}
@@ -259,9 +288,9 @@ xfs_inobt_write_verify(
{
if (!xfs_inobt_verify(bp)) {
trace_xfs_btree_corrupt(bp, _RET_IP_);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- bp->b_target->bt_mount, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
+ return;
}
xfs_btree_sblock_calc_crc(bp);
@@ -316,6 +345,28 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
#endif
};
+static const struct xfs_btree_ops xfs_finobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_finobt_set_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_finobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
+#if defined(DEBUG) || defined(XFS_WARN)
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
+#endif
+};
+
/*
* Allocate a new inode btree cursor.
*/
@@ -324,7 +375,8 @@ xfs_inobt_init_cursor(
struct xfs_mount *mp, /* file system mount point */
struct xfs_trans *tp, /* transaction pointer */
struct xfs_buf *agbp, /* buffer for agi structure */
- xfs_agnumber_t agno) /* allocation group number */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* ialloc or free ino btree */
{
struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
struct xfs_btree_cur *cur;
@@ -333,11 +385,17 @@ xfs_inobt_init_cursor(
cur->bc_tp = tp;
cur->bc_mp = mp;
- cur->bc_nlevels = be32_to_cpu(agi->agi_level);
- cur->bc_btnum = XFS_BTNUM_INO;
+ cur->bc_btnum = btnum;
+ if (btnum == XFS_BTNUM_INO) {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ cur->bc_ops = &xfs_inobt_ops;
+ } else {
+ cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+ cur->bc_ops = &xfs_finobt_ops;
+ }
+
cur->bc_blocklog = mp->m_sb.sb_blocklog;
- cur->bc_ops = &xfs_inobt_ops;
if (xfs_sb_version_hascrc(&mp->m_sb))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 3ac36b7642e..d7ebea72c2d 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -27,55 +27,6 @@ struct xfs_btree_cur;
struct xfs_mount;
/*
- * There is a btree for the inode map per allocation group.
- */
-#define XFS_IBT_MAGIC 0x49414254 /* 'IABT' */
-#define XFS_IBT_CRC_MAGIC 0x49414233 /* 'IAB3' */
-
-typedef __uint64_t xfs_inofree_t;
-#define XFS_INODES_PER_CHUNK (NBBY * sizeof(xfs_inofree_t))
-#define XFS_INODES_PER_CHUNK_LOG (XFS_NBBYLOG + 3)
-#define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1)
-#define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i))
-
-static inline xfs_inofree_t xfs_inobt_maskn(int i, int n)
-{
- return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i;
-}
-
-/*
- * Data record structure
- */
-typedef struct xfs_inobt_rec {
- __be32 ir_startino; /* starting inode number */
- __be32 ir_freecount; /* count of free inodes (set bits) */
- __be64 ir_free; /* free inode mask */
-} xfs_inobt_rec_t;
-
-typedef struct xfs_inobt_rec_incore {
- xfs_agino_t ir_startino; /* starting inode number */
- __int32_t ir_freecount; /* count of free inodes (set bits) */
- xfs_inofree_t ir_free; /* free inode mask */
-} xfs_inobt_rec_incore_t;
-
-
-/*
- * Key structure
- */
-typedef struct xfs_inobt_key {
- __be32 ir_startino; /* starting inode number */
-} xfs_inobt_key_t;
-
-/* btree pointer type */
-typedef __be32 xfs_inobt_ptr_t;
-
-/*
- * block numbers in the AG.
- */
-#define XFS_IBT_BLOCK(mp) ((xfs_agblock_t)(XFS_CNT_BLOCK(mp) + 1))
-#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
-
-/*
* Btree block header size depends on a superblock flag.
*/
#define XFS_INOBT_BLOCK_LEN(mp) \
@@ -107,9 +58,8 @@ typedef __be32 xfs_inobt_ptr_t;
((index) - 1) * sizeof(xfs_inobt_ptr_t)))
extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
- struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+ struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t,
+ xfs_btnum_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
-extern const struct xfs_buf_ops xfs_inobt_buf_ops;
-
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 474807a401c..c48df5f25b9 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -18,24 +18,19 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_log_priv.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_dinode.h"
#include "xfs_error.h"
-#include "xfs_filestream.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
#include "xfs_quota.h"
#include "xfs_trace.h"
-#include "xfs_fsops.h"
#include "xfs_icache.h"
#include "xfs_bmap_util.h"
@@ -500,11 +495,6 @@ xfs_inode_ag_walk_grab(
if (!igrab(inode))
return ENOENT;
- if (is_bad_inode(inode)) {
- IRELE(ip);
- return ENOENT;
- }
-
/* inode is valid */
return 0;
@@ -517,8 +507,7 @@ STATIC int
xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args,
@@ -592,7 +581,7 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- error = execute(batch[i], pag, flags, args);
+ error = execute(batch[i], flags, args);
IRELE(batch[i]);
if (error == EAGAIN) {
skipped++;
@@ -646,8 +635,7 @@ xfs_eofblocks_worker(
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args)
@@ -674,8 +662,7 @@ xfs_inode_ag_iterator(
int
xfs_inode_ag_iterator_tag(
struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags,
+ int (*execute)(struct xfs_inode *ip, int flags,
void *args),
int flags,
void *args,
@@ -918,8 +905,6 @@ restart:
xfs_iflock(ip);
}
- if (is_bad_inode(VFS_I(ip)))
- goto reclaim;
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_iunpin_wait(ip);
xfs_iflush_abort(ip, false);
@@ -1221,7 +1206,6 @@ xfs_inode_match_id(
STATIC int
xfs_inode_free_eofblocks(
struct xfs_inode *ip,
- struct xfs_perag *pag,
int flags,
void *args)
{
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9ed68bb750f..9cf017b899b 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -60,12 +60,10 @@ int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
void xfs_eofblocks_worker(struct work_struct *);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
- int flags, void *args),
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args);
int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
- int flags, void *args),
+ int (*execute)(struct xfs_inode *ip, int flags, void *args),
int flags, void *args, int tag);
static inline int
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 5a5a593994d..7e454923325 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -17,16 +17,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
#include "xfs_icreate_item.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
@@ -57,13 +59,14 @@ xfs_icreate_item_size(
STATIC void
xfs_icreate_item_format(
struct xfs_log_item *lip,
- struct xfs_log_iovec *log_vector)
+ struct xfs_log_vec *lv)
{
struct xfs_icreate_item *icp = ICR_ITEM(lip);
+ struct xfs_log_iovec *vecp = NULL;
- log_vector->i_addr = (xfs_caddr_t)&icp->ic_format;
- log_vector->i_len = sizeof(struct xfs_icreate_log);
- log_vector->i_type = XLOG_REG_TYPE_ICREATE;
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICREATE,
+ &icp->ic_format,
+ sizeof(struct xfs_icreate_log));
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index e3d75385aa7..a6115fe1ac9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -19,29 +19,24 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_space.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_attr_sf.h"
#include "xfs_attr.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
@@ -52,6 +47,9 @@
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_bmap_btree.h"
kmem_zone_t *xfs_inode_zone;
@@ -63,6 +61,8 @@ kmem_zone_t *xfs_inode_zone;
STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
+STATIC int xfs_iunlink_remove(xfs_trans_t *, xfs_inode_t *);
+
/*
* helper function to extract extent size hint from inode
*/
@@ -78,48 +78,44 @@ xfs_get_extsz_hint(
}
/*
- * This is a wrapper routine around the xfs_ilock() routine used to centralize
- * some grungy code. It is used in places that wish to lock the inode solely
- * for reading the extents. The reason these places can't just call
- * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format. If the inode is in b-tree
- * format, then we need to lock the inode exclusively until the extents are read
- * in. Locking it exclusively all the time would limit our parallelism
- * unnecessarily, though. What we do instead is check to see if the extents
- * have been read in yet, and only lock the inode exclusively if they have not.
+ * These two are wrapper routines around the xfs_ilock() routine used to
+ * centralize some grungy code. They are used in places that wish to lock the
+ * inode solely for reading the extents. The reason these places can't just
+ * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
+ * bringing in of the extents from disk for a file in b-tree format. If the
+ * inode is in b-tree format, then we need to lock the inode exclusively until
+ * the extents are read in. Locking it exclusively all the time would limit
+ * our parallelism unnecessarily, though. What we do instead is check to see
+ * if the extents have been read in yet, and only lock the inode exclusively
+ * if they have not.
*
- * The function returns a value which should be given to the corresponding
- * xfs_iunlock_map_shared(). This value is the mode in which the lock was
- * actually taken.
+ * The functions return a value which should be given to the corresponding
+ * xfs_iunlock() call.
*/
uint
-xfs_ilock_map_shared(
- xfs_inode_t *ip)
+xfs_ilock_data_map_shared(
+ struct xfs_inode *ip)
{
- uint lock_mode;
+ uint lock_mode = XFS_ILOCK_SHARED;
- if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
- ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
+ (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
lock_mode = XFS_ILOCK_EXCL;
- } else {
- lock_mode = XFS_ILOCK_SHARED;
- }
-
xfs_ilock(ip, lock_mode);
-
return lock_mode;
}
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
- xfs_inode_t *ip,
- unsigned int lock_mode)
+uint
+xfs_ilock_attr_map_shared(
+ struct xfs_inode *ip)
{
- xfs_iunlock(ip, lock_mode);
+ uint lock_mode = XFS_ILOCK_SHARED;
+
+ if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
+ (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
+ lock_mode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lock_mode);
+ return lock_mode;
}
/*
@@ -589,9 +585,9 @@ xfs_lookup(
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return XFS_ERROR(EIO);
- lock_mode = xfs_ilock_map_shared(dp);
+ lock_mode = xfs_ilock_data_map_shared(dp);
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
- xfs_iunlock_map_shared(dp, lock_mode);
+ xfs_iunlock(dp, lock_mode);
if (error)
goto out;
@@ -659,7 +655,6 @@ xfs_ialloc(
uint flags;
int error;
timespec_t tv;
- int filestreams = 0;
/*
* Call the space management code to pick
@@ -686,6 +681,14 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
+ /*
+ * We always convert v1 inodes to v2 now - we only support filesystems
+ * with >= v2 inode capability, so there is no reason for ever leaving
+ * an inode in v1 format.
+ */
+ if (ip->i_d.di_version == 1)
+ ip->i_d.di_version = 2;
+
ip->i_d.di_mode = mode;
ip->i_d.di_onlink = 0;
ip->i_d.di_nlink = nlink;
@@ -695,27 +698,6 @@ xfs_ialloc(
xfs_set_projid(ip, prid);
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- /*
- * If the superblock version is up to where we support new format
- * inodes and this is currently an old format inode, then change
- * the inode version number now. This way we only do the conversion
- * here rather than here and in the flush/logging code.
- */
- if (xfs_sb_version_hasnlink(&mp->m_sb) &&
- ip->i_d.di_version == 1) {
- ip->i_d.di_version = 2;
- /*
- * We've already zeroed the old link count, the projid field,
- * and the pad field.
- */
- }
-
- /*
- * Project ids won't be stored on disk if we are using a version 1 inode.
- */
- if ((prid != 0) && (ip->i_d.di_version == 1))
- xfs_bump_ino_vers2(tp, ip);
-
if (pip && XFS_INHERIT_GID(pip)) {
ip->i_d.di_gid = pip->i_d.di_gid;
if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
@@ -776,13 +758,6 @@ xfs_ialloc(
flags |= XFS_ILOG_DEV;
break;
case S_IFREG:
- /*
- * we can't set up filestreams until after the VFS inode
- * is set up properly.
- */
- if (pip && xfs_inode_is_filestream(pip))
- filestreams = 1;
- /* fall through */
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
uint di_flags = 0;
@@ -848,15 +823,6 @@ xfs_ialloc(
/* now that we have an i_mode we can setup inode ops and unlock */
xfs_setup_inode(ip);
- /* now we have set up the vfs inode we can associate the filestream */
- if (filestreams) {
- error = xfs_filestream_associate(pip, ip);
- if (error < 0)
- return -error;
- if (!error)
- xfs_iflags_set(ip, XFS_IFILESTREAM);
- }
-
*ipp = ip;
return 0;
}
@@ -1077,40 +1043,6 @@ xfs_droplink(
}
/*
- * This gets called when the inode's version needs to be changed from 1 to 2.
- * Currently this happens when the nlink field overflows the old 16-bit value
- * or when chproj is called to change the project for the first time.
- * As a side effect the superblock version will also get rev'd
- * to contain the NLINK bit.
- */
-void
-xfs_bump_ino_vers2(
- xfs_trans_t *tp,
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp;
-
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_d.di_version == 1);
-
- ip->i_d.di_version = 2;
- ip->i_d.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- mp = tp->t_mountp;
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- spin_lock(&mp->m_sb_lock);
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- xfs_sb_version_addnlink(&mp->m_sb);
- spin_unlock(&mp->m_sb_lock);
- xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
- } else {
- spin_unlock(&mp->m_sb_lock);
- }
- }
- /* Caller must log the inode */
-}
-
-/*
* Increment the link count on an inode & log the change.
*/
int
@@ -1120,22 +1052,10 @@ xfs_bumplink(
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT(ip->i_d.di_nlink > 0);
+ ASSERT(ip->i_d.di_version > 1);
+ ASSERT(ip->i_d.di_nlink > 0 || (VFS_I(ip)->i_state & I_LINKABLE));
ip->i_d.di_nlink++;
inc_nlink(VFS_I(ip));
- if ((ip->i_d.di_version == 1) &&
- (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
- /*
- * The inode has increased its number of links beyond
- * what can fit in an old format inode. It now needs
- * to be converted to a version 2 inode with a 32 bit
- * link count. If this is the first inode in the file
- * system to do this, then we need to bump the superblock
- * version number as well.
- */
- xfs_bump_ino_vers2(tp, ip);
- }
-
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
return 0;
}
@@ -1170,10 +1090,7 @@ xfs_create(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- prid = xfs_get_projid(dp);
- else
- prid = XFS_PROJID_DEFAULT;
+ prid = xfs_get_initial_prid(dp);
/*
* Make sure that we have allocated dquot(s) on disk.
@@ -1338,6 +1255,114 @@ xfs_create(
}
int
+xfs_create_tmpfile(
+ struct xfs_inode *dp,
+ struct dentry *dentry,
+ umode_t mode,
+ struct xfs_inode **ipp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_inode *ip = NULL;
+ struct xfs_trans *tp = NULL;
+ int error;
+ uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
+ prid_t prid;
+ struct xfs_dquot *udqp = NULL;
+ struct xfs_dquot *gdqp = NULL;
+ struct xfs_dquot *pdqp = NULL;
+ struct xfs_trans_res *tres;
+ uint resblks;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return XFS_ERROR(EIO);
+
+ prid = xfs_get_initial_prid(dp);
+
+ /*
+ * Make sure that we have allocated dquot(s) on disk.
+ */
+ error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
+ xfs_kgid_to_gid(current_fsgid()), prid,
+ XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
+ &udqp, &gdqp, &pdqp);
+ if (error)
+ return error;
+
+ resblks = XFS_IALLOC_SPACE_RES(mp);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE_TMPFILE);
+
+ tres = &M_RES(mp)->tr_create_tmpfile;
+ error = xfs_trans_reserve(tp, tres, resblks, 0);
+ if (error == ENOSPC) {
+ /* No space at all so try a "no-allocation" reservation */
+ resblks = 0;
+ error = xfs_trans_reserve(tp, tres, 0, 0);
+ }
+ if (error) {
+ cancel_flags = 0;
+ goto out_trans_cancel;
+ }
+
+ error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
+ pdqp, resblks, 1, 0);
+ if (error)
+ goto out_trans_cancel;
+
+ error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
+ prid, resblks > 0, &ip, NULL);
+ if (error) {
+ if (error == ENOSPC)
+ goto out_trans_cancel;
+ goto out_trans_abort;
+ }
+
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(tp);
+
+ /*
+ * Attach the dquot(s) to the inodes and modify them incore.
+ * These ids of the inode couldn't have changed since the new
+ * inode has been locked ever since it was created.
+ */
+ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
+
+ ip->i_d.di_nlink--;
+ error = xfs_iunlink(tp, ip);
+ if (error)
+ goto out_trans_abort;
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto out_release_inode;
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ *ipp = ip;
+ return 0;
+
+ out_trans_abort:
+ cancel_flags |= XFS_TRANS_ABORT;
+ out_trans_cancel:
+ xfs_trans_cancel(tp, cancel_flags);
+ out_release_inode:
+ /*
+ * Wait until after the current transaction is aborted to
+ * release the inode. This prevents recursive transactions
+ * and deadlocks from xfs_inactive.
+ */
+ if (ip)
+ IRELE(ip);
+
+ xfs_qm_dqrele(udqp);
+ xfs_qm_dqrele(gdqp);
+ xfs_qm_dqrele(pdqp);
+
+ return error;
+}
+
+int
xfs_link(
xfs_inode_t *tdp,
xfs_inode_t *sip,
@@ -1402,6 +1427,12 @@ xfs_link(
xfs_bmap_init(&free_list, &first_block);
+ if (sip->i_d.di_nlink == 0) {
+ error = xfs_iunlink_remove(tp, sip);
+ if (error)
+ goto abort_return;
+ }
+
error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
&first_block, &free_list, resblks);
if (error)
@@ -1592,16 +1623,6 @@ xfs_release(
int truncated;
/*
- * If we are using filestreams, and we have an unlinked
- * file that we are processing the last close on, then nothing
- * will be able to reopen and write to this file. Purge this
- * inode from the filestreams cache so that it doesn't delay
- * teardown of the inode.
- */
- if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
- xfs_filestream_deassociate(ip);
-
- /*
* If we previously truncated this file and removed old data
* in the process, we want to initiate "early" writeout on
* the last close. This is an attempt to combat the notorious
@@ -1663,6 +1684,150 @@ xfs_release(
}
/*
+ * xfs_inactive_truncate
+ *
+ * Called to perform a truncate when an inode becomes unlinked.
+ */
+STATIC int
+xfs_inactive_truncate(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ /*
+ * Log the inode size first to prevent stale data exposure in the event
+ * of a system crash before the truncate completes. See the related
+ * comment in xfs_setattr_size() for details.
+ */
+ ip->i_d.di_size = 0;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+ error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
+ if (error)
+ goto error_trans_cancel;
+
+ ASSERT(ip->i_d.di_nextents == 0);
+
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error_unlock;
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+
+error_trans_cancel:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+/*
+ * xfs_inactive_ifree()
+ *
+ * Perform the inode free when an inode is unlinked.
+ */
+STATIC int
+xfs_inactive_ifree(
+ struct xfs_inode *ip)
+{
+ xfs_bmap_free_t free_list;
+ xfs_fsblock_t first_block;
+ int committed;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ int error;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+
+ /*
+ * The ifree transaction might need to allocate blocks for record
+ * insertion to the finobt. We don't want to fail here at ENOSPC, so
+ * allow ifree to dip into the reserved block pool if necessary.
+ *
+ * Freeing large sets of inodes generally means freeing inode chunks,
+ * directory and file data blocks, so this should be relatively safe.
+ * Only under severe circumstances should it be possible to free enough
+ * inodes to exhaust the reserve block pool via finobt expansion while
+ * at the same time not creating free space in the filesystem.
+ *
+ * Send a warning if the reservation does happen to fail, as the inode
+ * now remains allocated and sits on the unlinked list until the fs is
+ * repaired.
+ */
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
+ XFS_IFREE_SPACE_RES(mp), 0);
+ if (error) {
+ if (error == ENOSPC) {
+ xfs_warn_ratelimited(mp,
+ "Failed to remove inode(s) from unlinked list. "
+ "Please free space, unmount and run xfs_repair.");
+ } else {
+ ASSERT(XFS_FORCED_SHUTDOWN(mp));
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_ifree(tp, ip, &free_list);
+ if (error) {
+ /*
+ * If we fail to free the inode, shut down. The cancel
+ * might do that, we need to make sure. Otherwise the
+ * inode might be lost for a long time or forever.
+ */
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
+ xfs_notice(mp, "%s: xfs_ifree returned error %d",
+ __func__, error);
+ xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ }
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+ }
+
+ /*
+ * Credit the quota account(s). The inode is gone.
+ */
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+
+ /*
+ * Just ignore errors at this point. There is nothing we can
+ * do except to try to keep going. Make sure it's not a silent
+ * error.
+ */
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
+ __func__, error);
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+ __func__, error);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return 0;
+}
+
+/*
* xfs_inactive
*
* This is called when the vnode reference count for the vnode
@@ -1670,16 +1835,11 @@ xfs_release(
* now be truncated. Also, we clear all of the read-ahead state
* kept for the inode here since the file is now closed.
*/
-int
+void
xfs_inactive(
xfs_inode_t *ip)
{
- xfs_bmap_free_t free_list;
- xfs_fsblock_t first_block;
- int committed;
- struct xfs_trans *tp;
struct xfs_mount *mp;
- struct xfs_trans_res *resp;
int error;
int truncate = 0;
@@ -1687,19 +1847,17 @@ xfs_inactive(
* If the inode is already free, then there can be nothing
* to clean up here.
*/
- if (ip->i_d.di_mode == 0 || is_bad_inode(VFS_I(ip))) {
+ if (ip->i_d.di_mode == 0) {
ASSERT(ip->i_df.if_real_bytes == 0);
ASSERT(ip->i_df.if_broot_bytes == 0);
- return VN_INACTIVE_CACHE;
+ return;
}
mp = ip->i_mount;
- error = 0;
-
/* If this is a read-only mount, don't do this (would generate I/O) */
if (mp->m_flags & XFS_MOUNT_RDONLY)
- goto out;
+ return;
if (ip->i_d.di_nlink != 0) {
/*
@@ -1707,12 +1865,10 @@ xfs_inactive(
* cache. Post-eof blocks must be freed, lest we end up with
* broken free space accounting.
*/
- if (xfs_can_free_eofblocks(ip, true)) {
- error = xfs_free_eofblocks(mp, ip, false);
- if (error)
- return VN_INACTIVE_CACHE;
- }
- goto out;
+ if (xfs_can_free_eofblocks(ip, true))
+ xfs_free_eofblocks(mp, ip, false);
+
+ return;
}
if (S_ISREG(ip->i_d.di_mode) &&
@@ -1722,36 +1878,14 @@ xfs_inactive(
error = xfs_qm_dqattach(ip, 0);
if (error)
- return VN_INACTIVE_CACHE;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- resp = (truncate || S_ISLNK(ip->i_d.di_mode)) ?
- &M_RES(mp)->tr_itruncate : &M_RES(mp)->tr_ifree;
-
- error = xfs_trans_reserve(tp, resp, 0, 0);
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- xfs_trans_cancel(tp, 0);
- return VN_INACTIVE_CACHE;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
- if (S_ISLNK(ip->i_d.di_mode)) {
- error = xfs_inactive_symlink(ip, &tp);
- if (error)
- goto out_cancel;
- } else if (truncate) {
- ip->i_d.di_size = 0;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ return;
- error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
- if (error)
- goto out_cancel;
-
- ASSERT(ip->i_d.di_nextents == 0);
- }
+ if (S_ISLNK(ip->i_d.di_mode))
+ error = xfs_inactive_symlink(ip);
+ else if (truncate)
+ error = xfs_inactive_truncate(ip);
+ if (error)
+ return;
/*
* If there are attributes associated with the file then blow them away
@@ -1762,25 +1896,9 @@ xfs_inactive(
if (ip->i_d.di_anextents > 0) {
ASSERT(ip->i_d.di_forkoff != 0);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto out_unlock;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
error = xfs_attr_inactive(ip);
if (error)
- goto out;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
- if (error) {
- xfs_trans_cancel(tp, 0);
- goto out;
- }
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
+ return;
}
if (ip->i_afp)
@@ -1791,52 +1909,14 @@ xfs_inactive(
/*
* Free the inode.
*/
- xfs_bmap_init(&free_list, &first_block);
- error = xfs_ifree(tp, ip, &free_list);
- if (error) {
- /*
- * If we fail to free the inode, shut down. The cancel
- * might do that, we need to make sure. Otherwise the
- * inode might be lost for a long time or forever.
- */
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_notice(mp, "%s: xfs_ifree returned error %d",
- __func__, error);
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
- }
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
- } else {
- /*
- * Credit the quota account(s). The inode is gone.
- */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
-
- /*
- * Just ignore errors at this point. There is nothing we can
- * do except to try to keep going. Make sure it's not a silent
- * error.
- */
- error = xfs_bmap_finish(&tp, &free_list, &committed);
- if (error)
- xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
- __func__, error);
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
- __func__, error);
- }
+ error = xfs_inactive_ifree(ip);
+ if (error)
+ return;
/*
* Release the dquots held by inode, if any.
*/
xfs_qm_dqdetach(ip);
-out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out:
- return VN_INACTIVE_CACHE;
-out_cancel:
- xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
- goto out_unlock;
}
/*
@@ -2107,8 +2187,8 @@ xfs_ifree_cluster(
{
xfs_mount_t *mp = free_ip->i_mount;
int blks_per_cluster;
+ int inodes_per_cluster;
int nbufs;
- int ninodes;
int i, j;
xfs_daddr_t blkno;
xfs_buf_t *bp;
@@ -2118,18 +2198,11 @@ xfs_ifree_cluster(
struct xfs_perag *pag;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
- if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
- blks_per_cluster = 1;
- ninodes = mp->m_sb.sb_inopblock;
- nbufs = XFS_IALLOC_BLOCKS(mp);
- } else {
- blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
- mp->m_sb.sb_blocksize;
- ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
- nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
- }
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
+ nbufs = mp->m_ialloc_blks / blks_per_cluster;
- for (j = 0; j < nbufs; j++, inum += ninodes) {
+ for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
XFS_INO_TO_AGBNO(mp, inum));
@@ -2191,7 +2264,7 @@ xfs_ifree_cluster(
* transaction stale above, which means there is no point in
* even trying to lock them.
*/
- for (i = 0; i < ninodes; i++) {
+ for (i = 0; i < inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
@@ -2370,6 +2443,33 @@ xfs_iunpin_wait(
__xfs_iunpin_wait(ip);
}
+/*
+ * Removing an inode from the namespace involves removing the directory entry
+ * and dropping the link count on the inode. Removing the directory entry can
+ * result in locking an AGF (directory blocks were freed) and removing a link
+ * count can result in placing the inode on an unlinked list which results in
+ * locking an AGI.
+ *
+ * The big problem here is that we have an ordering constraint on AGF and AGI
+ * locking - inode allocation locks the AGI, then can allocate a new extent for
+ * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
+ * removes the inode from the unlinked list, requiring that we lock the AGI
+ * first, and then freeing the inode can result in an inode chunk being freed
+ * and hence freeing disk space requiring that we lock an AGF.
+ *
+ * Hence the ordering that is imposed by other parts of the code is AGI before
+ * AGF. This means we cannot remove the directory entry before we drop the inode
+ * reference count and put it on the unlinked list as this results in a lock
+ * order of AGF then AGI, and this can deadlock against inode allocation and
+ * freeing. Therefore we must drop the link counts before we remove the
+ * directory entry.
+ *
+ * This is still safe from a transactional point of view - it is not until we
+ * get to xfs_bmap_finish() that we have the possibility of multiple
+ * transactions in this operation. Hence as long as we remove the directory
+ * entry and drop the link count in the first transaction of the remove
+ * operation, there are no transactional constraints on the ordering here.
+ */
int
xfs_remove(
xfs_inode_t *dp,
@@ -2439,6 +2539,7 @@ xfs_remove(
/*
* If we're removing a directory perform some additional validation.
*/
+ cancel_flags |= XFS_TRANS_ABORT;
if (is_dir) {
ASSERT(ip->i_d.di_nlink >= 2);
if (ip->i_d.di_nlink != 2) {
@@ -2449,31 +2550,16 @@ xfs_remove(
error = XFS_ERROR(ENOTEMPTY);
goto out_trans_cancel;
}
- }
-
- xfs_bmap_init(&free_list, &first_block);
- error = xfs_dir_removename(tp, dp, name, ip->i_ino,
- &first_block, &free_list, resblks);
- if (error) {
- ASSERT(error != ENOENT);
- goto out_bmap_cancel;
- }
- xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- if (is_dir) {
- /*
- * Drop the link from ip's "..".
- */
+ /* Drop the link from ip's "..". */
error = xfs_droplink(tp, dp);
if (error)
- goto out_bmap_cancel;
+ goto out_trans_cancel;
- /*
- * Drop the "." link from ip to self.
- */
+ /* Drop the "." link from ip to self. */
error = xfs_droplink(tp, ip);
if (error)
- goto out_bmap_cancel;
+ goto out_trans_cancel;
} else {
/*
* When removing a non-directory we need to log the parent
@@ -2482,20 +2568,24 @@ xfs_remove(
*/
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
}
+ xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- /*
- * Drop the link from dp to ip.
- */
+ /* Drop the link from dp to ip. */
error = xfs_droplink(tp, ip);
if (error)
- goto out_bmap_cancel;
+ goto out_trans_cancel;
- /*
- * Determine if this is the last link while
- * we are in the transaction.
- */
+ /* Determine if this is the last link while the inode is locked */
link_zero = (ip->i_d.di_nlink == 0);
+ xfs_bmap_init(&free_list, &first_block);
+ error = xfs_dir_removename(tp, dp, name, ip->i_ino,
+ &first_block, &free_list, resblks);
+ if (error) {
+ ASSERT(error != ENOENT);
+ goto out_bmap_cancel;
+ }
+
/*
* If this is a synchronous mount, make sure that the
* remove transaction goes to disk before returning to
@@ -2512,20 +2602,13 @@ xfs_remove(
if (error)
goto std_return;
- /*
- * If we are using filestreams, kill the stream association.
- * If the file is still open it may get a new one but that
- * will get killed on last close in xfs_close() so we don't
- * have to worry about that.
- */
- if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
+ if (is_dir && xfs_inode_is_filestream(ip))
xfs_filestream_deassociate(ip);
return 0;
out_bmap_cancel:
xfs_bmap_cancel(&free_list);
- cancel_flags |= XFS_TRANS_ABORT;
out_trans_cancel:
xfs_trans_cancel(tp, cancel_flags);
std_return:
@@ -2856,13 +2939,13 @@ xfs_iflush_cluster(
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
- inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+ inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
if (!ilist)
goto out_put;
- mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
+ mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
rcu_read_lock();
/* really need a gang lookup range call here */
@@ -3107,6 +3190,7 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
ASSERT(iip != NULL && iip->ili_fields != 0);
+ ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3167,7 +3251,7 @@ xfs_iflush_int(
}
/*
- * Inode item log recovery for v1/v2 inodes are dependent on the
+ * Inode item log recovery for v2 inodes are dependent on the
* di_flushiter count for correct sequencing. We bump the flush
* iteration count so we can detect flushes which postdate a log record
* during recovery. This is redundant as we now log every change and
@@ -3190,40 +3274,9 @@ xfs_iflush_int(
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
ip->i_d.di_flushiter = 0;
- /*
- * If this is really an old format inode and the superblock version
- * has not been updated to support only new format inodes, then
- * convert back to the old inode format. If the superblock version
- * has been updated, then make the conversion permanent.
- */
- ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == 1) {
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = 2;
- dip->di_version = 2;
- ip->i_d.di_onlink = 0;
- dip->di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- memset(&(dip->di_pad[0]), 0,
- sizeof(dip->di_pad));
- ASSERT(xfs_get_projid(ip) == 0);
- }
- }
-
- xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
+ xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
if (XFS_IFORK_Q(ip))
- xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
+ xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
xfs_inobp_check(mp, bp);
/*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4a91358c147..f72bffa6726 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -20,11 +20,11 @@
#include "xfs_inode_buf.h"
#include "xfs_inode_fork.h"
+#include "xfs_dinode.h"
/*
* Kernel only inode definitions
*/
-
struct xfs_dinode;
struct xfs_inode;
struct xfs_buf;
@@ -50,6 +50,9 @@ typedef struct xfs_inode {
xfs_ifork_t *i_afp; /* attribute fork pointer */
xfs_ifork_t i_df; /* data fork */
+ /* operations vectors */
+ const struct xfs_dir_ops *d_ops; /* directory ops vector */
+
/* Transaction and locking information. */
struct xfs_inode_log_item *i_itemp; /* logging information */
mrlock_t i_lock; /* inode lock */
@@ -190,6 +193,15 @@ xfs_set_projid(struct xfs_inode *ip,
ip->i_d.di_projid_lo = (__uint16_t) (projid & 0xffff);
}
+static inline prid_t
+xfs_get_initial_prid(struct xfs_inode *dp)
+{
+ if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
+ return xfs_get_projid(dp);
+
+ return XFS_PROJID_DEFAULT;
+}
+
/*
* In-core inode flags.
*/
@@ -197,7 +209,6 @@ xfs_set_projid(struct xfs_inode *ip,
#define XFS_ISTALE (1 << 1) /* inode has been staled */
#define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */
#define XFS_INEW (1 << 3) /* inode has just been allocated */
-#define XFS_IFILESTREAM (1 << 4) /* inode is in a filestream dir. */
#define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */
#define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */
#define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */
@@ -213,8 +224,7 @@ xfs_set_projid(struct xfs_inode *ip,
*/
#define XFS_IRECLAIM_RESET_FLAGS \
(XFS_IRECLAIMABLE | XFS_IRECLAIM | \
- XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | \
- XFS_IFILESTREAM);
+ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED)
/*
* Synchronize processes attempting to flush the in-core inode back to disk.
@@ -316,11 +326,13 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
int xfs_release(struct xfs_inode *ip);
-int xfs_inactive(struct xfs_inode *ip);
+void xfs_inactive(struct xfs_inode *ip);
int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode **ipp, struct xfs_name *ci_name);
int xfs_create(struct xfs_inode *dp, struct xfs_name *name,
umode_t mode, xfs_dev_t rdev, struct xfs_inode **ipp);
+int xfs_create_tmpfile(struct xfs_inode *dp, struct dentry *dentry,
+ umode_t mode, struct xfs_inode **ipp);
int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
struct xfs_inode *ip);
int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
@@ -335,8 +347,8 @@ int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
-uint xfs_ilock_map_shared(xfs_inode_t *);
-void xfs_iunlock_map_shared(xfs_inode_t *, uint);
+uint xfs_ilock_data_map_shared(struct xfs_inode *);
+uint xfs_ilock_attr_map_shared(struct xfs_inode *);
int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_buf **, xfs_inode_t **);
@@ -365,7 +377,6 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
struct xfs_inode **, int *);
int xfs_droplink(struct xfs_trans *, struct xfs_inode *);
int xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
-void xfs_bump_ino_vers2(struct xfs_trans *, struct xfs_inode *);
/* from xfs_file.c */
int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index 63382d37f56..cb35ae41d4a 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -17,20 +17,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_error.h"
#include "xfs_cksum.h"
#include "xfs_icache.h"
+#include "xfs_trans.h"
#include "xfs_ialloc.h"
+#include "xfs_dinode.h"
/*
* Check that none of the inode's in the buffer have a next
@@ -102,8 +102,7 @@ xfs_inode_buf_verify(
}
xfs_buf_ioerror(bp, EFSCORRUPTED);
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
- mp, dip);
+ xfs_verifier_error(bp);
#ifdef DEBUG
xfs_alert(mp,
"bad inode magic/vsn daddr %lld #%d (magic=%x)",
@@ -306,7 +305,7 @@ xfs_dinode_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return false;
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
- offsetof(struct xfs_dinode, di_crc)))
+ XFS_DINODE_CRC_OFF))
return false;
if (be64_to_cpu(dip->di_ino) != ip->i_ino)
return false;
@@ -327,7 +326,7 @@ xfs_dinode_calc_crc(
ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
- offsetof(struct xfs_dinode, di_crc));
+ XFS_DINODE_CRC_OFF);
dip->di_crc = xfs_end_cksum(crc);
}
@@ -438,17 +437,16 @@ xfs_iread(
}
/*
- * The inode format changed when we moved the link count and
- * made it 32 bits long. If this is an old format inode,
- * convert it in memory to look like a new one. If it gets
- * flushed to disk we will convert back before flushing or
- * logging it. We zero out the new projid field and the old link
- * count field. We'll handle clearing the pad field (the remains
- * of the old uuid field) when we actually convert the inode to
- * the new format. We don't change the version number so that we
- * can distinguish this from a real new format inode.
+ * Automatically convert version 1 inode formats in memory to version 2
+ * inode format. If the inode is modified, it will get logged and
+ * rewritten as a version 2 inode. We can do this because we set the
+ * superblock feature bit for v2 inodes unconditionally during mount
+ * and it means the reast of the code can assume the inode version is 2
+ * or higher.
*/
if (ip->i_d.di_version == 1) {
+ ip->i_d.di_version = 2;
+ memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
ip->i_d.di_nlink = ip->i_d.di_onlink;
ip->i_d.di_onlink = 0;
xfs_set_projid(ip, 0);
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
index abba0ae8cf2..9308c47f2a5 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/xfs_inode_buf.h
@@ -47,7 +47,4 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#define xfs_inobp_check(mp, bp)
#endif /* DEBUG */
-extern const struct xfs_buf_ops xfs_inode_buf_ops;
-extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
-
#endif /* __XFS_INODE_BUF_H__ */
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/xfs_inode_fork.c
index 02f1083955b..b031e8d0d92 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/xfs_inode_fork.c
@@ -20,31 +20,21 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_filestream.h"
-#include "xfs_cksum.h"
#include "xfs_trace.h"
-#include "xfs_icache.h"
+#include "xfs_attr_sf.h"
+#include "xfs_dinode.h"
kmem_zone_t *xfs_ifork_zone;
@@ -441,6 +431,8 @@ xfs_iread_extents(
xfs_ifork_t *ifp;
xfs_extnum_t nextents;
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
ip->i_mount);
@@ -731,15 +723,16 @@ xfs_idestroy_fork(
}
/*
- * xfs_iextents_copy()
+ * Convert in-core extents to on-disk form
*
- * This is called to copy the REAL extents (as opposed to the delayed
- * allocation extents) from the inode into the given buffer. It
- * returns the number of bytes copied into the buffer.
+ * For either the data or attr fork in extent format, we need to endian convert
+ * the in-core extent as we place them into the on-disk inode.
*
- * If there are no delayed allocation extents, then we can just
- * memcpy() the extents into the buffer. Otherwise, we need to
- * examine each extent in turn and skip those which are delayed.
+ * In the case of the data fork, the in-core and on-disk fork sizes can be
+ * different due to delayed allocation extents. We only copy on-disk extents
+ * here, so callers must always use the physical fork size to determine the
+ * size of the buffer passed to this routine. We will return the size actually
+ * used.
*/
int
xfs_iextents_copy(
@@ -805,8 +798,7 @@ xfs_iflush_fork(
xfs_inode_t *ip,
xfs_dinode_t *dip,
xfs_inode_log_item_t *iip,
- int whichfork,
- xfs_buf_t *bp)
+ int whichfork)
{
char *cp;
xfs_ifork_t *ifp;
@@ -1031,15 +1023,14 @@ xfs_iext_add(
* the next index needed in the indirection array.
*/
else {
- int count = ext_diff;
+ uint count = ext_diff;
while (count) {
erp = xfs_iext_irec_new(ifp, erp_idx);
- erp->er_extcount = count;
- count -= MIN(count, (int)XFS_LINEAR_EXTS);
- if (count) {
+ erp->er_extcount = min(count, XFS_LINEAR_EXTS);
+ count -= erp->er_extcount;
+ if (count)
erp_idx++;
- }
}
}
}
@@ -1359,7 +1350,7 @@ xfs_iext_remove_indirect(
void
xfs_iext_realloc_direct(
xfs_ifork_t *ifp, /* inode fork pointer */
- int new_size) /* new size of extents */
+ int new_size) /* new size of extents after adding */
{
int rnew_size; /* real new size of extents */
@@ -1397,13 +1388,8 @@ xfs_iext_realloc_direct(
rnew_size - ifp->if_real_bytes);
}
}
- /*
- * Switch from the inline extent buffer to a direct
- * extent list. Be sure to include the inline extent
- * bytes in new_size.
- */
+ /* Switch from the inline extent buffer to a direct extent list */
else {
- new_size += ifp->if_bytes;
if (!is_power_of_2(new_size)) {
rnew_size = roundup_pow_of_two(new_size);
}
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/xfs_inode_fork.h
index 28661a0d905..7d3b1ed6dcb 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/xfs_inode_fork.h
@@ -19,6 +19,7 @@
#define __XFS_INODE_FORK_H__
struct xfs_inode_log_item;
+struct xfs_dinode;
/*
* The following xfs_ext_irec_t struct introduces a second (top) level
@@ -126,8 +127,7 @@ typedef struct xfs_ifork {
int xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
- struct xfs_inode_log_item *, int,
- struct xfs_buf *);
+ struct xfs_inode_log_item *, int);
void xfs_idestroy_fork(struct xfs_inode *, int);
void xfs_idata_realloc(struct xfs_inode *, int, int);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 37808110984..a640137b357 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,19 +17,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_trans_priv.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_error.h"
#include "xfs_trace.h"
+#include "xfs_trans_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_log.h"
kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -39,27 +40,14 @@ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
-
-/*
- * This returns the number of iovecs needed to log the given inode item.
- *
- * We need one iovec for the inode log format structure, one for the
- * inode core, and possibly one for the inode data/extents/b-tree root
- * and one for the inode attribute data/extents/b-tree root.
- */
STATIC void
-xfs_inode_item_size(
- struct xfs_log_item *lip,
+xfs_inode_item_data_fork_size(
+ struct xfs_inode_log_item *iip,
int *nvecs,
int *nbytes)
{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- *nvecs += 2;
- *nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_icdinode_size(ip->i_d.di_version);
-
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
@@ -70,7 +58,6 @@ xfs_inode_item_size(
*nvecs += 1;
}
break;
-
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
@@ -78,7 +65,6 @@ xfs_inode_item_size(
*nvecs += 1;
}
break;
-
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
@@ -90,19 +76,20 @@ xfs_inode_item_size(
case XFS_DINODE_FMT_DEV:
case XFS_DINODE_FMT_UUID:
break;
-
default:
ASSERT(0);
break;
}
+}
- if (!XFS_IFORK_Q(ip))
- return;
-
+STATIC void
+xfs_inode_item_attr_fork_size(
+ struct xfs_inode_log_item *iip,
+ int *nvecs,
+ int *nbytes)
+{
+ struct xfs_inode *ip = iip->ili_inode;
- /*
- * Log any necessary attribute data.
- */
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
@@ -113,7 +100,6 @@ xfs_inode_item_size(
*nvecs += 1;
}
break;
-
case XFS_DINODE_FMT_BTREE:
if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
ip->i_afp->if_broot_bytes > 0) {
@@ -121,7 +107,6 @@ xfs_inode_item_size(
*nvecs += 1;
}
break;
-
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
@@ -129,7 +114,6 @@ xfs_inode_item_size(
*nvecs += 1;
}
break;
-
default:
ASSERT(0);
break;
@@ -137,98 +121,39 @@ xfs_inode_item_size(
}
/*
- * xfs_inode_item_format_extents - convert in-core extents to on-disk form
- *
- * For either the data or attr fork in extent format, we need to endian convert
- * the in-core extent as we place them into the on-disk inode. In this case, we
- * need to do this conversion before we write the extents into the log. Because
- * we don't have the disk inode to write into here, we allocate a buffer and
- * format the extents into it via xfs_iextents_copy(). We free the buffer in
- * the unlock routine after the copy for the log has been made.
+ * This returns the number of iovecs needed to log the given inode item.
*
- * In the case of the data fork, the in-core and on-disk fork sizes can be
- * different due to delayed allocation extents. We only log on-disk extents
- * here, so always use the physical fork size to determine the size of the
- * buffer we need to allocate.
+ * We need one iovec for the inode log format structure, one for the
+ * inode core, and possibly one for the inode data/extents/b-tree root
+ * and one for the inode attribute data/extents/b-tree root.
*/
STATIC void
-xfs_inode_item_format_extents(
- struct xfs_inode *ip,
- struct xfs_log_iovec *vecp,
- int whichfork,
- int type)
+xfs_inode_item_size(
+ struct xfs_log_item *lip,
+ int *nvecs,
+ int *nbytes)
{
- xfs_bmbt_rec_t *ext_buffer;
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
- ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
- if (whichfork == XFS_DATA_FORK)
- ip->i_itemp->ili_extents_buf = ext_buffer;
- else
- ip->i_itemp->ili_aextents_buf = ext_buffer;
+ *nvecs += 2;
+ *nbytes += sizeof(struct xfs_inode_log_format) +
+ xfs_icdinode_size(ip->i_d.di_version);
- vecp->i_addr = ext_buffer;
- vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
- vecp->i_type = type;
+ xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
+ if (XFS_IFORK_Q(ip))
+ xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
}
-/*
- * This is called to fill in the vector of log iovecs for the
- * given inode log item. It fills the first item with an inode
- * log format structure, the second with the on-disk inode structure,
- * and a possible third and/or fourth with the inode data/extents/b-tree
- * root and inode attributes data/extents/b-tree root.
- */
STATIC void
-xfs_inode_item_format(
- struct xfs_log_item *lip,
- struct xfs_log_iovec *vecp)
+xfs_inode_item_format_data_fork(
+ struct xfs_inode_log_item *iip,
+ struct xfs_inode_log_format *ilf,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp)
{
- struct xfs_inode_log_item *iip = INODE_ITEM(lip);
struct xfs_inode *ip = iip->ili_inode;
- uint nvecs;
size_t data_bytes;
- xfs_mount_t *mp;
-
- vecp->i_addr = &iip->ili_format;
- vecp->i_len = sizeof(xfs_inode_log_format_t);
- vecp->i_type = XLOG_REG_TYPE_IFORMAT;
- vecp++;
- nvecs = 1;
-
- vecp->i_addr = &ip->i_d;
- vecp->i_len = xfs_icdinode_size(ip->i_d.di_version);
- vecp->i_type = XLOG_REG_TYPE_ICORE;
- vecp++;
- nvecs++;
-
- /*
- * If this is really an old format inode, then we need to
- * log it as such. This means that we have to copy the link
- * count from the new field to the old. We don't have to worry
- * about the new fields, because nothing trusts them as long as
- * the old inode version number is there. If the superblock already
- * has a new version number, then we don't bother converting back.
- */
- mp = ip->i_mount;
- ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == 1) {
- if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
- /*
- * Convert it back.
- */
- ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- ip->i_d.di_onlink = ip->i_d.di_nlink;
- } else {
- /*
- * The superblock version has already been bumped,
- * so just make the conversion to the new inode
- * format permanent.
- */
- ip->i_d.di_version = 2;
- ip->i_d.di_onlink = 0;
- memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- }
- }
switch (ip->i_d.di_format) {
case XFS_DINODE_FMT_EXTENTS:
@@ -239,36 +164,23 @@ xfs_inode_item_format(
if ((iip->ili_fields & XFS_ILOG_DEXT) &&
ip->i_d.di_nextents > 0 &&
ip->i_df.if_bytes > 0) {
+ struct xfs_bmbt_rec *p;
+
ASSERT(ip->i_df.if_u1.if_extents != NULL);
ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
- ASSERT(iip->ili_extents_buf == NULL);
-
-#ifdef XFS_NATIVE_HOST
- if (ip->i_d.di_nextents == ip->i_df.if_bytes /
- (uint)sizeof(xfs_bmbt_rec_t)) {
- /*
- * There are no delayed allocation
- * extents, so just point to the
- * real extents array.
- */
- vecp->i_addr = ip->i_df.if_u1.if_extents;
- vecp->i_len = ip->i_df.if_bytes;
- vecp->i_type = XLOG_REG_TYPE_IEXT;
- } else
-#endif
- {
- xfs_inode_item_format_extents(ip, vecp,
- XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
- }
- ASSERT(vecp->i_len <= ip->i_df.if_bytes);
- iip->ili_format.ilf_dsize = vecp->i_len;
- vecp++;
- nvecs++;
+
+ p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
+ data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
+ xlog_finish_iovec(lv, *vecp, data_bytes);
+
+ ASSERT(data_bytes <= ip->i_df.if_bytes);
+
+ ilf->ilf_dsize = data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DEXT;
}
break;
-
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DEXT |
@@ -277,80 +189,70 @@ xfs_inode_item_format(
if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
ip->i_df.if_broot_bytes > 0) {
ASSERT(ip->i_df.if_broot != NULL);
- vecp->i_addr = ip->i_df.if_broot;
- vecp->i_len = ip->i_df.if_broot_bytes;
- vecp->i_type = XLOG_REG_TYPE_IBROOT;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
+ ip->i_df.if_broot,
+ ip->i_df.if_broot_bytes);
+ ilf->ilf_dsize = ip->i_df.if_broot_bytes;
+ ilf->ilf_size++;
} else {
ASSERT(!(iip->ili_fields &
XFS_ILOG_DBROOT));
iip->ili_fields &= ~XFS_ILOG_DBROOT;
}
break;
-
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT |
XFS_ILOG_DEV | XFS_ILOG_UUID);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
- ASSERT(ip->i_d.di_size > 0);
-
- vecp->i_addr = ip->i_df.if_u1.if_data;
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed to
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_df.if_bytes, 4);
- ASSERT((ip->i_df.if_real_bytes == 0) ||
- (ip->i_df.if_real_bytes == data_bytes));
- vecp->i_len = (int)data_bytes;
- vecp->i_type = XLOG_REG_TYPE_ILOCAL;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_dsize = (unsigned)data_bytes;
+ ASSERT(ip->i_df.if_real_bytes == 0 ||
+ ip->i_df.if_real_bytes == data_bytes);
+ ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_d.di_size > 0);
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
+ ip->i_df.if_u1.if_data, data_bytes);
+ ilf->ilf_dsize = (unsigned)data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_DDATA;
}
break;
-
case XFS_DINODE_FMT_DEV:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
XFS_ILOG_DEXT | XFS_ILOG_UUID);
- if (iip->ili_fields & XFS_ILOG_DEV) {
- iip->ili_format.ilf_u.ilfu_rdev =
- ip->i_df.if_u2.if_rdev;
- }
+ if (iip->ili_fields & XFS_ILOG_DEV)
+ ilf->ilf_u.ilfu_rdev = ip->i_df.if_u2.if_rdev;
break;
-
case XFS_DINODE_FMT_UUID:
iip->ili_fields &=
~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT |
XFS_ILOG_DEXT | XFS_ILOG_DEV);
- if (iip->ili_fields & XFS_ILOG_UUID) {
- iip->ili_format.ilf_u.ilfu_uuid =
- ip->i_df.if_u2.if_uuid;
- }
+ if (iip->ili_fields & XFS_ILOG_UUID)
+ ilf->ilf_u.ilfu_uuid = ip->i_df.if_u2.if_uuid;
break;
-
default:
ASSERT(0);
break;
}
+}
- /*
- * If there are no attributes associated with the file, then we're done.
- */
- if (!XFS_IFORK_Q(ip)) {
- iip->ili_fields &=
- ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
- goto out;
- }
+STATIC void
+xfs_inode_item_format_attr_fork(
+ struct xfs_inode_log_item *iip,
+ struct xfs_inode_log_format *ilf,
+ struct xfs_log_vec *lv,
+ struct xfs_log_iovec **vecp)
+{
+ struct xfs_inode *ip = iip->ili_inode;
+ size_t data_bytes;
switch (ip->i_d.di_aformat) {
case XFS_DINODE_FMT_EXTENTS:
@@ -360,30 +262,22 @@ xfs_inode_item_format(
if ((iip->ili_fields & XFS_ILOG_AEXT) &&
ip->i_d.di_anextents > 0 &&
ip->i_afp->if_bytes > 0) {
+ struct xfs_bmbt_rec *p;
+
ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
ip->i_d.di_anextents);
ASSERT(ip->i_afp->if_u1.if_extents != NULL);
-#ifdef XFS_NATIVE_HOST
- /*
- * There are not delayed allocation extents
- * for attributes, so just point at the array.
- */
- vecp->i_addr = ip->i_afp->if_u1.if_extents;
- vecp->i_len = ip->i_afp->if_bytes;
- vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
-#else
- ASSERT(iip->ili_aextents_buf == NULL);
- xfs_inode_item_format_extents(ip, vecp,
- XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
-#endif
- iip->ili_format.ilf_asize = vecp->i_len;
- vecp++;
- nvecs++;
+
+ p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
+ data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
+ xlog_finish_iovec(lv, *vecp, data_bytes);
+
+ ilf->ilf_asize = data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_AEXT;
}
break;
-
case XFS_DINODE_FMT_BTREE:
iip->ili_fields &=
~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
@@ -392,61 +286,89 @@ xfs_inode_item_format(
ip->i_afp->if_broot_bytes > 0) {
ASSERT(ip->i_afp->if_broot != NULL);
- vecp->i_addr = ip->i_afp->if_broot;
- vecp->i_len = ip->i_afp->if_broot_bytes;
- vecp->i_type = XLOG_REG_TYPE_IATTR_BROOT;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
+ ip->i_afp->if_broot,
+ ip->i_afp->if_broot_bytes);
+ ilf->ilf_asize = ip->i_afp->if_broot_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ABROOT;
}
break;
-
case XFS_DINODE_FMT_LOCAL:
iip->ili_fields &=
~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_afp->if_bytes > 0) {
- ASSERT(ip->i_afp->if_u1.if_data != NULL);
-
- vecp->i_addr = ip->i_afp->if_u1.if_data;
/*
* Round i_bytes up to a word boundary.
* The underlying memory is guaranteed to
* to be there by xfs_idata_realloc().
*/
data_bytes = roundup(ip->i_afp->if_bytes, 4);
- ASSERT((ip->i_afp->if_real_bytes == 0) ||
- (ip->i_afp->if_real_bytes == data_bytes));
- vecp->i_len = (int)data_bytes;
- vecp->i_type = XLOG_REG_TYPE_IATTR_LOCAL;
- vecp++;
- nvecs++;
- iip->ili_format.ilf_asize = (unsigned)data_bytes;
+ ASSERT(ip->i_afp->if_real_bytes == 0 ||
+ ip->i_afp->if_real_bytes == data_bytes);
+ ASSERT(ip->i_afp->if_u1.if_data != NULL);
+ xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
+ ip->i_afp->if_u1.if_data,
+ data_bytes);
+ ilf->ilf_asize = (unsigned)data_bytes;
+ ilf->ilf_size++;
} else {
iip->ili_fields &= ~XFS_ILOG_ADATA;
}
break;
-
default:
ASSERT(0);
break;
}
-
-out:
- /*
- * Now update the log format that goes out to disk from the in-core
- * values. We always write the inode core to make the arithmetic
- * games in recovery easier, which isn't a big deal as just about any
- * transaction would dirty it anyway.
- */
- iip->ili_format.ilf_fields = XFS_ILOG_CORE |
- (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
- iip->ili_format.ilf_size = nvecs;
}
+/*
+ * This is called to fill in the vector of log iovecs for the given inode
+ * log item. It fills the first item with an inode log format structure,
+ * the second with the on-disk inode structure, and a possible third and/or
+ * fourth with the inode data/extents/b-tree root and inode attributes
+ * data/extents/b-tree root.
+ */
+STATIC void
+xfs_inode_item_format(
+ struct xfs_log_item *lip,
+ struct xfs_log_vec *lv)
+{
+ struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+ struct xfs_inode *ip = iip->ili_inode;
+ struct xfs_inode_log_format *ilf;
+ struct xfs_log_iovec *vecp = NULL;
+
+ ASSERT(ip->i_d.di_version > 1);
+
+ ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
+ ilf->ilf_type = XFS_LI_INODE;
+ ilf->ilf_ino = ip->i_ino;
+ ilf->ilf_blkno = ip->i_imap.im_blkno;
+ ilf->ilf_len = ip->i_imap.im_len;
+ ilf->ilf_boffset = ip->i_imap.im_boffset;
+ ilf->ilf_fields = XFS_ILOG_CORE;
+ ilf->ilf_size = 2; /* format + core */
+ xlog_finish_iovec(lv, vecp, sizeof(struct xfs_inode_log_format));
+
+ xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_ICORE,
+ &ip->i_d,
+ xfs_icdinode_size(ip->i_d.di_version));
+
+ xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
+ if (XFS_IFORK_Q(ip)) {
+ xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
+ } else {
+ iip->ili_fields &=
+ ~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
+ }
+
+ /* update the format with the exact fields we actually logged */
+ ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
+}
/*
* This is called to pin the inode associated with the inode log
@@ -563,27 +485,6 @@ xfs_inode_item_unlock(
ASSERT(ip->i_itemp != NULL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- /*
- * If the inode needed a separate buffer with which to log
- * its extents, then free it now.
- */
- if (iip->ili_extents_buf != NULL) {
- ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ip->i_d.di_nextents > 0);
- ASSERT(iip->ili_fields & XFS_ILOG_DEXT);
- ASSERT(ip->i_df.if_bytes > 0);
- kmem_free(iip->ili_extents_buf);
- iip->ili_extents_buf = NULL;
- }
- if (iip->ili_aextents_buf != NULL) {
- ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS);
- ASSERT(ip->i_d.di_anextents > 0);
- ASSERT(iip->ili_fields & XFS_ILOG_AEXT);
- ASSERT(ip->i_afp->if_bytes > 0);
- kmem_free(iip->ili_aextents_buf);
- iip->ili_aextents_buf = NULL;
- }
-
lock_flags = iip->ili_lock_flags;
iip->ili_lock_flags = 0;
if (lock_flags)
@@ -670,11 +571,6 @@ xfs_inode_item_init(
iip->ili_inode = ip;
xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
&xfs_inode_item_ops);
- iip->ili_format.ilf_type = XFS_LI_INODE;
- iip->ili_format.ilf_ino = ip->i_ino;
- iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
- iip->ili_format.ilf_len = ip->i_imap.im_len;
- iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
}
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index dce4d656768..488d81254e2 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,11 +34,6 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
- struct xfs_bmbt_rec *ili_extents_buf; /* array of logged
- data exts */
- struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
- attr exts */
- xfs_inode_log_format_t ili_format; /* logged structure */
} xfs_inode_log_item_t;
static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 668e8f4ccf5..8bc1bbce745 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -17,32 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ioctl.h"
+#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_itable.h"
#include "xfs_error.h"
#include "xfs_attr.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_buf_item.h"
#include "xfs_fsops.h"
#include "xfs_discard.h"
#include "xfs_quota.h"
-#include "xfs_inode_item.h"
#include "xfs_export.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
+#include "xfs_dinode.h"
+#include "xfs_trans.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -113,15 +112,11 @@ xfs_find_handle(
memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
hsize = sizeof(xfs_fsid_t);
} else {
- int lock_mode;
-
- lock_mode = xfs_ilock_map_shared(ip);
handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
sizeof(handle.ha_fid.fid_len);
handle.ha_fid.fid_pad = 0;
handle.ha_fid.fid_gen = ip->i_d.di_gen;
handle.ha_fid.fid_ino = ip->i_ino;
- xfs_iunlock_map_shared(ip, lock_mode);
hsize = XFS_HSIZE(handle);
}
@@ -276,32 +271,6 @@ xfs_open_by_handle(
return error;
}
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
- char __user *buffer,
- int buflen,
- const char *link)
-{
- int len;
-
- len = PTR_ERR(link);
- if (IS_ERR(link))
- goto out;
-
- len = strlen(link);
- if (len > (unsigned) buflen)
- len = buflen;
- if (copy_to_user(buffer, link, len))
- len = -EFAULT;
- out:
- return len;
-}
-
-
int
xfs_readlink_by_handle(
struct file *parfilp,
@@ -339,7 +308,7 @@ xfs_readlink_by_handle(
error = -xfs_readlink(XFS_I(dentry->d_inode), link);
if (error)
goto out_kfree;
- error = do_readlink(hreq->ohandle, olen, link);
+ error = readlink_copy(hreq->ohandle, olen, link);
if (error)
goto out_kfree;
@@ -443,7 +412,8 @@ xfs_attrlist_by_handle(
return -XFS_ERROR(EPERM);
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -XFS_ERROR(EFAULT);
- if (al_hreq.buflen > XATTR_LIST_MAX)
+ if (al_hreq.buflen < sizeof(struct attrlist) ||
+ al_hreq.buflen > XATTR_LIST_MAX)
return -XFS_ERROR(EINVAL);
/*
@@ -573,10 +543,11 @@ xfs_attrmulti_by_handle(
ops = memdup_user(am_hreq.ops, size);
if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
+ error = -PTR_ERR(ops);
goto out_dput;
}
+ error = ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -586,7 +557,7 @@ xfs_attrmulti_by_handle(
ops[i].am_error = strncpy_from_user((char *)attr_name,
ops[i].am_attrname, MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
+ error = ERANGE;
if (ops[i].am_error < 0)
break;
@@ -641,7 +612,11 @@ xfs_ioc_space(
unsigned int cmd,
xfs_flock64_t *bf)
{
- int attr_flags = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ struct iattr iattr;
+ bool setprealloc = false;
+ bool clrprealloc = false;
int error;
/*
@@ -661,19 +636,128 @@ xfs_ioc_space(
if (!S_ISREG(inode->i_mode))
return -XFS_ERROR(EINVAL);
- if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
- attr_flags |= XFS_ATTR_NONBLOCK;
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
- if (filp->f_flags & O_DSYNC)
- attr_flags |= XFS_ATTR_SYNC;
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ switch (bf->l_whence) {
+ case 0: /*SEEK_SET*/
+ break;
+ case 1: /*SEEK_CUR*/
+ bf->l_start += filp->f_pos;
+ break;
+ case 2: /*SEEK_END*/
+ bf->l_start += XFS_ISIZE(ip);
+ break;
+ default:
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
- if (ioflags & IO_INVIS)
- attr_flags |= XFS_ATTR_DMI;
+ /*
+ * length of <= 0 for resv/unresv/zero is invalid. length for
+ * alloc/free is ignored completely and we have no idea what userspace
+ * might have set it to, so set it to zero to allow range
+ * checks to pass.
+ */
+ switch (cmd) {
+ case XFS_IOC_ZERO_RANGE:
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_RESVSP64:
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_UNRESVSP64:
+ if (bf->l_len <= 0) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+ break;
+ default:
+ bf->l_len = 0;
+ break;
+ }
+
+ if (bf->l_start < 0 ||
+ bf->l_start > mp->m_super->s_maxbytes ||
+ bf->l_start + bf->l_len < 0 ||
+ bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
+ error = XFS_ERROR(EINVAL);
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case XFS_IOC_ZERO_RANGE:
+ error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
+ if (!error)
+ setprealloc = true;
+ break;
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_RESVSP64:
+ error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
+ XFS_BMAPI_PREALLOC);
+ if (!error)
+ setprealloc = true;
+ break;
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_UNRESVSP64:
+ error = xfs_free_file_space(ip, bf->l_start, bf->l_len);
+ break;
+ case XFS_IOC_ALLOCSP:
+ case XFS_IOC_ALLOCSP64:
+ case XFS_IOC_FREESP:
+ case XFS_IOC_FREESP64:
+ if (bf->l_start > XFS_ISIZE(ip)) {
+ error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
+ bf->l_start - XFS_ISIZE(ip), 0);
+ if (error)
+ goto out_unlock;
+ }
+
+ iattr.ia_valid = ATTR_SIZE;
+ iattr.ia_size = bf->l_start;
+
+ error = xfs_setattr_size(ip, &iattr);
+ if (!error)
+ clrprealloc = true;
+ break;
+ default:
+ ASSERT(0);
+ error = XFS_ERROR(EINVAL);
+ }
- error = mnt_want_write_file(filp);
if (error)
- return error;
- error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+ goto out_unlock;
+
+ tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ goto out_unlock;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+
+ if (!(ioflags & IO_INVIS)) {
+ ip->i_d.di_mode &= ~S_ISUID;
+ if (ip->i_d.di_mode & S_IXGRP)
+ ip->i_d.di_mode &= ~S_ISGID;
+ xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+ }
+
+ if (setprealloc)
+ ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+ else if (clrprealloc)
+ ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ if (filp->f_flags & O_DSYNC)
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp, 0);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
mnt_drop_write_file(filp);
return -error;
}
@@ -1132,7 +1216,7 @@ xfs_ioctl_setattr(
* cleared upon successful return from chown()
*/
if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
- !inode_capable(VFS_I(ip), CAP_FSETID))
+ !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
/*
@@ -1144,15 +1228,8 @@ xfs_ioctl_setattr(
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
+ ASSERT(ip->i_d.di_version > 1);
xfs_set_projid(ip, fa->fsx_projid);
-
- /*
- * We may have to rev the inode as well as
- * the superblock version number since projids didn't
- * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
- */
- if (ip->i_d.di_version == 1)
- xfs_bump_ino_vers2(tp, ip);
}
}
@@ -1474,7 +1551,7 @@ xfs_file_ioctl(
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+ da.d_mem = da.d_miniosz = target->bt_logical_sectorsize;
da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
if (copy_to_user(arg, &da, sizeof(da)))
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index f671f7e472a..944d5baa710 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -22,14 +22,13 @@
#include <asm/uaccess.h>
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_vnode.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_itable.h"
#include "xfs_error.h"
@@ -357,7 +356,8 @@ xfs_compat_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg,
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -XFS_ERROR(EFAULT);
- if (al_hreq.buflen > XATTR_LIST_MAX)
+ if (al_hreq.buflen < sizeof(struct attrlist) ||
+ al_hreq.buflen > XATTR_LIST_MAX)
return -XFS_ERROR(EINVAL);
/*
@@ -424,10 +424,11 @@ xfs_compat_attrmulti_by_handle(
ops = memdup_user(compat_ptr(am_hreq.ops), size);
if (IS_ERR(ops)) {
- error = PTR_ERR(ops);
+ error = -PTR_ERR(ops);
goto out_dput;
}
+ error = ENOMEM;
attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
if (!attr_name)
goto out_kfree_ops;
@@ -438,7 +439,7 @@ xfs_compat_attrmulti_by_handle(
compat_ptr(ops[i].am_attrname),
MAXNAMELEN);
if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
- error = -ERANGE;
+ error = ERANGE;
if (ops[i].am_error < 0)
break;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8d4d49b6fbf..6d3ec2b6ee2 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -17,34 +17,28 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_quota.h"
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
+#include "xfs_dinode.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -110,7 +104,7 @@ xfs_alert_fsblock_zero(
xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
"Access to block zero in inode %llu "
"start_block: %llx start_off: %llx "
- "blkcnt: %llx extent-state: %x\n",
+ "blkcnt: %llx extent-state: %x",
(unsigned long long)ip->i_ino,
(unsigned long long)imap->br_startblock,
(unsigned long long)imap->br_startoff,
@@ -134,7 +128,6 @@ xfs_iomap_write_direct(
xfs_fsblock_t firstfsb;
xfs_extlen_t extsz, temp;
int nimaps;
- int bmapi_flag;
int quota_flag;
int rt;
xfs_trans_t *tp;
@@ -206,18 +199,15 @@ xfs_iomap_write_direct(
xfs_trans_ijoin(tp, ip, 0);
- bmapi_flag = 0;
- if (offset < XFS_ISIZE(ip) || extsz)
- bmapi_flag |= XFS_BMAPI_PREALLOC;
-
/*
* From this point onwards we overwrite the imap pointer that the
* caller gave to us.
*/
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flag,
- &firstfsb, 0, imap, &nimaps, &free_list);
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_PREALLOC, &firstfsb, 0,
+ imap, &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
@@ -655,7 +645,6 @@ int
xfs_iomap_write_allocate(
xfs_inode_t *ip,
xfs_off_t offset,
- size_t count,
xfs_bmbt_irec_t *imap)
{
xfs_mount_t *mp = ip->i_mount;
@@ -741,7 +730,7 @@ xfs_iomap_write_allocate(
*/
nimaps = 1;
end_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
- error = xfs_bmap_last_offset(NULL, ip, &last_block,
+ error = xfs_bmap_last_offset(ip, &last_block,
XFS_DATA_FORK);
if (error)
goto trans_cancel;
@@ -760,8 +749,7 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb,
- XFS_BMAPI_STACK_SWITCH,
+ count_fsb, 0,
&first_block, 1,
imap, &nimaps, &free_list);
if (error)
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 80615760959..411fbb8919e 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -21,12 +21,12 @@
struct xfs_inode;
struct xfs_bmbt_irec;
-extern int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *, int);
-extern int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t, size_t,
+int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
struct xfs_bmbt_irec *);
-extern int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 2b8952d9199..205613a0606 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -17,32 +17,29 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_rtalloc.h"
+#include "xfs_acl.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_inode_item.h"
+#include "xfs_trans.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_symlink.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2_priv.h"
+#include "xfs_dinode.h"
+#include "xfs_trans_space.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -52,6 +49,18 @@
#include <linux/fiemap.h>
#include <linux/slab.h>
+/*
+ * Directories have different lock order w.r.t. mmap_sem compared to regular
+ * files. This is due to readdir potentially triggering page faults on a user
+ * buffer inside filldir(), and this happens with the ilock on the directory
+ * held. For regular files, the lock order is the other way around - the
+ * mmap_sem is taken during the page fault, and then we lock the ilock to do
+ * block mapping. Hence we need a different class for the directory ilock so
+ * that lockdep can tell them apart.
+ */
+static struct lock_class_key xfs_nondir_ilock_class;
+static struct lock_class_key xfs_dir_ilock_class;
+
static int
xfs_initxattrs(
struct inode *inode,
@@ -63,8 +72,8 @@ xfs_initxattrs(
int error = 0;
for (xattr = xattr_array; xattr->name != NULL; xattr++) {
- error = xfs_attr_set(ip, xattr->name, xattr->value,
- xattr->value_len, ATTR_SECURE);
+ error = -xfs_attr_set(ip, xattr->name, xattr->value,
+ xattr->value_len, ATTR_SECURE);
if (error < 0)
break;
}
@@ -84,8 +93,8 @@ xfs_init_security(
struct inode *dir,
const struct qstr *qstr)
{
- return security_inode_init_security(inode, dir, qstr,
- &xfs_initxattrs, NULL);
+ return -security_inode_init_security(inode, dir, qstr,
+ &xfs_initxattrs, NULL);
}
static void
@@ -115,19 +124,19 @@ xfs_cleanup_inode(
xfs_dentry_to_name(&teardown, dentry, 0);
xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
- iput(inode);
}
STATIC int
-xfs_vn_mknod(
+xfs_generic_create(
struct inode *dir,
struct dentry *dentry,
umode_t mode,
- dev_t rdev)
+ dev_t rdev,
+ bool tmpfile) /* unnamed file */
{
struct inode *inode;
struct xfs_inode *ip = NULL;
- struct posix_acl *default_acl = NULL;
+ struct posix_acl *default_acl, *acl;
struct xfs_name name;
int error;
@@ -143,17 +152,16 @@ xfs_vn_mknod(
rdev = 0;
}
- if (IS_POSIXACL(dir)) {
- default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
- if (IS_ERR(default_acl))
- return PTR_ERR(default_acl);
+ error = posix_acl_create(dir, &mode, &default_acl, &acl);
+ if (error)
+ return error;
- if (!default_acl)
- mode &= ~current_umask();
+ if (!tmpfile) {
+ xfs_dentry_to_name(&name, dentry, mode);
+ error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+ } else {
+ error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
}
-
- xfs_dentry_to_name(&name, dentry, mode);
- error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
if (unlikely(error))
goto out_free_acl;
@@ -163,22 +171,46 @@ xfs_vn_mknod(
if (unlikely(error))
goto out_cleanup_inode;
+#ifdef CONFIG_XFS_POSIX_ACL
if (default_acl) {
- error = -xfs_inherit_acl(inode, default_acl);
- default_acl = NULL;
- if (unlikely(error))
+ error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+ if (error)
+ goto out_cleanup_inode;
+ }
+ if (acl) {
+ error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+ if (error)
goto out_cleanup_inode;
}
+#endif
+ if (tmpfile)
+ d_tmpfile(dentry, inode);
+ else
+ d_instantiate(dentry, inode);
- d_instantiate(dentry, inode);
+ out_free_acl:
+ if (default_acl)
+ posix_acl_release(default_acl);
+ if (acl)
+ posix_acl_release(acl);
return -error;
out_cleanup_inode:
- xfs_cleanup_inode(dir, inode, dentry);
- out_free_acl:
- posix_acl_release(default_acl);
- return -error;
+ if (!tmpfile)
+ xfs_cleanup_inode(dir, inode, dentry);
+ iput(inode);
+ goto out_free_acl;
+}
+
+STATIC int
+xfs_vn_mknod(
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode,
+ dev_t rdev)
+{
+ return xfs_generic_create(dir, dentry, mode, rdev, false);
}
STATIC int
@@ -341,6 +373,7 @@ xfs_vn_symlink(
out_cleanup_inode:
xfs_cleanup_inode(dir, inode, dentry);
+ iput(inode);
out:
return -error;
}
@@ -395,18 +428,6 @@ xfs_vn_follow_link(
return NULL;
}
-STATIC void
-xfs_vn_put_link(
- struct dentry *dentry,
- struct nameidata *nd,
- void *p)
-{
- char *s = nd_get_link(nd);
-
- if (!IS_ERR(s))
- kfree(s);
-}
-
STATIC int
xfs_vn_getattr(
struct vfsmount *mnt,
@@ -463,14 +484,12 @@ xfs_vn_getattr(
static void
xfs_setattr_mode(
- struct xfs_trans *tp,
struct xfs_inode *ip,
struct iattr *iattr)
{
- struct inode *inode = VFS_I(ip);
- umode_t mode = iattr->ia_mode;
+ struct inode *inode = VFS_I(ip);
+ umode_t mode = iattr->ia_mode;
- ASSERT(tp);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ip->i_d.di_mode &= S_IFMT;
@@ -480,6 +499,32 @@ xfs_setattr_mode(
inode->i_mode |= mode & ~S_IFMT;
}
+static void
+xfs_setattr_time(
+ struct xfs_inode *ip,
+ struct iattr *iattr)
+{
+ struct inode *inode = VFS_I(ip);
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ if (iattr->ia_valid & ATTR_ATIME) {
+ inode->i_atime = iattr->ia_atime;
+ ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+ }
+ if (iattr->ia_valid & ATTR_CTIME) {
+ inode->i_ctime = iattr->ia_ctime;
+ ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ }
+ if (iattr->ia_valid & ATTR_MTIME) {
+ inode->i_mtime = iattr->ia_mtime;
+ ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ }
+}
+
int
xfs_setattr_nonsize(
struct xfs_inode *ip,
@@ -622,7 +667,8 @@ xfs_setattr_nonsize(
}
if (!gid_eq(igid, gid)) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
- ASSERT(!XFS_IS_PQUOTA_ON(mp));
+ ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+ !XFS_IS_PQUOTA_ON(mp));
ASSERT(mask & ATTR_GID);
ASSERT(gdqp);
olddquot2 = xfs_qm_vop_chown(tp, ip,
@@ -633,30 +679,10 @@ xfs_setattr_nonsize(
}
}
- /*
- * Change file access modes.
- */
if (mask & ATTR_MODE)
- xfs_setattr_mode(tp, ip, iattr);
-
- /*
- * Change file access or modified times.
- */
- if (mask & ATTR_ATIME) {
- inode->i_atime = iattr->ia_atime;
- ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
- ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
- }
- if (mask & ATTR_CTIME) {
- inode->i_ctime = iattr->ia_ctime;
- ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- }
- if (mask & ATTR_MTIME) {
- inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- }
+ xfs_setattr_mode(ip, iattr);
+ if (mask & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_setattr_time(ip, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -687,7 +713,7 @@ xfs_setattr_nonsize(
* Posix ACL code seems to care about this issue either.
*/
if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
- error = -xfs_acl_chmod(inode);
+ error = -posix_acl_chmod(inode, inode->i_mode);
if (error)
return XFS_ERROR(error);
}
@@ -709,12 +735,10 @@ out_dqrele:
int
xfs_setattr_size(
struct xfs_inode *ip,
- struct iattr *iattr,
- int flags)
+ struct iattr *iattr)
{
struct xfs_mount *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
- int mask = iattr->ia_valid;
xfs_off_t oldsize, newsize;
struct xfs_trans *tp;
int error;
@@ -733,14 +757,10 @@ xfs_setattr_size(
if (error)
return XFS_ERROR(error);
+ ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(S_ISREG(ip->i_d.di_mode));
- ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
- ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
-
- if (!(flags & XFS_ATTR_NOLOCK)) {
- lock_flags |= XFS_IOLOCK_EXCL;
- xfs_ilock(ip, lock_flags);
- }
+ ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+ ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
oldsize = inode->i_size;
newsize = iattr->ia_size;
@@ -749,13 +769,12 @@ xfs_setattr_size(
* Short circuit the truncate case for zero length files.
*/
if (newsize == 0 && oldsize == 0 && ip->i_d.di_nextents == 0) {
- if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
- goto out_unlock;
+ if (!(iattr->ia_valid & (ATTR_CTIME|ATTR_MTIME)))
+ return 0;
/*
* Use the regular setattr path to update the timestamps.
*/
- xfs_iunlock(ip, lock_flags);
iattr->ia_valid &= ~ATTR_SIZE;
return xfs_setattr_nonsize(ip, iattr, 0);
}
@@ -765,7 +784,7 @@ xfs_setattr_size(
*/
error = xfs_qm_dqattach(ip, 0);
if (error)
- goto out_unlock;
+ return error;
/*
* Now we can make the changes. Before we join the inode to the
@@ -783,7 +802,7 @@ xfs_setattr_size(
*/
error = xfs_zero_eof(ip, newsize, oldsize);
if (error)
- goto out_unlock;
+ return error;
}
/*
@@ -802,7 +821,7 @@ xfs_setattr_size(
error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
ip->i_d.di_size, newsize);
if (error)
- goto out_unlock;
+ return error;
}
/*
@@ -810,22 +829,34 @@ xfs_setattr_size(
*/
inode_dio_wait(inode);
+ /*
+ * Do all the page cache truncate work outside the transaction context
+ * as the "lock" order is page lock->log space reservation. i.e.
+ * locking pages inside the transaction can ABBA deadlock with
+ * writeback. We have to do the VFS inode size update before we truncate
+ * the pagecache, however, to avoid racing with page faults beyond the
+ * new EOF they are not serialised against truncate operations except by
+ * page locks and size updates.
+ *
+ * Hence we are in a situation where a truncate can fail with ENOMEM
+ * from xfs_trans_reserve(), but having already truncated the in-memory
+ * version of the file (i.e. made user visible changes). There's not
+ * much we can do about this, except to hope that the caller sees ENOMEM
+ * and retries the truncate operation.
+ */
error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
if (error)
- goto out_unlock;
+ return error;
+ truncate_setsize(inode, newsize);
tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto out_trans_cancel;
- truncate_setsize(inode, newsize);
-
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
lock_flags |= XFS_ILOCK_EXCL;
-
xfs_ilock(ip, XFS_ILOCK_EXCL);
-
xfs_trans_ijoin(tp, ip, 0);
/*
@@ -838,10 +869,11 @@ xfs_setattr_size(
* these flags set. For all other operations the VFS set these flags
* explicitly if it wants a timestamp update.
*/
- if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ if (newsize != oldsize &&
+ !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) {
iattr->ia_ctime = iattr->ia_mtime =
current_fs_time(inode->i_sb);
- mask |= ATTR_CTIME | ATTR_MTIME;
+ iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME;
}
/*
@@ -877,22 +909,10 @@ xfs_setattr_size(
xfs_inode_clear_eofblocks_tag(ip);
}
- /*
- * Change file access modes.
- */
- if (mask & ATTR_MODE)
- xfs_setattr_mode(tp, ip, iattr);
-
- if (mask & ATTR_CTIME) {
- inode->i_ctime = iattr->ia_ctime;
- ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
- ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
- }
- if (mask & ATTR_MTIME) {
- inode->i_mtime = iattr->ia_mtime;
- ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
- ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
- }
+ if (iattr->ia_valid & ATTR_MODE)
+ xfs_setattr_mode(ip, iattr);
+ if (iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME))
+ xfs_setattr_time(ip, iattr);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -916,12 +936,21 @@ out_trans_cancel:
STATIC int
xfs_vn_setattr(
- struct dentry *dentry,
- struct iattr *iattr)
+ struct dentry *dentry,
+ struct iattr *iattr)
{
- if (iattr->ia_valid & ATTR_SIZE)
- return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
- return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
+ struct xfs_inode *ip = XFS_I(dentry->d_inode);
+ int error;
+
+ if (iattr->ia_valid & ATTR_SIZE) {
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+ error = xfs_setattr_size(ip, iattr);
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ } else {
+ error = xfs_setattr_nonsize(ip, iattr, 0);
+ }
+
+ return -error;
}
STATIC int
@@ -1051,8 +1080,18 @@ xfs_vn_fiemap(
return 0;
}
+STATIC int
+xfs_vn_tmpfile(
+ struct inode *dir,
+ struct dentry *dentry,
+ umode_t mode)
+{
+ return xfs_generic_create(dir, dentry, mode, 0, true);
+}
+
static const struct inode_operations xfs_inode_operations = {
.get_acl = xfs_get_acl,
+ .set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1080,6 +1119,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
.get_acl = xfs_get_acl,
+ .set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1087,6 +1127,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
+ .tmpfile = xfs_vn_tmpfile,
};
static const struct inode_operations xfs_dir_ci_inode_operations = {
@@ -1106,6 +1147,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.mknod = xfs_vn_mknod,
.rename = xfs_vn_rename,
.get_acl = xfs_get_acl,
+ .set_acl = xfs_set_acl,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1113,13 +1155,13 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.update_time = xfs_vn_update_time,
+ .tmpfile = xfs_vn_tmpfile,
};
static const struct inode_operations xfs_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = xfs_vn_follow_link,
- .put_link = xfs_vn_put_link,
- .get_acl = xfs_get_acl,
+ .put_link = kfree_put_link,
.getattr = xfs_vn_getattr,
.setattr = xfs_vn_setattr,
.setxattr = generic_setxattr,
@@ -1169,6 +1211,7 @@ xfs_setup_inode(
struct xfs_inode *ip)
{
struct inode *inode = &ip->i_vnode;
+ gfp_t gfp_mask;
inode->i_ino = ip->i_ino;
inode->i_state = I_NEW;
@@ -1204,6 +1247,8 @@ xfs_setup_inode(
inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
xfs_diflags_to_iflags(inode, ip);
+ ip->d_ops = ip->i_mount->m_nondir_inode_ops;
+ lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &xfs_inode_operations;
@@ -1211,11 +1256,13 @@ xfs_setup_inode(
inode->i_mapping->a_ops = &xfs_address_space_operations;
break;
case S_IFDIR:
+ lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
inode->i_op = &xfs_dir_ci_inode_operations;
else
inode->i_op = &xfs_dir_inode_operations;
inode->i_fop = &xfs_dir_file_operations;
+ ip->d_ops = ip->i_mount->m_dir_inode_ops;
break;
case S_IFLNK:
inode->i_op = &xfs_symlink_inode_operations;
@@ -1229,6 +1276,14 @@ xfs_setup_inode(
}
/*
+ * Ensure all page cache allocations are done from GFP_NOFS context to
+ * prevent direct reclaim recursion back into the filesystem and blowing
+ * stacks or deadlocking.
+ */
+ gfp_mask = mapping_gfp_mask(inode->i_mapping);
+ mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS)));
+
+ /*
* If there is no attribute fork no ACL can exist on this inode,
* and it can't have any file capabilities attached to it either.
*/
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index d81fb41205e..1c34e433592 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -30,14 +30,10 @@ extern void xfs_setup_inode(struct xfs_inode *);
/*
* Internal setattr interfaces.
*/
-#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
-#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if op would block */
-#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
-#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
-#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
+#define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
int flags);
-extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap, int flags);
+extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 084b3e1741f..cb64f222d60 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -17,24 +17,23 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_btree.h"
#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
#include "xfs_itable.h"
#include "xfs_error.h"
-#include "xfs_btree.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_dinode.h"
STATIC int
xfs_internal_inum(
@@ -210,9 +209,8 @@ xfs_bulkstat(
xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
xfs_ino_t lastino; /* last inode number returned */
- int nbcluster; /* # of blocks in a cluster */
- int nicluster; /* # of inodes in a cluster */
- int nimask; /* mask for inode clusters */
+ int blks_per_cluster; /* # of blocks per cluster */
+ int inodes_per_cluster;/* # of inodes per cluster */
int nirbuf; /* size of irbuf */
int rval; /* return value error code */
int tmp; /* result value from btree calls */
@@ -244,11 +242,8 @@ xfs_bulkstat(
*done = 0;
fmterror = 0;
ubufp = ubuffer;
- nicluster = mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp) ?
- mp->m_sb.sb_inopblock :
- (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
- nimask = ~(nicluster - 1);
- nbcluster = nicluster >> mp->m_sb.sb_inopblog;
+ blks_per_cluster = xfs_icluster_size_fsb(mp);
+ inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
if (!irbuf)
return ENOMEM;
@@ -275,7 +270,8 @@ xfs_bulkstat(
/*
* Allocate and initialize a btree cursor for ialloc btree.
*/
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+ XFS_BTNUM_INO);
irbp = irbuf;
irbufend = irbuf + nirbuf;
end_of_ag = 0;
@@ -391,12 +387,12 @@ xfs_bulkstat(
agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
for (chunkidx = 0;
chunkidx < XFS_INODES_PER_CHUNK;
- chunkidx += nicluster,
- agbno += nbcluster) {
- if (xfs_inobt_maskn(chunkidx, nicluster)
- & ~r.ir_free)
+ chunkidx += inodes_per_cluster,
+ agbno += blks_per_cluster) {
+ if (xfs_inobt_maskn(chunkidx,
+ inodes_per_cluster) & ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
- agbno, nbcluster,
+ agbno, blks_per_cluster,
&xfs_inode_buf_ops);
}
blk_finish_plug(&plug);
@@ -626,7 +622,8 @@ xfs_inumbers(
agino = 0;
continue;
}
- cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
+ XFS_BTNUM_INO);
error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
&tmp);
if (error) {
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index f9bb590acc0..825249d2dfc 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -119,6 +119,7 @@ typedef __uint64_t __psunsigned_t;
#include "xfs_iops.h"
#include "xfs_aops.h"
#include "xfs_super.h"
+#include "xfs_cksum.h"
#include "xfs_buf.h"
#include "xfs_message.h"
@@ -178,6 +179,7 @@ typedef __uint64_t __psunsigned_t;
#define ENOATTR ENODATA /* Attribute not found */
#define EWRONGFS EINVAL /* Mount with wrong filesystem type */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define SYNCHRONIZE() barrier()
#define __return_address __builtin_return_address(0)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a2dea108071..292308dede6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -17,21 +17,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
#include "xfs_log_recover.h"
-#include "xfs_trans_priv.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
#include "xfs_fsops.h"
@@ -618,11 +616,13 @@ xfs_log_mount(
int error = 0;
int min_logfsbs;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
- xfs_notice(mp, "Mounting Filesystem");
- else {
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+ xfs_notice(mp, "Mounting V%d Filesystem",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
+ } else {
xfs_notice(mp,
-"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+ XFS_SB_VERSION_NUM(&mp->m_sb));
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
@@ -1000,27 +1000,34 @@ xfs_log_space_wake(
}
/*
- * Determine if we have a transaction that has gone to disk
- * that needs to be covered. To begin the transition to the idle state
- * firstly the log needs to be idle (no AIL and nothing in the iclogs).
- * If we are then in a state where covering is needed, the caller is informed
- * that dummy transactions are required to move the log into the idle state.
+ * Determine if we have a transaction that has gone to disk that needs to be
+ * covered. To begin the transition to the idle state firstly the log needs to
+ * be idle. That means the CIL, the AIL and the iclogs needs to be empty before
+ * we start attempting to cover the log.
+ *
+ * Only if we are then in a state where covering is needed, the caller is
+ * informed that dummy transactions are required to move the log into the idle
+ * state.
*
- * Because this is called as part of the sync process, we should also indicate
- * that dummy transactions should be issued in anything but the covered or
- * idle states. This ensures that the log tail is accurately reflected in
- * the log at the end of the sync, hence if a crash occurrs avoids replay
- * of transactions where the metadata is already on disk.
+ * If there are any items in the AIl or CIL, then we do not want to attempt to
+ * cover the log as we may be in a situation where there isn't log space
+ * available to run a dummy transaction and this can lead to deadlocks when the
+ * tail of the log is pinned by an item that is modified in the CIL. Hence
+ * there's no point in running a dummy transaction at this point because we
+ * can't start trying to idle the log until both the CIL and AIL are empty.
*/
int
xfs_log_need_covered(xfs_mount_t *mp)
{
- int needed = 0;
struct xlog *log = mp->m_log;
+ int needed = 0;
if (!xfs_fs_writable(mp))
return 0;
+ if (!xlog_cil_empty(log))
+ return 0;
+
spin_lock(&log->l_icloglock);
switch (log->l_covered_state) {
case XLOG_STATE_COVER_DONE:
@@ -1029,14 +1036,17 @@ xfs_log_need_covered(xfs_mount_t *mp)
break;
case XLOG_STATE_COVER_NEED:
case XLOG_STATE_COVER_NEED2:
- if (!xfs_ail_min_lsn(log->l_ailp) &&
- xlog_iclogs_empty(log)) {
- if (log->l_covered_state == XLOG_STATE_COVER_NEED)
- log->l_covered_state = XLOG_STATE_COVER_DONE;
- else
- log->l_covered_state = XLOG_STATE_COVER_DONE2;
- }
- /* FALLTHRU */
+ if (xfs_ail_min_lsn(log->l_ailp))
+ break;
+ if (!xlog_iclogs_empty(log))
+ break;
+
+ needed = 1;
+ if (log->l_covered_state == XLOG_STATE_COVER_NEED)
+ log->l_covered_state = XLOG_STATE_COVER_DONE;
+ else
+ log->l_covered_state = XLOG_STATE_COVER_DONE2;
+ break;
default:
needed = 1;
break;
@@ -1068,6 +1078,7 @@ xlog_assign_tail_lsn_locked(
tail_lsn = lip->li_lsn;
else
tail_lsn = atomic64_read(&log->l_last_sync_lsn);
+ trace_xfs_log_assign_tail_lsn(log, tail_lsn);
atomic64_set(&log->l_tail_lsn, tail_lsn);
return tail_lsn;
}
@@ -1154,7 +1165,7 @@ xlog_iodone(xfs_buf_t *bp)
/*
* Race to shutdown the filesystem if we see an error.
*/
- if (XFS_TEST_ERROR((xfs_buf_geterror(bp)), l->l_mp,
+ if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_stale(bp);
@@ -1172,11 +1183,14 @@ xlog_iodone(xfs_buf_t *bp)
/* log I/O is always issued ASYNC */
ASSERT(XFS_BUF_ISASYNC(bp));
xlog_state_done_syncing(iclog, aborted);
+
/*
- * do not reference the buffer (bp) here as we could race
- * with it being freed after writing the unmount record to the
- * log.
+ * drop the buffer lock now that we are done. Nothing references
+ * the buffer after this, so an unmount waiting on this lock can now
+ * tear it down safely. As such, it is unsafe to reference the buffer
+ * (bp) after the unlock as we could race with it being freed.
*/
+ xfs_buf_unlock(bp);
}
/*
@@ -1359,8 +1373,16 @@ xlog_alloc_log(
bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0);
if (!bp)
goto out_free_log;
- bp->b_iodone = xlog_iodone;
+
+ /*
+ * The iclogbuf buffer locks are held over IO but we are not going to do
+ * IO yet. Hence unlock the buffer so that the log IO path can grab it
+ * when appropriately.
+ */
ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
+ bp->b_iodone = xlog_iodone;
log->l_xbuf = bp;
spin_lock_init(&log->l_icloglock);
@@ -1389,6 +1411,9 @@ xlog_alloc_log(
if (!bp)
goto out_free_iclog;
+ ASSERT(xfs_buf_islocked(bp));
+ xfs_buf_unlock(bp);
+
bp->b_iodone = xlog_iodone;
iclog->ic_bp = bp;
iclog->ic_data = bp->b_addr;
@@ -1413,7 +1438,6 @@ xlog_alloc_log(
iclog->ic_callback_tail = &(iclog->ic_callback);
iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
- ASSERT(xfs_buf_islocked(iclog->ic_bp));
init_waitqueue_head(&iclog->ic_force_wait);
init_waitqueue_head(&iclog->ic_write_wait);
@@ -1622,6 +1646,12 @@ xlog_cksum(
* we transition the iclogs to IOERROR state *after* flushing all existing
* iclogs to disk. This is because we don't want anymore new transactions to be
* started or completed afterwards.
+ *
+ * We lock the iclogbufs here so that we can serialise against IO completion
+ * during unmount. We might be processing a shutdown triggered during unmount,
+ * and that can occur asynchronously to the unmount thread, and hence we need to
+ * ensure that completes before tearing down the iclogbufs. Hence we need to
+ * hold the buffer lock across the log IO to acheive that.
*/
STATIC int
xlog_bdstrat(
@@ -1629,6 +1659,7 @@ xlog_bdstrat(
{
struct xlog_in_core *iclog = bp->b_fspriv;
+ xfs_buf_lock(bp);
if (iclog->ic_state & XLOG_STATE_IOERROR) {
xfs_buf_ioerror(bp, EIO);
xfs_buf_stale(bp);
@@ -1636,7 +1667,8 @@ xlog_bdstrat(
/*
* It would seem logical to return EIO here, but we rely on
* the log state machine to propagate I/O errors instead of
- * doing it here.
+ * doing it here. Similarly, IO completion will unlock the
+ * buffer, so we don't do it here.
*/
return 0;
}
@@ -1838,14 +1870,28 @@ xlog_dealloc_log(
xlog_cil_destroy(log);
/*
- * always need to ensure that the extra buffer does not point to memory
- * owned by another log buffer before we free it.
+ * Cycle all the iclogbuf locks to make sure all log IO completion
+ * is done before we tear down these buffers.
*/
+ iclog = log->l_iclog;
+ for (i = 0; i < log->l_iclog_bufs; i++) {
+ xfs_buf_lock(iclog->ic_bp);
+ xfs_buf_unlock(iclog->ic_bp);
+ iclog = iclog->ic_next;
+ }
+
+ /*
+ * Always need to ensure that the extra buffer does not point to memory
+ * owned by another log buffer before we free it. Also, cycle the lock
+ * first to ensure we've completed IO on it.
+ */
+ xfs_buf_lock(log->l_xbuf);
+ xfs_buf_unlock(log->l_xbuf);
xfs_buf_set_empty(log->l_xbuf, BTOBB(log->l_iclog_size));
xfs_buf_free(log->l_xbuf);
iclog = log->l_iclog;
- for (i=0; i<log->l_iclog_bufs; i++) {
+ for (i = 0; i < log->l_iclog_bufs; i++) {
xfs_buf_free(iclog->ic_bp);
next_iclog = iclog->ic_next;
kmem_free(iclog);
@@ -1979,7 +2025,7 @@ xlog_print_tic_res(
for (i = 0; i < ticket->t_res_num; i++) {
uint r_type = ticket->t_res_arr[i].r_type;
- xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
+ xfs_warn(mp, "region[%u]: %s - %u bytes", i,
((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
"bad-rtype" : res_type_str[r_type-1]),
ticket->t_res_arr[i].r_len);
@@ -3702,11 +3748,9 @@ xlog_verify_iclog(
/* check validity of iclog pointers */
spin_lock(&log->l_icloglock);
icptr = log->l_iclog;
- for (i=0; i < log->l_iclog_bufs; i++) {
- if (icptr == NULL)
- xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
- icptr = icptr->ic_next;
- }
+ for (i = 0; i < log->l_iclog_bufs; i++, icptr = icptr->ic_next)
+ ASSERT(icptr);
+
if (icptr != log->l_iclog)
xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
spin_unlock(&log->l_icloglock);
@@ -3908,11 +3952,14 @@ xfs_log_force_umount(
retval = xlog_state_ioerror(log);
spin_unlock(&log->l_icloglock);
}
+
/*
- * Wake up everybody waiting on xfs_log_force.
- * Callback all log item committed functions as if the
- * log writes were completed.
+ * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
+ * as if the log writes were completed. The abort handling in the log
+ * item committed callback functions will do this again under lock to
+ * avoid races.
*/
+ wake_up_all(&log->l_cilp->xc_commit_wait);
xlog_state_do_callback(log, XFS_LI_ABORTED, NULL);
#ifdef XFSERRORDEBUG
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 1c458487f00..84e0deb95ab 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -18,20 +18,71 @@
#ifndef __XFS_LOG_H__
#define __XFS_LOG_H__
-#include "xfs_log_format.h"
-
struct xfs_log_vec {
struct xfs_log_vec *lv_next; /* next lv in build list */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
- int lv_buf_len; /* size of formatted buffer */
+ int lv_bytes; /* accounted space in buffer */
+ int lv_buf_len; /* aligned size of buffer */
int lv_size; /* size of allocated lv */
};
#define XFS_LOG_VEC_ORDERED (-1)
+static inline void *
+xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type)
+{
+ struct xfs_log_iovec *vec = *vecp;
+
+ if (vec) {
+ ASSERT(vec - lv->lv_iovecp < lv->lv_niovecs);
+ vec++;
+ } else {
+ vec = &lv->lv_iovecp[0];
+ }
+
+ vec->i_type = type;
+ vec->i_addr = lv->lv_buf + lv->lv_buf_len;
+
+ ASSERT(IS_ALIGNED((unsigned long)vec->i_addr, sizeof(uint64_t)));
+
+ *vecp = vec;
+ return vec->i_addr;
+}
+
+/*
+ * We need to make sure the next buffer is naturally aligned for the biggest
+ * basic data type we put into it. We already accounted for this padding when
+ * sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ */
+static inline void
+xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
+{
+ lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+ lv->lv_bytes += len;
+ vec->i_len = len;
+}
+
+static inline void *
+xlog_copy_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
+ uint type, void *data, int len)
+{
+ void *buf;
+
+ buf = xlog_prepare_iovec(lv, vecp, type);
+ memcpy(buf, data, len);
+ xlog_finish_iovec(lv, *vecp, len);
+ return buf;
+}
+
/*
* Structure used to pass callback function and the function's argument
* to the log manager.
@@ -82,11 +133,7 @@ struct xlog_ticket;
struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
-
-void xfs_log_item_init(struct xfs_mount *mp,
- struct xfs_log_item *item,
- int type,
- const struct xfs_item_ops *ops);
+struct xfs_log_callback;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
@@ -114,7 +161,7 @@ xfs_lsn_t xlog_assign_tail_lsn_locked(struct xfs_mount *mp);
void xfs_log_space_wake(struct xfs_mount *mp);
int xfs_log_notify(struct xfs_mount *mp,
struct xlog_in_core *iclog,
- xfs_log_callback_t *callback_entry);
+ struct xfs_log_callback *callback_entry);
int xfs_log_release_iclog(struct xfs_mount *mp,
struct xlog_in_core *iclog);
int xfs_log_reserve(struct xfs_mount *mp,
@@ -135,7 +182,7 @@ void xlog_iodone(struct xfs_buf *);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index cfe97973ba3..b3425b34e3d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -17,11 +17,9 @@
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_log_priv.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@@ -29,6 +27,10 @@
#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
#include "xfs_discard.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -80,36 +82,6 @@ xlog_cil_init_post_recovery(
log->l_curr_block);
}
-STATIC int
-xlog_cil_lv_item_format(
- struct xfs_log_item *lip,
- struct xfs_log_vec *lv)
-{
- int index;
- char *ptr;
-
- /* format new vectors into array */
- lip->li_ops->iop_format(lip, lv->lv_iovecp);
-
- /* copy data into existing array */
- ptr = lv->lv_buf;
- for (index = 0; index < lv->lv_niovecs; index++) {
- struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
-
- memcpy(ptr, vec->i_addr, vec->i_len);
- vec->i_addr = ptr;
- ptr += vec->i_len;
- }
-
- /*
- * some size calculations for log vectors over-estimate, so the caller
- * doesn't know the amount of space actually used by the item. Return
- * the byte count to the caller so they can check and store it
- * appropriately.
- */
- return ptr - lv->lv_buf;
-}
-
/*
* Prepare the log item for insertion into the CIL. Calculate the difference in
* log space and vectors it will consume, and if it is a new item pin it as
@@ -125,7 +97,7 @@ xfs_cil_prepare_item(
{
/* Account for the new LV being passed in */
if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
- *diff_len += lv->lv_buf_len;
+ *diff_len += lv->lv_bytes;
*diff_iovecs += lv->lv_niovecs;
}
@@ -139,7 +111,7 @@ xfs_cil_prepare_item(
else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
- *diff_len -= old_lv->lv_buf_len;
+ *diff_len -= old_lv->lv_bytes;
*diff_iovecs -= old_lv->lv_niovecs;
kmem_free(old_lv);
}
@@ -230,12 +202,28 @@ xlog_cil_insert_format_items(
nbytes = 0;
}
+ /*
+ * We 64-bit align the length of each iovec so that the start
+ * of the next one is naturally aligned. We'll need to
+ * account for that slack space here. Then round nbytes up
+ * to 64-bit alignment so that the initial buffer alignment is
+ * easy to calculate and verify.
+ */
+ nbytes += niovecs * sizeof(uint64_t);
+ nbytes = round_up(nbytes, sizeof(uint64_t));
+
/* grab the old item if it exists for reservation accounting */
old_lv = lip->li_lv;
- /* calc buffer size */
- buf_size = sizeof(struct xfs_log_vec) + nbytes +
- niovecs * sizeof(struct xfs_log_iovec);
+ /*
+ * The data buffer needs to start 64-bit aligned, so round up
+ * that space to ensure we can align it appropriately and not
+ * overrun the buffer.
+ */
+ buf_size = nbytes +
+ round_up((sizeof(struct xfs_log_vec) +
+ niovecs * sizeof(struct xfs_log_iovec)),
+ sizeof(uint64_t));
/* compare to existing item size */
if (lip->li_lv && buf_size <= lip->li_lv->lv_size) {
@@ -251,35 +239,31 @@ xlog_cil_insert_format_items(
* that the space reservation accounting is correct.
*/
*diff_iovecs -= lv->lv_niovecs;
- *diff_len -= lv->lv_buf_len;
-
- /* Ensure the lv is set up according to ->iop_size */
- lv->lv_niovecs = niovecs;
- lv->lv_buf = (char *)lv + buf_size - nbytes;
-
- lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
- goto insert;
+ *diff_len -= lv->lv_bytes;
+ } else {
+ /* allocate new data chunk */
+ lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
+ lv->lv_item = lip;
+ lv->lv_size = buf_size;
+ if (ordered) {
+ /* track as an ordered logvec */
+ ASSERT(lip->li_lv == NULL);
+ lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
+ goto insert;
+ }
+ lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
}
- /* allocate new data chunk */
- lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
- lv->lv_item = lip;
- lv->lv_size = buf_size;
+ /* Ensure the lv is set up according to ->iop_size */
lv->lv_niovecs = niovecs;
- if (ordered) {
- /* track as an ordered logvec */
- ASSERT(lip->li_lv == NULL);
- lv->lv_buf_len = XFS_LOG_VEC_ORDERED;
- goto insert;
- }
-
- /* The allocated iovec region lies beyond the log vector. */
- lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1];
/* The allocated data region lies beyond the iovec region */
+ lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
lv->lv_buf = (char *)lv + buf_size - nbytes;
+ ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));
- lv->lv_buf_len = xlog_cil_lv_item_format(lip, lv);
+ lip->li_ops->iop_format(lip, lv);
insert:
ASSERT(lv->lv_buf_len <= nbytes);
xfs_cil_prepare_item(log, lv, old_lv, diff_len, diff_iovecs);
@@ -402,7 +386,15 @@ xlog_cil_committed(
xfs_extent_busy_clear(mp, &ctx->busy_extents,
(mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
+ /*
+ * If we are aborting the commit, wake up anyone waiting on the
+ * committing list. If we don't, then a shutdown we can leave processes
+ * waiting in xlog_cil_force_lsn() waiting on a sequence commit that
+ * will never happen because we aborted it.
+ */
spin_lock(&ctx->cil->xc_push_lock);
+ if (abort)
+ wake_up_all(&ctx->cil->xc_commit_wait);
list_del(&ctx->committing);
spin_unlock(&ctx->cil->xc_push_lock);
@@ -516,13 +508,6 @@ xlog_cil_push(
cil->xc_ctx = new_ctx;
/*
- * mirror the new sequence into the cil structure so that we can do
- * unlocked checks against the current sequence in log forces without
- * risking deferencing a freed context pointer.
- */
- cil->xc_current_sequence = new_ctx->sequence;
-
- /*
* The switch is now done, so we can drop the context lock and move out
* of a shared context. We can't just go straight to the commit record,
* though - we need to synchronise with previous and future commits so
@@ -540,8 +525,15 @@ xlog_cil_push(
* Hence we need to add this context to the committing context list so
* that higher sequences will wait for us to write out a commit record
* before they do.
+ *
+ * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+ * structure atomically with the addition of this sequence to the
+ * committing list. This also ensures that we can do unlocked checks
+ * against the current sequence in log forces without risking
+ * deferencing a freed context pointer.
*/
spin_lock(&cil->xc_push_lock);
+ cil->xc_current_sequence = new_ctx->sequence;
list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock);
up_write(&cil->xc_ctx_lock);
@@ -581,8 +573,18 @@ restart:
spin_lock(&cil->xc_push_lock);
list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
/*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log)) {
+ spin_unlock(&cil->xc_push_lock);
+ goto out_abort_free_ticket;
+ }
+
+ /*
* Higher sequences will wait for this one so skip them.
- * Don't wait for own own sequence, either.
+ * Don't wait for our own sequence, either.
*/
if (new_ctx->sequence >= ctx->sequence)
continue;
@@ -679,8 +681,14 @@ xlog_cil_push_background(
}
+/*
+ * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence
+ * number that is passed. When it returns, the work will be queued for
+ * @push_seq, but it won't be completed. The caller is expected to do any
+ * waiting for push_seq to complete if it is required.
+ */
static void
-xlog_cil_push_foreground(
+xlog_cil_push_now(
struct xlog *log,
xfs_lsn_t push_seq)
{
@@ -705,10 +713,22 @@ xlog_cil_push_foreground(
}
cil->xc_push_seq = push_seq;
+ queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
spin_unlock(&cil->xc_push_lock);
+}
- /* do the push now */
- xlog_cil_push(log);
+bool
+xlog_cil_empty(
+ struct xlog *log)
+{
+ struct xfs_cil *cil = log->l_cilp;
+ bool empty = false;
+
+ spin_lock(&cil->xc_push_lock);
+ if (list_empty(&cil->xc_cil))
+ empty = true;
+ spin_unlock(&cil->xc_push_lock);
+ return empty;
}
/*
@@ -724,7 +744,7 @@ xlog_cil_push_foreground(
* background commit, returns without it held once background commits are
* allowed again.
*/
-int
+void
xfs_log_commit_cil(
struct xfs_mount *mp,
struct xfs_trans *tp,
@@ -770,7 +790,6 @@ xfs_log_commit_cil(
xlog_cil_push_background(log);
up_read(&cil->xc_ctx_lock);
- return 0;
}
/*
@@ -799,7 +818,8 @@ xlog_cil_force_lsn(
* xlog_cil_push() handles racing pushes for the same sequence,
* so no need to deal with it here.
*/
- xlog_cil_push_foreground(log, sequence);
+restart:
+ xlog_cil_push_now(log, sequence);
/*
* See if we can find a previous sequence still committing.
@@ -807,9 +827,15 @@ xlog_cil_force_lsn(
* before allowing the force of push_seq to go ahead. Hence block
* on commits for those as well.
*/
-restart:
spin_lock(&cil->xc_push_lock);
list_for_each_entry(ctx, &cil->xc_committing, committing) {
+ /*
+ * Avoid getting stuck in this loop because we were woken by the
+ * shutdown, but then went back to sleep once already in the
+ * shutdown state.
+ */
+ if (XLOG_FORCED_SHUTDOWN(log))
+ goto out_shutdown;
if (ctx->sequence > sequence)
continue;
if (!ctx->commit_lsn) {
@@ -825,8 +851,39 @@ restart:
/* found it! */
commit_lsn = ctx->commit_lsn;
}
+
+ /*
+ * The call to xlog_cil_push_now() executes the push in the background.
+ * Hence by the time we have got here it our sequence may not have been
+ * pushed yet. This is true if the current sequence still matches the
+ * push sequence after the above wait loop and the CIL still contains
+ * dirty objects.
+ *
+ * When the push occurs, it will empty the CIL and atomically increment
+ * the currect sequence past the push sequence and move it into the
+ * committing list. Of course, if the CIL is clean at the time of the
+ * push, it won't have pushed the CIL at all, so in that case we should
+ * try the push for this sequence again from the start just in case.
+ */
+ if (sequence == cil->xc_current_sequence &&
+ !list_empty(&cil->xc_cil)) {
+ spin_unlock(&cil->xc_push_lock);
+ goto restart;
+ }
+
spin_unlock(&cil->xc_push_lock);
return commit_lsn;
+
+ /*
+ * We detected a shutdown in progress. We need to trigger the log force
+ * to pass through it's iclog state machine error handling, even though
+ * we are already in a shutdown state. Hence we can't return
+ * NULLCOMMITLSN here as that has special meaning to log forces (i.e.
+ * LSN is already stable), so we return a zero LSN instead.
+ */
+out_shutdown:
+ spin_unlock(&cil->xc_push_lock);
+ return 0;
}
/*
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index ca7e28a8ed3..f0969c77bdb 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -234,178 +234,6 @@ typedef struct xfs_trans_header {
{ XFS_LI_ICREATE, "XFS_LI_ICREATE" }
/*
- * Transaction types. Used to distinguish types of buffers.
- */
-#define XFS_TRANS_SETATTR_NOT_SIZE 1
-#define XFS_TRANS_SETATTR_SIZE 2
-#define XFS_TRANS_INACTIVE 3
-#define XFS_TRANS_CREATE 4
-#define XFS_TRANS_CREATE_TRUNC 5
-#define XFS_TRANS_TRUNCATE_FILE 6
-#define XFS_TRANS_REMOVE 7
-#define XFS_TRANS_LINK 8
-#define XFS_TRANS_RENAME 9
-#define XFS_TRANS_MKDIR 10
-#define XFS_TRANS_RMDIR 11
-#define XFS_TRANS_SYMLINK 12
-#define XFS_TRANS_SET_DMATTRS 13
-#define XFS_TRANS_GROWFS 14
-#define XFS_TRANS_STRAT_WRITE 15
-#define XFS_TRANS_DIOSTRAT 16
-/* 17 was XFS_TRANS_WRITE_SYNC */
-#define XFS_TRANS_WRITEID 18
-#define XFS_TRANS_ADDAFORK 19
-#define XFS_TRANS_ATTRINVAL 20
-#define XFS_TRANS_ATRUNCATE 21
-#define XFS_TRANS_ATTR_SET 22
-#define XFS_TRANS_ATTR_RM 23
-#define XFS_TRANS_ATTR_FLAG 24
-#define XFS_TRANS_CLEAR_AGI_BUCKET 25
-#define XFS_TRANS_QM_SBCHANGE 26
-/*
- * Dummy entries since we use the transaction type to index into the
- * trans_type[] in xlog_recover_print_trans_head()
- */
-#define XFS_TRANS_DUMMY1 27
-#define XFS_TRANS_DUMMY2 28
-#define XFS_TRANS_QM_QUOTAOFF 29
-#define XFS_TRANS_QM_DQALLOC 30
-#define XFS_TRANS_QM_SETQLIM 31
-#define XFS_TRANS_QM_DQCLUSTER 32
-#define XFS_TRANS_QM_QINOCREATE 33
-#define XFS_TRANS_QM_QUOTAOFF_END 34
-#define XFS_TRANS_SB_UNIT 35
-#define XFS_TRANS_FSYNC_TS 36
-#define XFS_TRANS_GROWFSRT_ALLOC 37
-#define XFS_TRANS_GROWFSRT_ZERO 38
-#define XFS_TRANS_GROWFSRT_FREE 39
-#define XFS_TRANS_SWAPEXT 40
-#define XFS_TRANS_SB_COUNT 41
-#define XFS_TRANS_CHECKPOINT 42
-#define XFS_TRANS_ICREATE 43
-#define XFS_TRANS_TYPE_MAX 43
-/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#define XFS_TRANS_TYPES \
- { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
- { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
- { XFS_TRANS_INACTIVE, "INACTIVE" }, \
- { XFS_TRANS_CREATE, "CREATE" }, \
- { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
- { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
- { XFS_TRANS_REMOVE, "REMOVE" }, \
- { XFS_TRANS_LINK, "LINK" }, \
- { XFS_TRANS_RENAME, "RENAME" }, \
- { XFS_TRANS_MKDIR, "MKDIR" }, \
- { XFS_TRANS_RMDIR, "RMDIR" }, \
- { XFS_TRANS_SYMLINK, "SYMLINK" }, \
- { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
- { XFS_TRANS_GROWFS, "GROWFS" }, \
- { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
- { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
- { XFS_TRANS_WRITEID, "WRITEID" }, \
- { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
- { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
- { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
- { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
- { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
- { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
- { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
- { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
- { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
- { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
- { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
- { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
- { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
- { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
- { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
- { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
- { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
- { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
- { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
- { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
- { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
- { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
- { XFS_TRANS_DUMMY1, "DUMMY1" }, \
- { XFS_TRANS_DUMMY2, "DUMMY2" }, \
- { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
-
-/*
- * This structure is used to track log items associated with
- * a transaction. It points to the log item and keeps some
- * flags to track the state of the log item. It also tracks
- * the amount of space needed to log the item it describes
- * once we get to commit processing (see xfs_trans_commit()).
- */
-struct xfs_log_item_desc {
- struct xfs_log_item *lid_item;
- struct list_head lid_trans;
- unsigned char lid_flags;
-};
-
-#define XFS_LID_DIRTY 0x1
-
-/*
- * Values for t_flags.
- */
-#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
-#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
-#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
-#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
-#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
-#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
-#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
- count in superblock */
-
-/*
- * Values for call flags parameter.
- */
-#define XFS_TRANS_RELEASE_LOG_RES 0x4
-#define XFS_TRANS_ABORT 0x8
-
-/*
- * Field values for xfs_trans_mod_sb.
- */
-#define XFS_TRANS_SB_ICOUNT 0x00000001
-#define XFS_TRANS_SB_IFREE 0x00000002
-#define XFS_TRANS_SB_FDBLOCKS 0x00000004
-#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
-#define XFS_TRANS_SB_FREXTENTS 0x00000010
-#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
-#define XFS_TRANS_SB_DBLOCKS 0x00000040
-#define XFS_TRANS_SB_AGCOUNT 0x00000080
-#define XFS_TRANS_SB_IMAXPCT 0x00000100
-#define XFS_TRANS_SB_REXTSIZE 0x00000200
-#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
-#define XFS_TRANS_SB_RBLOCKS 0x00000800
-#define XFS_TRANS_SB_REXTENTS 0x00001000
-#define XFS_TRANS_SB_REXTSLOG 0x00002000
-
-/*
- * Here we centralize the specification of XFS meta-data buffer
- * reference count values. This determine how hard the buffer
- * cache tries to hold onto the buffer.
- */
-#define XFS_AGF_REF 4
-#define XFS_AGI_REF 4
-#define XFS_AGFL_REF 3
-#define XFS_INO_BTREE_REF 3
-#define XFS_ALLOC_BTREE_REF 2
-#define XFS_BMAP_BTREE_REF 2
-#define XFS_DIR_BTREE_REF 2
-#define XFS_INO_REF 2
-#define XFS_ATTR_BTREE_REF 1
-#define XFS_DQUOT_REF 1
-
-/*
- * Flags for xfs_trans_ichgtime().
- */
-#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
-#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
-#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
-
-
-/*
* Inode Log Item Format definitions.
*
* This is the structure used to lay out an inode log item in the
@@ -797,7 +625,6 @@ typedef struct xfs_qoff_logformat {
char qf_pad[12]; /* padding for future */
} xfs_qoff_logformat_t;
-
/*
* Disk quotas status in m_qflags, and also sb_qflags. 16 bits.
*/
@@ -849,8 +676,4 @@ struct xfs_icreate_log {
__be32 icl_gen; /* inode generation number to use */
};
-int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
-int xfs_log_calc_minimum_size(struct xfs_mount *);
-
-
#endif /* __XFS_LOG_FORMAT_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 136654b9400..9bc403a9e54 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -22,6 +22,7 @@ struct xfs_buf;
struct xlog;
struct xlog_ticket;
struct xfs_mount;
+struct xfs_log_callback;
/*
* Flags for log structure
@@ -227,8 +228,8 @@ typedef struct xlog_in_core {
/* Callback structures need their own cacheline */
spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
- xfs_log_callback_t *ic_callback;
- xfs_log_callback_t **ic_callback_tail;
+ struct xfs_log_callback *ic_callback;
+ struct xfs_log_callback **ic_callback_tail;
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp;
@@ -254,7 +255,7 @@ struct xfs_cil_ctx {
int space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
- xfs_log_callback_t log_cb; /* completion callback hook. */
+ struct xfs_log_callback log_cb; /* completion callback hook. */
struct list_head committing; /* ctx committing list */
};
@@ -514,12 +515,10 @@ xlog_assign_grant_head(atomic64_t *head, int cycle, int space)
/*
* Committed Item List interfaces
*/
-int
-xlog_cil_init(struct xlog *log);
-void
-xlog_cil_init_post_recovery(struct xlog *log);
-void
-xlog_cil_destroy(struct xlog *log);
+int xlog_cil_init(struct xlog *log);
+void xlog_cil_init_post_recovery(struct xlog *log);
+void xlog_cil_destroy(struct xlog *log);
+bool xlog_cil_empty(struct xlog *log);
/*
* CIL force routines
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 39797490a1f..981af0f6504 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,42 +17,34 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_cksum.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_icreate_item.h"
-
-/* Need all the magic numbers and buffer ops structures from these headers */
-#include "xfs_symlink.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
#include "xfs_dir2.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_attr_remote.h"
#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
@@ -201,7 +193,10 @@ xlog_bread_noalign(
bp->b_io_length = nbblks;
bp->b_error = 0;
- xfsbdstrat(log->l_mp, bp);
+ if (XFS_FORCED_SHUTDOWN(log->l_mp))
+ return XFS_ERROR(EIO);
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error)
xfs_buf_ioerror_alert(bp, __func__);
@@ -305,9 +300,9 @@ xlog_header_check_dump(
xfs_mount_t *mp,
xlog_rec_header_t *head)
{
- xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
- xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, " log : uuid = %pU, fmt = %d",
&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
}
#else
@@ -1659,6 +1654,7 @@ xlog_recover_reorder_trans(
int pass)
{
xlog_recover_item_t *item, *n;
+ int error = 0;
LIST_HEAD(sort_list);
LIST_HEAD(cancel_list);
LIST_HEAD(buffer_list);
@@ -1700,9 +1696,17 @@ xlog_recover_reorder_trans(
"%s: unrecognized type of log operation",
__func__);
ASSERT(0);
- return XFS_ERROR(EIO);
+ /*
+ * return the remaining items back to the transaction
+ * item list so they can be freed in caller.
+ */
+ if (!list_empty(&sort_list))
+ list_splice_init(&sort_list, &trans->r_itemq);
+ error = XFS_ERROR(EIO);
+ goto out;
}
}
+out:
ASSERT(list_empty(&sort_list));
if (!list_empty(&buffer_list))
list_splice(&buffer_list, &trans->r_itemq);
@@ -1712,7 +1716,7 @@ xlog_recover_reorder_trans(
list_splice_tail(&inode_buffer_list, &trans->r_itemq);
if (!list_empty(&cancel_list))
list_splice_tail(&cancel_list, &trans->r_itemq);
- return 0;
+ return error;
}
/*
@@ -2134,7 +2138,9 @@ xlog_recover_validate_buf_type(
bp->b_ops = &xfs_allocbt_buf_ops;
break;
case XFS_IBT_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
case XFS_IBT_MAGIC:
+ case XFS_FIBT_MAGIC:
bp->b_ops = &xfs_inobt_buf_ops;
break;
case XFS_BMAP_CRC_MAGIC:
@@ -2362,7 +2368,7 @@ xlog_recover_do_reg_buffer(
item->ri_buf[i].i_len, __func__);
goto next;
}
- error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
+ error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
-1, 0, XFS_QMOPT_DOWARN,
"dquot_buf_recover");
if (error)
@@ -2394,133 +2400,6 @@ xlog_recover_do_reg_buffer(
}
/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_qm_dqcheck(
- struct xfs_mount *mp,
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type, /* used only when IO_dorepair is true */
- uint flags,
- char *str)
-{
- xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
- int errs = 0;
-
- /*
- * We can encounter an uninitialized dquot buffer for 2 reasons:
- * 1. If we crash while deleting the quotainode(s), and those blks got
- * used for user data. This is because we take the path of regular
- * file deletion; however, the size field of quotainodes is never
- * updated, so all the tricks that we play in itruncate_finish
- * don't quite matter.
- *
- * 2. We don't play the quota buffers when there's a quotaoff logitem.
- * But the allocation will be replayed so we'll end up with an
- * uninitialized quota block.
- *
- * This is all fine; things are still consistent, and we haven't lost
- * any quota information. Just don't complain about bad dquot blks.
- */
- if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
- str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
- errs++;
- }
- if (ddq->d_version != XFS_DQUOT_VERSION) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
- str, id, ddq->d_version, XFS_DQUOT_VERSION);
- errs++;
- }
-
- if (ddq->d_flags != XFS_DQ_USER &&
- ddq->d_flags != XFS_DQ_PROJ &&
- ddq->d_flags != XFS_DQ_GROUP) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
- str, id, ddq->d_flags);
- errs++;
- }
-
- if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : ondisk-dquot 0x%p, ID mismatch: "
- "0x%x expected, found id 0x%x",
- str, ddq, id, be32_to_cpu(ddq->d_id));
- errs++;
- }
-
- if (!errs && ddq->d_id) {
- if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >
- be64_to_cpu(ddq->d_blk_softlimit)) {
- if (!ddq->d_btimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >
- be64_to_cpu(ddq->d_ino_softlimit)) {
- if (!ddq->d_itimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >
- be64_to_cpu(ddq->d_rtb_softlimit)) {
- if (!ddq->d_rtbtimer) {
- if (flags & XFS_QMOPT_DOWARN)
- xfs_alert(mp,
- "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- }
-
- if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
- return errs;
-
- if (flags & XFS_QMOPT_DOWARN)
- xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
-
- /*
- * Typically, a repair is only requested by quotacheck.
- */
- ASSERT(id != -1);
- ASSERT(flags & XFS_QMOPT_DQREPAIR);
- memset(d, 0, sizeof(xfs_dqblk_t));
-
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_flags = type;
- d->dd_diskdq.d_id = cpu_to_be32(id);
-
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
- uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid);
- xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk),
- XFS_DQUOT_CRC_OFF);
- }
-
- return errs;
-}
-
-/*
* Perform a dquot buffer recovery.
* Simple algorithm: if we have found a QUOTAOFF log item of the same type
* (ie. USR or GRP), then just toss this buffer away; don't recover it.
@@ -2649,19 +2528,19 @@ xlog_recover_buffer_pass2(
*
* Also make sure that only inode buffers with good sizes stay in
* the buffer cache. The kernel moves inodes in buffers of 1 block
- * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
+ * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
* running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
- * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+ * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
+ * for *our* value of mp->m_inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
* overlap with future reads of those inodes.
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
(BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
- (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
+ (__uint32_t)log->l_mp->m_inode_cluster_size))) {
xfs_buf_stale(bp);
error = xfs_bwrite(bp);
} else {
@@ -3125,7 +3004,7 @@ xlog_recover_dquot_pass2(
*/
dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
- error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2 (log copy)");
if (error)
return XFS_ERROR(EIO);
@@ -3145,7 +3024,7 @@ xlog_recover_dquot_pass2(
* was among a chunk of dquots created earlier, and we did some
* minimal initialization then.
*/
- error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
"xlog_recover_dquot_pass2");
if (error) {
xfs_buf_relse(bp);
@@ -3268,7 +3147,7 @@ xlog_recover_efd_pass2(
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return 0;
@@ -3334,10 +3213,10 @@ xlog_recover_do_icreate_pass2(
}
/* existing allocation is fixed value */
- ASSERT(count == XFS_IALLOC_INODES(mp));
- ASSERT(length == XFS_IALLOC_BLOCKS(mp));
- if (count != XFS_IALLOC_INODES(mp) ||
- length != XFS_IALLOC_BLOCKS(mp)) {
+ ASSERT(count == mp->m_ialloc_inos);
+ ASSERT(length == mp->m_ialloc_blks);
+ if (count != mp->m_ialloc_inos ||
+ length != mp->m_ialloc_blks) {
xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
return EINVAL;
}
@@ -3643,8 +3522,7 @@ out:
STATIC int
xlog_recover_unmount_trans(
- struct xlog *log,
- struct xlog_recover *trans)
+ struct xlog *log)
{
/* Do nothing now */
xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
@@ -3718,7 +3596,7 @@ xlog_recover_process_data(
trans, pass);
break;
case XLOG_UNMOUNT_TRANS:
- error = xlog_recover_unmount_trans(log, trans);
+ error = xlog_recover_unmount_trans(log);
break;
case XLOG_WAS_CONT_TRANS:
error = xlog_recover_add_to_cont_trans(log,
@@ -3743,8 +3621,10 @@ xlog_recover_process_data(
error = XFS_ERROR(EIO);
break;
}
- if (error)
+ if (error) {
+ xlog_recover_free_trans(trans);
return error;
+ }
}
dp += be32_to_cpu(ohead->oh_len);
num_logops--;
@@ -3878,7 +3758,7 @@ xlog_recover_process_efis(
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
out:
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return error;
}
@@ -4077,7 +3957,7 @@ xlog_unpack_data_crc(
if (crc != rhead->h_crc) {
if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
xfs_alert(log->l_mp,
- "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+ "log record CRC mismatch: found 0x%x, expected 0x%x.",
le32_to_cpu(rhead->h_crc),
le32_to_cpu(crc));
xfs_hex_dump(dp, 32);
@@ -4532,7 +4412,13 @@ xlog_do_recover(
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
bp->b_ops = &xfs_sb_buf_ops;
- xfsbdstrat(log->l_mp, bp);
+
+ if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_buf_relse(bp);
+ return XFS_ERROR(EIO);
+ }
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/xfs_log_rlimit.c
index bbcec0bbc12..ee7e0e80246 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/xfs_log_rlimit.c
@@ -17,16 +17,19 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_ag.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_trans_space.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_da_btree.h"
#include "xfs_attr_leaf.h"
+#include "xfs_bmap_btree.h"
/*
* Calculate the maximum length in bytes that would be required for a local
@@ -39,7 +42,7 @@ xfs_log_calc_max_attrsetm_res(
int size;
int nblks;
- size = xfs_attr_leaf_entsize_local_max(mp->m_sb.sb_blocksize) -
+ size = xfs_attr_leaf_entsize_local_max(mp->m_attr_geo->blksize) -
MAXNAMELEN - 1;
nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK);
nblks += XFS_B_TO_FSB(mp, size);
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 9163dc14053..63ca2f0420b 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,9 +17,8 @@
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5dcc68019d1..3507cd0ec40 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -17,35 +17,31 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_dir2.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_rtalloc.h"
#include "xfs_bmap.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_fsops.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
-#include "xfs_cksum.h"
-#include "xfs_buf_item.h"
+#include "xfs_dinode.h"
#ifdef HAVE_PERCPU_SB
@@ -286,22 +282,29 @@ xfs_readsb(
struct xfs_sb *sbp = &mp->m_sb;
int error;
int loud = !(flags & XFS_MFSI_QUIET);
+ const struct xfs_buf_ops *buf_ops;
ASSERT(mp->m_sb_bp == NULL);
ASSERT(mp->m_ddev_targp != NULL);
/*
+ * For the initial read, we must guess at the sector
+ * size based on the block device. It's enough to
+ * get the sb_sectsize out of the superblock and
+ * then reread with the proper length.
+ * We don't verify it yet, because it may not be complete.
+ */
+ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
+ buf_ops = NULL;
+
+ /*
* Allocate a (locked) buffer to hold the superblock.
* This will be kept around at all times to optimize
* access to the superblock.
*/
- sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-
reread:
bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0,
- loud ? &xfs_sb_buf_ops
- : &xfs_sb_quiet_buf_ops);
+ BTOBB(sector_size), 0, buf_ops);
if (!bp) {
if (loud)
xfs_warn(mp, "SB buffer read failed");
@@ -311,14 +314,28 @@ reread:
error = bp->b_error;
if (loud)
xfs_warn(mp, "SB validate failed with error %d.", error);
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
goto release_buf;
}
/*
* Initialize the mount structure from the superblock.
*/
- xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
- xfs_sb_quota_from_disk(&mp->m_sb);
+ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_quota_from_disk(sbp);
+
+ /*
+ * If we haven't validated the superblock, do so now before we try
+ * to check the sector size and reread the superblock appropriately.
+ */
+ if (sbp->sb_magicnum != XFS_SB_MAGIC) {
+ if (loud)
+ xfs_warn(mp, "Invalid superblock magic number");
+ error = EINVAL;
+ goto release_buf;
+ }
/*
* We must be able to do sector-sized and sector-aligned IO.
@@ -331,13 +348,14 @@ reread:
goto release_buf;
}
- /*
- * If device sector size is smaller than the superblock size,
- * re-read the superblock so the buffer is correctly sized.
- */
- if (sector_size < sbp->sb_sectsize) {
+ if (buf_ops == NULL) {
+ /*
+ * Re-read the superblock so the buffer is correctly sized,
+ * and properly verified.
+ */
xfs_buf_relse(bp);
sector_size = sbp->sb_sectsize;
+ buf_ops = loud ? &xfs_sb_buf_ops : &xfs_sb_quiet_buf_ops;
goto reread;
}
@@ -690,6 +708,12 @@ xfs_mountfs(
mp->m_update_flags |= XFS_SB_VERSIONNUM;
}
+ /* always use v2 inodes by default now */
+ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
+ mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ mp->m_update_flags |= XFS_SB_VERSIONNUM;
+ }
+
/*
* Check if sb_agblocks is aligned at stripe boundary
* If sb_agblocks is NOT aligned turn off m_dalign since
@@ -723,8 +747,20 @@ xfs_mountfs(
* Set the inode cluster size.
* This may still be overridden by the file system
* block size if it is larger than the chosen cluster size.
+ *
+ * For v5 filesystems, scale the cluster size with the inode size to
+ * keep a constant ratio of inode per cluster buffer, but only if mkfs
+ * has set the inode alignment value appropriately for larger cluster
+ * sizes.
*/
mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ int new_size = mp->m_inode_cluster_size;
+
+ new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
+ if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
+ mp->m_inode_cluster_size = new_size;
+ }
/*
* Set inode alignment fields
@@ -755,12 +791,11 @@ xfs_mountfs(
mp->m_dmevmask = 0; /* not persistent; set after each mount */
- xfs_dir_mount(mp);
-
- /*
- * Initialize the attribute manager's entries.
- */
- mp->m_attr_magicpct = (mp->m_sb.sb_blocksize * 37) / 100;
+ error = xfs_da_mount(mp);
+ if (error) {
+ xfs_warn(mp, "Failed dir/attr init: %d", error);
+ goto out_remove_uuid;
+ }
/*
* Initialize the precomputed transaction reservations values.
@@ -775,7 +810,7 @@ xfs_mountfs(
error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
- goto out_remove_uuid;
+ goto out_free_dir;
}
if (!sbp->sb_logblocks) {
@@ -950,6 +985,8 @@ xfs_mountfs(
xfs_wait_buftarg(mp->m_ddev_targp);
out_free_perag:
xfs_free_perag(mp);
+ out_free_dir:
+ xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
out:
@@ -1027,6 +1064,7 @@ xfs_unmountfs(
"Freespace may not be correct on next mount.");
xfs_log_unmount(mp);
+ xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
#if defined(DEBUG)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1fa0584b562..7295a0b7c34 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -26,6 +26,8 @@ struct xfs_mru_cache;
struct xfs_nameops;
struct xfs_ail;
struct xfs_quotainfo;
+struct xfs_dir_ops;
+struct xfs_da_geometry;
#ifdef HAVE_PERCPU_SB
@@ -95,6 +97,8 @@ typedef struct xfs_mount {
uint m_readio_blocks; /* min read size blocks */
uint m_writeio_log; /* min write size log bytes */
uint m_writeio_blocks; /* min write size blocks */
+ struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
+ struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
struct xlog *m_log; /* log specific stuff */
int m_logbufs; /* number of log buffers */
int m_logbsize; /* size of each log buffer */
@@ -111,7 +115,7 @@ typedef struct xfs_mount {
__uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
__uint8_t m_agno_log; /* log #ag's */
__uint8_t m_agino_log; /* #bits for agino in inum */
- __uint16_t m_inode_cluster_size;/* min inode buf size */
+ uint m_inode_cluster_size;/* min inode buf size */
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
@@ -130,8 +134,6 @@ typedef struct xfs_mount {
int m_fixedfsid[2]; /* unchanged for life of FS */
uint m_dmevmask; /* DMI events for this FS */
__uint64_t m_flags; /* global mount flags */
- uint m_dir_node_ents; /* #entries in a dir danode */
- uint m_attr_node_ents; /* #entries in attr danode */
int m_ialloc_inos; /* inodes in inode allocation */
int m_ialloc_blks; /* blocks in inode allocation */
int m_inoalign_mask;/* mask sb_inoalignmt if used */
@@ -144,15 +146,10 @@ typedef struct xfs_mount {
int m_dalign; /* stripe unit */
int m_swidth; /* stripe width */
int m_sinoalign; /* stripe unit inode alignment */
- int m_attr_magicpct;/* 37% of the blocksize */
- int m_dir_magicpct; /* 37% of the dir blocksize */
__uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
- int m_dirblksize; /* directory block sz--bytes */
- int m_dirblkfsbs; /* directory block sz--fsbs */
- xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
- xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
- xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
+ const struct xfs_dir_ops *m_dir_inode_ops; /* vector of dir inode ops */
+ const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */
uint m_chsize; /* size of next field */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 4aff5639573..f99b4933dc2 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -100,14 +100,20 @@
* likely result in a loop in one of the lists. That's a sure-fire recipe for
* an infinite loop in the code.
*/
-typedef struct xfs_mru_cache_elem
-{
- struct list_head list_node;
- unsigned long key;
- void *value;
-} xfs_mru_cache_elem_t;
+struct xfs_mru_cache {
+ struct radix_tree_root store; /* Core storage data structure. */
+ struct list_head *lists; /* Array of lists, one per grp. */
+ struct list_head reap_list; /* Elements overdue for reaping. */
+ spinlock_t lock; /* Lock to protect this struct. */
+ unsigned int grp_count; /* Number of discrete groups. */
+ unsigned int grp_time; /* Time period spanned by grps. */
+ unsigned int lru_grp; /* Group containing time zero. */
+ unsigned long time_zero; /* Time first element was added. */
+ xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
+ struct delayed_work work; /* Workqueue data for reaping. */
+ unsigned int queued; /* work has been queued */
+};
-static kmem_zone_t *xfs_mru_elem_zone;
static struct workqueue_struct *xfs_mru_reap_wq;
/*
@@ -129,12 +135,12 @@ static struct workqueue_struct *xfs_mru_reap_wq;
*/
STATIC unsigned long
_xfs_mru_cache_migrate(
- xfs_mru_cache_t *mru,
- unsigned long now)
+ struct xfs_mru_cache *mru,
+ unsigned long now)
{
- unsigned int grp;
- unsigned int migrated = 0;
- struct list_head *lru_list;
+ unsigned int grp;
+ unsigned int migrated = 0;
+ struct list_head *lru_list;
/* Nothing to do if the data store is empty. */
if (!mru->time_zero)
@@ -193,11 +199,11 @@ _xfs_mru_cache_migrate(
*/
STATIC void
_xfs_mru_cache_list_insert(
- xfs_mru_cache_t *mru,
- xfs_mru_cache_elem_t *elem)
+ struct xfs_mru_cache *mru,
+ struct xfs_mru_cache_elem *elem)
{
- unsigned int grp = 0;
- unsigned long now = jiffies;
+ unsigned int grp = 0;
+ unsigned long now = jiffies;
/*
* If the data store is empty, initialise time zero, leave grp set to
@@ -231,10 +237,10 @@ _xfs_mru_cache_list_insert(
*/
STATIC void
_xfs_mru_cache_clear_reap_list(
- xfs_mru_cache_t *mru) __releases(mru->lock) __acquires(mru->lock)
-
+ struct xfs_mru_cache *mru)
+ __releases(mru->lock) __acquires(mru->lock)
{
- xfs_mru_cache_elem_t *elem, *next;
+ struct xfs_mru_cache_elem *elem, *next;
struct list_head tmp;
INIT_LIST_HEAD(&tmp);
@@ -252,15 +258,8 @@ _xfs_mru_cache_clear_reap_list(
spin_unlock(&mru->lock);
list_for_each_entry_safe(elem, next, &tmp, list_node) {
-
- /* Remove the element from the reap list. */
list_del_init(&elem->list_node);
-
- /* Call the client's free function with the key and value pointer. */
- mru->free_func(elem->key, elem->value);
-
- /* Free the element structure. */
- kmem_zone_free(xfs_mru_elem_zone, elem);
+ mru->free_func(elem);
}
spin_lock(&mru->lock);
@@ -277,7 +276,8 @@ STATIC void
_xfs_mru_cache_reap(
struct work_struct *work)
{
- xfs_mru_cache_t *mru = container_of(work, xfs_mru_cache_t, work.work);
+ struct xfs_mru_cache *mru =
+ container_of(work, struct xfs_mru_cache, work.work);
unsigned long now, next;
ASSERT(mru && mru->lists);
@@ -304,28 +304,16 @@ _xfs_mru_cache_reap(
int
xfs_mru_cache_init(void)
{
- xfs_mru_elem_zone = kmem_zone_init(sizeof(xfs_mru_cache_elem_t),
- "xfs_mru_cache_elem");
- if (!xfs_mru_elem_zone)
- goto out;
-
xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
if (!xfs_mru_reap_wq)
- goto out_destroy_mru_elem_zone;
-
+ return -ENOMEM;
return 0;
-
- out_destroy_mru_elem_zone:
- kmem_zone_destroy(xfs_mru_elem_zone);
- out:
- return -ENOMEM;
}
void
xfs_mru_cache_uninit(void)
{
destroy_workqueue(xfs_mru_reap_wq);
- kmem_zone_destroy(xfs_mru_elem_zone);
}
/*
@@ -336,14 +324,14 @@ xfs_mru_cache_uninit(void)
*/
int
xfs_mru_cache_create(
- xfs_mru_cache_t **mrup,
+ struct xfs_mru_cache **mrup,
unsigned int lifetime_ms,
unsigned int grp_count,
xfs_mru_cache_free_func_t free_func)
{
- xfs_mru_cache_t *mru = NULL;
- int err = 0, grp;
- unsigned int grp_time;
+ struct xfs_mru_cache *mru = NULL;
+ int err = 0, grp;
+ unsigned int grp_time;
if (mrup)
*mrup = NULL;
@@ -400,7 +388,7 @@ exit:
*/
static void
xfs_mru_cache_flush(
- xfs_mru_cache_t *mru)
+ struct xfs_mru_cache *mru)
{
if (!mru || !mru->lists)
return;
@@ -420,7 +408,7 @@ xfs_mru_cache_flush(
void
xfs_mru_cache_destroy(
- xfs_mru_cache_t *mru)
+ struct xfs_mru_cache *mru)
{
if (!mru || !mru->lists)
return;
@@ -438,38 +426,30 @@ xfs_mru_cache_destroy(
*/
int
xfs_mru_cache_insert(
- xfs_mru_cache_t *mru,
- unsigned long key,
- void *value)
+ struct xfs_mru_cache *mru,
+ unsigned long key,
+ struct xfs_mru_cache_elem *elem)
{
- xfs_mru_cache_elem_t *elem;
+ int error;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
return EINVAL;
- elem = kmem_zone_zalloc(xfs_mru_elem_zone, KM_SLEEP);
- if (!elem)
+ if (radix_tree_preload(GFP_KERNEL))
return ENOMEM;
- if (radix_tree_preload(GFP_KERNEL)) {
- kmem_zone_free(xfs_mru_elem_zone, elem);
- return ENOMEM;
- }
-
INIT_LIST_HEAD(&elem->list_node);
elem->key = key;
- elem->value = value;
spin_lock(&mru->lock);
-
- radix_tree_insert(&mru->store, key, elem);
+ error = -radix_tree_insert(&mru->store, key, elem);
radix_tree_preload_end();
- _xfs_mru_cache_list_insert(mru, elem);
-
+ if (!error)
+ _xfs_mru_cache_list_insert(mru, elem);
spin_unlock(&mru->lock);
- return 0;
+ return error;
}
/*
@@ -478,13 +458,12 @@ xfs_mru_cache_insert(
* the client data pointer for the removed element is returned, otherwise this
* function will return a NULL pointer.
*/
-void *
+struct xfs_mru_cache_elem *
xfs_mru_cache_remove(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- xfs_mru_cache_elem_t *elem;
- void *value = NULL;
+ struct xfs_mru_cache_elem *elem;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
@@ -492,17 +471,11 @@ xfs_mru_cache_remove(
spin_lock(&mru->lock);
elem = radix_tree_delete(&mru->store, key);
- if (elem) {
- value = elem->value;
+ if (elem)
list_del(&elem->list_node);
- }
-
spin_unlock(&mru->lock);
- if (elem)
- kmem_zone_free(xfs_mru_elem_zone, elem);
-
- return value;
+ return elem;
}
/*
@@ -511,13 +484,14 @@ xfs_mru_cache_remove(
*/
void
xfs_mru_cache_delete(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- void *value = xfs_mru_cache_remove(mru, key);
+ struct xfs_mru_cache_elem *elem;
- if (value)
- mru->free_func(key, value);
+ elem = xfs_mru_cache_remove(mru, key);
+ if (elem)
+ mru->free_func(elem);
}
/*
@@ -540,12 +514,12 @@ xfs_mru_cache_delete(
* status, we need to help it get it right by annotating the path that does
* not release the lock.
*/
-void *
+struct xfs_mru_cache_elem *
xfs_mru_cache_lookup(
- xfs_mru_cache_t *mru,
- unsigned long key)
+ struct xfs_mru_cache *mru,
+ unsigned long key)
{
- xfs_mru_cache_elem_t *elem;
+ struct xfs_mru_cache_elem *elem;
ASSERT(mru && mru->lists);
if (!mru || !mru->lists)
@@ -560,7 +534,7 @@ xfs_mru_cache_lookup(
} else
spin_unlock(&mru->lock);
- return elem ? elem->value : NULL;
+ return elem;
}
/*
@@ -570,7 +544,8 @@ xfs_mru_cache_lookup(
*/
void
xfs_mru_cache_done(
- xfs_mru_cache_t *mru) __releases(mru->lock)
+ struct xfs_mru_cache *mru)
+ __releases(mru->lock)
{
spin_unlock(&mru->lock);
}
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index 36dd3ec8b4e..fb5245ba5ff 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -18,24 +18,15 @@
#ifndef __XFS_MRU_CACHE_H__
#define __XFS_MRU_CACHE_H__
+struct xfs_mru_cache;
-/* Function pointer type for callback to free a client's data pointer. */
-typedef void (*xfs_mru_cache_free_func_t)(unsigned long, void*);
+struct xfs_mru_cache_elem {
+ struct list_head list_node;
+ unsigned long key;
+};
-typedef struct xfs_mru_cache
-{
- struct radix_tree_root store; /* Core storage data structure. */
- struct list_head *lists; /* Array of lists, one per grp. */
- struct list_head reap_list; /* Elements overdue for reaping. */
- spinlock_t lock; /* Lock to protect this struct. */
- unsigned int grp_count; /* Number of discrete groups. */
- unsigned int grp_time; /* Time period spanned by grps. */
- unsigned int lru_grp; /* Group containing time zero. */
- unsigned long time_zero; /* Time first element was added. */
- xfs_mru_cache_free_func_t free_func; /* Function pointer for freeing. */
- struct delayed_work work; /* Workqueue data for reaping. */
- unsigned int queued; /* work has been queued */
-} xfs_mru_cache_t;
+/* Function pointer type for callback to free a client's data pointer. */
+typedef void (*xfs_mru_cache_free_func_t)(struct xfs_mru_cache_elem *elem);
int xfs_mru_cache_init(void);
void xfs_mru_cache_uninit(void);
@@ -44,10 +35,12 @@ int xfs_mru_cache_create(struct xfs_mru_cache **mrup, unsigned int lifetime_ms,
xfs_mru_cache_free_func_t free_func);
void xfs_mru_cache_destroy(struct xfs_mru_cache *mru);
int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
- void *value);
-void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
+ struct xfs_mru_cache_elem *elem);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
+struct xfs_mru_cache_elem *
+xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
void xfs_mru_cache_done(struct xfs_mru_cache *mru);
#endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e6c2e6c9cd..6d26759c779 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -17,31 +17,28 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
-#include "xfs_rtalloc.h"
+#include "xfs_quota.h"
#include "xfs_error.h"
#include "xfs_bmap.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
#include "xfs_cksum.h"
+#include "xfs_dinode.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -137,8 +134,6 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
- struct xfs_dquot *gdqp = NULL;
- struct xfs_dquot *pdqp = NULL;
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -146,21 +141,6 @@ xfs_qm_dqpurge(
return EAGAIN;
}
- /*
- * If this quota has a hint attached, prepare for releasing it now.
- */
- gdqp = dqp->q_gdquot;
- if (gdqp) {
- xfs_dqlock(gdqp);
- dqp->q_gdquot = NULL;
- }
-
- pdqp = dqp->q_pdquot;
- if (pdqp) {
- xfs_dqlock(pdqp);
- dqp->q_pdquot = NULL;
- }
-
dqp->dq_flags |= XFS_DQ_FREEING;
xfs_dqflock(dqp);
@@ -209,11 +189,6 @@ xfs_qm_dqpurge(
XFS_STATS_DEC(xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
-
- if (gdqp)
- xfs_qm_dqput(gdqp);
- if (pdqp)
- xfs_qm_dqput(pdqp);
return 0;
}
@@ -383,7 +358,6 @@ xfs_qm_dqattach_one(
xfs_dqid_t id,
uint type,
uint doalloc,
- xfs_dquot_t *udqhint, /* hint */
xfs_dquot_t **IO_idqpp)
{
xfs_dquot_t *dqp;
@@ -393,9 +367,9 @@ xfs_qm_dqattach_one(
error = 0;
/*
- * See if we already have it in the inode itself. IO_idqpp is
- * &i_udquot or &i_gdquot. This made the code look weird, but
- * made the logic a lot simpler.
+ * See if we already have it in the inode itself. IO_idqpp is &i_udquot
+ * or &i_gdquot. This made the code look weird, but made the logic a lot
+ * simpler.
*/
dqp = *IO_idqpp;
if (dqp) {
@@ -404,49 +378,10 @@ xfs_qm_dqattach_one(
}
/*
- * udqhint is the i_udquot field in inode, and is non-NULL only
- * when the type arg is group/project. Its purpose is to save a
- * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
- * the user dquot.
- */
- if (udqhint) {
- ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
- xfs_dqlock(udqhint);
-
- /*
- * No need to take dqlock to look at the id.
- *
- * The ID can't change until it gets reclaimed, and it won't
- * be reclaimed as long as we have a ref from inode and we
- * hold the ilock.
- */
- if (type == XFS_DQ_GROUP)
- dqp = udqhint->q_gdquot;
- else
- dqp = udqhint->q_pdquot;
- if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
- ASSERT(*IO_idqpp == NULL);
-
- *IO_idqpp = xfs_qm_dqhold(dqp);
- xfs_dqunlock(udqhint);
- return 0;
- }
-
- /*
- * We can't hold a dquot lock when we call the dqget code.
- * We'll deadlock in no time, because of (not conforming to)
- * lock ordering - the inodelock comes before any dquot lock,
- * and we may drop and reacquire the ilock in xfs_qm_dqget().
- */
- xfs_dqunlock(udqhint);
- }
-
- /*
- * Find the dquot from somewhere. This bumps the
- * reference count of dquot and returns it locked.
- * This can return ENOENT if dquot didn't exist on
- * disk and we didn't ask it to allocate;
- * ESRCH if quotas got turned off suddenly.
+ * Find the dquot from somewhere. This bumps the reference count of
+ * dquot and returns it locked. This can return ENOENT if dquot didn't
+ * exist on disk and we didn't ask it to allocate; ESRCH if quotas got
+ * turned off suddenly.
*/
error = xfs_qm_dqget(ip->i_mount, ip, id, type,
doalloc | XFS_QMOPT_DOWARN, &dqp);
@@ -464,48 +399,6 @@ xfs_qm_dqattach_one(
return 0;
}
-
-/*
- * Given a udquot and group/project type, attach the group/project
- * dquot pointer to the udquot as a hint for future lookups.
- */
-STATIC void
-xfs_qm_dqattach_hint(
- struct xfs_inode *ip,
- int type)
-{
- struct xfs_dquot **dqhintp;
- struct xfs_dquot *dqp;
- struct xfs_dquot *udq = ip->i_udquot;
-
- ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-
- xfs_dqlock(udq);
-
- if (type == XFS_DQ_GROUP) {
- dqp = ip->i_gdquot;
- dqhintp = &udq->q_gdquot;
- } else {
- dqp = ip->i_pdquot;
- dqhintp = &udq->q_pdquot;
- }
-
- if (*dqhintp) {
- struct xfs_dquot *tmp;
-
- if (*dqhintp == dqp)
- goto done;
-
- tmp = *dqhintp;
- *dqhintp = NULL;
- xfs_qm_dqrele(tmp);
- }
-
- *dqhintp = xfs_qm_dqhold(dqp);
-done:
- xfs_dqunlock(udq);
-}
-
static bool
xfs_qm_need_dqattach(
struct xfs_inode *ip)
@@ -536,7 +429,6 @@ xfs_qm_dqattach_locked(
uint flags)
{
xfs_mount_t *mp = ip->i_mount;
- uint nquotas = 0;
int error = 0;
if (!xfs_qm_need_dqattach(ip))
@@ -544,77 +436,39 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_UQUOTA_ON(mp)) {
+ if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
flags & XFS_QMOPT_DQALLOC,
- NULL, &ip->i_udquot);
+ &ip->i_udquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_udquot);
}
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_GQUOTA_ON(mp)) {
+ if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_gdquot);
- /*
- * Don't worry about the udquot that we may have
- * attached above. It'll get detached, if not already.
- */
+ &ip->i_gdquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_gdquot);
}
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- if (XFS_IS_PQUOTA_ON(mp)) {
+ if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
flags & XFS_QMOPT_DQALLOC,
- ip->i_udquot, &ip->i_pdquot);
- /*
- * Don't worry about the udquot that we may have
- * attached above. It'll get detached, if not already.
- */
+ &ip->i_pdquot);
if (error)
goto done;
- nquotas++;
+ ASSERT(ip->i_pdquot);
}
+done:
/*
- * Attach this group/project quota to the user quota as a hint.
- * This WON'T, in general, result in a thrash.
+ * Don't worry about the dquots that we may have attached before any
+ * error - they'll get detached later if it has not already been done.
*/
- if (nquotas > 1 && ip->i_udquot) {
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_gdquot || !XFS_IS_GQUOTA_ON(mp));
- ASSERT(ip->i_pdquot || !XFS_IS_PQUOTA_ON(mp));
-
- /*
- * We do not have i_udquot locked at this point, but this check
- * is OK since we don't depend on the i_gdquot to be accurate
- * 100% all the time. It is just a hint, and this will
- * succeed in general.
- */
- if (ip->i_udquot->q_gdquot != ip->i_gdquot)
- xfs_qm_dqattach_hint(ip, XFS_DQ_GROUP);
-
- if (ip->i_udquot->q_pdquot != ip->i_pdquot)
- xfs_qm_dqattach_hint(ip, XFS_DQ_PROJ);
- }
-
- done:
-#ifdef DEBUG
- if (!error) {
- if (XFS_IS_UQUOTA_ON(mp))
- ASSERT(ip->i_udquot);
- if (XFS_IS_GQUOTA_ON(mp))
- ASSERT(ip->i_gdquot);
- if (XFS_IS_PQUOTA_ON(mp))
- ASSERT(ip->i_pdquot);
- }
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
return error;
}
@@ -664,20 +518,6 @@ xfs_qm_dqdetach(
}
}
-int
-xfs_qm_calc_dquots_per_chunk(
- struct xfs_mount *mp,
- unsigned int nbblks) /* basic block units */
-{
- unsigned int ndquots;
-
- ASSERT(nbblks > 0);
- ndquots = BBTOB(nbblks);
- do_div(ndquots, sizeof(xfs_dqblk_t));
-
- return ndquots;
-}
-
struct xfs_qm_isolate {
struct list_head buffers;
struct list_head dispose;
@@ -831,22 +671,17 @@ xfs_qm_init_quotainfo(
qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
- if ((error = list_lru_init(&qinf->qi_lru))) {
- kmem_free(qinf);
- mp->m_quotainfo = NULL;
- return error;
- }
+ error = -list_lru_init(&qinf->qi_lru);
+ if (error)
+ goto out_free_qinf;
/*
* See if quotainodes are setup, and if not, allocate them,
* and change the superblock accordingly.
*/
- if ((error = xfs_qm_init_quotainos(mp))) {
- list_lru_destroy(&qinf->qi_lru);
- kmem_free(qinf);
- mp->m_quotainfo = NULL;
- return error;
- }
+ error = xfs_qm_init_quotainos(mp);
+ if (error)
+ goto out_free_lru;
INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
@@ -858,8 +693,7 @@ xfs_qm_init_quotainfo(
/* Precalc some constants */
qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
- qinf->qi_dqperchunk = xfs_qm_calc_dquots_per_chunk(mp,
- qinf->qi_dqchunklen);
+ qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen);
mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
@@ -906,7 +740,7 @@ xfs_qm_init_quotainfo(
qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-
+
xfs_qm_dqdestroy(dqp);
} else {
qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -923,6 +757,13 @@ xfs_qm_init_quotainfo(
qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
register_shrinker(&qinf->qi_shrinker);
return 0;
+
+out_free_lru:
+ list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+ kmem_free(qinf);
+ mp->m_quotainfo = NULL;
+ return error;
}
@@ -1092,10 +933,10 @@ xfs_qm_reset_dqcounts(
/*
* Do a sanity check, and if needed, repair the dqblk. Don't
* output any warnings because it's perfectly possible to
- * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
+ * find uninitialised dquot blks. See comment in xfs_dqcheck.
*/
- (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
- "xfs_quotacheck");
+ xfs_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+ "xfs_quotacheck");
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
@@ -1210,16 +1051,18 @@ xfs_qm_dqiterate(
lblkno = 0;
maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
do {
+ uint lock_mode;
+
nmaps = XFS_DQITER_MAP_SIZE;
/*
* We aren't changing the inode itself. Just changing
* some of its data. No new blocks are added here, and
* the inode is never added to the transaction.
*/
- xfs_ilock(qip, XFS_ILOCK_SHARED);
+ lock_mode = xfs_ilock_data_map_shared(qip);
error = xfs_bmapi_read(qip, lblkno, maxlblkcnt - lblkno,
map, &nmaps, 0);
- xfs_iunlock(qip, XFS_ILOCK_SHARED);
+ xfs_iunlock(qip, lock_mode);
if (error)
break;
@@ -2099,24 +1942,21 @@ xfs_qm_vop_create_dqattach(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- if (udqp) {
+ if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
- ASSERT(XFS_IS_UQUOTA_ON(mp));
ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
- if (gdqp) {
+ if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(XFS_IS_GQUOTA_ON(mp));
ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
- if (pdqp) {
+ if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
- ASSERT(XFS_IS_PQUOTA_ON(mp));
ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
ip->i_pdquot = xfs_qm_dqhold(pdqp);
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 2b602df9c24..797fd463627 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -20,13 +20,29 @@
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
struct xfs_inode;
extern struct kmem_zone *xfs_qm_dqtrxzone;
/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE 10
+
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+ !dqp->q_core.d_blk_hardlimit && \
+ !dqp->q_core.d_blk_softlimit && \
+ !dqp->q_core.d_rtb_hardlimit && \
+ !dqp->q_core.d_rtb_softlimit && \
+ !dqp->q_core.d_ino_hardlimit && \
+ !dqp->q_core.d_ino_softlimit && \
+ !dqp->q_core.d_bcount && \
+ !dqp->q_core.d_rtbcount && \
+ !dqp->q_core.d_icount)
+
+/*
* This defines the unit of allocation of dquots.
* Currently, it is just one file system block, and a 4K blk contains 30
* (136 * 30 = 4080) dquots. It's probably not worth trying to make
@@ -103,8 +119,6 @@ xfs_dq_to_quota_inode(struct xfs_dquot *dqp)
return NULL;
}
-extern int xfs_qm_calc_dquots_per_chunk(struct xfs_mount *mp,
- unsigned int nbblks);
extern void xfs_trans_mod_dquot(struct xfs_trans *,
struct xfs_dquot *, uint, long);
extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 3af50ccdfac..e9be63abd8d 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -18,21 +18,15 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_qm.h"
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 8174aad0b38..bbc813caba4 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -20,24 +20,18 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_trans.h"
#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
@@ -284,22 +278,29 @@ xfs_qm_scall_trunc_qfiles(
xfs_mount_t *mp,
uint flags)
{
- int error = 0, error2 = 0;
+ int error = EINVAL;
- if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
- xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+ if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
+ (flags & ~XFS_DQ_ALLTYPES)) {
+ xfs_debug(mp, "%s: flags=%x m_qflags=%x",
__func__, flags, mp->m_qflags);
return XFS_ERROR(EINVAL);
}
- if (flags & XFS_DQ_USER)
+ if (flags & XFS_DQ_USER) {
error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
- if (flags & XFS_DQ_GROUP)
- error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+ if (error)
+ return error;
+ }
+ if (flags & XFS_DQ_GROUP) {
+ error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+ if (error)
+ return error;
+ }
if (flags & XFS_DQ_PROJ)
- error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
+ error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_pquotino);
- return error ? error : error2;
+ return error;
}
/*
@@ -325,7 +326,7 @@ xfs_qm_scall_quotaon(
sbflags = 0;
if (flags == 0) {
- xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+ xfs_debug(mp, "%s: zero flags, m_qflags=%x",
__func__, mp->m_qflags);
return XFS_ERROR(EINVAL);
}
@@ -348,7 +349,7 @@ xfs_qm_scall_quotaon(
(mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
(flags & XFS_PQUOTA_ENFD))) {
xfs_debug(mp,
- "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+ "%s: Can't enforce without acct, flags=%x sbflags=%x",
__func__, flags, mp->m_sb.sb_qflags);
return XFS_ERROR(EINVAL);
}
@@ -648,7 +649,7 @@ xfs_qm_scall_setqlim(
q->qi_bsoftlimit = soft;
}
} else {
- xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
+ xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
}
hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
(xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -664,7 +665,7 @@ xfs_qm_scall_setqlim(
q->qi_rtbsoftlimit = soft;
}
} else {
- xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+ xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
}
hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -681,7 +682,7 @@ xfs_qm_scall_setqlim(
q->qi_isoftlimit = soft;
}
} else {
- xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
+ xfs_debug(mp, "ihard %Ld < isoft %Ld", hard, soft);
}
/*
@@ -959,7 +960,6 @@ xfs_qm_export_flags(
STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
- struct xfs_perag *pag,
int flags,
void *args)
{
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e7d84d2d868..5376dd406ba 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -150,10 +150,6 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp,
xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, pd, nb, ni, \
f | XFS_QMOPT_RES_REGBLKS)
-extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
- xfs_dqid_t, uint, uint, char *);
extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
-extern const struct xfs_buf_ops xfs_dquot_buf_ops;
-
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/xfs_quota_defs.h
index e6b0d6e1f4f..137e2093707 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/xfs_quota_defs.h
@@ -154,4 +154,8 @@ typedef __uint16_t xfs_qwarncnt_t;
(XFS_QMOPT_UQUOTA | XFS_QMOPT_PQUOTA | XFS_QMOPT_GQUOTA)
#define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+ xfs_dqid_t id, uint type, uint flags, char *str);
+extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+
#endif /* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
deleted file mode 100644
index 6d86219d93d..00000000000
--- a/fs/xfs/xfs_quota_priv.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE 10
-
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
- !dqp->q_core.d_blk_hardlimit && \
- !dqp->q_core.d_blk_softlimit && \
- !dqp->q_core.d_rtb_hardlimit && \
- !dqp->q_core.d_rtb_softlimit && \
- !dqp->q_core.d_ino_hardlimit && \
- !dqp->q_core.d_ino_softlimit && \
- !dqp->q_core.d_bcount && \
- !dqp->q_core.d_rtbcount && \
- !dqp->q_core.d_icount)
-
-#define DQFLAGTO_TYPESTR(d) (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
- (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
- (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-
-#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 1326d81596c..2ad1b9822e9 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -17,15 +17,14 @@
*/
#include "xfs.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_log.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_trans.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
#include "xfs_qm.h"
#include <linux/quota.h>
@@ -101,16 +100,36 @@ xfs_fs_set_xstate(
if (!XFS_IS_QUOTA_ON(mp))
return -EINVAL;
return -xfs_qm_scall_quotaoff(mp, flags);
- case Q_XQUOTARM:
- if (XFS_IS_QUOTA_ON(mp))
- return -EINVAL;
- return -xfs_qm_scall_trunc_qfiles(mp, flags);
}
return -EINVAL;
}
STATIC int
+xfs_fs_rm_xquota(
+ struct super_block *sb,
+ unsigned int uflags)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+ unsigned int flags = 0;
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+
+ if (XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
+
+ if (uflags & FS_USER_QUOTA)
+ flags |= XFS_DQ_USER;
+ if (uflags & FS_GROUP_QUOTA)
+ flags |= XFS_DQ_GROUP;
+ if (uflags & FS_USER_QUOTA)
+ flags |= XFS_DQ_PROJ;
+
+ return -xfs_qm_scall_trunc_qfiles(mp, flags);
+}
+
+STATIC int
xfs_fs_get_dqblk(
struct super_block *sb,
struct kqid qid,
@@ -150,6 +169,7 @@ const struct quotactl_ops xfs_quotactl_operations = {
.get_xstatev = xfs_fs_get_xstatev,
.get_xstate = xfs_fs_get_xstate,
.set_xstate = xfs_fs_set_xstate,
+ .rm_xquota = xfs_fs_rm_xquota,
.get_dqblk = xfs_fs_get_dqblk,
.set_dqblk = xfs_fs_set_dqblk,
};
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6f9e63c9fc2..ec5ca65c621 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -17,172 +17,260 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
-#include "xfs_rtalloc.h"
-#include "xfs_fsops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_inode_item.h"
+#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_buf.h"
#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
/*
- * Prototypes for internal functions.
+ * Read and return the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
*/
+STATIC int /* error */
+xfs_rtget_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
+{
+ xfs_buf_t *bp; /* buffer for summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (rbpp && *rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (rbpp && *rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ if (rbpp) {
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ }
+ /*
+ * Point to the summary information & copy it out.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ *sum = *sp;
+ /*
+ * Drop the buffer if we're not asked to remember it.
+ */
+ if (!rbpp)
+ xfs_trans_brelse(tp, bp);
+ return 0;
+}
-STATIC int xfs_rtallocate_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_extlen_t, xfs_buf_t **, xfs_fsblock_t *);
-STATIC int xfs_rtany_summary(xfs_mount_t *, xfs_trans_t *, int, int,
- xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, int *);
-STATIC int xfs_rtcheck_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_extlen_t, int, xfs_rtblock_t *, int *);
-STATIC int xfs_rtfind_back(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_rtblock_t, xfs_rtblock_t *);
-STATIC int xfs_rtfind_forw(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_rtblock_t, xfs_rtblock_t *);
-STATIC int xfs_rtget_summary( xfs_mount_t *, xfs_trans_t *, int,
- xfs_rtblock_t, xfs_buf_t **, xfs_fsblock_t *, xfs_suminfo_t *);
-STATIC int xfs_rtmodify_range(xfs_mount_t *, xfs_trans_t *, xfs_rtblock_t,
- xfs_extlen_t, int);
-STATIC int xfs_rtmodify_summary(xfs_mount_t *, xfs_trans_t *, int,
- xfs_rtblock_t, int, xfs_buf_t **, xfs_fsblock_t *);
-
-/*
- * Internal functions.
- */
/*
- * Allocate space to the bitmap or summary file, and zero it, for growfs.
+ * Return whether there are any free extents in the size range given
+ * by low and high, for the bitmap block bbno.
*/
STATIC int /* error */
-xfs_growfs_rt_alloc(
- xfs_mount_t *mp, /* file system mount point */
- xfs_extlen_t oblocks, /* old count of blocks */
- xfs_extlen_t nblocks, /* new count of blocks */
- xfs_inode_t *ip) /* inode (bitmap/summary) */
+xfs_rtany_summary(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ int low, /* low log2 extent size */
+ int high, /* high log2 extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb, /* in/out: summary block number */
+ int *stat) /* out: any good extents here? */
{
- xfs_fileoff_t bno; /* block number in file */
- xfs_buf_t *bp; /* temporary buffer for zeroing */
- int committed; /* transaction committed flag */
- xfs_daddr_t d; /* disk block address */
- int error; /* error return value */
- xfs_fsblock_t firstblock; /* first block allocated in xaction */
- xfs_bmap_free_t flist; /* list of freed blocks */
- xfs_fsblock_t fsbno; /* filesystem block for bno */
- xfs_bmbt_irec_t map; /* block map output */
- int nmap; /* number of block maps */
- int resblks; /* space reservation */
+ int error; /* error value */
+ int log; /* loop counter, log2 of ext. size */
+ xfs_suminfo_t sum; /* summary data */
/*
- * Allocate space to the file, as necessary.
+ * Loop over logs of extent sizes. Order is irrelevant.
*/
- while (oblocks < nblocks) {
- int cancelflags = 0;
- xfs_trans_t *tp;
-
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
- resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
+ for (log = low; log <= high; log++) {
/*
- * Reserve space & log for one extent added to the file.
+ * Get one summary datum.
*/
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata,
- resblks, 0);
- if (error)
- goto error_cancel;
- cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+ error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
+ if (error) {
+ return error;
+ }
/*
- * Lock the inode.
+ * If there are any, return success.
*/
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ if (sum) {
+ *stat = 1;
+ return 0;
+ }
+ }
+ /*
+ * Found nothing, return failure.
+ */
+ *stat = 0;
+ return 0;
+}
- xfs_bmap_init(&flist, &firstblock);
- /*
- * Allocate blocks to the bitmap file.
- */
- nmap = 1;
- cancelflags |= XFS_TRANS_ABORT;
- error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
- XFS_BMAPI_METADATA, &firstblock,
- resblks, &map, &nmap, &flist);
- if (!error && nmap < 1)
- error = XFS_ERROR(ENOSPC);
- if (error)
- goto error_cancel;
- /*
- * Free any blocks freed up in the transaction, then commit.
- */
- error = xfs_bmap_finish(&tp, &flist, &committed);
- if (error)
- goto error_cancel;
- error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
- if (error)
- goto error;
- /*
- * Now we need to clear the allocated blocks.
- * Do this one block per transaction, to keep it simple.
- */
- cancelflags = 0;
- for (bno = map.br_startoff, fsbno = map.br_startblock;
- bno < map.br_startoff + map.br_blockcount;
- bno++, fsbno++) {
- tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
- /*
- * Reserve log for one block zeroing.
- */
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
- 0, 0);
+
+/*
+ * Copy and transform the summary file, given the old and new
+ * parameters in the mount structures.
+ */
+STATIC int /* error */
+xfs_rtcopy_summary(
+ xfs_mount_t *omp, /* old file system mount point */
+ xfs_mount_t *nmp, /* new file system mount point */
+ xfs_trans_t *tp) /* transaction pointer */
+{
+ xfs_rtblock_t bbno; /* bitmap block number */
+ xfs_buf_t *bp; /* summary buffer */
+ int error; /* error return value */
+ int log; /* summary level number (log length) */
+ xfs_suminfo_t sum; /* summary data */
+ xfs_fsblock_t sumbno; /* summary block number */
+
+ bp = NULL;
+ for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
+ for (bbno = omp->m_sb.sb_rbmblocks - 1;
+ (xfs_srtblock_t)bbno >= 0;
+ bbno--) {
+ error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
+ &sumbno, &sum);
if (error)
- goto error_cancel;
- /*
- * Lock the bitmap inode.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- /*
- * Get a buffer for the block.
- */
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize, 0);
- if (bp == NULL) {
- error = XFS_ERROR(EIO);
-error_cancel:
- xfs_trans_cancel(tp, cancelflags);
- goto error;
- }
- memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
- xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
- /*
- * Commit the transaction.
- */
- error = xfs_trans_commit(tp, 0);
+ return error;
+ if (sum == 0)
+ continue;
+ error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
+ &bp, &sumbno);
if (error)
- goto error;
+ return error;
+ error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
+ &bp, &sumbno);
+ if (error)
+ return error;
+ ASSERT(sum > 0);
}
- /*
- * Go on to the next extent, if any.
- */
- oblocks = map.br_startoff + map.br_blockcount;
}
return 0;
+}
+/*
+ * Mark an extent specified by start and len allocated.
+ * Updates all the summary information as well as the bitmap.
+ */
+STATIC int /* error */
+xfs_rtallocate_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* start block to allocate */
+ xfs_extlen_t len, /* length to allocate */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the allocated extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock = 0; /* first block allocated > end */
+ xfs_rtblock_t preblock = 0; /* first block allocated < start */
-error:
+ end = start + len - 1;
+ /*
+ * Assume we're allocating out of the middle of a free extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of free extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Decrement the summary information corresponding to the entire
+ * (old) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ /*
+ * If there are blocks not being allocated at the front of the
+ * old extent, add summary data for them to be free.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being allocated at the end of the
+ * old extent, add summary data for them to be free.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Modify the bitmap to mark this extent allocated.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 0);
return error;
}
@@ -721,1112 +809,126 @@ xfs_rtallocate_extent_size(
}
/*
- * Mark an extent specified by start and len allocated.
- * Updates all the summary information as well as the bitmap.
+ * Allocate space to the bitmap or summary file, and zero it, for growfs.
*/
STATIC int /* error */
-xfs_rtallocate_range(
+xfs_growfs_rt_alloc(
xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* start block to allocate */
- xfs_extlen_t len, /* length to allocate */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
+ xfs_extlen_t oblocks, /* old count of blocks */
+ xfs_extlen_t nblocks, /* new count of blocks */
+ xfs_inode_t *ip) /* inode (bitmap/summary) */
{
- xfs_rtblock_t end; /* end of the allocated extent */
- int error; /* error value */
- xfs_rtblock_t postblock = 0; /* first block allocated > end */
- xfs_rtblock_t preblock = 0; /* first block allocated < start */
+ xfs_fileoff_t bno; /* block number in file */
+ xfs_buf_t *bp; /* temporary buffer for zeroing */
+ int committed; /* transaction committed flag */
+ xfs_daddr_t d; /* disk block address */
+ int error; /* error return value */
+ xfs_fsblock_t firstblock; /* first block allocated in xaction */
+ xfs_bmap_free_t flist; /* list of freed blocks */
+ xfs_fsblock_t fsbno; /* filesystem block for bno */
+ xfs_bmbt_irec_t map; /* block map output */
+ int nmap; /* number of block maps */
+ int resblks; /* space reservation */
- end = start + len - 1;
- /*
- * Assume we're allocating out of the middle of a free extent.
- * We need to find the beginning and end of the extent so we can
- * properly update the summary.
- */
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
- if (error) {
- return error;
- }
- /*
- * Find the next allocated block (end of free extent).
- */
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
- if (error) {
- return error;
- }
- /*
- * Decrement the summary information corresponding to the entire
- * (old) free extent.
- */
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
- if (error) {
- return error;
- }
- /*
- * If there are blocks not being allocated at the front of the
- * old extent, add summary data for them to be free.
- */
- if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
- /*
- * If there are blocks not being allocated at the end of the
- * old extent, add summary data for them to be free.
- */
- if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), 1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
/*
- * Modify the bitmap to mark this extent allocated.
+ * Allocate space to the file, as necessary.
*/
- error = xfs_rtmodify_range(mp, tp, start, len, 0);
- return error;
-}
-
-/*
- * Return whether there are any free extents in the size range given
- * by low and high, for the bitmap block bbno.
- */
-STATIC int /* error */
-xfs_rtany_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int low, /* low log2 extent size */
- int high, /* high log2 extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- int *stat) /* out: any good extents here? */
-{
- int error; /* error value */
- int log; /* loop counter, log2 of ext. size */
- xfs_suminfo_t sum; /* summary data */
+ while (oblocks < nblocks) {
+ int cancelflags = 0;
+ xfs_trans_t *tp;
- /*
- * Loop over logs of extent sizes. Order is irrelevant.
- */
- for (log = low; log <= high; log++) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
+ resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
/*
- * Get one summary datum.
+ * Reserve space & log for one extent added to the file.
*/
- error = xfs_rtget_summary(mp, tp, log, bbno, rbpp, rsb, &sum);
- if (error) {
- return error;
- }
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtalloc,
+ resblks, 0);
+ if (error)
+ goto error_cancel;
+ cancelflags = XFS_TRANS_RELEASE_LOG_RES;
/*
- * If there are any, return success.
+ * Lock the inode.
*/
- if (sum) {
- *stat = 1;
- return 0;
- }
- }
- /*
- * Found nothing, return failure.
- */
- *stat = 0;
- return 0;
-}
-
-/*
- * Get a buffer for the bitmap or summary file block specified.
- * The buffer is returned read and locked.
- */
-STATIC int /* error */
-xfs_rtbuf_get(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t block, /* block number in bitmap or summary */
- int issum, /* is summary not bitmap */
- xfs_buf_t **bpp) /* output: buffer for the block */
-{
- xfs_buf_t *bp; /* block buffer, result */
- xfs_inode_t *ip; /* bitmap or summary inode */
- xfs_bmbt_irec_t map;
- int nmap = 1;
- int error; /* error value */
-
- ip = issum ? mp->m_rsumip : mp->m_rbmip;
-
- error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
- if (error)
- return error;
-
- ASSERT(map.br_startblock != NULLFSBLOCK);
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp, NULL);
- if (error)
- return error;
- ASSERT(!xfs_buf_geterror(bp));
- *bpp = bp;
- return 0;
-}
-
-#ifdef DEBUG
-/*
- * Check that the given extent (block range) is allocated already.
- */
-STATIC int /* error */
-xfs_rtcheck_alloc_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* out: 1 for allocated, 0 for not */
-{
- xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
-
- return xfs_rtcheck_range(mp, tp, bno, len, 0, &new, stat);
-}
-#endif
-
-/*
- * Check that the given range is either all allocated (val = 0) or
- * all free (val = 1).
- */
-STATIC int /* error */
-xfs_rtcheck_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block number of extent */
- xfs_extlen_t len, /* length of extent */
- int val, /* 1 for free, 0 for allocated */
- xfs_rtblock_t *new, /* out: first block not matching */
- int *stat) /* out: 1 for matches, 0 for not */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- /*
- * Compute starting bitmap block number
- */
- block = XFS_BITTOBLOCK(mp, start);
- /*
- * Read the bitmap block.
- */
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Compute the starting word's address, and starting bit.
- */
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- /*
- * 0 (allocated) => all zero's; 1 (free) => all one's.
- */
- val = -val;
- /*
- * If not starting on a word boundary, deal with the first
- * (partial) word.
- */
- if (bit) {
- /*
- * Compute first bit not examined.
- */
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
- /*
- * Mask of relevant bits.
- */
- mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = (*b ^ val) & mask)) {
- /*
- * Different, compute first wrong bit and return.
- */
- xfs_trans_brelse(tp, bp);
- i = XFS_RTLOBIT(wdiff) - bit;
- *new = start + i;
- *stat = 0;
- return 0;
- }
- i = lastbit - bit;
- /*
- * Go on to next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * If done with this block, get the next one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the next one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = *b ^ val)) {
- /*
- * Different, compute first wrong bit and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *new = start + i;
- *stat = 0;
- return 0;
- }
- i += XFS_NBWORD;
+ xfs_bmap_init(&flist, &firstblock);
/*
- * Go on to next block if that's where the next word is
- * and we need the next word.
+ * Allocate blocks to the bitmap file.
*/
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * If done with this block, get the next one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer.
- */
- b++;
- }
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if ((lastbit = len - i)) {
+ nmap = 1;
+ cancelflags |= XFS_TRANS_ABORT;
+ error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
+ XFS_BMAPI_METADATA, &firstblock,
+ resblks, &map, &nmap, &flist);
+ if (!error && nmap < 1)
+ error = XFS_ERROR(ENOSPC);
+ if (error)
+ goto error_cancel;
/*
- * Mask of relevant bits.
+ * Free any blocks freed up in the transaction, then commit.
*/
- mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ error = xfs_bmap_finish(&tp, &flist, &committed);
+ if (error)
+ goto error_cancel;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ if (error)
+ goto error;
/*
- * Compute difference between actual and desired value.
+ * Now we need to clear the allocated blocks.
+ * Do this one block per transaction, to keep it simple.
*/
- if ((wdiff = (*b ^ val) & mask)) {
+ cancelflags = 0;
+ for (bno = map.br_startoff, fsbno = map.br_startblock;
+ bno < map.br_startoff + map.br_blockcount;
+ bno++, fsbno++) {
+ tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ZERO);
/*
- * Different, compute first wrong bit and return.
+ * Reserve log for one block zeroing.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *new = start + i;
- *stat = 0;
- return 0;
- } else
- i = len;
- }
- /*
- * Successful, return.
- */
- xfs_trans_brelse(tp, bp);
- *new = start + i;
- *stat = 1;
- return 0;
-}
-
-/*
- * Copy and transform the summary file, given the old and new
- * parameters in the mount structures.
- */
-STATIC int /* error */
-xfs_rtcopy_summary(
- xfs_mount_t *omp, /* old file system mount point */
- xfs_mount_t *nmp, /* new file system mount point */
- xfs_trans_t *tp) /* transaction pointer */
-{
- xfs_rtblock_t bbno; /* bitmap block number */
- xfs_buf_t *bp; /* summary buffer */
- int error; /* error return value */
- int log; /* summary level number (log length) */
- xfs_suminfo_t sum; /* summary data */
- xfs_fsblock_t sumbno; /* summary block number */
-
- bp = NULL;
- for (log = omp->m_rsumlevels - 1; log >= 0; log--) {
- for (bbno = omp->m_sb.sb_rbmblocks - 1;
- (xfs_srtblock_t)bbno >= 0;
- bbno--) {
- error = xfs_rtget_summary(omp, tp, log, bbno, &bp,
- &sumbno, &sum);
- if (error)
- return error;
- if (sum == 0)
- continue;
- error = xfs_rtmodify_summary(omp, tp, log, bbno, -sum,
- &bp, &sumbno);
- if (error)
- return error;
- error = xfs_rtmodify_summary(nmp, tp, log, bbno, sum,
- &bp, &sumbno);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growrtzero,
+ 0, 0);
if (error)
- return error;
- ASSERT(sum > 0);
- }
- }
- return 0;
-}
-
-/*
- * Searching backward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-STATIC int /* error */
-xfs_rtfind_back(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t firstbit; /* first useful bit in the word */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
-
- /*
- * Compute and read in starting bitmap block for starting block.
- */
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Get the first word's index & point to it.
- */
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- len = start - limit + 1;
- /*
- * Compute match value, based on the bit at start: if 1 (free)
- * then all-ones, else all-zeroes.
- */
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
- /*
- * If the starting position is not word-aligned, deal with the
- * partial word.
- */
- if (bit < XFS_NBWORD - 1) {
- /*
- * Calculate first (leftmost) bit number to look at,
- * and mask for all the relevant bits in this word.
- */
- firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
- mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
- firstbit;
- /*
- * Calculate the difference between the value there
- * and what we're looking for.
- */
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different. Mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i = bit - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
- return 0;
- }
- i = bit - firstbit + 1;
- /*
- * Go on to previous block if that's where the previous word is
- * and we need the previous word.
- */
- if (--word == -1 && i < len) {
- /*
- * If done with this block, get the previous one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the previous one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = *b ^ want)) {
- /*
- * Different, mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
- return 0;
- }
- i += XFS_NBWORD;
- /*
- * Go on to previous block if that's where the previous word is
- * and we need the previous word.
- */
- if (--word == -1 && i < len) {
- /*
- * If done with this block, get the previous one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- word = XFS_BLOCKWMASK(mp);
- b = &bufp[word];
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b--;
- }
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if (len - i) {
- /*
- * Calculate first (leftmost) bit number to look at,
- * and mask for all the relevant bits in this word.
- */
- firstbit = XFS_NBWORD - (len - i);
- mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different, mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
- *rtblock = start - i + 1;
- return 0;
- } else
- i = len;
- }
- /*
- * No match, return that we scanned the whole area.
- */
- xfs_trans_brelse(tp, bp);
- *rtblock = start - i + 1;
- return 0;
-}
-
-/*
- * Searching forward from start to limit, find the first block whose
- * allocated/free state is different from start's.
- */
-STATIC int /* error */
-xfs_rtfind_forw(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to look at */
- xfs_rtblock_t limit, /* last block to look at */
- xfs_rtblock_t *rtblock) /* out: start block found */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtblock_t i; /* current bit number rel. to start */
- xfs_rtblock_t lastbit; /* last useful bit in the word */
- xfs_rtblock_t len; /* length of inspected area */
- xfs_rtword_t mask; /* mask of relevant bits for value */
- xfs_rtword_t want; /* mask for "good" values */
- xfs_rtword_t wdiff; /* difference from wanted value */
- int word; /* word number in the buffer */
-
- /*
- * Compute and read in starting bitmap block for starting block.
- */
- block = XFS_BITTOBLOCK(mp, start);
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Get the first word's index & point to it.
- */
- word = XFS_BITTOWORD(mp, start);
- b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- len = limit - start + 1;
- /*
- * Compute match value, based on the bit at start: if 1 (free)
- * then all-ones, else all-zeroes.
- */
- want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
- /*
- * If the starting position is not word-aligned, deal with the
- * partial word.
- */
- if (bit) {
- /*
- * Calculate last (rightmost) bit number to look at,
- * and mask for all the relevant bits in this word.
- */
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
- mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
- /*
- * Calculate the difference between the value there
- * and what we're looking for.
- */
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different. Mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i = XFS_RTLOBIT(wdiff) - bit;
- *rtblock = start + i - 1;
- return 0;
- }
- i = lastbit - bit;
- /*
- * Go on to next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * If done with this block, get the previous one.
- */
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the previous word in the buffer.
- */
- b++;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the next one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Compute difference between actual and desired value.
- */
- if ((wdiff = *b ^ want)) {
+ goto error_cancel;
/*
- * Different, mark where we are and return.
+ * Lock the bitmap inode.
*/
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
- return 0;
- }
- i += XFS_NBWORD;
- /*
- * Go on to next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
/*
- * If done with this block, get the next one.
+ * Get a buffer for the block.
*/
- xfs_trans_brelse(tp, bp);
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
+ d = XFS_FSB_TO_DADDR(mp, fsbno);
+ bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+ mp->m_bsize, 0);
+ if (bp == NULL) {
+ error = XFS_ERROR(EIO);
+error_cancel:
+ xfs_trans_cancel(tp, cancelflags);
+ goto error;
}
- b = bufp = bp->b_addr;
- word = 0;
- } else {
+ memset(bp->b_addr, 0, mp->m_sb.sb_blocksize);
+ xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
/*
- * Go on to the next word in the buffer.
+ * Commit the transaction.
*/
- b++;
+ error = xfs_trans_commit(tp, 0);
+ if (error)
+ goto error;
}
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if ((lastbit = len - i)) {
/*
- * Calculate mask for all the relevant bits in this word.
- */
- mask = ((xfs_rtword_t)1 << lastbit) - 1;
- /*
- * Compute difference between actual and desired value.
+ * Go on to the next extent, if any.
*/
- if ((wdiff = (*b ^ want) & mask)) {
- /*
- * Different, mark where we are and return.
- */
- xfs_trans_brelse(tp, bp);
- i += XFS_RTLOBIT(wdiff);
- *rtblock = start + i - 1;
- return 0;
- } else
- i = len;
+ oblocks = map.br_startoff + map.br_blockcount;
}
- /*
- * No match, return that we scanned the whole area.
- */
- xfs_trans_brelse(tp, bp);
- *rtblock = start + i - 1;
return 0;
-}
-/*
- * Mark an extent specified by start and len freed.
- * Updates all the summary information as well as the bitmap.
- */
-STATIC int /* error */
-xfs_rtfree_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to free */
- xfs_extlen_t len, /* length to free */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
-{
- xfs_rtblock_t end; /* end of the freed extent */
- int error; /* error value */
- xfs_rtblock_t postblock; /* first block freed > end */
- xfs_rtblock_t preblock; /* first block freed < start */
-
- end = start + len - 1;
- /*
- * Modify the bitmap to mark this extent freed.
- */
- error = xfs_rtmodify_range(mp, tp, start, len, 1);
- if (error) {
- return error;
- }
- /*
- * Assume we're freeing out of the middle of an allocated extent.
- * We need to find the beginning and end of the extent so we can
- * properly update the summary.
- */
- error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
- if (error) {
- return error;
- }
- /*
- * Find the next allocated block (end of allocated extent).
- */
- error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
- &postblock);
- if (error)
- return error;
- /*
- * If there are blocks not being freed at the front of the
- * old extent, add summary data for them to be allocated.
- */
- if (preblock < start) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(start - preblock),
- XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
- /*
- * If there are blocks not being freed at the end of the
- * old extent, add summary data for them to be allocated.
- */
- if (postblock > end) {
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock - end),
- XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
- if (error) {
- return error;
- }
- }
- /*
- * Increment the summary information corresponding to the entire
- * (new) free extent.
- */
- error = xfs_rtmodify_summary(mp, tp,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
- XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+error:
return error;
}
/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-STATIC int /* error */
-xfs_rtget_summary(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb, /* in/out: summary block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
-{
- xfs_buf_t *bp; /* buffer for summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
-
- /*
- * Compute entry number in the summary file.
- */
- so = XFS_SUMOFFS(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (rbpp && *rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (rbpp && *rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- if (rbpp) {
- *rbpp = bp;
- *rsb = sb;
- }
- }
- /*
- * Point to the summary information & copy it out.
- */
- sp = XFS_SUMPTR(mp, bp, so);
- *sum = *sp;
- /*
- * Drop the buffer if we're not asked to remember it.
- */
- if (!rbpp)
- xfs_trans_brelse(tp, bp);
- return 0;
-}
-
-/*
- * Set the given range of bitmap bits to the given value.
- * Do whatever I/O and logging is required.
- */
-STATIC int /* error */
-xfs_rtmodify_range(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t start, /* starting block to modify */
- xfs_extlen_t len, /* length of extent to modify */
- int val) /* 1 for free, 0 for allocated */
-{
- xfs_rtword_t *b; /* current word in buffer */
- int bit; /* bit number in the word */
- xfs_rtblock_t block; /* bitmap block number */
- xfs_buf_t *bp; /* buf for the block */
- xfs_rtword_t *bufp; /* starting word in buffer */
- int error; /* error value */
- xfs_rtword_t *first; /* first used word in the buffer */
- int i; /* current bit number rel. to start */
- int lastbit; /* last useful bit in word */
- xfs_rtword_t mask; /* mask o frelevant bits for value */
- int word; /* word number in the buffer */
-
- /*
- * Compute starting bitmap block number.
- */
- block = XFS_BITTOBLOCK(mp, start);
- /*
- * Read the bitmap block, and point to its data.
- */
- error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
- if (error) {
- return error;
- }
- bufp = bp->b_addr;
- /*
- * Compute the starting word's address, and starting bit.
- */
- word = XFS_BITTOWORD(mp, start);
- first = b = &bufp[word];
- bit = (int)(start & (XFS_NBWORD - 1));
- /*
- * 0 (allocated) => all zeroes; 1 (free) => all ones.
- */
- val = -val;
- /*
- * If not starting on a word boundary, deal with the first
- * (partial) word.
- */
- if (bit) {
- /*
- * Compute first bit not changed and mask of relevant bits.
- */
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
- mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
- /*
- * Set/clear the active bits.
- */
- if (val)
- *b |= mask;
- else
- *b &= ~mask;
- i = lastbit - bit;
- /*
- * Go on to the next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * Log the changed part of this block.
- * Get the next one.
- */
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
- }
- } else {
- /*
- * Starting on a word boundary, no partial word.
- */
- i = 0;
- }
- /*
- * Loop over whole words in buffers. When we use up one buffer
- * we move on to the next one.
- */
- while (len - i >= XFS_NBWORD) {
- /*
- * Set the word value correctly.
- */
- *b = val;
- i += XFS_NBWORD;
- /*
- * Go on to the next block if that's where the next word is
- * and we need the next word.
- */
- if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
- /*
- * Log the changed part of this block.
- * Get the next one.
- */
- xfs_trans_log_buf(tp, bp,
- (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp));
- error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
- if (error) {
- return error;
- }
- first = b = bufp = bp->b_addr;
- word = 0;
- } else {
- /*
- * Go on to the next word in the buffer
- */
- b++;
- }
- }
- /*
- * If not ending on a word boundary, deal with the last
- * (partial) word.
- */
- if ((lastbit = len - i)) {
- /*
- * Compute a mask of relevant bits.
- */
- bit = 0;
- mask = ((xfs_rtword_t)1 << lastbit) - 1;
- /*
- * Set/clear the active bits.
- */
- if (val)
- *b |= mask;
- else
- *b &= ~mask;
- b++;
- }
- /*
- * Log any remaining changed bytes.
- */
- if (b > first)
- xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
- (uint)((char *)b - (char *)bufp - 1));
- return 0;
-}
-
-/*
- * Read and modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-STATIC int /* error */
-xfs_rtmodify_summary(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- int log, /* log2 of extent size */
- xfs_rtblock_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- xfs_buf_t **rbpp, /* in/out: summary block buffer */
- xfs_fsblock_t *rsb) /* in/out: summary block number */
-{
- xfs_buf_t *bp; /* buffer for the summary block */
- int error; /* error value */
- xfs_fsblock_t sb; /* summary fsblock */
- int so; /* index into the summary file */
- xfs_suminfo_t *sp; /* pointer to returned data */
-
- /*
- * Compute entry number in the summary file.
- */
- so = XFS_SUMOFFS(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = XFS_SUMOFFSTOBLOCK(mp, so);
- /*
- * If we have an old buffer, and the block number matches, use that.
- */
- if (rbpp && *rbpp && *rsb == sb)
- bp = *rbpp;
- /*
- * Otherwise we have to get the buffer.
- */
- else {
- /*
- * If there was an old one, get rid of it first.
- */
- if (rbpp && *rbpp)
- xfs_trans_brelse(tp, *rbpp);
- error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
- if (error) {
- return error;
- }
- /*
- * Remember this buffer and block for the next call.
- */
- if (rbpp) {
- *rbpp = bp;
- *rsb = sb;
- }
- }
- /*
- * Point to the summary information, modify and log it.
- */
- sp = XFS_SUMPTR(mp, bp, so);
- *sp += delta;
- xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
- (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
- return 0;
-}
-
-/*
* Visible (exported) functions.
*/
@@ -2129,66 +1231,6 @@ xfs_rtallocate_extent(
}
/*
- * Free an extent in the realtime subvolume. Length is expressed in
- * realtime extents, as is the block number.
- */
-int /* error */
-xfs_rtfree_extent(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_rtblock_t bno, /* starting block number to free */
- xfs_extlen_t len) /* length of extent freed */
-{
- int error; /* error value */
- xfs_mount_t *mp; /* file system mount structure */
- xfs_fsblock_t sb; /* summary file block number */
- xfs_buf_t *sumbp; /* summary file block buffer */
-
- mp = tp->t_mountp;
-
- ASSERT(mp->m_rbmip->i_itemp != NULL);
- ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
-
-#ifdef DEBUG
- /*
- * Check to see that this whole range is currently allocated.
- */
- {
- int stat; /* result from checking range */
-
- error = xfs_rtcheck_alloc_range(mp, tp, bno, len, &stat);
- if (error) {
- return error;
- }
- ASSERT(stat);
- }
-#endif
- sumbp = NULL;
- /*
- * Free the range of realtime blocks.
- */
- error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
- if (error) {
- return error;
- }
- /*
- * Mark more blocks free in the superblock.
- */
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
- /*
- * If we've now freed all the blocks, reset the file sequence
- * number to 0.
- */
- if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
- mp->m_sb.sb_rextents) {
- if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
- mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
- *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
- xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
- }
- return 0;
-}
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2a1a24c0e2..752b63d1030 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -95,6 +95,30 @@ xfs_growfs_rt(
struct xfs_mount *mp, /* file system mount structure */
xfs_growfs_rt_t *in); /* user supplied growfs struct */
+/*
+ * From xfs_rtbitmap.c
+ */
+int xfs_rtbuf_get(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t block, int issum, struct xfs_buf **bpp);
+int xfs_rtcheck_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len, int val,
+ xfs_rtblock_t *new, int *stat);
+int xfs_rtfind_back(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_rtblock_t limit,
+ xfs_rtblock_t *rtblock);
+int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_rtblock_t limit,
+ xfs_rtblock_t *rtblock);
+int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len, int val);
+int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
+ xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
+ xfs_fsblock_t *rsb);
+int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_rtblock_t start, xfs_extlen_t len,
+ struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
+
+
#else
# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb) (ENOSYS)
# define xfs_rtfree_extent(t,b,l) (ENOSYS)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/xfs_rtbitmap.c
new file mode 100644
index 00000000000..f4dd697cac0
--- /dev/null
+++ b/fs/xfs/xfs_rtbitmap.c
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trans.h"
+#include "xfs_trans_space.h"
+#include "xfs_trace.h"
+#include "xfs_buf.h"
+#include "xfs_icache.h"
+#include "xfs_dinode.h"
+#include "xfs_rtalloc.h"
+
+
+/*
+ * Realtime allocator bitmap functions shared with userspace.
+ */
+
+/*
+ * Get a buffer for the bitmap or summary file block specified.
+ * The buffer is returned read and locked.
+ */
+int
+xfs_rtbuf_get(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t block, /* block number in bitmap or summary */
+ int issum, /* is summary not bitmap */
+ xfs_buf_t **bpp) /* output: buffer for the block */
+{
+ xfs_buf_t *bp; /* block buffer, result */
+ xfs_inode_t *ip; /* bitmap or summary inode */
+ xfs_bmbt_irec_t map;
+ int nmap = 1;
+ int error; /* error value */
+
+ ip = issum ? mp->m_rsumip : mp->m_rbmip;
+
+ error = xfs_bmapi_read(ip, block, 1, &map, &nmap, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ ASSERT(map.br_startblock != NULLFSBLOCK);
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map.br_startblock),
+ mp->m_bsize, 0, &bp, NULL);
+ if (error)
+ return error;
+ *bpp = bp;
+ return 0;
+}
+
+/*
+ * Searching backward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_back(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t firstbit; /* first useful bit in the word */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = start - limit + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit < XFS_NBWORD - 1) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
+ firstbit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = bit - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i = bit - firstbit + 1;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the previous one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to previous block if that's where the previous word is
+ * and we need the previous word.
+ */
+ if (--word == -1 && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, --block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ word = XFS_BLOCKWMASK(mp);
+ b = &bufp[word];
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b--;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if (len - i) {
+ /*
+ * Calculate first (leftmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ firstbit = XFS_NBWORD - (len - i);
+ mask = (((xfs_rtword_t)1 << (len - i)) - 1) << firstbit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ *rtblock = start - i + 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start - i + 1;
+ return 0;
+}
+
+/*
+ * Searching forward from start to limit, find the first block whose
+ * allocated/free state is different from start's.
+ */
+int
+xfs_rtfind_forw(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to look at */
+ xfs_rtblock_t limit, /* last block to look at */
+ xfs_rtblock_t *rtblock) /* out: start block found */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in the word */
+ xfs_rtblock_t len; /* length of inspected area */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t want; /* mask for "good" values */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute and read in starting bitmap block for starting block.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Get the first word's index & point to it.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ len = limit - start + 1;
+ /*
+ * Compute match value, based on the bit at start: if 1 (free)
+ * then all-ones, else all-zeroes.
+ */
+ want = (*b & ((xfs_rtword_t)1 << bit)) ? -1 : 0;
+ /*
+ * If the starting position is not word-aligned, deal with the
+ * partial word.
+ */
+ if (bit) {
+ /*
+ * Calculate last (rightmost) bit number to look at,
+ * and mask for all the relevant bits in this word.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Calculate the difference between the value there
+ * and what we're looking for.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different. Mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the previous one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the previous word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ want)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Calculate mask for all the relevant bits in this word.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ want) & mask)) {
+ /*
+ * Different, mark where we are and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *rtblock = start + i - 1;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * No match, return that we scanned the whole area.
+ */
+ xfs_trans_brelse(tp, bp);
+ *rtblock = start + i - 1;
+ return 0;
+}
+
+/*
+ * Read and modify the summary information for a given extent size,
+ * bitmap block combination.
+ * Keeps track of a current summary block, so we don't keep reading
+ * it from the buffer cache.
+ */
+int
+xfs_rtmodify_summary(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ int log, /* log2 of extent size */
+ xfs_rtblock_t bbno, /* bitmap block number */
+ int delta, /* change to make to summary info */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_buf_t *bp; /* buffer for the summary block */
+ int error; /* error value */
+ xfs_fsblock_t sb; /* summary fsblock */
+ int so; /* index into the summary file */
+ xfs_suminfo_t *sp; /* pointer to returned data */
+
+ /*
+ * Compute entry number in the summary file.
+ */
+ so = XFS_SUMOFFS(mp, log, bbno);
+ /*
+ * Compute the block number in the summary file.
+ */
+ sb = XFS_SUMOFFSTOBLOCK(mp, so);
+ /*
+ * If we have an old buffer, and the block number matches, use that.
+ */
+ if (rbpp && *rbpp && *rsb == sb)
+ bp = *rbpp;
+ /*
+ * Otherwise we have to get the buffer.
+ */
+ else {
+ /*
+ * If there was an old one, get rid of it first.
+ */
+ if (rbpp && *rbpp)
+ xfs_trans_brelse(tp, *rbpp);
+ error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
+ if (error) {
+ return error;
+ }
+ /*
+ * Remember this buffer and block for the next call.
+ */
+ if (rbpp) {
+ *rbpp = bp;
+ *rsb = sb;
+ }
+ }
+ /*
+ * Point to the summary information, modify and log it.
+ */
+ sp = XFS_SUMPTR(mp, bp, so);
+ *sp += delta;
+ xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr),
+ (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1));
+ return 0;
+}
+
+/*
+ * Set the given range of bitmap bits to the given value.
+ * Do whatever I/O and logging is required.
+ */
+int
+xfs_rtmodify_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to modify */
+ xfs_extlen_t len, /* length of extent to modify */
+ int val) /* 1 for free, 0 for allocated */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtword_t *first; /* first used word in the buffer */
+ int i; /* current bit number rel. to start */
+ int lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask o frelevant bits for value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number.
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block, and point to its data.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ first = b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zeroes; 1 (free) => all ones.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not changed and mask of relevant bits.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ i = lastbit - bit;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Set the word value correctly.
+ */
+ *b = val;
+ i += XFS_NBWORD;
+ /*
+ * Go on to the next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * Log the changed part of this block.
+ * Get the next one.
+ */
+ xfs_trans_log_buf(tp, bp,
+ (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp));
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ first = b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Compute a mask of relevant bits.
+ */
+ bit = 0;
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Set/clear the active bits.
+ */
+ if (val)
+ *b |= mask;
+ else
+ *b &= ~mask;
+ b++;
+ }
+ /*
+ * Log any remaining changed bytes.
+ */
+ if (b > first)
+ xfs_trans_log_buf(tp, bp, (uint)((char *)first - (char *)bufp),
+ (uint)((char *)b - (char *)bufp - 1));
+ return 0;
+}
+
+/*
+ * Mark an extent specified by start and len freed.
+ * Updates all the summary information as well as the bitmap.
+ */
+int
+xfs_rtfree_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block to free */
+ xfs_extlen_t len, /* length to free */
+ xfs_buf_t **rbpp, /* in/out: summary block buffer */
+ xfs_fsblock_t *rsb) /* in/out: summary block number */
+{
+ xfs_rtblock_t end; /* end of the freed extent */
+ int error; /* error value */
+ xfs_rtblock_t postblock; /* first block freed > end */
+ xfs_rtblock_t preblock; /* first block freed < start */
+
+ end = start + len - 1;
+ /*
+ * Modify the bitmap to mark this extent freed.
+ */
+ error = xfs_rtmodify_range(mp, tp, start, len, 1);
+ if (error) {
+ return error;
+ }
+ /*
+ * Assume we're freeing out of the middle of an allocated extent.
+ * We need to find the beginning and end of the extent so we can
+ * properly update the summary.
+ */
+ error = xfs_rtfind_back(mp, tp, start, 0, &preblock);
+ if (error) {
+ return error;
+ }
+ /*
+ * Find the next allocated block (end of allocated extent).
+ */
+ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1,
+ &postblock);
+ if (error)
+ return error;
+ /*
+ * If there are blocks not being freed at the front of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (preblock < start) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(start - preblock),
+ XFS_BITTOBLOCK(mp, preblock), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * If there are blocks not being freed at the end of the
+ * old extent, add summary data for them to be allocated.
+ */
+ if (postblock > end) {
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock - end),
+ XFS_BITTOBLOCK(mp, end + 1), -1, rbpp, rsb);
+ if (error) {
+ return error;
+ }
+ }
+ /*
+ * Increment the summary information corresponding to the entire
+ * (new) free extent.
+ */
+ error = xfs_rtmodify_summary(mp, tp,
+ XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ XFS_BITTOBLOCK(mp, preblock), 1, rbpp, rsb);
+ return error;
+}
+
+/*
+ * Check that the given range is either all allocated (val = 0) or
+ * all free (val = 1).
+ */
+int
+xfs_rtcheck_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t start, /* starting block number of extent */
+ xfs_extlen_t len, /* length of extent */
+ int val, /* 1 for free, 0 for allocated */
+ xfs_rtblock_t *new, /* out: first block not matching */
+ int *stat) /* out: 1 for matches, 0 for not */
+{
+ xfs_rtword_t *b; /* current word in buffer */
+ int bit; /* bit number in the word */
+ xfs_rtblock_t block; /* bitmap block number */
+ xfs_buf_t *bp; /* buf for the block */
+ xfs_rtword_t *bufp; /* starting word in buffer */
+ int error; /* error value */
+ xfs_rtblock_t i; /* current bit number rel. to start */
+ xfs_rtblock_t lastbit; /* last useful bit in word */
+ xfs_rtword_t mask; /* mask of relevant bits for value */
+ xfs_rtword_t wdiff; /* difference from wanted value */
+ int word; /* word number in the buffer */
+
+ /*
+ * Compute starting bitmap block number
+ */
+ block = XFS_BITTOBLOCK(mp, start);
+ /*
+ * Read the bitmap block.
+ */
+ error = xfs_rtbuf_get(mp, tp, block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ bufp = bp->b_addr;
+ /*
+ * Compute the starting word's address, and starting bit.
+ */
+ word = XFS_BITTOWORD(mp, start);
+ b = &bufp[word];
+ bit = (int)(start & (XFS_NBWORD - 1));
+ /*
+ * 0 (allocated) => all zero's; 1 (free) => all one's.
+ */
+ val = -val;
+ /*
+ * If not starting on a word boundary, deal with the first
+ * (partial) word.
+ */
+ if (bit) {
+ /*
+ * Compute first bit not examined.
+ */
+ lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ /*
+ * Mask of relevant bits.
+ */
+ mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i = XFS_RTLOBIT(wdiff) - bit;
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i = lastbit - bit;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ } else {
+ /*
+ * Starting on a word boundary, no partial word.
+ */
+ i = 0;
+ }
+ /*
+ * Loop over whole words in buffers. When we use up one buffer
+ * we move on to the next one.
+ */
+ while (len - i >= XFS_NBWORD) {
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = *b ^ val)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ }
+ i += XFS_NBWORD;
+ /*
+ * Go on to next block if that's where the next word is
+ * and we need the next word.
+ */
+ if (++word == XFS_BLOCKWSIZE(mp) && i < len) {
+ /*
+ * If done with this block, get the next one.
+ */
+ xfs_trans_brelse(tp, bp);
+ error = xfs_rtbuf_get(mp, tp, ++block, 0, &bp);
+ if (error) {
+ return error;
+ }
+ b = bufp = bp->b_addr;
+ word = 0;
+ } else {
+ /*
+ * Go on to the next word in the buffer.
+ */
+ b++;
+ }
+ }
+ /*
+ * If not ending on a word boundary, deal with the last
+ * (partial) word.
+ */
+ if ((lastbit = len - i)) {
+ /*
+ * Mask of relevant bits.
+ */
+ mask = ((xfs_rtword_t)1 << lastbit) - 1;
+ /*
+ * Compute difference between actual and desired value.
+ */
+ if ((wdiff = (*b ^ val) & mask)) {
+ /*
+ * Different, compute first wrong bit and return.
+ */
+ xfs_trans_brelse(tp, bp);
+ i += XFS_RTLOBIT(wdiff);
+ *new = start + i;
+ *stat = 0;
+ return 0;
+ } else
+ i = len;
+ }
+ /*
+ * Successful, return.
+ */
+ xfs_trans_brelse(tp, bp);
+ *new = start + i;
+ *stat = 1;
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that the given extent (block range) is allocated already.
+ */
+STATIC int /* error */
+xfs_rtcheck_alloc_range(
+ xfs_mount_t *mp, /* file system mount point */
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ xfs_rtblock_t new; /* dummy for xfs_rtcheck_range */
+ int stat;
+ int error;
+
+ error = xfs_rtcheck_range(mp, tp, bno, len, 0, &new, &stat);
+ if (error)
+ return error;
+ ASSERT(stat);
+ return 0;
+}
+#else
+#define xfs_rtcheck_alloc_range(m,t,b,l) (0)
+#endif
+/*
+ * Free an extent in the realtime subvolume. Length is expressed in
+ * realtime extents, as is the block number.
+ */
+int /* error */
+xfs_rtfree_extent(
+ xfs_trans_t *tp, /* transaction pointer */
+ xfs_rtblock_t bno, /* starting block number to free */
+ xfs_extlen_t len) /* length of extent freed */
+{
+ int error; /* error value */
+ xfs_mount_t *mp; /* file system mount structure */
+ xfs_fsblock_t sb; /* summary file block number */
+ xfs_buf_t *sumbp = NULL; /* summary file block buffer */
+
+ mp = tp->t_mountp;
+
+ ASSERT(mp->m_rbmip->i_itemp != NULL);
+ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+
+ error = xfs_rtcheck_alloc_range(mp, tp, bno, len);
+ if (error)
+ return error;
+
+ /*
+ * Free the range of realtime blocks.
+ */
+ error = xfs_rtfree_range(mp, tp, bno, len, &sumbp, &sb);
+ if (error) {
+ return error;
+ }
+ /*
+ * Mark more blocks free in the superblock.
+ */
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, (long)len);
+ /*
+ * If we've now freed all the blocks, reset the file sequence
+ * number to 0.
+ */
+ if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
+ mp->m_sb.sb_rextents) {
+ if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
+ mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
+ *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
+ xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
+ }
+ return 0;
+}
+
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index a5b59d92eb7..7703fa6770f 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -17,34 +17,26 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
-#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_bmap.h"
#include "xfs_error.h"
-#include "xfs_quota.h"
-#include "xfs_fsops.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_dinode.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -209,10 +201,6 @@ xfs_mount_validate_sb(
* write validation, we don't need to check feature masks.
*/
if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
- xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
-
if (xfs_sb_has_compat_feature(sbp,
XFS_SB_FEAT_COMPAT_UNKNOWN)) {
xfs_warn(mp,
@@ -249,13 +237,13 @@ xfs_mount_validate_sb(
if (xfs_sb_version_has_pquotino(sbp)) {
if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
xfs_notice(mp,
- "Version 5 of Super block has XFS_OQUOTA bits.\n");
+ "Version 5 of Super block has XFS_OQUOTA bits.");
return XFS_ERROR(EFSCORRUPTED);
}
} else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
xfs_notice(mp,
-"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.\n");
+"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
return XFS_ERROR(EFSCORRUPTED);
}
@@ -296,15 +284,16 @@ xfs_mount_validate_sb(
sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
+ sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
(sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
(sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
(sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
(sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) ||
sbp->sb_dblocks == 0 ||
sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
- sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
- XFS_CORRUPTION_ERROR("SB sanity check failed",
- XFS_ERRLEVEL_LOW, mp, sbp);
+ sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
+ sbp->sb_shared_vn != 0)) {
+ xfs_notice(mp, "SB sanity check failed");
return XFS_ERROR(EFSCORRUPTED);
}
@@ -345,15 +334,6 @@ xfs_mount_validate_sb(
xfs_warn(mp, "Offline file system operation in progress!");
return XFS_ERROR(EFSCORRUPTED);
}
-
- /*
- * Version 1 directory format has never worked on Linux.
- */
- if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
- xfs_warn(mp, "file system using version 1 directory format");
- return XFS_ERROR(ENOSYS);
- }
-
return 0;
}
@@ -503,10 +483,16 @@ xfs_sb_quota_to_disk(
}
/*
- * GQUOTINO and PQUOTINO cannot be used together in versions
- * of superblock that do not have pquotino. from->sb_flags
- * tells us which quota is active and should be copied to
- * disk.
+ * GQUOTINO and PQUOTINO cannot be used together in versions of
+ * superblock that do not have pquotino. from->sb_flags tells us which
+ * quota is active and should be copied to disk. If neither are active,
+ * make sure we write NULLFSINO to the sb_gquotino field as a quota
+ * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
+ * bit is set.
+ *
+ * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+ * as they do not require any translation. Hence the main sb field loop
+ * will write them appropriately from the in-core superblock.
*/
if ((*fields & XFS_SB_GQUOTINO) &&
(from->sb_qflags & XFS_GQUOTA_ACCT))
@@ -514,6 +500,17 @@ xfs_sb_quota_to_disk(
else if ((*fields & XFS_SB_PQUOTINO) &&
(from->sb_qflags & XFS_PQUOTA_ACCT))
to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
+ else {
+ /*
+ * We can't rely on just the fields being logged to tell us
+ * that it is safe to write NULLFSINO - we should only do that
+ * if quotas are not actually enabled. Hence only write
+ * NULLFSINO if both in-core quota inodes are NULL.
+ */
+ if (from->sb_gquotino == NULLFSINO &&
+ from->sb_pquotino == NULLFSINO)
+ to->sb_gquotino = cpu_to_be64(NULLFSINO);
+ }
*fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
}
@@ -596,6 +593,11 @@ xfs_sb_verify(
* single bit error could clear the feature bit and unused parts of the
* superblock are supposed to be zero. Hence a non-null crc field indicates that
* we've potentially lost a feature bit and we should check it anyway.
+ *
+ * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the
+ * last field in V4 secondary superblocks. So for secondary superblocks,
+ * we are more forgiving, and ignore CRC failures if the primary doesn't
+ * indicate that the fs version is V5.
*/
static void
xfs_sb_read_verify(
@@ -614,19 +616,22 @@ xfs_sb_read_verify(
XFS_SB_VERSION_5) ||
dsb->sb_crc != 0)) {
- if (!xfs_verify_cksum(bp->b_addr, be16_to_cpu(dsb->sb_sectsize),
- offsetof(struct xfs_sb, sb_crc))) {
- error = EFSCORRUPTED;
- goto out_error;
+ if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) {
+ /* Only fail bad secondaries on a known V5 filesystem */
+ if (bp->b_bn == XFS_SB_DADDR ||
+ xfs_sb_version_hascrc(&mp->m_sb)) {
+ error = EFSBADCRC;
+ goto out_error;
+ }
}
}
error = xfs_sb_verify(bp, true);
out_error:
if (error) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- mp, bp->b_addr);
xfs_buf_ioerror(bp, error);
+ if (error == EFSCORRUPTED || error == EFSBADCRC)
+ xfs_verifier_error(bp);
}
}
@@ -642,7 +647,6 @@ xfs_sb_quiet_read_verify(
{
struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp);
-
if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) {
/* XFS filesystem, verify noisily! */
xfs_sb_read_verify(bp);
@@ -662,9 +666,8 @@ xfs_sb_write_verify(
error = xfs_sb_verify(bp, false);
if (error) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
- mp, bp->b_addr);
xfs_buf_ioerror(bp, error);
+ xfs_verifier_error(bp);
return;
}
@@ -674,8 +677,7 @@ xfs_sb_write_verify(
if (bip)
XFS_BUF_TO_SBP(bp)->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_sb, sb_crc));
+ xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF);
}
const struct xfs_buf_ops xfs_sb_buf_ops = {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 6835b44f850..c43c2d609a2 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -36,8 +36,6 @@ struct xfs_trans;
#define XFS_SB_VERSION_5 5 /* CRC enabled filesystem */
#define XFS_SB_VERSION_NUMBITS 0x000f
#define XFS_SB_VERSION_ALLFBITS 0xfff0
-#define XFS_SB_VERSION_SASHFBITS 0xf000
-#define XFS_SB_VERSION_REALFBITS 0x0ff0
#define XFS_SB_VERSION_ATTRBIT 0x0010
#define XFS_SB_VERSION_NLINKBIT 0x0020
#define XFS_SB_VERSION_QUOTABIT 0x0040
@@ -50,24 +48,15 @@ struct xfs_trans;
#define XFS_SB_VERSION_DIRV2BIT 0x2000
#define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
-#define XFS_SB_VERSION_OKSASHFBITS \
- (XFS_SB_VERSION_EXTFLGBIT | \
- XFS_SB_VERSION_DIRV2BIT | \
- XFS_SB_VERSION_BORGBIT)
-#define XFS_SB_VERSION_OKREALFBITS \
- (XFS_SB_VERSION_ATTRBIT | \
- XFS_SB_VERSION_NLINKBIT | \
- XFS_SB_VERSION_QUOTABIT | \
- XFS_SB_VERSION_ALIGNBIT | \
- XFS_SB_VERSION_DALIGNBIT | \
- XFS_SB_VERSION_SHAREDBIT | \
- XFS_SB_VERSION_LOGV2BIT | \
- XFS_SB_VERSION_SECTORBIT | \
- XFS_SB_VERSION_MOREBITSBIT)
-#define XFS_SB_VERSION_OKREALBITS \
- (XFS_SB_VERSION_NUMBITS | \
- XFS_SB_VERSION_OKREALFBITS | \
- XFS_SB_VERSION_OKSASHFBITS)
+
+/*
+ * Supported feature bit list is just all bits in the versionnum field because
+ * we've used them all up and understand them all. Except, of course, for the
+ * shared superblock bit, which nobody knows what it does and so is unsupported.
+ */
+#define XFS_SB_VERSION_OKBITS \
+ ((XFS_SB_VERSION_NUMBITS | XFS_SB_VERSION_ALLFBITS) & \
+ ~XFS_SB_VERSION_SHAREDBIT)
/*
* There are two words to hold XFS "feature" bits: the original
@@ -76,7 +65,6 @@ struct xfs_trans;
*
* These defines represent bits in sb_features2.
*/
-#define XFS_SB_VERSION2_REALFBITS 0x00ffffff /* Mask: features */
#define XFS_SB_VERSION2_RESERVED1BIT 0x00000001
#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
@@ -86,16 +74,11 @@ struct xfs_trans;
#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
#define XFS_SB_VERSION2_FTYPE 0x00000200 /* inode type in dir */
-#define XFS_SB_VERSION2_OKREALFBITS \
+#define XFS_SB_VERSION2_OKBITS \
(XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
XFS_SB_VERSION2_ATTR2BIT | \
XFS_SB_VERSION2_PROJID32BIT | \
XFS_SB_VERSION2_FTYPE)
-#define XFS_SB_VERSION2_OKSASHFBITS \
- (0)
-#define XFS_SB_VERSION2_OKREALBITS \
- (XFS_SB_VERSION2_OKREALFBITS | \
- XFS_SB_VERSION2_OKSASHFBITS )
/*
* Superblock - in core version. Must match the ondisk version below.
@@ -182,6 +165,8 @@ typedef struct xfs_sb {
/* must be padded to 64 bit alignment */
} xfs_sb_t;
+#define XFS_SB_CRC_OFF offsetof(struct xfs_sb, sb_crc)
+
/*
* Superblock - on disk version. Must match the in core version above.
* Must be padded to 64 bit alignment.
@@ -343,214 +328,140 @@ typedef enum {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
-{
- /* We always support version 1-3 */
- if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
- sbp->sb_versionnum <= XFS_SB_VERSION_3)
- return 1;
-
- /* We support version 4 if all feature bits are supported */
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
- if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
- return 0;
-
- if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
- return 0;
- return 1;
- }
- if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
- return 1;
-
- return 0;
-}
-
/*
- * Detect a mismatched features2 field. Older kernels read/wrote
- * this into the wrong slot, so to be safe we keep them in sync.
+ * The first XFS version we support is a v4 superblock with V2 directories.
*/
-static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
+static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
{
- return (sbp->sb_bad_features2 != sbp->sb_features2);
-}
-
-static inline unsigned xfs_sb_version_tonew(unsigned v)
-{
- if (v == XFS_SB_VERSION_1)
- return XFS_SB_VERSION_4;
-
- if (v == XFS_SB_VERSION_2)
- return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
+ return false;
- return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
- XFS_SB_VERSION_NLINKBIT;
-}
+ /* check for unknown features in the fs */
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS)))
+ return false;
-static inline unsigned xfs_sb_version_toold(unsigned v)
-{
- if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
- return 0;
- if (v & XFS_SB_VERSION_NLINKBIT)
- return XFS_SB_VERSION_3;
- if (v & XFS_SB_VERSION_ATTRBIT)
- return XFS_SB_VERSION_2;
- return XFS_SB_VERSION_1;
+ return true;
}
-static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
+static inline bool xfs_sb_good_version(struct xfs_sb *sbp)
{
- return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
- sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5)
+ return true;
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ return xfs_sb_good_v4_features(sbp);
+ return false;
}
-static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
+/*
+ * Detect a mismatched features2 field. Older kernels read/wrote
+ * this into the wrong slot, so to be safe we keep them in sync.
+ */
+static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp)
{
- if (sbp->sb_versionnum == XFS_SB_VERSION_1)
- sbp->sb_versionnum = XFS_SB_VERSION_2;
- else if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
- sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
- else
- sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+ return sbp->sb_bad_features2 != sbp->sb_features2;
}
-static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp)
{
- return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+ return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT);
}
-static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addattr(struct xfs_sb *sbp)
{
- if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
- sbp->sb_versionnum = XFS_SB_VERSION_3;
- else
- sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
}
-static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
}
-static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addquota(struct xfs_sb *sbp)
{
- if (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4)
- sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
- else
- sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
- XFS_SB_VERSION_QUOTABIT;
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
}
-static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
(sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT));
}
-static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
-}
-
-static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
-}
-
-static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
-{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT));
+ return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
}
-static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
-static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
}
-static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
}
-static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp)
{
- return XFS_SB_VERSION_NUM(sbp) >= XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
+ return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
}
-static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
- (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT));
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}
/*
* sb_features2 bit version macros.
- *
- * For example, for a bit defined as XFS_SB_VERSION2_FUNBIT, has a macro:
- *
- * SB_VERSION_HASFUNBIT(xfs_sb_t *sbp)
- * ((xfs_sb_version_hasmorebits(sbp) &&
- * ((sbp)->sb_features2 & XFS_SB_VERSION2_FUNBIT)
*/
-
-static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
(xfs_sb_version_hasmorebits(sbp) &&
(sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
}
-static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
(xfs_sb_version_hasmorebits(sbp) &&
(sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT));
}
-static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}
-static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
+static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
{
sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
+ sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
if (!sbp->sb_features2)
sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
}
-static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
+static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp)
{
return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) ||
(xfs_sb_version_hasmorebits(sbp) &&
(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT));
}
-static inline void xfs_sb_version_addprojid32bit(xfs_sb_t *sbp)
+static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
{
sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
@@ -585,7 +496,9 @@ xfs_sb_has_compat_feature(
return (sbp->sb_features_compat & feature) != 0;
}
-#define XFS_SB_FEAT_RO_COMPAT_ALL 0
+#define XFS_SB_FEAT_RO_COMPAT_FINOBT (1 << 0) /* free inode btree */
+#define XFS_SB_FEAT_RO_COMPAT_ALL \
+ (XFS_SB_FEAT_RO_COMPAT_FINOBT)
#define XFS_SB_FEAT_RO_COMPAT_UNKNOWN ~XFS_SB_FEAT_RO_COMPAT_ALL
static inline bool
xfs_sb_has_ro_compat_feature(
@@ -621,12 +534,12 @@ xfs_sb_has_incompat_log_feature(
/*
* V5 superblock specific feature checks
*/
-static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+static inline int xfs_sb_version_hascrc(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
-static inline int xfs_sb_version_has_pquotino(xfs_sb_t *sbp)
+static inline int xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
@@ -639,6 +552,12 @@ static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp)
(sbp->sb_features2 & XFS_SB_VERSION2_FTYPE));
}
+static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp)
+{
+ return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) &&
+ (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT);
+}
+
/*
* end of superblock version macros
*/
@@ -699,7 +618,4 @@ extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp);
-extern const struct xfs_buf_ops xfs_sb_buf_ops;
-extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
-
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/xfs_shared.h
new file mode 100644
index 00000000000..82404da2ca6
--- /dev/null
+++ b/fs/xfs/xfs_shared.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_SHARED_H__
+#define __XFS_SHARED_H__
+
+/*
+ * Definitions shared between kernel and userspace that don't fit into any other
+ * header file that is shared with userspace.
+ */
+struct xfs_ifork;
+struct xfs_buf;
+struct xfs_buf_ops;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_inode;
+
+/*
+ * Buffer verifier operations are widely used, including userspace tools
+ */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
+extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
+extern const struct xfs_buf_ops xfs_da3_node_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
+extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+
+/*
+ * Transaction types. Used to distinguish types of buffers. These never reach
+ * the log.
+ */
+#define XFS_TRANS_SETATTR_NOT_SIZE 1
+#define XFS_TRANS_SETATTR_SIZE 2
+#define XFS_TRANS_INACTIVE 3
+#define XFS_TRANS_CREATE 4
+#define XFS_TRANS_CREATE_TRUNC 5
+#define XFS_TRANS_TRUNCATE_FILE 6
+#define XFS_TRANS_REMOVE 7
+#define XFS_TRANS_LINK 8
+#define XFS_TRANS_RENAME 9
+#define XFS_TRANS_MKDIR 10
+#define XFS_TRANS_RMDIR 11
+#define XFS_TRANS_SYMLINK 12
+#define XFS_TRANS_SET_DMATTRS 13
+#define XFS_TRANS_GROWFS 14
+#define XFS_TRANS_STRAT_WRITE 15
+#define XFS_TRANS_DIOSTRAT 16
+/* 17 was XFS_TRANS_WRITE_SYNC */
+#define XFS_TRANS_WRITEID 18
+#define XFS_TRANS_ADDAFORK 19
+#define XFS_TRANS_ATTRINVAL 20
+#define XFS_TRANS_ATRUNCATE 21
+#define XFS_TRANS_ATTR_SET 22
+#define XFS_TRANS_ATTR_RM 23
+#define XFS_TRANS_ATTR_FLAG 24
+#define XFS_TRANS_CLEAR_AGI_BUCKET 25
+#define XFS_TRANS_QM_SBCHANGE 26
+/*
+ * Dummy entries since we use the transaction type to index into the
+ * trans_type[] in xlog_recover_print_trans_head()
+ */
+#define XFS_TRANS_DUMMY1 27
+#define XFS_TRANS_DUMMY2 28
+#define XFS_TRANS_QM_QUOTAOFF 29
+#define XFS_TRANS_QM_DQALLOC 30
+#define XFS_TRANS_QM_SETQLIM 31
+#define XFS_TRANS_QM_DQCLUSTER 32
+#define XFS_TRANS_QM_QINOCREATE 33
+#define XFS_TRANS_QM_QUOTAOFF_END 34
+#define XFS_TRANS_SB_UNIT 35
+#define XFS_TRANS_FSYNC_TS 36
+#define XFS_TRANS_GROWFSRT_ALLOC 37
+#define XFS_TRANS_GROWFSRT_ZERO 38
+#define XFS_TRANS_GROWFSRT_FREE 39
+#define XFS_TRANS_SWAPEXT 40
+#define XFS_TRANS_SB_COUNT 41
+#define XFS_TRANS_CHECKPOINT 42
+#define XFS_TRANS_ICREATE 43
+#define XFS_TRANS_CREATE_TMPFILE 44
+#define XFS_TRANS_TYPE_MAX 44
+/* new transaction types need to be reflected in xfs_logprint(8) */
+
+#define XFS_TRANS_TYPES \
+ { XFS_TRANS_SETATTR_NOT_SIZE, "SETATTR_NOT_SIZE" }, \
+ { XFS_TRANS_SETATTR_SIZE, "SETATTR_SIZE" }, \
+ { XFS_TRANS_INACTIVE, "INACTIVE" }, \
+ { XFS_TRANS_CREATE, "CREATE" }, \
+ { XFS_TRANS_CREATE_TMPFILE, "CREATE_TMPFILE" }, \
+ { XFS_TRANS_CREATE_TRUNC, "CREATE_TRUNC" }, \
+ { XFS_TRANS_TRUNCATE_FILE, "TRUNCATE_FILE" }, \
+ { XFS_TRANS_REMOVE, "REMOVE" }, \
+ { XFS_TRANS_LINK, "LINK" }, \
+ { XFS_TRANS_RENAME, "RENAME" }, \
+ { XFS_TRANS_MKDIR, "MKDIR" }, \
+ { XFS_TRANS_RMDIR, "RMDIR" }, \
+ { XFS_TRANS_SYMLINK, "SYMLINK" }, \
+ { XFS_TRANS_SET_DMATTRS, "SET_DMATTRS" }, \
+ { XFS_TRANS_GROWFS, "GROWFS" }, \
+ { XFS_TRANS_STRAT_WRITE, "STRAT_WRITE" }, \
+ { XFS_TRANS_DIOSTRAT, "DIOSTRAT" }, \
+ { XFS_TRANS_WRITEID, "WRITEID" }, \
+ { XFS_TRANS_ADDAFORK, "ADDAFORK" }, \
+ { XFS_TRANS_ATTRINVAL, "ATTRINVAL" }, \
+ { XFS_TRANS_ATRUNCATE, "ATRUNCATE" }, \
+ { XFS_TRANS_ATTR_SET, "ATTR_SET" }, \
+ { XFS_TRANS_ATTR_RM, "ATTR_RM" }, \
+ { XFS_TRANS_ATTR_FLAG, "ATTR_FLAG" }, \
+ { XFS_TRANS_CLEAR_AGI_BUCKET, "CLEAR_AGI_BUCKET" }, \
+ { XFS_TRANS_QM_SBCHANGE, "QM_SBCHANGE" }, \
+ { XFS_TRANS_QM_QUOTAOFF, "QM_QUOTAOFF" }, \
+ { XFS_TRANS_QM_DQALLOC, "QM_DQALLOC" }, \
+ { XFS_TRANS_QM_SETQLIM, "QM_SETQLIM" }, \
+ { XFS_TRANS_QM_DQCLUSTER, "QM_DQCLUSTER" }, \
+ { XFS_TRANS_QM_QINOCREATE, "QM_QINOCREATE" }, \
+ { XFS_TRANS_QM_QUOTAOFF_END, "QM_QOFF_END" }, \
+ { XFS_TRANS_SB_UNIT, "SB_UNIT" }, \
+ { XFS_TRANS_FSYNC_TS, "FSYNC_TS" }, \
+ { XFS_TRANS_GROWFSRT_ALLOC, "GROWFSRT_ALLOC" }, \
+ { XFS_TRANS_GROWFSRT_ZERO, "GROWFSRT_ZERO" }, \
+ { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
+ { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
+ { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
+ { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
+ { XFS_TRANS_DUMMY1, "DUMMY1" }, \
+ { XFS_TRANS_DUMMY2, "DUMMY2" }, \
+ { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
+
+/*
+ * This structure is used to track log items associated with
+ * a transaction. It points to the log item and keeps some
+ * flags to track the state of the log item. It also tracks
+ * the amount of space needed to log the item it describes
+ * once we get to commit processing (see xfs_trans_commit()).
+ */
+struct xfs_log_item_desc {
+ struct xfs_log_item *lid_item;
+ struct list_head lid_trans;
+ unsigned char lid_flags;
+};
+
+#define XFS_LID_DIRTY 0x1
+
+/* log size calculation functions */
+int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
+int xfs_log_calc_minimum_size(struct xfs_mount *);
+
+
+/*
+ * Values for t_flags.
+ */
+#define XFS_TRANS_DIRTY 0x01 /* something needs to be logged */
+#define XFS_TRANS_SB_DIRTY 0x02 /* superblock is modified */
+#define XFS_TRANS_PERM_LOG_RES 0x04 /* xact took a permanent log res */
+#define XFS_TRANS_SYNC 0x08 /* make commit synchronous */
+#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
+#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
+#define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer
+ count in superblock */
+/*
+ * Values for call flags parameter.
+ */
+#define XFS_TRANS_RELEASE_LOG_RES 0x4
+#define XFS_TRANS_ABORT 0x8
+
+/*
+ * Field values for xfs_trans_mod_sb.
+ */
+#define XFS_TRANS_SB_ICOUNT 0x00000001
+#define XFS_TRANS_SB_IFREE 0x00000002
+#define XFS_TRANS_SB_FDBLOCKS 0x00000004
+#define XFS_TRANS_SB_RES_FDBLOCKS 0x00000008
+#define XFS_TRANS_SB_FREXTENTS 0x00000010
+#define XFS_TRANS_SB_RES_FREXTENTS 0x00000020
+#define XFS_TRANS_SB_DBLOCKS 0x00000040
+#define XFS_TRANS_SB_AGCOUNT 0x00000080
+#define XFS_TRANS_SB_IMAXPCT 0x00000100
+#define XFS_TRANS_SB_REXTSIZE 0x00000200
+#define XFS_TRANS_SB_RBMBLOCKS 0x00000400
+#define XFS_TRANS_SB_RBLOCKS 0x00000800
+#define XFS_TRANS_SB_REXTENTS 0x00001000
+#define XFS_TRANS_SB_REXTSLOG 0x00002000
+
+/*
+ * Here we centralize the specification of XFS meta-data buffer reference count
+ * values. This determines how hard the buffer cache tries to hold onto the
+ * buffer.
+ */
+#define XFS_AGF_REF 4
+#define XFS_AGI_REF 4
+#define XFS_AGFL_REF 3
+#define XFS_INO_BTREE_REF 3
+#define XFS_ALLOC_BTREE_REF 2
+#define XFS_BMAP_BTREE_REF 2
+#define XFS_DIR_BTREE_REF 2
+#define XFS_INO_REF 2
+#define XFS_ATTR_BTREE_REF 1
+#define XFS_DQUOT_REF 1
+
+/*
+ * Flags for xfs_trans_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+#define XFS_ICHGTIME_CREATE 0x4 /* inode create timestamp */
+
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+ uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+ struct xfs_inode *ip, struct xfs_ifork *ifp);
+
+#endif /* __XFS_SHARED_H__ */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index ce372b7d564..f2240383d4b 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -59,6 +59,7 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
{ "abtc2", XFSSTAT_END_ABTC_V2 },
{ "bmbt2", XFSSTAT_END_BMBT_V2 },
{ "ibt2", XFSSTAT_END_IBT_V2 },
+ { "fibt2", XFSSTAT_END_FIBT_V2 },
/* we print both series of quota information together */
{ "qm", XFSSTAT_END_QM },
};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c03ad38ceae..c8f238b8299 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -183,7 +183,23 @@ struct xfsstats {
__uint32_t xs_ibt_2_alloc;
__uint32_t xs_ibt_2_free;
__uint32_t xs_ibt_2_moves;
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_IBT_V2+6)
+#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15)
+ __uint32_t xs_fibt_2_lookup;
+ __uint32_t xs_fibt_2_compare;
+ __uint32_t xs_fibt_2_insrec;
+ __uint32_t xs_fibt_2_delrec;
+ __uint32_t xs_fibt_2_newroot;
+ __uint32_t xs_fibt_2_killroot;
+ __uint32_t xs_fibt_2_increment;
+ __uint32_t xs_fibt_2_decrement;
+ __uint32_t xs_fibt_2_lshift;
+ __uint32_t xs_fibt_2_rshift;
+ __uint32_t xs_fibt_2_split;
+ __uint32_t xs_fibt_2_join;
+ __uint32_t xs_fibt_2_alloc;
+ __uint32_t xs_fibt_2_free;
+ __uint32_t xs_fibt_2_moves;
+#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_FIBT_V2+6)
__uint32_t xs_qm_dqreclaims;
__uint32_t xs_qm_dqreclaim_misses;
__uint32_t xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15188cc9944..8f0333b3f7a 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -17,34 +17,26 @@
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_ialloc.h"
#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
+#include "xfs_alloc.h"
#include "xfs_error.h"
-#include "xfs_itable.h"
#include "xfs_fsops.h"
-#include "xfs_attr.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_trans_priv.h"
-#include "xfs_filestream.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
@@ -52,6 +44,9 @@
#include "xfs_icache.h"
#include "xfs_trace.h"
#include "xfs_icreate_item.h"
+#include "xfs_dinode.h"
+#include "xfs_filestream.h"
+#include "xfs_quota.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -770,20 +765,18 @@ xfs_open_devices(
* Setup xfs_mount buffer target pointers
*/
error = ENOMEM;
- mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+ mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
if (!mp->m_ddev_targp)
goto out_close_rtdev;
if (rtdev) {
- mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
- mp->m_fsname);
+ mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev);
if (!mp->m_rtdev_targp)
goto out_free_ddev_targ;
}
if (logdev && logdev != ddev) {
- mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
- mp->m_fsname);
+ mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev);
if (!mp->m_logdev_targp)
goto out_free_rtdev_targ;
} else {
@@ -816,8 +809,7 @@ xfs_setup_devices(
{
int error;
- error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
- mp->m_sb.sb_sectsize);
+ error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -827,14 +819,12 @@ xfs_setup_devices(
if (xfs_sb_version_hassector(&mp->m_sb))
log_sector_size = mp->m_sb.sb_logsectsize;
error = xfs_setsize_buftarg(mp->m_logdev_targp,
- mp->m_sb.sb_blocksize,
log_sector_size);
if (error)
return error;
}
if (mp->m_rtdev_targp) {
error = xfs_setsize_buftarg(mp->m_rtdev_targp,
- mp->m_sb.sb_blocksize,
mp->m_sb.sb_sectsize);
if (error)
return error;
@@ -946,10 +936,6 @@ xfs_fs_destroy_inode(
XFS_STATS_INC(vn_reclaim);
- /* bad inode, get out here ASAP */
- if (is_bad_inode(inode))
- goto out_reclaim;
-
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
/*
@@ -965,7 +951,6 @@ xfs_fs_destroy_inode(
* this more efficiently than we can here, so simply let background
* reclaim tear down all inodes.
*/
-out_reclaim:
xfs_inode_set_reclaim_tag(ip);
}
@@ -1006,7 +991,7 @@ xfs_fs_evict_inode(
trace_xfs_evict_inode(ip);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
XFS_STATS_INC(vn_rele);
XFS_STATS_INC(vn_remove);
@@ -1165,7 +1150,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
* Note: xfs_log_quiesce() stops background log work - the callers must ensure
* it is started again when appropriate.
*/
-void
+static void
xfs_quiesce_attr(
struct xfs_mount *mp)
{
@@ -1207,6 +1192,7 @@ xfs_fs_remount(
char *p;
int error;
+ sync_filesystem(sb);
while ((p = strsep(&options, ",")) != NULL) {
int token;
@@ -1246,7 +1232,7 @@ xfs_fs_remount(
*/
#if 0
xfs_info(mp,
- "mount option \"%s\" not supported for remount\n", p);
+ "mount option \"%s\" not supported for remount", p);
return -EINVAL;
#else
break;
@@ -1442,11 +1428,11 @@ xfs_fs_fill_super(
if (error)
goto out_free_fsname;
- error = xfs_init_mount_workqueues(mp);
+ error = -xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
- error = xfs_icsb_init_counters(mp);
+ error = -xfs_icsb_init_counters(mp);
if (error)
goto out_destroy_workqueues;
@@ -1491,10 +1477,6 @@ xfs_fs_fill_super(
error = ENOENT;
goto out_unmount;
}
- if (is_bad_inode(root)) {
- error = EINVAL;
- goto out_unmount;
- }
sb->s_root = d_make_root(root);
if (!sb->s_root) {
error = ENOMEM;
@@ -1767,13 +1749,9 @@ init_xfs_fs(void)
if (error)
goto out_destroy_wq;
- error = xfs_filestream_init();
- if (error)
- goto out_mru_cache_uninit;
-
error = xfs_buf_init();
if (error)
- goto out_filestream_uninit;
+ goto out_mru_cache_uninit;
error = xfs_init_procfs();
if (error)
@@ -1800,8 +1778,6 @@ init_xfs_fs(void)
xfs_cleanup_procfs();
out_buf_terminate:
xfs_buf_terminate();
- out_filestream_uninit:
- xfs_filestream_uninit();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
out_destroy_wq:
@@ -1820,7 +1796,6 @@ exit_xfs_fs(void)
xfs_sysctl_unregister();
xfs_cleanup_procfs();
xfs_buf_terminate();
- xfs_filestream_uninit();
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index f622a97a7e3..d69363c833e 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -17,31 +17,32 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "xfs.h"
+#include "xfs_shared.h"
#include "xfs_fs.h"
#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_dir2_format.h"
#include "xfs_dir2.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
#include "xfs_bmap_util.h"
#include "xfs_error.h"
#include "xfs_quota.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
#include "xfs_symlink.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_dinode.h"
/* ----- Kernel only functions below ----- */
STATIC int
@@ -80,6 +81,10 @@ xfs_readlink_bmap(
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
goto out;
}
byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -88,7 +93,7 @@ xfs_readlink_bmap(
cur_chunk = bp->b_addr;
if (xfs_sb_version_hascrc(&mp->m_sb)) {
- if (!xfs_symlink_hdr_ok(mp, ip->i_ino, offset,
+ if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
byte_cnt, bp)) {
error = EFSCORRUPTED;
xfs_alert(mp,
@@ -208,10 +213,7 @@ xfs_symlink(
return XFS_ERROR(ENAMETOOLONG);
udqp = gdqp = NULL;
- if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- prid = xfs_get_projid(dp);
- else
- prid = XFS_PROJID_DEFAULT;
+ prid = xfs_get_initial_prid(dp);
/*
* Make sure that we have allocated dquot(s) on disk.
@@ -424,8 +426,7 @@ xfs_symlink(
*/
STATIC int
xfs_inactive_symlink_rmt(
- xfs_inode_t *ip,
- xfs_trans_t **tpp)
+ struct xfs_inode *ip)
{
xfs_buf_t *bp;
int committed;
@@ -437,11 +438,9 @@ xfs_inactive_symlink_rmt(
xfs_mount_t *mp;
xfs_bmbt_irec_t mval[XFS_SYMLINK_MAPS];
int nmaps;
- xfs_trans_t *ntp;
int size;
xfs_trans_t *tp;
- tp = *tpp;
mp = ip->i_mount;
ASSERT(ip->i_df.if_flags & XFS_IFEXTENTS);
/*
@@ -453,6 +452,16 @@ xfs_inactive_symlink_rmt(
*/
ASSERT(ip->i_d.di_nextents > 0 && ip->i_d.di_nextents <= 2);
+ tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
/*
* Lock the inode, fix the size, and join it to the transaction.
* Hold it so in the normal path, we still have it locked for
@@ -471,7 +480,7 @@ xfs_inactive_symlink_rmt(
error = xfs_bmapi_read(ip, 0, xfs_symlink_blocks(mp, size),
mval, &nmaps, 0);
if (error)
- goto error0;
+ goto error_trans_cancel;
/*
* Invalidate the block(s). No validation is done.
*/
@@ -481,22 +490,24 @@ xfs_inactive_symlink_rmt(
XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
if (!bp) {
error = ENOMEM;
- goto error1;
+ goto error_bmap_cancel;
}
xfs_trans_binval(tp, bp);
}
/*
* Unmap the dead block(s) to the free_list.
*/
- if ((error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
- &first_block, &free_list, &done)))
- goto error1;
+ error = xfs_bunmapi(tp, ip, 0, size, XFS_BMAPI_METADATA, nmaps,
+ &first_block, &free_list, &done);
+ if (error)
+ goto error_bmap_cancel;
ASSERT(done);
/*
* Commit the first transaction. This logs the EFI and the inode.
*/
- if ((error = xfs_bmap_finish(&tp, &free_list, &committed)))
- goto error1;
+ error = xfs_bmap_finish(&tp, &free_list, &committed);
+ if (error)
+ goto error_bmap_cancel;
/*
* The transaction must have been committed, since there were
* actually extents freed by xfs_bunmapi. See xfs_bmap_finish.
@@ -511,26 +522,13 @@ xfs_inactive_symlink_rmt(
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
/*
- * Get a new, empty transaction to return to our caller.
- */
- ntp = xfs_trans_dup(tp);
- /*
* Commit the transaction containing extent freeing and EFDs.
- * If we get an error on the commit here or on the reserve below,
- * we need to unlock the inode since the new transaction doesn't
- * have the inode attached.
*/
- error = xfs_trans_commit(tp, 0);
- tp = ntp;
+ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
if (error) {
ASSERT(XFS_FORCED_SHUTDOWN(mp));
- goto error0;
+ goto error_unlock;
}
- /*
- * transaction commit worked ok so we can drop the extra ticket
- * reference that we gained in xfs_trans_dup()
- */
- xfs_log_ticket_put(tp->t_ticket);
/*
* Remove the memory for extent descriptions (just bookkeeping).
@@ -538,23 +536,16 @@ xfs_inactive_symlink_rmt(
if (ip->i_df.if_bytes)
xfs_idata_realloc(ip, -ip->i_df.if_bytes, XFS_DATA_FORK);
ASSERT(ip->i_df.if_bytes == 0);
- /*
- * Put an itruncate log reservation in the new transaction
- * for our caller.
- */
- error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
- if (error) {
- ASSERT(XFS_FORCED_SHUTDOWN(mp));
- goto error0;
- }
- xfs_trans_ijoin(tp, ip, 0);
- *tpp = tp;
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
- error1:
+error_bmap_cancel:
xfs_bmap_cancel(&free_list);
- error0:
+error_trans_cancel:
+ xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
+error_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -563,41 +554,46 @@ xfs_inactive_symlink_rmt(
*/
int
xfs_inactive_symlink(
- struct xfs_inode *ip,
- struct xfs_trans **tp)
+ struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
int pathlen;
trace_xfs_inactive_symlink(ip);
- ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
/*
* Zero length symlinks _can_ exist.
*/
pathlen = (int)ip->i_d.di_size;
- if (!pathlen)
+ if (!pathlen) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return 0;
+ }
if (pathlen < 0 || pathlen > MAXPATHLEN) {
xfs_alert(mp, "%s: inode (0x%llx) bad symlink length (%d)",
__func__, (unsigned long long)ip->i_ino, pathlen);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(0);
return XFS_ERROR(EFSCORRUPTED);
}
if (ip->i_df.if_flags & XFS_IFINLINE) {
- if (ip->i_df.if_bytes > 0)
+ if (ip->i_df.if_bytes > 0)
xfs_idata_realloc(ip, -(ip->i_df.if_bytes),
XFS_DATA_FORK);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
ASSERT(ip->i_df.if_bytes == 0);
return 0;
}
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
/* remove the remote symlink */
- return xfs_inactive_symlink_rmt(ip, tp);
+ return xfs_inactive_symlink_rmt(ip);
}
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index 99338ba666a..e75245d0911 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -22,6 +22,6 @@
int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
const char *target_path, umode_t mode, struct xfs_inode **ipp);
int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_inactive_symlink(struct xfs_inode *ip, struct xfs_trans **tpp);
+int xfs_inactive_symlink(struct xfs_inode *ip);
#endif /* __XFS_SYMLINK_H */
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/xfs_symlink_remote.c
index 01c85e3f647..23c2f2577c8 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/xfs_symlink_remote.c
@@ -19,8 +19,9 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_shared.h"
+#include "xfs_trans_resv.h"
#include "xfs_ag.h"
#include "xfs_sb.h"
#include "xfs_mount.h"
@@ -30,6 +31,7 @@
#include "xfs_trace.h"
#include "xfs_symlink.h"
#include "xfs_cksum.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
@@ -78,7 +80,6 @@ xfs_symlink_hdr_set(
*/
bool
xfs_symlink_hdr_ok(
- struct xfs_mount *mp,
xfs_ino_t ino,
uint32_t offset,
uint32_t size,
@@ -131,12 +132,13 @@ xfs_symlink_read_verify(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return;
- if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_dsymlink_hdr, sl_crc)) ||
- !xfs_symlink_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
+ if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
+ xfs_buf_ioerror(bp, EFSBADCRC);
+ else if (!xfs_symlink_verify(bp))
xfs_buf_ioerror(bp, EFSCORRUPTED);
- }
+
+ if (bp->b_error)
+ xfs_verifier_error(bp);
}
static void
@@ -151,8 +153,8 @@ xfs_symlink_write_verify(
return;
if (!xfs_symlink_verify(bp)) {
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr);
xfs_buf_ioerror(bp, EFSCORRUPTED);
+ xfs_verifier_error(bp);
return;
}
@@ -160,8 +162,7 @@ xfs_symlink_write_verify(
struct xfs_dsymlink_hdr *dsl = bp->b_addr;
dsl->sl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
}
- xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
- offsetof(struct xfs_dsymlink_hdr, sl_crc));
+ xfs_buf_update_cksum(bp, XFS_SYMLINK_CRC_OFF);
}
const struct xfs_buf_ops xfs_symlink_buf_ops = {
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 5d7b3e40705..1e85bcd0e41 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -17,19 +17,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
-#include "xfs_mount.h"
#include "xfs_da_btree.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
@@ -37,6 +34,8 @@
#include "xfs_bmap.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_buf_item.h"
#include "xfs_quota.h"
@@ -46,6 +45,8 @@
#include "xfs_dquot.h"
#include "xfs_log_recover.h"
#include "xfs_inode_item.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_filestream.h"
/*
* We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 47910e638c1..152f8278263 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -31,8 +31,8 @@ struct xfs_da_args;
struct xfs_da_node_entry;
struct xfs_dquot;
struct xfs_log_item;
-struct xlog_ticket;
struct xlog;
+struct xlog_ticket;
struct xlog_recover;
struct xlog_recover_item;
struct xfs_buf_log_format;
@@ -135,6 +135,31 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
+DECLARE_EVENT_CLASS(xfs_ag_class,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
+ TP_ARGS(mp, agno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ ),
+ TP_printk("dev %d:%d agno %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno)
+);
+#define DEFINE_AG_EVENT(name) \
+DEFINE_EVENT(xfs_ag_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), \
+ TP_ARGS(mp, agno))
+
+DEFINE_AG_EVENT(xfs_read_agf);
+DEFINE_AG_EVENT(xfs_alloc_read_agf);
+DEFINE_AG_EVENT(xfs_read_agi);
+DEFINE_AG_EVENT(xfs_ialloc_read_agi);
+
TRACE_EVENT(xfs_attr_list_node_descend,
TP_PROTO(struct xfs_attr_list_context *ctx,
struct xfs_da_node_entry *btree),
@@ -513,6 +538,64 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
+DECLARE_EVENT_CLASS(xfs_filestream_class,
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
+ TP_ARGS(ip, agno),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(int, streams)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = agno;
+ __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno %u streams %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->streams)
+)
+#define DEFINE_FILESTREAM_EVENT(name) \
+DEFINE_EVENT(xfs_filestream_class, name, \
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), \
+ TP_ARGS(ip, agno))
+DEFINE_FILESTREAM_EVENT(xfs_filestream_free);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
+DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
+
+TRACE_EVENT(xfs_filestream_pick,
+ TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno,
+ xfs_extlen_t free, int nscan),
+ TP_ARGS(ip, agno, free, nscan),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_agnumber_t, agno)
+ __field(int, streams)
+ __field(xfs_extlen_t, free)
+ __field(int, nscan)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->agno = agno;
+ __entry->streams = xfs_filestream_peek_ag(ip->i_mount, agno);
+ __entry->free = free;
+ __entry->nscan = nscan;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->agno,
+ __entry->streams,
+ __entry->free,
+ __entry->nscan)
+);
+
DECLARE_EVENT_CLASS(xfs_lock_class,
TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
unsigned long caller_ip),
@@ -578,6 +661,8 @@ DEFINE_INODE_EVENT(xfs_readlink);
DEFINE_INODE_EVENT(xfs_inactive_symlink);
DEFINE_INODE_EVENT(xfs_alloc_file_space);
DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_zero_file_space);
+DEFINE_INODE_EVENT(xfs_collapse_file_space);
DEFINE_INODE_EVENT(xfs_readdir);
#ifdef CONFIG_XFS_POSIX_ACL
DEFINE_INODE_EVENT(xfs_get_acl);
@@ -938,6 +1023,63 @@ DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
+DECLARE_EVENT_CLASS(xfs_ail_class,
+ TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn),
+ TP_ARGS(lip, old_lsn, new_lsn),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(void *, lip)
+ __field(uint, type)
+ __field(uint, flags)
+ __field(xfs_lsn_t, old_lsn)
+ __field(xfs_lsn_t, new_lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = lip->li_mountp->m_super->s_dev;
+ __entry->lip = lip;
+ __entry->type = lip->li_type;
+ __entry->flags = lip->li_flags;
+ __entry->old_lsn = old_lsn;
+ __entry->new_lsn = new_lsn;
+ ),
+ TP_printk("dev %d:%d lip 0x%p old lsn %d/%d new lsn %d/%d type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->lip,
+ CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+ CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+ __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+ __print_flags(__entry->flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_AIL_EVENT(name) \
+DEFINE_EVENT(xfs_ail_class, name, \
+ TP_PROTO(struct xfs_log_item *lip, xfs_lsn_t old_lsn, xfs_lsn_t new_lsn), \
+ TP_ARGS(lip, old_lsn, new_lsn))
+DEFINE_AIL_EVENT(xfs_ail_insert);
+DEFINE_AIL_EVENT(xfs_ail_move);
+DEFINE_AIL_EVENT(xfs_ail_delete);
+
+TRACE_EVENT(xfs_log_assign_tail_lsn,
+ TP_PROTO(struct xlog *log, xfs_lsn_t new_lsn),
+ TP_ARGS(log, new_lsn),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_lsn_t, new_lsn)
+ __field(xfs_lsn_t, old_lsn)
+ __field(xfs_lsn_t, last_sync_lsn)
+ ),
+ TP_fast_assign(
+ __entry->dev = log->l_mp->m_super->s_dev;
+ __entry->new_lsn = new_lsn;
+ __entry->old_lsn = atomic64_read(&log->l_tail_lsn);
+ __entry->last_sync_lsn = atomic64_read(&log->l_last_sync_lsn);
+ ),
+ TP_printk("dev %d:%d new tail lsn %d/%d, old lsn %d/%d, last sync %d/%d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ CYCLE_LSN(__entry->new_lsn), BLOCK_LSN(__entry->new_lsn),
+ CYCLE_LSN(__entry->old_lsn), BLOCK_LSN(__entry->old_lsn),
+ CYCLE_LSN(__entry->last_sync_lsn), BLOCK_LSN(__entry->last_sync_lsn))
+)
DECLARE_EVENT_CLASS(xfs_file_class,
TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
@@ -976,7 +1118,6 @@ DEFINE_RW_EVENT(xfs_file_read);
DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_splice_read);
-DEFINE_RW_EVENT(xfs_file_splice_write);
DECLARE_EVENT_CLASS(xfs_page_class,
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 5411e01ab45..d03932564cc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -18,32 +18,21 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
#include "xfs_quota.h"
-#include "xfs_qm.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
-#include "xfs_trans_space.h"
-#include "xfs_inode_item.h"
-#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
+#include "xfs_log.h"
#include "xfs_trace.h"
+#include "xfs_error.h"
kmem_zone_t *xfs_trans_zone;
kmem_zone_t *xfs_log_item_desc_zone;
@@ -838,7 +827,7 @@ xfs_trans_committed_bulk(
xfs_log_item_batch_insert(ailp, &cur, log_items, i, commit_lsn);
spin_lock(&ailp->xa_lock);
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
}
@@ -898,12 +887,7 @@ xfs_trans_commit(
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- error = xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
- if (error == ENOMEM) {
- xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
- error = XFS_ERROR(EIO);
- goto out_unreserve;
- }
+ xfs_log_commit_cil(mp, tp, &commit_lsn, flags);
current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
xfs_trans_free(tp);
@@ -913,10 +897,7 @@ xfs_trans_commit(
* log out now and wait for it.
*/
if (sync) {
- if (!error) {
- error = _xfs_log_force_lsn(mp, commit_lsn,
- XFS_LOG_SYNC, NULL);
- }
+ error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
XFS_STATS_INC(xs_trans_sync);
} else {
XFS_STATS_INC(xs_trans_async);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 09cf40b89e8..b5bc1ab3c4d 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,10 +18,6 @@
#ifndef __XFS_TRANS_H__
#define __XFS_TRANS_H__
-struct xfs_log_item;
-
-#include "xfs_trans_resv.h"
-
/* kernel only transaction subsystem defines */
struct xfs_buf;
@@ -68,7 +64,7 @@ typedef struct xfs_log_item {
struct xfs_item_ops {
void (*iop_size)(xfs_log_item_t *, int *, int *);
- void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+ void (*iop_format)(xfs_log_item_t *, struct xfs_log_vec *);
void (*iop_pin)(xfs_log_item_t *);
void (*iop_unpin)(xfs_log_item_t *, int remove);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
@@ -77,6 +73,9 @@ struct xfs_item_ops {
void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
};
+void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+ int type, const struct xfs_item_ops *ops);
+
/*
* Return values for the iop_push() routines.
*/
@@ -85,18 +84,12 @@ struct xfs_item_ops {
#define XFS_ITEM_LOCKED 2
#define XFS_ITEM_FLUSHING 3
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
/*
* This is the structure maintained for every active transaction.
*/
typedef struct xfs_trans {
unsigned int t_magic; /* magic number */
- xfs_log_callback_t t_logcb; /* log callback struct */
unsigned int t_type; /* transaction type */
unsigned int t_log_res; /* amt of log space resvd */
unsigned int t_log_count; /* count for perm log res */
@@ -132,7 +125,6 @@ typedef struct xfs_trans {
int64_t t_rextents_delta;/* superblocks rextents chg */
int64_t t_rextslog_delta;/* superblocks rextslog chg */
struct list_head t_items; /* log item descriptors */
- xfs_trans_header_t t_header; /* header for in-log trans */
struct list_head t_busy; /* list of busy extents */
unsigned long t_pflags; /* saved process flags state */
} xfs_trans_t;
@@ -237,10 +229,16 @@ void xfs_trans_log_efd_extent(xfs_trans_t *,
xfs_fsblock_t,
xfs_extlen_t);
int xfs_trans_commit(xfs_trans_t *, uint flags);
+int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_cancel(xfs_trans_t *, int);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
+void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *,
+ enum xfs_blft);
+void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
+ struct xfs_buf *src_bp);
+
extern kmem_zone_t *xfs_trans_zone;
extern kmem_zone_t *xfs_log_item_desc_zone;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 21c6d7ddbc0..cb0f3a84cc6 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -18,15 +18,16 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_trace.h"
#include "xfs_error.h"
+#include "xfs_log.h"
#ifdef DEBUG
/*
@@ -172,7 +173,6 @@ xfs_trans_ail_cursor_next(
*/
void
xfs_trans_ail_cursor_done(
- struct xfs_ail *ailp,
struct xfs_ail_cursor *cur)
{
cur->item = NULL;
@@ -367,7 +367,7 @@ xfsaild_push(
* If the AIL is empty or our push has reached the end we are
* done now.
*/
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
goto out_done;
}
@@ -452,7 +452,7 @@ xfsaild_push(
break;
lsn = lip->li_lsn;
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
@@ -658,11 +658,13 @@ xfs_trans_ail_update_bulk(
if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0)
continue;
+ trace_xfs_ail_move(lip, lip->li_lsn, lsn);
xfs_ail_delete(ailp, lip);
if (mlip == lip)
mlip_changed = 1;
} else {
lip->li_flags |= XFS_LI_IN_AIL;
+ trace_xfs_ail_insert(lip, 0, lsn);
}
lip->li_lsn = lsn;
list_add(&lip->li_ail, &tmp);
@@ -731,6 +733,7 @@ xfs_trans_ail_delete_bulk(
return;
}
+ trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8c75b8f6727..b8eef0549f3 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -17,17 +17,15 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_trans_priv.h"
#include "xfs_error.h"
@@ -277,6 +275,10 @@ xfs_trans_read_buf_map(
XFS_BUF_UNDONE(bp);
xfs_buf_stale(bp);
xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
return error;
}
#ifdef DEBUG
@@ -316,7 +318,18 @@ xfs_trans_read_buf_map(
ASSERT(bp->b_iodone == NULL);
XFS_BUF_READ(bp);
bp->b_ops = ops;
- xfsbdstrat(tp->t_mountp, bp);
+
+ /*
+ * XXX(hch): clean up the error handling here to be less
+ * of a mess..
+ */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ trace_xfs_bdstrat_shut(bp, _RET_IP_);
+ xfs_bioerror_relse(bp);
+ } else {
+ xfs_buf_iorequest(bp);
+ }
+
error = xfs_buf_iowait(bp);
if (error) {
xfs_buf_ioerror_alert(bp, __func__);
@@ -329,6 +342,9 @@ xfs_trans_read_buf_map(
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp,
SHUTDOWN_META_IO_ERROR);
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
return error;
}
}
@@ -366,6 +382,10 @@ xfs_trans_read_buf_map(
if (tp->t_flags & XFS_TRANS_DIRTY)
xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR);
xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == EFSBADCRC)
+ error = EFSCORRUPTED;
return error;
}
#ifdef DEBUG
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 54ee3c5dee7..41172861e85 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,23 +17,18 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
+#include "xfs_quota.h"
#include "xfs_qm.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -300,8 +295,8 @@ xfs_trans_mod_dquot(
/*
* Given an array of dqtrx structures, lock all the dquots associated and join
* them to the transaction, provided they have been modified. We know that the
- * highest number of dquots of one type - usr, grp OR prj - involved in a
- * transaction is 2 so we don't need to make this very generic.
+ * highest number of dquots of one type - usr, grp and prj - involved in a
+ * transaction is 3 so we don't need to make this very generic.
*/
STATIC void
xfs_trans_dqlockedjoin(
diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c
index 8d71b16ecca..47978ba89da 100644
--- a/fs/xfs/xfs_trans_extfree.c
+++ b/fs/xfs/xfs_trans_extfree.c
@@ -17,12 +17,13 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_extfree_item.h"
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 53dfe46f368..50c3f561428 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -17,18 +17,15 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
@@ -114,12 +111,14 @@ xfs_trans_log_inode(
/*
* First time we log the inode in a transaction, bump the inode change
- * counter if it is configured for this to occur.
+ * counter if it is configured for this to occur. We don't use
+ * inode_inc_version() because there is no need for extra locking around
+ * i_version as we already hold the inode locked exclusively for
+ * metadata modification.
*/
if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
IS_I_VERSION(VFS_I(ip))) {
- inode_inc_iversion(VFS_I(ip));
- ip->i_d.di_changecount = VFS_I(ip)->i_version;
+ ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
flags |= XFS_ILOG_CORE;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index c52def0b441..bd1281862ad 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -27,7 +27,6 @@ struct xfs_log_vec;
void xfs_trans_init(struct xfs_mount *);
-int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
@@ -134,8 +133,7 @@ struct xfs_log_item * xfs_trans_ail_cursor_last(struct xfs_ail *ailp,
xfs_lsn_t lsn);
struct xfs_log_item * xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
struct xfs_ail_cursor *cur);
-void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
- struct xfs_ail_cursor *cur);
+void xfs_trans_ail_cursor_done(struct xfs_ail_cursor *cur);
#if BITS_PER_LONG != 64
static inline void
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index a65a3cc4061..f2bda7c76b8 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -18,27 +18,20 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_shared.h"
#include "xfs_format.h"
-#include "xfs_log.h"
+#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
+#include "xfs_da_format.h"
#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
#include "xfs_inode.h"
-#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
#include "xfs_ialloc.h"
-#include "xfs_alloc.h"
-#include "xfs_extent_busy.h"
-#include "xfs_bmap.h"
-#include "xfs_bmap_util.h"
#include "xfs_quota.h"
+#include "xfs_trans.h"
#include "xfs_qm.h"
#include "xfs_trans_space.h"
#include "xfs_trace.h"
@@ -89,20 +82,69 @@ xfs_calc_buf_res(
* on disk. Hence we need an inode reservation function that calculates all this
* correctly. So, we log:
*
- * - log op headers for object
+ * - 4 log op headers for object
+ * - for the ilf, the inode core and 2 forks
* - inode log format object
- * - the entire inode contents (core + 2 forks)
- * - two bmap btree block headers
+ * - the inode core
+ * - two inode forks containing bmap btree root blocks.
+ * - the btree data contained by both forks will fit into the inode size,
+ * hence when combined with the inode core above, we have a total of the
+ * actual inode size.
+ * - the BMBT headers need to be accounted separately, as they are
+ * additional to the records and pointers that fit inside the inode
+ * forks.
*/
STATIC uint
xfs_calc_inode_res(
struct xfs_mount *mp,
uint ninodes)
{
- return ninodes * (sizeof(struct xlog_op_header) +
- sizeof(struct xfs_inode_log_format) +
- mp->m_sb.sb_inodesize +
- 2 * XFS_BMBT_BLOCK_LEN(mp));
+ return ninodes *
+ (4 * sizeof(struct xlog_op_header) +
+ sizeof(struct xfs_inode_log_format) +
+ mp->m_sb.sb_inodesize +
+ 2 * XFS_BMBT_BLOCK_LEN(mp));
+}
+
+/*
+ * The free inode btree is a conditional feature and the log reservation
+ * requirements differ slightly from that of the traditional inode allocation
+ * btree. The finobt tracks records for inode chunks with at least one free
+ * inode. A record can be removed from the tree for an inode allocation
+ * or free and thus the finobt reservation is unconditional across:
+ *
+ * - inode allocation
+ * - inode free
+ * - inode chunk allocation
+ *
+ * The 'modify' param indicates to include the record modification scenario. The
+ * 'alloc' param indicates to include the reservation for free space btree
+ * modifications on behalf of finobt modifications. This is required only for
+ * transactions that do not already account for free space btree modifications.
+ *
+ * the free inode btree: max depth * block size
+ * the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the free inode btree entry: block size
+ */
+STATIC uint
+xfs_calc_finobt_res(
+ struct xfs_mount *mp,
+ int alloc,
+ int modify)
+{
+ uint res;
+
+ if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+ return 0;
+
+ res = xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1));
+ if (alloc)
+ res += xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+ XFS_FSB_TO_B(mp, 1));
+ if (modify)
+ res += (uint)XFS_FSB_TO_B(mp, 1);
+
+ return res;
}
/*
@@ -182,7 +224,7 @@ xfs_calc_itruncate_reservation(
xfs_calc_buf_res(5, 0) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
XFS_FSB_TO_B(mp, 1)) +
- xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+ xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0)));
}
@@ -212,6 +254,19 @@ xfs_calc_rename_reservation(
}
/*
+ * For removing an inode from unlinked list at first, we can modify:
+ * the agi hash list and counters: sector size
+ * the on disk inode before ours in the agi hash list: inode cluster size
+ */
+STATIC uint
+xfs_calc_iunlink_remove_reservation(
+ struct xfs_mount *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ max_t(uint, XFS_FSB_TO_B(mp, 1), mp->m_inode_cluster_size);
+}
+
+/*
* For creating a link to an inode:
* the parent directory inode: inode size
* the linked inode: inode size
@@ -228,6 +283,7 @@ xfs_calc_link_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
+ xfs_calc_iunlink_remove_reservation(mp) +
MAX((xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
@@ -237,6 +293,18 @@ xfs_calc_link_reservation(
}
/*
+ * For adding an inode to unlinked list we can modify:
+ * the agi hash list: sector size
+ * the unlinked inode: inode size
+ */
+STATIC uint
+xfs_calc_iunlink_add_reservation(xfs_mount_t *mp)
+{
+ return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+ xfs_calc_inode_res(mp, 1);
+}
+
+/*
* For removing a directory entry we can modify:
* the parent directory inode: inode size
* the removed inode: inode size
@@ -253,10 +321,11 @@ xfs_calc_remove_reservation(
struct xfs_mount *mp)
{
return XFS_DQUOT_LOGRES(mp) +
- MAX((xfs_calc_inode_res(mp, 2) +
+ xfs_calc_iunlink_add_reservation(mp) +
+ MAX((xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1))),
- (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
+ (xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
XFS_FSB_TO_B(mp, 1))));
}
@@ -275,6 +344,7 @@ xfs_calc_remove_reservation(
* the superblock for the nlink flag: sector size
* the directory btree: (max depth + v2) * dir block size
* the directory inode's bmap btree: (max depth + v2) * block size
+ * the finobt (record modification and allocation btrees)
*/
STATIC uint
xfs_calc_create_resv_modify(
@@ -283,14 +353,15 @@ xfs_calc_create_resv_modify(
return xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
(uint)XFS_FSB_TO_B(mp, 1) +
- xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
+ xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 1, 1);
}
/*
* For create we can allocate some inodes giving:
* the agi and agf of the ag getting the new inodes: 2 * sectorsize
* the superblock for the nlink flag: sector size
- * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ * the inode blocks allocated: mp->m_ialloc_blks * blocksize
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
*/
@@ -300,7 +371,7 @@ xfs_calc_create_resv_alloc(
{
return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
mp->m_sb.sb_sectsize +
- xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp), XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_buf_res(mp->m_ialloc_blks, XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
XFS_FSB_TO_B(mp, 1));
@@ -321,6 +392,7 @@ __xfs_calc_create_reservation(
* the superblock for the nlink flag: sector size
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the finobt (record insertion)
*/
STATIC uint
xfs_calc_icreate_resv_alloc(
@@ -330,7 +402,8 @@ xfs_calc_icreate_resv_alloc(
mp->m_sb.sb_sectsize +
xfs_calc_buf_res(mp->m_in_maxlevels, XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 0, 0);
}
STATIC uint
@@ -351,6 +424,20 @@ xfs_calc_create_reservation(
}
+STATIC uint
+xfs_calc_create_tmpfile_reservation(
+ struct xfs_mount *mp)
+{
+ uint res = XFS_DQUOT_LOGRES(mp);
+
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ res += xfs_calc_icreate_resv_alloc(mp);
+ else
+ res += xfs_calc_create_resv_alloc(mp);
+
+ return res + xfs_calc_iunlink_add_reservation(mp);
+}
+
/*
* Making a new directory is the same as creating a new file.
*/
@@ -384,6 +471,7 @@ xfs_calc_symlink_reservation(
* the on disk inode before ours in the agi hash list: inode cluster size
* the inode btree: max depth * blocksize
* the allocation btrees: 2 trees * (max depth - 1) * block size
+ * the finobt (record insertion, removal or modification)
*/
STATIC uint
xfs_calc_ifree_reservation(
@@ -391,15 +479,15 @@ xfs_calc_ifree_reservation(
{
return XFS_DQUOT_LOGRES(mp) +
xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
- MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
- XFS_INODE_CLUSTER_SIZE(mp)) +
+ xfs_calc_iunlink_remove_reservation(mp) +
xfs_calc_buf_res(1, 0) +
- xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
+ xfs_calc_buf_res(2 + mp->m_ialloc_blks +
mp->m_in_maxlevels, 0) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ XFS_FSB_TO_B(mp, 1)) +
+ xfs_calc_finobt_res(mp, 0, 1);
}
/*
@@ -522,7 +610,7 @@ xfs_calc_addafork_reservation(
return XFS_DQUOT_LOGRES(mp) +
xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(1, mp->m_dirblksize) +
+ xfs_calc_buf_res(1, mp->m_dir_geo->blksize) +
xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
XFS_FSB_TO_B(mp, 1)) +
xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
@@ -653,15 +741,14 @@ xfs_calc_qm_setqlim_reservation(
/*
* Allocating quota on disk if needed.
- * the write transaction log space: M_RES(mp)->tr_write.tr_logres
+ * the write transaction log space for quota file extent allocation
* the unit of quota allocation: one system block size
*/
STATIC uint
xfs_calc_qm_dqalloc_reservation(
struct xfs_mount *mp)
{
- ASSERT(M_RES(mp)->tr_write.tr_logres);
- return M_RES(mp)->tr_write.tr_logres +
+ return xfs_calc_write_reservation(mp) +
xfs_calc_buf_res(1,
XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
}
@@ -738,6 +825,11 @@ xfs_trans_resv_calc(
resp->tr_create.tr_logcount = XFS_CREATE_LOG_COUNT;
resp->tr_create.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+ resp->tr_create_tmpfile.tr_logres =
+ xfs_calc_create_tmpfile_reservation(mp);
+ resp->tr_create_tmpfile.tr_logcount = XFS_CREATE_TMPFILE_LOG_COUNT;
+ resp->tr_create_tmpfile.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
+
resp->tr_mkdir.tr_logres = xfs_calc_mkdir_reservation(mp);
resp->tr_mkdir.tr_logcount = XFS_MKDIR_LOG_COUNT;
resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
@@ -793,7 +885,6 @@ xfs_trans_resv_calc(
/* The following transaction are logged in logical format */
resp->tr_ichange.tr_logres = xfs_calc_ichange_reservation(mp);
resp->tr_growdata.tr_logres = xfs_calc_growdata_reservation(mp);
- resp->tr_swrite.tr_logres = xfs_calc_swrite_reservation(mp);
resp->tr_fsyncts.tr_logres = xfs_calc_swrite_reservation(mp);
resp->tr_writeid.tr_logres = xfs_calc_writeid_reservation(mp);
resp->tr_attrsetrt.tr_logres = xfs_calc_attrsetrt_reservation(mp);
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/xfs_trans_resv.h
index de7de9aaad8..1097d14cd58 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/xfs_trans_resv.h
@@ -38,11 +38,11 @@ struct xfs_trans_resv {
struct xfs_trans_res tr_remove; /* unlink trans */
struct xfs_trans_res tr_symlink; /* symlink trans */
struct xfs_trans_res tr_create; /* create trans */
+ struct xfs_trans_res tr_create_tmpfile; /* create O_TMPFILE trans */
struct xfs_trans_res tr_mkdir; /* mkdir trans */
struct xfs_trans_res tr_ifree; /* inode free trans */
struct xfs_trans_res tr_ichange; /* inode update trans */
struct xfs_trans_res tr_growdata; /* fs data section grow trans */
- struct xfs_trans_res tr_swrite; /* sync write inode trans */
struct xfs_trans_res tr_addafork; /* add inode attr fork trans */
struct xfs_trans_res tr_writeid; /* write setuid/setgid file */
struct xfs_trans_res tr_attrinval; /* attr fork buffer
@@ -100,6 +100,7 @@ struct xfs_trans_resv {
#define XFS_ITRUNCATE_LOG_COUNT 2
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
+#define XFS_CREATE_TMPFILE_LOG_COUNT 2
#define XFS_MKDIR_LOG_COUNT 3
#define XFS_SYMLINK_LOG_COUNT 3
#define XFS_REMOVE_LOG_COUNT 2
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/xfs_trans_space.h
index 7d2c920dfb9..bf9c4579334 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/xfs_trans_space.h
@@ -28,7 +28,8 @@
(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
XFS_EXTENTADD_SPACE_RES(mp,w))
-#define XFS_DAENTER_1B(mp,w) ((w) == XFS_DATA_FORK ? (mp)->m_dirblkfsbs : 1)
+#define XFS_DAENTER_1B(mp,w) \
+ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
#define XFS_DAENTER_DBS(mp,w) \
(XFS_DA_NODE_MAXDEPTH + (((w) == XFS_DATA_FORK) ? 2 : 0))
#define XFS_DAENTER_BLOCKS(mp,w) \
@@ -47,13 +48,15 @@
#define XFS_DIRREMOVE_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
- (XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels - 1)
+ ((mp)->m_ialloc_blks + \
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+ ((mp)->m_in_maxlevels - 1)))
/*
* Space reservation values for various transactions.
*/
#define XFS_ADDAFORK_SPACE_RES(mp) \
- ((mp)->m_dirblkfsbs + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
+ ((mp)->m_dir_geo->fsbcount + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK))
#define XFS_ATTRRM_SPACE_RES(mp) \
XFS_DAREMOVE_SPACE_RES(mp, XFS_ATTR_FORK)
/* This macro is not used - see inline code in xfs_attr_set */
@@ -82,5 +85,8 @@
(XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl))
#define XFS_SYMLINK_SPACE_RES(mp,nl,b) \
(XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b))
+#define XFS_IFREE_SPACE_RES(mp) \
+ (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0)
+
#endif /* __XFS_TRANS_SPACE_H__ */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 82bbc34d54a..65c6e6650b1 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -134,7 +134,7 @@ typedef enum {
typedef enum {
XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_BMAPi, XFS_BTNUM_INOi,
- XFS_BTNUM_MAX
+ XFS_BTNUM_FINOi, XFS_BTNUM_MAX
} xfs_btnum_t;
struct xfs_name {
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
index db14d0c0868..e8a77383c0d 100644
--- a/fs/xfs/xfs_vnode.h
+++ b/fs/xfs/xfs_vnode.h
@@ -25,14 +25,6 @@ struct xfs_inode;
struct attrlist_cursor_kern;
/*
- * Return values for xfs_inactive. A return value of
- * VN_INACTIVE_NOCACHE implies that the file system behavior
- * has disassociated its state and bhv_desc_t from the vnode.
- */
-#define VN_INACTIVE_CACHE 0
-#define VN_INACTIVE_NOCACHE 1
-
-/*
* Flags for read/write calls - same values as IRIX
*/
#define IO_ISDIRECT 0x00004 /* bypass page cache */
@@ -43,15 +35,6 @@ struct attrlist_cursor_kern;
{ IO_INVIS, "INVIS"}
/*
- * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- */
-#define FI_NONE 0 /* none */
-#define FI_REMAPF 1 /* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED 2 /* Do a remapf prior to the operation.
- Prevent VM access to the pages until
- the operation completes. */
-
-/*
* Some useful predicates.
*/
#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index e01f35ea76b..78ed92a46fd 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -17,9 +17,13 @@
*/
#include "xfs.h"
+#include "xfs_format.h"
#include "xfs_log_format.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
#include "xfs_attr.h"
#include "xfs_attr_leaf.h"
@@ -98,8 +102,8 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
&xfs_xattr_trusted_handler,
&xfs_xattr_security_handler,
#ifdef CONFIG_XFS_POSIX_ACL
- &xfs_xattr_acl_access_handler,
- &xfs_xattr_acl_default_handler,
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
#endif
NULL
};