diff options
Diffstat (limited to 'fs')
193 files changed, 6596 insertions, 5430 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 814ac4e213a..0a93dc1cb4a 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -1,6 +1,6 @@ config 9P_FS - tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)" - depends on INET && NET_9P && EXPERIMENTAL + tristate "Plan 9 Resource Sharing Support (9P2000)" + depends on INET && NET_9P help If you say Y here, you will get experimental support for Plan 9 resource sharing via the 9P2000 protocol. @@ -10,7 +10,6 @@ config 9P_FS If unsure, say N. if 9P_FS - config 9P_FSCACHE bool "Enable 9P client caching support (EXPERIMENTAL)" depends on EXPERIMENTAL diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 82a7c38ddad..691c78f58be 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -259,7 +259,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, if (IS_ERR(inode_fid)) { err = PTR_ERR(inode_fid); mutex_unlock(&v9inode->v_mutex); - goto error; + goto err_clunk_old_fid; } v9inode->writeback_fid = (void *) inode_fid; } @@ -267,8 +267,8 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, /* Since we are opening a file, assign the open fid to the file */ filp = lookup_instantiate_filp(nd, dentry, generic_file_open); if (IS_ERR(filp)) { - p9_client_clunk(ofid); - return PTR_ERR(filp); + err = PTR_ERR(filp); + goto err_clunk_old_fid; } filp->private_data = ofid; #ifdef CONFIG_9P_FSCACHE @@ -278,10 +278,11 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode, return 0; error: - if (ofid) - p9_client_clunk(ofid); if (fid) p9_client_clunk(fid); +err_clunk_old_fid: + if (ofid) + p9_client_clunk(ofid); return err; } diff --git a/fs/Kconfig b/fs/Kconfig index f3aa9b08b22..979992dcb38 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -121,9 +121,25 @@ config TMPFS See <file:Documentation/filesystems/tmpfs.txt> for details. +config TMPFS_XATTR + bool "Tmpfs extended attributes" + depends on TMPFS + default n + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + <http://acl.bestbits.at/> for details). + + Currently this enables support for the trusted.* and + security.* namespaces. + + If unsure, say N. + + You need this for POSIX ACL support on tmpfs. + config TMPFS_POSIX_ACL bool "Tmpfs POSIX Access Control Lists" - depends on TMPFS + depends on TMPFS_XATTR select GENERIC_ACL help POSIX Access Control Lists (ACLs) support permissions for users and diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 397d3057d33..1bffbe0ed77 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -820,6 +820,8 @@ static int load_flat_shared_library(int id, struct lib_info *libs) int res; char buf[16]; + memset(&bprm, 0, sizeof(bprm)); + /* Create the file name */ sprintf(buf, "/lib/lib%d.so", id); @@ -835,6 +837,12 @@ static int load_flat_shared_library(int id, struct lib_info *libs) if (!bprm.cred) goto out; + /* We don't really care about recalculating credentials at this point + * as we're past the point of no return and are dealing with shared + * libraries. + */ + bprm.cred_prepared = 1; + res = prepare_binprm(&bprm); if (!IS_ERR_VALUE(res)) diff --git a/fs/block_dev.c b/fs/block_dev.c index 257b00e9842..1f2b1997833 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1120,6 +1120,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto restart; } } + + if (!ret && !bdev->bd_openers) { + bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + bdi = blk_get_backing_dev_info(bdev); + if (bdi == NULL) + bdi = &default_backing_dev_info; + bdev_inode_switch_bdi(bdev->bd_inode, bdi); + } + /* * If the device is invalidated, rescan partition * if open succeeded or failed with -ENOMEDIUM. @@ -1130,14 +1139,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) rescan_partitions(disk, bdev); if (ret) goto out_clear; - - if (!bdev->bd_openers) { - bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; - bdev_inode_switch_bdi(bdev->bd_inode, bdi); - } } else { struct block_device *whole; whole = bdget_disk(disk, 0); @@ -1237,6 +1238,8 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) res = __blkdev_get(bdev, mode, 0); if (whole) { + struct gendisk *disk = whole->bd_disk; + /* finish claiming */ mutex_lock(&bdev->bd_mutex); spin_lock(&bdev_lock); @@ -1263,15 +1266,16 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) spin_unlock(&bdev_lock); /* - * Block event polling for write claims. Any write - * holder makes the write_holder state stick until all - * are released. This is good enough and tracking - * individual writeable reference is too fragile given - * the way @mode is used in blkdev_get/put(). + * Block event polling for write claims if requested. Any + * write holder makes the write_holder state stick until + * all are released. This is good enough and tracking + * individual writeable reference is too fragile given the + * way @mode is used in blkdev_get/put(). */ - if (!res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { + if ((disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE) && + !res && (mode & FMODE_WRITE) && !bdev->bd_write_holder) { bdev->bd_write_holder = true; - disk_block_events(bdev->bd_disk); + disk_block_events(disk); } mutex_unlock(&bdev->bd_mutex); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ba41da59e31..96fcfa522da 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -10,6 +10,7 @@ #include <linux/swap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/prefetch.h> #include "extent_io.h" #include "extent_map.h" #include "compat.h" diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 199a8013431..f340f7c99d0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -709,7 +709,7 @@ again: WARN_ON(cur->checked); if (!list_empty(&cur->upper)) { /* - * the backref was added previously when processsing + * the backref was added previously when processing * backref of type BTRFS_TREE_BLOCK_REF_KEY */ BUG_ON(!list_is_singular(&cur->upper)); diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 7cb0f7f847e..75c47cd8d08 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -7,6 +7,7 @@ config CIFS select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ARC4 + select CRYPTO_DES help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block @@ -152,16 +153,28 @@ config CIFS_ACL Allows to fetch CIFS/NTFS ACL from the server. The DACL blob is handed over to the application/caller. -config CIFS_EXPERIMENTAL - bool "CIFS Experimental Features (EXPERIMENTAL)" +config CIFS_SMB2 + bool "SMB2 network file system support (EXPERIMENTAL)" + depends on EXPERIMENTAL && INET && BROKEN + select NLS + select KEYS + select FSCACHE + select DNS_RESOLVER + + help + This enables experimental support for the SMB2 (Server Message Block + version 2) protocol. The SMB2 protocol is the successor to the + popular CIFS and SMB network file sharing protocols. SMB2 is the + native file sharing mechanism for recent versions of Windows + operating systems (since Vista). SMB2 enablement will eventually + allow users better performance, security and features, than would be + possible with cifs. Note that smb2 mount options also are simpler + (compared to cifs) due to protocol improvements. + + Unless you are a developer or tester, say N. + +config CIFS_NFSD_EXPORT + bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL help - Enables cifs features under testing. These features are - experimental and currently include DFS support and directory - change notification ie fcntl(F_DNOTIFY), as well as the upcall - mechanism which will be used for Kerberos session negotiation - and uid remapping. Some of these features also may depend on - setting a value of 1 to the pseudo-file /proc/fs/cifs/Experimental - (which is disabled by default). See the file fs/cifs/README - for more details. If unsure, say N. - + Allows NFS server to export a CIFS mounted share (nfsd over cifs) diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index d87558448e3..005d524c3a4 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -4,7 +4,7 @@ obj-$(CONFIG_CIFS) += cifs.o cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \ - link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \ + link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \ cifs_unicode.o nterr.o xattr.o cifsencrypt.o \ readdir.o ioctl.o sess.o export.o diff --git a/fs/cifs/README b/fs/cifs/README index 74ab165fc64..4a3ca0e5ca2 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -704,18 +704,6 @@ the start of smb requests and responses can be enabled via: echo 1 > /proc/fs/cifs/traceSMB -Two other experimental features are under development. To test these -requires enabling CONFIG_CIFS_EXPERIMENTAL - - cifsacl support needed to retrieve approximated mode bits based on - the contents on the CIFS ACL. - - lease support: cifs will check the oplock state before calling into - the vfs to see if we can grant a lease on a file. - - DNOTIFY fcntl: needed for support of directory change - notification and perhaps later for file leases) - Per share (per client mount) statistics are available in /proc/fs/cifs/Stats if the kernel was configured with cifs statistics enabled. The statistics represent the number of successful (ie non-zero return code from the server) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 30d01bc9085..18f4272d904 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -63,7 +63,7 @@ void cifs_dump_detail(struct smb_hdr *smb) cERROR(1, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", smb->Command, smb->Status.CifsError, smb->Flags, smb->Flags2, smb->Mid, smb->Pid); - cERROR(1, "smb buf %p len %d", smb, smbCalcSize_LE(smb)); + cERROR(1, "smb buf %p len %d", smb, smbCalcSize(smb)); } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index ac51cd2d33a..a9d5692e0c2 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -58,9 +58,7 @@ struct cifs_sb_info { unsigned int mnt_cifs_flags; int prepathlen; char *prepath; /* relative path under the share to mount to */ -#ifdef CONFIG_CIFS_DFS_UPCALL - char *mountdata; /* mount options received at mount time */ -#endif + char *mountdata; /* options received at mount time or via DFS refs */ struct backing_dev_info bdi; struct delayed_work prune_tlinks; }; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 644dd882a56..6d02fd56056 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -82,6 +82,9 @@ int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *); char *cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode, const struct nls_table *codepage); +extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, + const struct nls_table *cp, int mapChars); + #endif /* diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index beeebf19423..f3c6fb9942a 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -23,24 +23,16 @@ #include <linux/fs.h> #include <linux/slab.h> +#include <linux/string.h> +#include <linux/keyctl.h> +#include <linux/key-type.h> +#include <keys/user-type.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" #include "cifs_debug.h" - -static struct cifs_wksid wksidarr[NUM_WK_SIDS] = { - {{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"}, - {{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"}, - {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"}, - {{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"}, - {{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} } -; - - /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; @@ -50,50 +42,385 @@ static const struct cifs_sid sid_authusers = { /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; +const struct cred *root_cred; -int match_sid(struct cifs_sid *ctsid) +static void +shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem, + int *nr_del) { - int i, j; - int num_subauth, num_sat, num_saw; - struct cifs_sid *cwsid; + struct rb_node *node; + struct rb_node *tmp; + struct cifs_sid_id *psidid; + + node = rb_first(root); + while (node) { + tmp = node; + node = rb_next(tmp); + psidid = rb_entry(tmp, struct cifs_sid_id, rbnode); + if (nr_to_scan == 0 || *nr_del == nr_to_scan) + ++(*nr_rem); + else { + if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE) + && psidid->refcount == 0) { + rb_erase(tmp, root); + ++(*nr_del); + } else + ++(*nr_rem); + } + } +} + +/* + * Run idmap cache shrinker. + */ +static int +cifs_idmap_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +{ + int nr_del = 0; + int nr_rem = 0; + struct rb_root *root; + + root = &uidtree; + spin_lock(&siduidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&siduidlock); + + root = &gidtree; + spin_lock(&sidgidlock); + shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del); + spin_unlock(&sidgidlock); + + return nr_rem; +} + +static struct shrinker cifs_shrinker = { + .shrink = cifs_idmap_shrinker, + .seeks = DEFAULT_SEEKS, +}; + +static int +cifs_idmap_key_instantiate(struct key *key, const void *data, size_t datalen) +{ + char *payload; + + payload = kmalloc(datalen, GFP_KERNEL); + if (!payload) + return -ENOMEM; + + memcpy(payload, data, datalen); + key->payload.data = payload; + return 0; +} + +static inline void +cifs_idmap_key_destroy(struct key *key) +{ + kfree(key->payload.data); +} - if (!ctsid) - return -1; +struct key_type cifs_idmap_key_type = { + .name = "cifs.idmap", + .instantiate = cifs_idmap_key_instantiate, + .destroy = cifs_idmap_key_destroy, + .describe = user_describe, + .match = user_match, +}; + +static void +sid_to_str(struct cifs_sid *sidptr, char *sidstr) +{ + int i; + unsigned long saval; + char *strptr; - for (i = 0; i < NUM_WK_SIDS; ++i) { - cwsid = &(wksidarr[i].cifssid); + strptr = sidstr; - /* compare the revision */ - if (ctsid->revision != cwsid->revision) - continue; + sprintf(strptr, "%s", "S"); + strptr = sidstr + strlen(sidstr); - /* compare all of the six auth values */ - for (j = 0; j < 6; ++j) { - if (ctsid->authority[j] != cwsid->authority[j]) - break; + sprintf(strptr, "-%d", sidptr->revision); + strptr = sidstr + strlen(sidstr); + + for (i = 0; i < 6; ++i) { + if (sidptr->authority[i]) { + sprintf(strptr, "-%d", sidptr->authority[i]); + strptr = sidstr + strlen(sidstr); } - if (j < 6) - continue; /* all of the auth values did not match */ - - /* compare all of the subauth values if any */ - num_sat = ctsid->num_subauth; - num_saw = cwsid->num_subauth; - num_subauth = num_sat < num_saw ? num_sat : num_saw; - if (num_subauth) { - for (j = 0; j < num_subauth; ++j) { - if (ctsid->sub_auth[j] != cwsid->sub_auth[j]) - break; - } - if (j < num_subauth) - continue; /* all sub_auth values do not match */ + } + + for (i = 0; i < sidptr->num_subauth; ++i) { + saval = le32_to_cpu(sidptr->sub_auth[i]); + sprintf(strptr, "-%ld", saval); + strptr = sidstr + strlen(sidstr); + } +} + +static void +id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr, + struct cifs_sid_id **psidid, char *typestr) +{ + int rc; + char *strptr; + struct rb_node *node = root->rb_node; + struct rb_node *parent = NULL; + struct rb_node **linkto = &(root->rb_node); + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + parent = node; + rc = compare_sids(sidptr, &((lsidid)->sid)); + if (rc > 0) { + linkto = &(node->rb_left); + node = node->rb_left; + } else if (rc < 0) { + linkto = &(node->rb_right); + node = node->rb_right; + } + } + + memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid)); + (*psidid)->time = jiffies - (SID_MAP_RETRY + 1); + (*psidid)->refcount = 0; + + sprintf((*psidid)->sidstr, "%s", typestr); + strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr); + sid_to_str(&(*psidid)->sid, strptr); + + clear_bit(SID_ID_PENDING, &(*psidid)->state); + clear_bit(SID_ID_MAPPED, &(*psidid)->state); + + rb_link_node(&(*psidid)->rbnode, parent, linkto); + rb_insert_color(&(*psidid)->rbnode, root); +} + +static struct cifs_sid_id * +id_rb_search(struct rb_root *root, struct cifs_sid *sidptr) +{ + int rc; + struct rb_node *node = root->rb_node; + struct cifs_sid_id *lsidid; + + while (node) { + lsidid = rb_entry(node, struct cifs_sid_id, rbnode); + rc = compare_sids(sidptr, &((lsidid)->sid)); + if (rc > 0) { + node = node->rb_left; + } else if (rc < 0) { + node = node->rb_right; + } else /* node found */ + return lsidid; + } + + return NULL; +} + +static int +sidid_pending_wait(void *unused) +{ + schedule(); + return signal_pending(current) ? -ERESTARTSYS : 0; +} + +static int +sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, + struct cifs_fattr *fattr, uint sidtype) +{ + int rc; + unsigned long cid; + struct key *idkey; + const struct cred *saved_cred; + struct cifs_sid_id *psidid, *npsidid; + struct rb_root *cidtree; + spinlock_t *cidlock; + + if (sidtype == SIDOWNER) { + cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */ + cidlock = &siduidlock; + cidtree = &uidtree; + } else if (sidtype == SIDGROUP) { + cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */ + cidlock = &sidgidlock; + cidtree = &gidtree; + } else + return -ENOENT; + + spin_lock(cidlock); + psidid = id_rb_search(cidtree, psid); + + if (!psidid) { /* node does not exist, allocate one & attempt adding */ + spin_unlock(cidlock); + npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL); + if (!npsidid) + return -ENOMEM; + + npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL); + if (!npsidid->sidstr) { + kfree(npsidid); + return -ENOMEM; + } + + spin_lock(cidlock); + psidid = id_rb_search(cidtree, psid); + if (psidid) { /* node happened to get inserted meanwhile */ + ++psidid->refcount; + spin_unlock(cidlock); + kfree(npsidid->sidstr); + kfree(npsidid); + } else { + psidid = npsidid; + id_rb_insert(cidtree, psid, &psidid, + sidtype == SIDOWNER ? "os:" : "gs:"); + ++psidid->refcount; + spin_unlock(cidlock); } + } else { + ++psidid->refcount; + spin_unlock(cidlock); + } + + /* + * If we are here, it is safe to access psidid and its fields + * since a reference was taken earlier while holding the spinlock. + * A reference on the node is put without holding the spinlock + * and it is OK to do so in this case, shrinker will not erase + * this node until all references are put and we do not access + * any fields of the node after a reference is put . + */ + if (test_bit(SID_ID_MAPPED, &psidid->state)) { + cid = psidid->id; + psidid->time = jiffies; /* update ts for accessing */ + goto sid_to_id_out; + } - cFYI(1, "matching sid: %s\n", wksidarr[i].sidname); - return 0; /* sids compare/match */ + if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) + goto sid_to_id_out; + + if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) { + saved_cred = override_creds(root_cred); + idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, ""); + if (IS_ERR(idkey)) + cFYI(1, "%s: Can't map SID to an id", __func__); + else { + cid = *(unsigned long *)idkey->payload.value; + psidid->id = cid; + set_bit(SID_ID_MAPPED, &psidid->state); + key_put(idkey); + kfree(psidid->sidstr); + } + revert_creds(saved_cred); + psidid->time = jiffies; /* update ts for accessing */ + clear_bit(SID_ID_PENDING, &psidid->state); + wake_up_bit(&psidid->state, SID_ID_PENDING); + } else { + rc = wait_on_bit(&psidid->state, SID_ID_PENDING, + sidid_pending_wait, TASK_INTERRUPTIBLE); + if (rc) { + cFYI(1, "%s: sidid_pending_wait interrupted %d", + __func__, rc); + --psidid->refcount; /* decremented without spinlock */ + return rc; + } + if (test_bit(SID_ID_MAPPED, &psidid->state)) + cid = psidid->id; } - cFYI(1, "No matching sid"); - return -1; +sid_to_id_out: + --psidid->refcount; /* decremented without spinlock */ + if (sidtype == SIDOWNER) + fattr->cf_uid = cid; + else + fattr->cf_gid = cid; + + return 0; +} + +int +init_cifs_idmap(void) +{ + struct cred *cred; + struct key *keyring; + int ret; + + cFYI(1, "Registering the %s key type\n", cifs_idmap_key_type.name); + + /* create an override credential set with a special thread keyring in + * which requests are cached + * + * this is used to prevent malicious redirections from being installed + * with add_key(). + */ + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | + KEY_USR_VIEW | KEY_USR_READ, + KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(keyring)) { + ret = PTR_ERR(keyring); + goto failed_put_cred; + } + + ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); + if (ret < 0) + goto failed_put_key; + + ret = register_key_type(&cifs_idmap_key_type); + if (ret < 0) + goto failed_put_key; + + /* instruct request_key() to use this special keyring as a cache for + * the results it looks up */ + cred->thread_keyring = keyring; + cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; + root_cred = cred; + + spin_lock_init(&siduidlock); + uidtree = RB_ROOT; + spin_lock_init(&sidgidlock); + gidtree = RB_ROOT; + + register_shrinker(&cifs_shrinker); + + cFYI(1, "cifs idmap keyring: %d\n", key_serial(keyring)); + return 0; + +failed_put_key: + key_put(keyring); +failed_put_cred: + put_cred(cred); + return ret; +} + +void +exit_cifs_idmap(void) +{ + key_revoke(root_cred->thread_keyring); + unregister_key_type(&cifs_idmap_key_type); + put_cred(root_cred); + unregister_shrinker(&cifs_shrinker); + cFYI(1, "Unregistered %s key type\n", cifs_idmap_key_type.name); +} + +void +cifs_destroy_idmaptrees(void) +{ + struct rb_root *root; + struct rb_node *node; + + root = &uidtree; + spin_lock(&siduidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&siduidlock); + + root = &gidtree; + spin_lock(&sidgidlock); + while ((node = rb_first(root))) + rb_erase(node, root); + spin_unlock(&sidgidlock); } /* if the two SIDs (roughly equivalent to a UUID for a user or group) are @@ -104,16 +431,24 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) int num_subauth, num_sat, num_saw; if ((!ctsid) || (!cwsid)) - return 0; + return 1; /* compare the revision */ - if (ctsid->revision != cwsid->revision) - return 0; + if (ctsid->revision != cwsid->revision) { + if (ctsid->revision > cwsid->revision) + return 1; + else + return -1; + } /* compare all of the six auth values */ for (i = 0; i < 6; ++i) { - if (ctsid->authority[i] != cwsid->authority[i]) - return 0; + if (ctsid->authority[i] != cwsid->authority[i]) { + if (ctsid->authority[i] > cwsid->authority[i]) + return 1; + else + return -1; + } } /* compare all of the subauth values if any */ @@ -122,12 +457,16 @@ int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) num_subauth = num_sat < num_saw ? num_sat : num_saw; if (num_subauth) { for (i = 0; i < num_subauth; ++i) { - if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) - return 0; + if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { + if (ctsid->sub_auth[i] > cwsid->sub_auth[i]) + return 1; + else + return -1; + } } } - return 1; /* sids compare/match */ + return 0; /* sids compare/match */ } @@ -382,22 +721,22 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif - if (compare_sids(&(ppace[i]->sid), pownersid)) + if (compare_sids(&(ppace[i]->sid), pownersid) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &user_mask); - if (compare_sids(&(ppace[i]->sid), pgrpsid)) + if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &group_mask); - if (compare_sids(&(ppace[i]->sid), &sid_everyone)) + if (compare_sids(&(ppace[i]->sid), &sid_everyone) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &other_mask); - if (compare_sids(&(ppace[i]->sid), &sid_authusers)) + if (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0) access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, @@ -475,10 +814,10 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl) /* Convert CIFS ACL to POSIX form */ -static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, - struct cifs_fattr *fattr) +static int parse_sec_desc(struct cifs_sb_info *cifs_sb, + struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr) { - int rc; + int rc = 0; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_acl *dacl_ptr; /* no need for SACL ptr */ char *end_of_acl = ((char *)pntsd) + acl_len; @@ -500,12 +839,26 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, le32_to_cpu(pntsd->sacloffset), dacloffset); /* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ rc = parse_sid(owner_sid_ptr, end_of_acl); - if (rc) + if (rc) { + cFYI(1, "%s: Error %d parsing Owner SID", __func__, rc); + return rc; + } + rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER); + if (rc) { + cFYI(1, "%s: Error %d mapping Owner SID to uid", __func__, rc); return rc; + } rc = parse_sid(group_sid_ptr, end_of_acl); - if (rc) + if (rc) { + cFYI(1, "%s: Error %d mapping Owner SID to gid", __func__, rc); return rc; + } + rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP); + if (rc) { + cFYI(1, "%s: Error %d mapping Group SID to gid", __func__, rc); + return rc; + } if (dacloffset) parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, @@ -520,7 +873,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, memcpy((void *)(&(cifscred->gsid)), (void *)group_sid_ptr, sizeof(struct cifs_sid)); */ - return 0; + return rc; } @@ -688,7 +1041,7 @@ out: } /* Set an ACL on the server */ -static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, +int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -727,7 +1080,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, rc = PTR_ERR(pntsd); cERROR(1, "%s: error %d getting sec desc", __func__, rc); } else { - rc = parse_sec_desc(pntsd, acllen, fattr); + rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr); kfree(pntsd); if (rc) cERROR(1, "parse sec desc failed rc = %d", rc); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index c4ae7d03656..5c902c7ce52 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -39,6 +39,15 @@ #define ACCESS_ALLOWED 0 #define ACCESS_DENIED 1 +#define SIDOWNER 1 +#define SIDGROUP 2 +#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */ + +#define SID_ID_MAPPED 0 +#define SID_ID_PENDING 1 +#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */ +#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */ + struct cifs_ntsd { __le16 revision; /* revision level */ __le16 type; @@ -74,7 +83,21 @@ struct cifs_wksid { char sidname[SIDNAMELENGTH]; } __attribute__((packed)); -extern int match_sid(struct cifs_sid *); +struct cifs_sid_id { + unsigned int refcount; /* increment with spinlock, decrement without */ + unsigned long id; + unsigned long time; + unsigned long state; + char *sidstr; + struct rb_node rbnode; + struct cifs_sid sid; +}; + +#ifdef __KERNEL__ +extern struct key_type cifs_idmap_key_type; +extern const struct cred *root_cred; +#endif /* KERNEL */ + extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *); #endif /* _CIFSACL_H */ diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index d1a016be73b..45c3f78c8f8 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -60,7 +60,7 @@ static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, server->session_key.response, server->session_key.len); crypto_shash_update(&server->secmech.sdescmd5->shash, - cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + cifs_pdu->Protocol, be32_to_cpu(cifs_pdu->smb_buf_length)); rc = crypto_shash_final(&server->secmech.sdescmd5->shash, signature); @@ -268,10 +268,11 @@ int setup_ntlm_response(struct cifsSesInfo *ses) } #ifdef CONFIG_CIFS_WEAK_PW_HASH -void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, +int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key) { int i; + int rc; char password_with_pad[CIFS_ENCPWD_SIZE]; memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); @@ -282,7 +283,7 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); - return; + return 0; } /* calculate old style session key */ @@ -299,10 +300,9 @@ void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, for (i = 0; i < CIFS_ENCPWD_SIZE; i++) password_with_pad[i] = toupper(password_with_pad[i]); - SMBencrypt(password_with_pad, cryptkey, lnm_session_key); + rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key); - /* clear password before we return/free memory */ - memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); + return rc; } #endif /* CIFS_WEAK_PW_HASH */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5c412b33cd7..493b74ca564 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -128,29 +128,22 @@ cifs_read_super(struct super_block *sb, void *data, } cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages; -#ifdef CONFIG_CIFS_DFS_UPCALL - /* copy mount params to sb for use in submounts */ - /* BB: should we move this after the mount so we - * do not have to do the copy on failed mounts? - * BB: May be it is better to do simple copy before - * complex operation (mount), and in case of fail - * just exit instead of doing mount and attempting - * undo it if this copy fails?*/ + /* + * Copy mount params to sb for use in submounts. Better to do + * the copy here and deal with the error before cleanup gets + * complicated post-mount. + */ if (data) { - int len = strlen(data); - cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); + cifs_sb->mountdata = kstrndup(data, PAGE_SIZE, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { bdi_destroy(&cifs_sb->bdi); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; } - strncpy(cifs_sb->mountdata, data, len + 1); - cifs_sb->mountdata[len] = '\0'; } -#endif - rc = cifs_mount(sb, cifs_sb, data, devname); + rc = cifs_mount(sb, cifs_sb, devname); if (rc) { if (!silent) @@ -163,7 +156,7 @@ cifs_read_super(struct super_block *sb, void *data, sb->s_bdi = &cifs_sb->bdi; sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ - inode = cifs_root_iget(sb, ROOT_I); + inode = cifs_root_iget(sb); if (IS_ERR(inode)) { rc = PTR_ERR(inode); @@ -184,12 +177,12 @@ cifs_read_super(struct super_block *sb, void *data, else sb->s_d_op = &cifs_dentry_ops; -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CIFS_NFSD_EXPORT if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cFYI(1, "export ops supported"); sb->s_export_op = &cifs_export_ops; } -#endif /* EXPERIMENTAL */ +#endif /* CIFS_NFSD_EXPORT */ return 0; @@ -202,12 +195,10 @@ out_no_root: out_mount_failed: if (cifs_sb) { -#ifdef CONFIG_CIFS_DFS_UPCALL if (cifs_sb->mountdata) { kfree(cifs_sb->mountdata); cifs_sb->mountdata = NULL; } -#endif unload_nls(cifs_sb->local_nls); bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb); @@ -231,12 +222,10 @@ cifs_put_super(struct super_block *sb) rc = cifs_umount(sb, cifs_sb); if (rc) cERROR(1, "cifs_umount failed with return code %d", rc); -#ifdef CONFIG_CIFS_DFS_UPCALL if (cifs_sb->mountdata) { kfree(cifs_sb->mountdata); cifs_sb->mountdata = NULL; } -#endif unload_nls(cifs_sb->local_nls); bdi_destroy(&cifs_sb->bdi); @@ -618,16 +607,31 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin) { /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { - int retval; - - /* some applications poll for the file length in this strange - way so we must seek to end on non-oplocked files by - setting the revalidate time to zero */ - CIFS_I(file->f_path.dentry->d_inode)->time = 0; - - retval = cifs_revalidate_file(file); - if (retval < 0) - return (loff_t)retval; + int rc; + struct inode *inode = file->f_path.dentry->d_inode; + + /* + * We need to be sure that all dirty pages are written and the + * server has the newest file length. + */ + if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + inode->i_mapping->nrpages != 0) { + rc = filemap_fdatawait(inode->i_mapping); + if (rc) { + mapping_set_error(inode->i_mapping, rc); + return rc; + } + } + /* + * Some applications poll for the file length in this strange + * way so we must seek to end on non-oplocked files by + * setting the revalidate time to zero. + */ + CIFS_I(inode)->time = 0; + + rc = cifs_revalidate_file_attr(file); + if (rc < 0) + return (loff_t)rc; } return generic_file_llseek_unlocked(file, offset, origin); } @@ -760,10 +764,11 @@ const struct file_operations cifs_file_strict_ops = { }; const struct file_operations cifs_file_direct_ops = { - /* no aio, no readv - - BB reevaluate whether they can be done with directio, no cache */ - .read = cifs_user_read, - .write = cifs_user_write, + /* BB reevaluate whether they can be done with directio, no cache */ + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_user_readv, + .aio_write = cifs_user_writev, .open = cifs_open, .release = cifs_close, .lock = cifs_lock, @@ -815,10 +820,11 @@ const struct file_operations cifs_file_strict_nobrl_ops = { }; const struct file_operations cifs_file_direct_nobrl_ops = { - /* no mmap, no aio, no readv - - BB reevaluate whether they can be done with directio, no cache */ - .read = cifs_user_read, - .write = cifs_user_write, + /* BB reevaluate whether they can be done with directio, no cache */ + .read = do_sync_read, + .write = do_sync_write, + .aio_read = cifs_user_readv, + .aio_write = cifs_user_writev, .open = cifs_open, .release = cifs_close, .fsync = cifs_fsync, @@ -981,10 +987,10 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ INIT_LIST_HEAD(&GlobalDnotifyReqList); INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); -#endif +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ /* * Initialize Global counters */ @@ -1033,22 +1039,33 @@ init_cifs(void) if (rc) goto out_destroy_mids; - rc = register_filesystem(&cifs_fs_type); - if (rc) - goto out_destroy_request_bufs; #ifdef CONFIG_CIFS_UPCALL rc = register_key_type(&cifs_spnego_key_type); if (rc) - goto out_unregister_filesystem; -#endif + goto out_destroy_request_bufs; +#endif /* CONFIG_CIFS_UPCALL */ + +#ifdef CONFIG_CIFS_ACL + rc = init_cifs_idmap(); + if (rc) + goto out_register_key_type; +#endif /* CONFIG_CIFS_ACL */ + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_init_cifs_idmap; return 0; -#ifdef CONFIG_CIFS_UPCALL -out_unregister_filesystem: - unregister_filesystem(&cifs_fs_type); +out_init_cifs_idmap: +#ifdef CONFIG_CIFS_ACL + exit_cifs_idmap(); +out_register_key_type: #endif +#ifdef CONFIG_CIFS_UPCALL + unregister_key_type(&cifs_spnego_key_type); out_destroy_request_bufs: +#endif cifs_destroy_request_bufs(); out_destroy_mids: cifs_destroy_mids(); @@ -1070,6 +1087,10 @@ exit_cifs(void) #ifdef CONFIG_CIFS_DFS_UPCALL cifs_dfs_release_automount_timer(); #endif +#ifdef CONFIG_CIFS_ACL + cifs_destroy_idmaptrees(); + exit_cifs_idmap(); +#endif #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); #endif diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index a9371b6578c..64313f778eb 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -47,7 +47,7 @@ extern void cifs_sb_deactive(struct super_block *sb); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; -extern struct inode *cifs_root_iget(struct super_block *, unsigned long); +extern struct inode *cifs_root_iget(struct super_block *); extern int cifs_create(struct inode *, struct dentry *, int, struct nameidata *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, @@ -59,9 +59,11 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int); extern int cifs_rmdir(struct inode *, struct dentry *); extern int cifs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); +extern int cifs_revalidate_file_attr(struct file *filp); +extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); extern int cifs_revalidate_dentry(struct dentry *); -extern void cifs_invalidate_mapping(struct inode *inode); +extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int cifs_setattr(struct dentry *, struct iattr *); @@ -80,12 +82,12 @@ extern const struct file_operations cifs_file_strict_nobrl_ops; extern int cifs_open(struct inode *inode, struct file *file); extern int cifs_close(struct inode *inode, struct file *file); extern int cifs_closedir(struct inode *inode, struct file *file); -extern ssize_t cifs_user_read(struct file *file, char __user *read_data, - size_t read_size, loff_t *poffset); +extern ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); -extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, - size_t write_size, loff_t *poffset); +extern ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); extern ssize_t cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); extern int cifs_lock(struct file *, int, struct file_lock *); @@ -123,9 +125,9 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; -#endif /* EXPERIMENTAL */ +#endif /* CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "1.71" +#define CIFS_VERSION "1.72" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a5d1106fcbd..76b4517e74b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -274,7 +274,8 @@ struct cifsSesInfo { int capabilities; char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ - char *user_name; + char *user_name; /* must not be null except during init of sess + and after mount option parsing we fill it */ char *domainName; char *password; struct session_key auth_key; @@ -780,10 +781,12 @@ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; */ GLOBAL_EXTERN spinlock_t cifs_file_list_lock; +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; /* DirNotify response queue */ GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ /* * Global transaction id (XID) information @@ -830,6 +833,11 @@ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ /* reconnect after this many failed echo attempts */ GLOBAL_EXTERN unsigned short echo_retries; +GLOBAL_EXTERN struct rb_root uidtree; +GLOBAL_EXTERN struct rb_root gidtree; +GLOBAL_EXTERN spinlock_t siduidlock; +GLOBAL_EXTERN spinlock_t sidgidlock; + void cifs_oplock_break(struct work_struct *work); void cifs_oplock_break_get(struct cifsFileInfo *cfile); void cifs_oplock_break_put(struct cifsFileInfo *cfile); diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b5c8cc5d7a7..de3aa285de0 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -397,9 +397,9 @@ #define GETU32(var) (*((__u32 *)var)) /* BB check for endian issues */ struct smb_hdr { - __u32 smb_buf_length; /* big endian on wire *//* BB length is only two - or three bytes - with one or two byte type preceding it that are - zero - we could mask the type byte off just in case BB */ + __be32 smb_buf_length; /* BB length is only two (rarely three) bytes, + with one or two byte "type" preceding it that will be + zero - we could mask the type byte off */ __u8 Protocol[4]; __u8 Command; union { @@ -428,43 +428,28 @@ struct smb_hdr { __u8 WordCount; } __attribute__((packed)); -/* given a pointer to an smb_hdr retrieve a char pointer to the byte count */ -#define BCC(smb_var) ((unsigned char *)(smb_var) + sizeof(struct smb_hdr) + \ - (2 * (smb_var)->WordCount)) +/* given a pointer to an smb_hdr, retrieve a void pointer to the ByteCount */ +static inline void * +BCC(struct smb_hdr *smb) +{ + return (void *)smb + sizeof(*smb) + 2 * smb->WordCount; +} /* given a pointer to an smb_hdr retrieve the pointer to the byte area */ #define pByteArea(smb_var) (BCC(smb_var) + 2) -/* get the converted ByteCount for a SMB packet and return it */ -static inline __u16 -get_bcc(struct smb_hdr *hdr) -{ - __u16 *bc_ptr = (__u16 *)BCC(hdr); - - return get_unaligned(bc_ptr); -} - /* get the unconverted ByteCount for a SMB packet and return it */ static inline __u16 -get_bcc_le(struct smb_hdr *hdr) +get_bcc(struct smb_hdr *hdr) { __le16 *bc_ptr = (__le16 *)BCC(hdr); return get_unaligned_le16(bc_ptr); } -/* set the ByteCount for a SMB packet in host-byte order */ -static inline void -put_bcc(__u16 count, struct smb_hdr *hdr) -{ - __u16 *bc_ptr = (__u16 *)BCC(hdr); - - put_unaligned(count, bc_ptr); -} - /* set the ByteCount for a SMB packet in little-endian */ static inline void -put_bcc_le(__u16 count, struct smb_hdr *hdr) +put_bcc(__u16 count, struct smb_hdr *hdr) { __le16 *bc_ptr = (__le16 *)BCC(hdr); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 8096f27ad9a..6e69e06a30b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -53,6 +53,9 @@ do { \ cFYI(1, "CIFS VFS: leaving %s (xid = %d) rc = %d", \ __func__, curr_xid, (int)rc); \ } while (0) +extern int init_cifs_idmap(void); +extern void exit_cifs_idmap(void); +extern void cifs_destroy_idmaptrees(void); extern char *build_path_from_dentry(struct dentry *); extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb, struct cifsTconInfo *tcon); @@ -90,7 +93,6 @@ extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, bool); extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern unsigned int smbCalcSize(struct smb_hdr *ptr); -extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); @@ -143,8 +145,10 @@ extern int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, extern int mode_to_cifs_acl(struct inode *inode, const char *path, __u64); extern struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *, struct inode *, const char *, u32 *); +extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, + const char *); -extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, +extern int cifs_mount(struct super_block *, struct cifs_sb_info *, const char *); extern int cifs_umount(struct super_block *, struct cifs_sb_info *); extern void cifs_dfs_release_automount_timer(void); @@ -304,12 +308,13 @@ extern int CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, char **syminfo, const struct nls_table *nls_codepage); +#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL extern int CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, char *symlinkinfo, const int buflen, __u16 fid, const struct nls_table *nls_codepage); - +#endif /* temporarily unused until cifs_symlink fixed */ extern int CIFSSMBOpen(const int xid, struct cifsTconInfo *tcon, const char *fileName, const int disposition, const int access_flags, const int omode, @@ -348,8 +353,6 @@ extern int CIFSGetSrvInodeNumber(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, __u64 *inode_number, const struct nls_table *nls_codepage, int remap_special_chars); -extern int cifsConvertToUCS(__le16 *target, const char *source, int maxlen, - const struct nls_table *cp, int mapChars); extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const __u16 netfid, const __u64 len, @@ -383,9 +386,15 @@ extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifsSesInfo *); #ifdef CONFIG_CIFS_WEAK_PW_HASH -extern void calc_lanman_hash(const char *password, const char *cryptkey, +extern int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); #endif /* CIFS_WEAK_PW_HASH */ +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ +extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, + const int notify_subdirs, const __u16 netfid, + __u32 filter, struct file *file, int multishot, + const struct nls_table *nls_codepage); +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ extern int CIFSSMBCopy(int xid, struct cifsTconInfo *source_tcon, const char *fromName, @@ -393,10 +402,6 @@ extern int CIFSSMBCopy(int xid, const char *toName, const int flags, const struct nls_table *nls_codepage, int remap_special_chars); -extern int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, - const int notify_subdirs, const __u16 netfid, - __u32 filter, struct file *file, int multishot, - const struct nls_table *nls_codepage); extern ssize_t CIFSSMBQAllEAs(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, const unsigned char *ea_name, char *EAData, @@ -427,9 +432,6 @@ extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb, int xid); extern int mdfour(unsigned char *, unsigned char *, int); extern int E_md4hash(const unsigned char *passwd, unsigned char *p16); -extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, - unsigned char *p24); -extern void E_P16(unsigned char *p14, unsigned char *p16); -extern void E_P24(unsigned char *p21, const unsigned char *c8, +extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index df959bae672..83df937b814 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -339,12 +339,13 @@ static int validate_t2(struct smb_t2_rsp *pSMB) get_unaligned_le16(&pSMB->t2_rsp.DataOffset) > 1024) goto vt2_err; - /* check that bcc is at least as big as parms + data */ - /* check that bcc is less than negotiated smb buffer */ total_size = get_unaligned_le16(&pSMB->t2_rsp.ParameterCount); if (total_size >= 512) goto vt2_err; + /* check that bcc is at least as big as parms + data, and that it is + * less than negotiated smb buffer + */ total_size += get_unaligned_le16(&pSMB->t2_rsp.DataCount); if (total_size > get_bcc(&pSMB->hdr) || total_size >= CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) @@ -357,6 +358,13 @@ vt2_err: return -EINVAL; } +static inline void inc_rfc1001_len(void *pSMB, int count) +{ + struct smb_hdr *hdr = (struct smb_hdr *)pSMB; + + be32_add_cpu(&hdr->smb_buf_length, count); +} + int CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) { @@ -409,7 +417,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) count += strlen(protocols[i].name) + 1; /* null at end of source and target buffers anyway */ } - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, @@ -541,10 +549,6 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) server->secType = RawNTLMSSP; else if (secFlags & CIFSSEC_MAY_LANMAN) server->secType = LANMAN; -/* #ifdef CONFIG_CIFS_EXPERIMENTAL - else if (secFlags & CIFSSEC_MAY_PLNTXT) - server->secType = ?? -#endif */ else { rc = -EOPNOTSUPP; cERROR(1, "Invalid security type"); @@ -578,7 +582,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && (server->capabilities & CAP_EXTENDED_SECURITY)) { - count = pSMBr->ByteCount; + count = get_bcc(&pSMBr->hdr); if (count < 16) { rc = -EIO; goto neg_err_exit; @@ -732,9 +736,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; put_unaligned_le16(1, &smb->EchoCount); - put_bcc_le(1, &smb->hdr); + put_bcc(1, &smb->hdr); smb->Data[0] = 'a'; - smb->hdr.smb_buf_length += 3; + inc_rfc1001_len(smb, 3); rc = cifs_call_async(server, (struct smb_hdr *)smb, cifs_echo_callback, server); @@ -852,7 +856,7 @@ PsxDelete: pSMB->TotalParameterCount = pSMB->ParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_UNLINK); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -898,7 +902,7 @@ DelFileRetry: pSMB->SearchAttributes = cpu_to_le16(ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM); pSMB->BufferFormat = 0x04; - pSMB->hdr.smb_buf_length += name_len + 1; + inc_rfc1001_len(pSMB, name_len + 1); pSMB->ByteCount = cpu_to_le16(name_len + 1); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -942,7 +946,7 @@ RmDirRetry: } pSMB->BufferFormat = 0x04; - pSMB->hdr.smb_buf_length += name_len + 1; + inc_rfc1001_len(pSMB, name_len + 1); pSMB->ByteCount = cpu_to_le16(name_len + 1); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -985,7 +989,7 @@ MkDirRetry: } pSMB->BufferFormat = 0x04; - pSMB->hdr.smb_buf_length += name_len + 1; + inc_rfc1001_len(pSMB, name_len + 1); pSMB->ByteCount = cpu_to_le16(name_len + 1); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -1063,7 +1067,7 @@ PsxCreat: pSMB->TotalParameterCount = pSMB->ParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_POSIX_OPEN); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -1075,7 +1079,7 @@ PsxCreat: cFYI(1, "copying inode info"); rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP))) { + if (rc || get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP)) { rc = -EIO; /* bad smb */ goto psx_create_err; } @@ -1096,7 +1100,7 @@ PsxCreat: pRetData->Type = cpu_to_le32(-1); /* unknown */ cFYI(DBG2, "unknown type"); } else { - if (pSMBr->ByteCount < sizeof(OPEN_PSX_RSP) + if (get_bcc(&pSMBr->hdr) < sizeof(OPEN_PSX_RSP) + sizeof(FILE_UNIX_BASIC_INFO)) { cERROR(1, "Open response data too small"); pRetData->Type = cpu_to_le32(-1); @@ -1228,7 +1232,7 @@ OldOpenRetry: pSMB->Sattr = cpu_to_le16(ATTR_HIDDEN | ATTR_SYSTEM | ATTR_DIRECTORY); pSMB->OpenFunction = cpu_to_le16(convert_disposition(openDisposition)); count += name_len; - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); /* long_op set to 1 to allow for oplock break timeouts */ @@ -1341,7 +1345,7 @@ openRetry: SECURITY_CONTEXT_TRACKING | SECURITY_EFFECTIVE_ONLY; count += name_len; - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); /* long_op set to 1 to allow for oplock break timeouts */ @@ -1426,7 +1430,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, const int netfid, } iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, &resp_buf_type, CIFS_LOG_ERROR); cifs_stats_inc(&tcon->num_reads); @@ -1560,7 +1564,7 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon, pSMB->DataLengthLow = cpu_to_le16(bytes_sent & 0xFFFF); pSMB->DataLengthHigh = cpu_to_le16(bytes_sent >> 16); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); if (wct == 14) pSMB->ByteCount = cpu_to_le16(byte_count); @@ -1644,11 +1648,12 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, pSMB->DataLengthLow = cpu_to_le16(count & 0xFFFF); pSMB->DataLengthHigh = cpu_to_le16(count >> 16); - smb_hdr_len = pSMB->hdr.smb_buf_length + 1; /* hdr + 1 byte pad */ + /* header + 1 byte pad */ + smb_hdr_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 1; if (wct == 14) - pSMB->hdr.smb_buf_length += count+1; + inc_rfc1001_len(pSMB, count + 1); else /* wct == 12 */ - pSMB->hdr.smb_buf_length += count+5; /* smb data starts later */ + inc_rfc1001_len(pSMB, count + 5); /* smb data starts later */ if (wct == 14) pSMB->ByteCount = cpu_to_le16(count + 1); else /* wct == 12 */ /* bigger pad, smaller smb hdr, keep offset ok */ { @@ -1748,7 +1753,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, /* oplock break */ count = 0; } - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); if (waitFlag) { @@ -1839,14 +1844,14 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, pSMB->Fid = smb_file_id; pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); if (waitFlag) { rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned); } else { iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */, &resp_buf_type, timeout); pSMB = NULL; /* request buf already freed by SendReceive2. Do @@ -1862,7 +1867,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, __u16 data_count; rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) { + if (rc || get_bcc(&pSMBr->hdr) < sizeof(*parm_data)) { rc = -EIO; /* bad smb */ goto plk_err_exit; } @@ -2012,7 +2017,7 @@ renameRetry: } count = 1 /* 1st signature byte */ + name_len + name_len2; - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -2092,7 +2097,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_RENAME_INFORMATION); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, pTcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -2159,7 +2164,7 @@ copyRetry: } count = 1 /* 1st signature byte */ + name_len + name_len2; - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -2249,7 +2254,7 @@ createSymLinkRetry: pSMB->DataOffset = cpu_to_le16(offset); pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_LINK); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -2335,7 +2340,7 @@ createHardLinkRetry: pSMB->DataOffset = cpu_to_le16(offset); pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_HLINK); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -2406,7 +2411,7 @@ winCreateHardLinkRetry: } count = 1 /* string type byte */ + name_len + name_len2; - pSMB->hdr.smb_buf_length += count; + inc_rfc1001_len(pSMB, count); pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -2477,7 +2482,7 @@ querySymLinkRetry: pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_LINK); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -2489,7 +2494,7 @@ querySymLinkRetry: rc = validate_t2((struct smb_t2_rsp *)pSMBr); /* BB also check enough total bytes returned */ - if (rc || (pSMBr->ByteCount < 2)) + if (rc || get_bcc(&pSMBr->hdr) < 2) rc = -EIO; else { bool is_unicode; @@ -2516,7 +2521,17 @@ querySymLinkRetry: return rc; } -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL +/* + * Recent Windows versions now create symlinks more frequently + * and they use the "reparse point" mechanism below. We can of course + * do symlinks nicely to Samba and other servers which support the + * CIFS Unix Extensions and we can also do SFU symlinks and "client only" + * "MF" symlinks optionally, but for recent Windows we really need to + * reenable the code below and fix the cifs_symlink callers to handle this. + * In the interim this code has been moved to its own config option so + * it is not compiled in by default until callers fixed up and more tested. + */ int CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, const unsigned char *searchName, @@ -2561,14 +2576,14 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, } else { /* decode response */ __u32 data_offset = le32_to_cpu(pSMBr->DataOffset); __u32 data_count = le32_to_cpu(pSMBr->DataCount); - if ((pSMBr->ByteCount < 2) || (data_offset > 512)) { - /* BB also check enough total bytes returned */ + if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) { + /* BB also check enough total bytes returned */ rc = -EIO; /* bad smb */ goto qreparse_out; } if (data_count && (data_count < 2048)) { char *end_of_smb = 2 /* sizeof byte count */ + - pSMBr->ByteCount + (char *)&pSMBr->ByteCount; + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; struct reparse_data *reparse_buf = (struct reparse_data *) @@ -2618,7 +2633,7 @@ qreparse_out: return rc; } -#endif /* CIFS_EXPERIMENTAL */ +#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */ #ifdef CONFIG_CIFS_POSIX @@ -2814,7 +2829,7 @@ queryAclRetry: pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_ACL); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -2826,8 +2841,8 @@ queryAclRetry: /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 2)) /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) rc = -EIO; /* bad smb */ else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -2908,7 +2923,7 @@ setAclRetry: pSMB->ParameterCount = cpu_to_le16(params); pSMB->TotalParameterCount = pSMB->ParameterCount; pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -2966,7 +2981,7 @@ GetExtAttrRetry: pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_ATTR_FLAGS); pSMB->Pad = 0; pSMB->Fid = netfid; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->t2.ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -2976,8 +2991,8 @@ GetExtAttrRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 2)) /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) /* If rc should we check for EOPNOSUPP and disable the srvino flag? or in caller? */ rc = -EIO; /* bad smb */ @@ -3052,6 +3067,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, char *end_of_smb; __u32 data_count, data_offset, parm_count, parm_offset; struct smb_com_ntransact_rsp *pSMBr; + u16 bcc; *pdatalen = 0; *pparmlen = 0; @@ -3061,8 +3077,8 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, pSMBr = (struct smb_com_ntransact_rsp *)buf; - /* ByteCount was converted from little endian in SendReceive */ - end_of_smb = 2 /* sizeof byte count */ + pSMBr->ByteCount + + bcc = get_bcc(&pSMBr->hdr); + end_of_smb = 2 /* sizeof byte count */ + bcc + (char *)&pSMBr->ByteCount; data_offset = le32_to_cpu(pSMBr->DataOffset); @@ -3088,7 +3104,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, *ppdata, data_count, (data_count + *ppdata), end_of_smb, pSMBr); return -EINVAL; - } else if (parm_count + data_count > pSMBr->ByteCount) { + } else if (parm_count + data_count > bcc) { cFYI(1, "parm count and data count larger than SMB"); return -EINVAL; } @@ -3124,9 +3140,9 @@ CIFSSMBGetCIFSACL(const int xid, struct cifsTconInfo *tcon, __u16 fid, pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | CIFS_ACL_DACL); pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ - pSMB->hdr.smb_buf_length += 11; + inc_rfc1001_len(pSMB, 11); iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = pSMB->hdr.smb_buf_length + 4; + iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4; rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type, 0); @@ -3235,10 +3251,9 @@ setCifsAclRetry: memcpy((char *) &pSMBr->hdr.Protocol + data_offset, (char *) pntsd, acllen); - pSMB->hdr.smb_buf_length += (byte_count + data_count); - + inc_rfc1001_len(pSMB, byte_count + data_count); } else - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -3289,7 +3304,7 @@ QInfRetry: } pSMB->BufferFormat = 0x04; name_len++; /* account for buffer type byte */ - pSMB->hdr.smb_buf_length += (__u16) name_len; + inc_rfc1001_len(pSMB, (__u16)name_len); pSMB->ByteCount = cpu_to_le16(name_len); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3364,7 +3379,7 @@ QFileInfoRetry: pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); pSMB->Pad = 0; pSMB->Fid = netfid; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -3375,7 +3390,7 @@ QFileInfoRetry: if (rc) /* BB add auto retry on EOPNOTSUPP? */ rc = -EIO; - else if (pSMBr->ByteCount < 40) + else if (get_bcc(&pSMBr->hdr) < 40) rc = -EIO; /* bad smb */ else if (pFindData) { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -3451,7 +3466,7 @@ QPathInfoRetry: else pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3463,9 +3478,9 @@ QPathInfoRetry: if (rc) /* BB add auto retry on EOPNOTSUPP? */ rc = -EIO; - else if (!legacy && (pSMBr->ByteCount < 40)) + else if (!legacy && get_bcc(&pSMBr->hdr) < 40) rc = -EIO; /* bad smb */ - else if (legacy && (pSMBr->ByteCount < 24)) + else if (legacy && get_bcc(&pSMBr->hdr) < 24) rc = -EIO; /* 24 or 26 expected but we do not read last field */ else if (pFindData) { @@ -3532,7 +3547,7 @@ UnixQFileInfoRetry: pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); pSMB->Pad = 0; pSMB->Fid = netfid; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -3541,7 +3556,7 @@ UnixQFileInfoRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { + if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n" "Unix Extensions can be disabled on mount " "by specifying the nosfu mount option."); @@ -3617,7 +3632,7 @@ UnixQPathInfoRetry: pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3627,7 +3642,7 @@ UnixQPathInfoRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) { + if (rc || get_bcc(&pSMBr->hdr) < sizeof(FILE_UNIX_BASIC_INFO)) { cERROR(1, "Malformed FILE_UNIX_BASIC_INFO response.\n" "Unix Extensions can be disabled on mount " "by specifying the nosfu mount option."); @@ -3731,7 +3746,7 @@ findFirstRetry: /* BB what should we set StorageType to? Does it matter? BB */ pSMB->SearchStorageType = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3860,7 +3875,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, byte_count = params + 1 /* pad */ ; pSMB->TotalParameterCount = cpu_to_le16(params); pSMB->ParameterCount = pSMB->TotalParameterCount; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4022,7 +4037,7 @@ GetInodeNumberRetry: pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_INTERNAL_INFO); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4032,8 +4047,8 @@ GetInodeNumberRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 2)) /* BB also check enough total bytes returned */ + if (rc || get_bcc(&pSMBr->hdr) < 2) /* If rc should we check for EOPNOSUPP and disable the srvino flag? or in caller? */ rc = -EIO; /* bad smb */ @@ -4246,7 +4261,7 @@ getDFSRetry: pSMB->ParameterCount = cpu_to_le16(params); pSMB->TotalParameterCount = pSMB->ParameterCount; pSMB->MaxReferralLevel = cpu_to_le16(3); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, @@ -4258,13 +4273,13 @@ getDFSRetry: rc = validate_t2((struct smb_t2_rsp *)pSMBr); /* BB Also check if enough total bytes returned? */ - if (rc || (pSMBr->ByteCount < 17)) { + if (rc || get_bcc(&pSMBr->hdr) < 17) { rc = -EIO; /* bad smb */ goto GetDFSRefExit; } cFYI(1, "Decoding GetDFSRefer response BCC: %d Offset %d", - pSMBr->ByteCount, + get_bcc(&pSMBr->hdr), le16_to_cpu(pSMBr->t2.DataOffset)); /* parse returned result into more usable form */ @@ -4320,7 +4335,7 @@ oldQFSInfoRetry: pSMB->Reserved3 = 0; pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); pSMB->InformationLevel = cpu_to_le16(SMB_INFO_ALLOCATION); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4330,12 +4345,12 @@ oldQFSInfoRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 18)) + if (rc || get_bcc(&pSMBr->hdr) < 18) rc = -EIO; /* bad smb */ else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); cFYI(1, "qfsinf resp BCC: %d Offset %d", - pSMBr->ByteCount, data_offset); + get_bcc(&pSMBr->hdr), data_offset); response_data = (FILE_SYSTEM_ALLOC_INFO *) (((char *) &pSMBr->hdr.Protocol) + data_offset); @@ -4399,7 +4414,7 @@ QFSInfoRetry: pSMB->Reserved3 = 0; pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_SIZE_INFO); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4409,7 +4424,7 @@ QFSInfoRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 24)) + if (rc || get_bcc(&pSMBr->hdr) < 24) rc = -EIO; /* bad smb */ else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4479,7 +4494,7 @@ QFSAttributeRetry: pSMB->Reserved3 = 0; pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_ATTRIBUTE_INFO); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4489,7 +4504,7 @@ QFSAttributeRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 13)) { + if (rc || get_bcc(&pSMBr->hdr) < 13) { /* BB also check if enough bytes returned */ rc = -EIO; /* bad smb */ } else { @@ -4550,7 +4565,7 @@ QFSDeviceRetry: pSMB->Reserved3 = 0; pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FS_DEVICE_INFO); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4560,7 +4575,8 @@ QFSDeviceRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < sizeof(FILE_SYSTEM_DEVICE_INFO))) + if (rc || get_bcc(&pSMBr->hdr) < + sizeof(FILE_SYSTEM_DEVICE_INFO)) rc = -EIO; /* bad smb */ else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4619,7 +4635,7 @@ QFSUnixRetry: pSMB->Reserved3 = 0; pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_CIFS_UNIX_INFO); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4629,7 +4645,7 @@ QFSUnixRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 13)) { + if (rc || get_bcc(&pSMBr->hdr) < 13) { rc = -EIO; /* bad smb */ } else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4702,7 +4718,7 @@ SETFSUnixRetry: pSMB->ClientUnixMinor = cpu_to_le16(CIFS_UNIX_MINOR_VERSION); pSMB->ClientUnixCap = cpu_to_le64(cap); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4764,7 +4780,7 @@ QFSPosixRetry: pSMB->Reserved3 = 0; pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FS_INFORMATION); pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_POSIX_FS_INFO); - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4774,7 +4790,7 @@ QFSPosixRetry: } else { /* decode response */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 13)) { + if (rc || get_bcc(&pSMBr->hdr) < 13) { rc = -EIO; /* bad smb */ } else { __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset); @@ -4890,7 +4906,7 @@ SetEOFRetry: pSMB->ParameterCount = cpu_to_le16(params); pSMB->TotalParameterCount = pSMB->ParameterCount; pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); parm_data->FileSize = cpu_to_le64(size); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -4969,7 +4985,7 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size, cpu_to_le16(SMB_SET_FILE_END_OF_FILE_INFO); } pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); if (rc) { @@ -5037,7 +5053,7 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, else pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); @@ -5096,7 +5112,7 @@ CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon, pSMB->Fid = fid; pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); *data_offset = delete_file ? 1 : 0; rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); @@ -5169,7 +5185,7 @@ SetTimesRetry: else pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_BASIC_INFO); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); memcpy(data_offset, data, sizeof(FILE_BASIC_INFO)); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -5221,7 +5237,7 @@ SetAttrLgcyRetry: } pSMB->attr = cpu_to_le16(dos_attrs); pSMB->BufferFormat = 0x04; - pSMB->hdr.smb_buf_length += name_len + 1; + inc_rfc1001_len(pSMB, name_len + 1); pSMB->ByteCount = cpu_to_le16(name_len + 1); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -5326,7 +5342,7 @@ CIFSSMBUnixSetFileInfo(const int xid, struct cifsTconInfo *tcon, pSMB->Fid = fid; pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); cifs_fill_unix_set_info(data_offset, args); @@ -5402,7 +5418,7 @@ setPermsRetry: pSMB->TotalDataCount = pSMB->DataCount; pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_UNIX_BASIC); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); cifs_fill_unix_set_info(data_offset, args); @@ -5418,79 +5434,6 @@ setPermsRetry: return rc; } -int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, - const int notify_subdirs, const __u16 netfid, - __u32 filter, struct file *pfile, int multishot, - const struct nls_table *nls_codepage) -{ - int rc = 0; - struct smb_com_transaction_change_notify_req *pSMB = NULL; - struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL; - struct dir_notify_req *dnotify_req; - int bytes_returned; - - cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid); - rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - pSMB->TotalParameterCount = 0 ; - pSMB->TotalDataCount = 0; - pSMB->MaxParameterCount = cpu_to_le32(2); - /* BB find exact data count max from sess structure BB */ - pSMB->MaxDataCount = 0; /* same in little endian or be */ -/* BB VERIFY verify which is correct for above BB */ - pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - - MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); - - pSMB->MaxSetupCount = 4; - pSMB->Reserved = 0; - pSMB->ParameterOffset = 0; - pSMB->DataCount = 0; - pSMB->DataOffset = 0; - pSMB->SetupCount = 4; /* single byte does not need le conversion */ - pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE); - pSMB->ParameterCount = pSMB->TotalParameterCount; - if (notify_subdirs) - pSMB->WatchTree = 1; /* one byte - no le conversion needed */ - pSMB->Reserved2 = 0; - pSMB->CompletionFilter = cpu_to_le32(filter); - pSMB->Fid = netfid; /* file handle always le */ - pSMB->ByteCount = 0; - - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *)pSMBr, &bytes_returned, - CIFS_ASYNC_OP); - if (rc) { - cFYI(1, "Error in Notify = %d", rc); - } else { - /* Add file to outstanding requests */ - /* BB change to kmem cache alloc */ - dnotify_req = kmalloc( - sizeof(struct dir_notify_req), - GFP_KERNEL); - if (dnotify_req) { - dnotify_req->Pid = pSMB->hdr.Pid; - dnotify_req->PidHigh = pSMB->hdr.PidHigh; - dnotify_req->Mid = pSMB->hdr.Mid; - dnotify_req->Tid = pSMB->hdr.Tid; - dnotify_req->Uid = pSMB->hdr.Uid; - dnotify_req->netfid = netfid; - dnotify_req->pfile = pfile; - dnotify_req->filter = filter; - dnotify_req->multishot = multishot; - spin_lock(&GlobalMid_Lock); - list_add_tail(&dnotify_req->lhead, - &GlobalDnotifyReqList); - spin_unlock(&GlobalMid_Lock); - } else - rc = -ENOMEM; - } - cifs_buf_release(pSMB); - return rc; -} - #ifdef CONFIG_CIFS_XATTR /* * Do a path-based QUERY_ALL_EAS call and parse the result. This is a common @@ -5560,7 +5503,7 @@ QAllEAsRetry: pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->InformationLevel = cpu_to_le16(SMB_INFO_QUERY_ALL_EAS); pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -5576,7 +5519,7 @@ QAllEAsRetry: of these trans2 responses */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); - if (rc || (pSMBr->ByteCount < 4)) { + if (rc || get_bcc(&pSMBr->hdr) < 4) { rc = -EIO; /* bad smb */ goto QAllEAsOut; } @@ -5773,7 +5716,7 @@ SetEARetry: pSMB->ParameterCount = cpu_to_le16(params); pSMB->TotalParameterCount = pSMB->ParameterCount; pSMB->Reserved4 = 0; - pSMB->hdr.smb_buf_length += byte_count; + inc_rfc1001_len(pSMB, byte_count); pSMB->ByteCount = cpu_to_le16(byte_count); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); @@ -5787,5 +5730,99 @@ SetEARetry: return rc; } - #endif + +#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* BB unused temporarily */ +/* + * Years ago the kernel added a "dnotify" function for Samba server, + * to allow network clients (such as Windows) to display updated + * lists of files in directory listings automatically when + * files are added by one user when another user has the + * same directory open on their desktop. The Linux cifs kernel + * client hooked into the kernel side of this interface for + * the same reason, but ironically when the VFS moved from + * "dnotify" to "inotify" it became harder to plug in Linux + * network file system clients (the most obvious use case + * for notify interfaces is when multiple users can update + * the contents of the same directory - exactly what network + * file systems can do) although the server (Samba) could + * still use it. For the short term we leave the worker + * function ifdeffed out (below) until inotify is fixed + * in the VFS to make it easier to plug in network file + * system clients. If inotify turns out to be permanently + * incompatible for network fs clients, we could instead simply + * expose this config flag by adding a future cifs (and smb2) notify ioctl. + */ +int CIFSSMBNotify(const int xid, struct cifsTconInfo *tcon, + const int notify_subdirs, const __u16 netfid, + __u32 filter, struct file *pfile, int multishot, + const struct nls_table *nls_codepage) +{ + int rc = 0; + struct smb_com_transaction_change_notify_req *pSMB = NULL; + struct smb_com_ntransaction_change_notify_rsp *pSMBr = NULL; + struct dir_notify_req *dnotify_req; + int bytes_returned; + + cFYI(1, "In CIFSSMBNotify for file handle %d", (int)netfid); + rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + pSMB->TotalParameterCount = 0 ; + pSMB->TotalDataCount = 0; + pSMB->MaxParameterCount = cpu_to_le32(2); + /* BB find exact data count max from sess structure BB */ + pSMB->MaxDataCount = 0; /* same in little endian or be */ +/* BB VERIFY verify which is correct for above BB */ + pSMB->MaxDataCount = cpu_to_le32((tcon->ses->server->maxBuf - + MAX_CIFS_HDR_SIZE) & 0xFFFFFF00); + + pSMB->MaxSetupCount = 4; + pSMB->Reserved = 0; + pSMB->ParameterOffset = 0; + pSMB->DataCount = 0; + pSMB->DataOffset = 0; + pSMB->SetupCount = 4; /* single byte does not need le conversion */ + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_NOTIFY_CHANGE); + pSMB->ParameterCount = pSMB->TotalParameterCount; + if (notify_subdirs) + pSMB->WatchTree = 1; /* one byte - no le conversion needed */ + pSMB->Reserved2 = 0; + pSMB->CompletionFilter = cpu_to_le32(filter); + pSMB->Fid = netfid; /* file handle always le */ + pSMB->ByteCount = 0; + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *)pSMBr, &bytes_returned, + CIFS_ASYNC_OP); + if (rc) { + cFYI(1, "Error in Notify = %d", rc); + } else { + /* Add file to outstanding requests */ + /* BB change to kmem cache alloc */ + dnotify_req = kmalloc( + sizeof(struct dir_notify_req), + GFP_KERNEL); + if (dnotify_req) { + dnotify_req->Pid = pSMB->hdr.Pid; + dnotify_req->PidHigh = pSMB->hdr.PidHigh; + dnotify_req->Mid = pSMB->hdr.Mid; + dnotify_req->Tid = pSMB->hdr.Tid; + dnotify_req->Uid = pSMB->hdr.Uid; + dnotify_req->netfid = netfid; + dnotify_req->pfile = pfile; + dnotify_req->filter = filter; + dnotify_req->multishot = multishot; + spin_lock(&GlobalMid_Lock); + list_add_tail(&dnotify_req->lhead, + &GlobalDnotifyReqList); + spin_unlock(&GlobalMid_Lock); + } else + rc = -ENOMEM; + } + cifs_buf_release(pSMB); + return rc; +} +#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 277262a8e82..da284e3cb65 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -102,6 +102,7 @@ struct smb_vol { bool fsc:1; /* enable fscache */ bool mfsymlinks:1; /* use Minshall+French Symlinks */ bool multiuser:1; + bool use_smb2:1; /* force smb2 use on mount instead of cifs */ unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -316,19 +317,19 @@ static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) put_unaligned_le16(total_in_buf, &pSMBt->t2_rsp.DataCount); /* fix up the BCC */ - byte_count = get_bcc_le(pTargetSMB); + byte_count = get_bcc(pTargetSMB); byte_count += total_in_buf2; /* is the result too big for the field? */ if (byte_count > USHRT_MAX) return -EPROTO; - put_bcc_le(byte_count, pTargetSMB); + put_bcc(byte_count, pTargetSMB); - byte_count = pTargetSMB->smb_buf_length; + byte_count = be32_to_cpu(pTargetSMB->smb_buf_length); byte_count += total_in_buf2; /* don't allow buffer to overflow */ if (byte_count > CIFSMaxBufSize) return -ENOBUFS; - pTargetSMB->smb_buf_length = byte_count; + pTargetSMB->smb_buf_length = cpu_to_be32(byte_count); memcpy(data_area_of_target, data_area_of_buf2, total_in_buf2); @@ -495,8 +496,7 @@ incomplete_rcv: /* Note that FC 1001 length is big endian on the wire, but we convert it here so it is always manipulated as host byte order */ - pdu_length = be32_to_cpu((__force __be32)smb_buffer->smb_buf_length); - smb_buffer->smb_buf_length = pdu_length; + pdu_length = be32_to_cpu(smb_buffer->smb_buf_length); cFYI(1, "rfc1002 length 0x%x", pdu_length+4); @@ -735,7 +735,7 @@ multi_t2_fnd: sock_release(csocket); server->ssocket = NULL; } - /* buffer usuallly freed in free_mid - need to free it here on exit */ + /* buffer usually freed in free_mid - need to free it here on exit */ cifs_buf_release(bigbuf); if (smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(smallbuf); @@ -818,10 +818,11 @@ extract_hostname(const char *unc) } static int -cifs_parse_mount_options(char *options, const char *devname, +cifs_parse_mount_options(const char *mountdata, const char *devname, struct smb_vol *vol) { char *value, *data, *end; + char *mountdata_copy, *options; unsigned int temp_len, i, j; char separator[2]; short int override_uid = -1; @@ -861,9 +862,14 @@ cifs_parse_mount_options(char *options, const char *devname, vol->actimeo = CIFS_DEF_ACTIMEO; - if (!options) - return 1; + if (!mountdata) + goto cifs_parse_mount_err; + + mountdata_copy = kstrndup(mountdata, PAGE_SIZE, GFP_KERNEL); + if (!mountdata_copy) + goto cifs_parse_mount_err; + options = mountdata_copy; end = options + strlen(options); if (strncmp(options, "sep=", 4) == 0) { if (options[4] != 0) { @@ -889,17 +895,22 @@ cifs_parse_mount_options(char *options, const char *devname, if (!value) { printk(KERN_WARNING "CIFS: invalid or missing username\n"); - return 1; /* needs_arg; */ + goto cifs_parse_mount_err; } else if (!*value) { /* null user, ie anonymous, authentication */ vol->nullauth = 1; } if (strnlen(value, MAX_USERNAME_SIZE) < MAX_USERNAME_SIZE) { - vol->username = value; + vol->username = kstrdup(value, GFP_KERNEL); + if (!vol->username) { + printk(KERN_WARNING "CIFS: no memory " + "for username\n"); + goto cifs_parse_mount_err; + } } else { printk(KERN_WARNING "CIFS: username too long\n"); - return 1; + goto cifs_parse_mount_err; } } else if (strnicmp(data, "pass", 4) == 0) { if (!value) { @@ -963,7 +974,7 @@ cifs_parse_mount_options(char *options, const char *devname, if (vol->password == NULL) { printk(KERN_WARNING "CIFS: no memory " "for password\n"); - return 1; + goto cifs_parse_mount_err; } for (i = 0, j = 0; i < temp_len; i++, j++) { vol->password[j] = value[i]; @@ -979,7 +990,7 @@ cifs_parse_mount_options(char *options, const char *devname, if (vol->password == NULL) { printk(KERN_WARNING "CIFS: no memory " "for password\n"); - return 1; + goto cifs_parse_mount_err; } strcpy(vol->password, value); } @@ -989,11 +1000,16 @@ cifs_parse_mount_options(char *options, const char *devname, vol->UNCip = NULL; } else if (strnlen(value, INET6_ADDRSTRLEN) < INET6_ADDRSTRLEN) { - vol->UNCip = value; + vol->UNCip = kstrdup(value, GFP_KERNEL); + if (!vol->UNCip) { + printk(KERN_WARNING "CIFS: no memory " + "for UNC IP\n"); + goto cifs_parse_mount_err; + } } else { printk(KERN_WARNING "CIFS: ip address " "too long\n"); - return 1; + goto cifs_parse_mount_err; } } else if (strnicmp(data, "sec", 3) == 0) { if (!value || !*value) { @@ -1006,7 +1022,7 @@ cifs_parse_mount_options(char *options, const char *devname, /* vol->secFlg |= CIFSSEC_MUST_SEAL | CIFSSEC_MAY_KRB5; */ cERROR(1, "Krb5 cifs privacy not supported"); - return 1; + goto cifs_parse_mount_err; } else if (strnicmp(value, "krb5", 4) == 0) { vol->secFlg |= CIFSSEC_MAY_KRB5; } else if (strnicmp(value, "ntlmsspi", 8) == 0) { @@ -1036,7 +1052,23 @@ cifs_parse_mount_options(char *options, const char *devname, vol->nullauth = 1; } else { cERROR(1, "bad security option: %s", value); - return 1; + goto cifs_parse_mount_err; + } + } else if (strnicmp(data, "vers", 3) == 0) { + if (!value || !*value) { + cERROR(1, "no protocol version specified" + " after vers= mount option"); + } else if ((strnicmp(value, "cifs", 4) == 0) || + (strnicmp(value, "1", 1) == 0)) { + /* this is the default */ + continue; + } else if ((strnicmp(value, "smb2", 4) == 0) || + (strnicmp(value, "2", 1) == 0)) { +#ifdef CONFIG_CIFS_SMB2 + vol->use_smb2 = true; +#else + cERROR(1, "smb2 support not enabled"); +#endif /* CONFIG_CIFS_SMB2 */ } } else if ((strnicmp(data, "unc", 3) == 0) || (strnicmp(data, "target", 6) == 0) @@ -1044,12 +1076,12 @@ cifs_parse_mount_options(char *options, const char *devname, if (!value || !*value) { printk(KERN_WARNING "CIFS: invalid path to " "network resource\n"); - return 1; /* needs_arg; */ + goto cifs_parse_mount_err; } if ((temp_len = strnlen(value, 300)) < 300) { vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); if (vol->UNC == NULL) - return 1; + goto cifs_parse_mount_err; strcpy(vol->UNC, value); if (strncmp(vol->UNC, "//", 2) == 0) { vol->UNC[0] = '\\'; @@ -1058,27 +1090,32 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: UNC Path does not begin " "with // or \\\\ \n"); - return 1; + goto cifs_parse_mount_err; } } else { printk(KERN_WARNING "CIFS: UNC name too long\n"); - return 1; + goto cifs_parse_mount_err; } } else if ((strnicmp(data, "domain", 3) == 0) || (strnicmp(data, "workgroup", 5) == 0)) { if (!value || !*value) { printk(KERN_WARNING "CIFS: invalid domain name\n"); - return 1; /* needs_arg; */ + goto cifs_parse_mount_err; } /* BB are there cases in which a comma can be valid in a domain name and need special handling? */ if (strnlen(value, 256) < 256) { - vol->domainname = value; + vol->domainname = kstrdup(value, GFP_KERNEL); + if (!vol->domainname) { + printk(KERN_WARNING "CIFS: no memory " + "for domainname\n"); + goto cifs_parse_mount_err; + } cFYI(1, "Domain name set"); } else { printk(KERN_WARNING "CIFS: domain name too " "long\n"); - return 1; + goto cifs_parse_mount_err; } } else if (strnicmp(data, "srcaddr", 7) == 0) { vol->srcaddr.ss_family = AF_UNSPEC; @@ -1086,7 +1123,7 @@ cifs_parse_mount_options(char *options, const char *devname, if (!value || !*value) { printk(KERN_WARNING "CIFS: srcaddr value" " not specified.\n"); - return 1; /* needs_arg; */ + goto cifs_parse_mount_err; } i = cifs_convert_address((struct sockaddr *)&vol->srcaddr, value, strlen(value)); @@ -1094,20 +1131,20 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_WARNING "CIFS: Could not parse" " srcaddr: %s\n", value); - return 1; + goto cifs_parse_mount_err; } } else if (strnicmp(data, "prefixpath", 10) == 0) { if (!value || !*value) { printk(KERN_WARNING "CIFS: invalid path prefix\n"); - return 1; /* needs_argument */ + goto cifs_parse_mount_err; } if ((temp_len = strnlen(value, 1024)) < 1024) { if (value[0] != '/') temp_len++; /* missing leading slash */ vol->prepath = kmalloc(temp_len+1, GFP_KERNEL); if (vol->prepath == NULL) - return 1; + goto cifs_parse_mount_err; if (value[0] != '/') { vol->prepath[0] = '/'; strcpy(vol->prepath+1, value); @@ -1116,24 +1153,33 @@ cifs_parse_mount_options(char *options, const char *devname, cFYI(1, "prefix path %s", vol->prepath); } else { printk(KERN_WARNING "CIFS: prefix too long\n"); - return 1; + goto cifs_parse_mount_err; } } else if (strnicmp(data, "iocharset", 9) == 0) { if (!value || !*value) { printk(KERN_WARNING "CIFS: invalid iocharset " "specified\n"); - return 1; /* needs_arg; */ + goto cifs_parse_mount_err; } if (strnlen(value, 65) < 65) { - if (strnicmp(value, "default", 7)) - vol->iocharset = value; + if (strnicmp(value, "default", 7)) { + vol->iocharset = kstrdup(value, + GFP_KERNEL); + + if (!vol->iocharset) { + printk(KERN_WARNING "CIFS: no " + "memory for" + "charset\n"); + goto cifs_parse_mount_err; + } + } /* if iocharset not set then load_nls_default is used by caller */ cFYI(1, "iocharset set to %s", value); } else { printk(KERN_WARNING "CIFS: iocharset name " "too long.\n"); - return 1; + goto cifs_parse_mount_err; } } else if (!strnicmp(data, "uid", 3) && value && *value) { vol->linux_uid = simple_strtoul(value, &value, 0); @@ -1246,7 +1292,7 @@ cifs_parse_mount_options(char *options, const char *devname, if (vol->actimeo > CIFS_MAX_ACTIMEO) { cERROR(1, "CIFS: attribute cache" "timeout too large"); - return 1; + goto cifs_parse_mount_err; } } } else if (strnicmp(data, "credentials", 4) == 0) { @@ -1390,7 +1436,7 @@ cifs_parse_mount_options(char *options, const char *devname, #ifndef CONFIG_CIFS_FSCACHE cERROR(1, "FS-Cache support needs CONFIG_CIFS_FSCACHE" "kernel config option set"); - return 1; + goto cifs_parse_mount_err; #endif vol->fsc = true; } else if (strnicmp(data, "mfsymlinks", 10) == 0) { @@ -1405,12 +1451,12 @@ cifs_parse_mount_options(char *options, const char *devname, if (devname == NULL) { printk(KERN_WARNING "CIFS: Missing UNC name for mount " "target\n"); - return 1; + goto cifs_parse_mount_err; } if ((temp_len = strnlen(devname, 300)) < 300) { vol->UNC = kmalloc(temp_len+1, GFP_KERNEL); if (vol->UNC == NULL) - return 1; + goto cifs_parse_mount_err; strcpy(vol->UNC, devname); if (strncmp(vol->UNC, "//", 2) == 0) { vol->UNC[0] = '\\'; @@ -1418,21 +1464,21 @@ cifs_parse_mount_options(char *options, const char *devname, } else if (strncmp(vol->UNC, "\\\\", 2) != 0) { printk(KERN_WARNING "CIFS: UNC Path does not " "begin with // or \\\\ \n"); - return 1; + goto cifs_parse_mount_err; } value = strpbrk(vol->UNC+2, "/\\"); if (value) *value = '\\'; } else { printk(KERN_WARNING "CIFS: UNC name too long\n"); - return 1; + goto cifs_parse_mount_err; } } if (vol->multiuser && !(vol->secFlg & CIFSSEC_MAY_KRB5)) { cERROR(1, "Multiuser mounts currently require krb5 " "authentication!"); - return 1; + goto cifs_parse_mount_err; } if (vol->UNCip == NULL) @@ -1450,7 +1496,12 @@ cifs_parse_mount_options(char *options, const char *devname, printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " "specified with no gid= option.\n"); + kfree(mountdata_copy); return 0; + +cifs_parse_mount_err: + kfree(mountdata_copy); + return 1; } /** Returns true if srcaddr isn't specified and rhs isn't @@ -2280,7 +2331,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) smb_buf = (struct smb_hdr *)ses_init_buf; /* sizeof RFC1002_SESSION_REQUEST with no scope */ - smb_buf->smb_buf_length = 0x81000044; + smb_buf->smb_buf_length = cpu_to_be32(0x81000044); rc = smb_send(server, smb_buf, 0x44); kfree(ses_init_buf); /* @@ -2691,8 +2742,12 @@ cleanup_volume_info(struct smb_vol **pvolume_info) return; volume_info = *pvolume_info; + kfree(volume_info->username); kzfree(volume_info->password); kfree(volume_info->UNC); + kfree(volume_info->UNCip); + kfree(volume_info->domainname); + kfree(volume_info->iocharset); kfree(volume_info->prepath); kfree(volume_info); *pvolume_info = NULL; @@ -2729,11 +2784,65 @@ build_unc_path_to_root(const struct smb_vol *volume_info, full_path[unc_len + cifs_sb->prepathlen] = 0; /* add trailing null */ return full_path; } + +/* + * Perform a dfs referral query for a share and (optionally) prefix + * + * If a referral is found, cifs_sb->mountdata will be (re-)allocated + * to a string containing updated options for the submount. Otherwise it + * will be left untouched. + * + * Returns the rc from get_dfs_path to the caller, which can be used to + * determine whether there were referrals. + */ +static int +expand_dfs_referral(int xid, struct cifsSesInfo *pSesInfo, + struct smb_vol *volume_info, struct cifs_sb_info *cifs_sb, + int check_prefix) +{ + int rc; + unsigned int num_referrals = 0; + struct dfs_info3_param *referrals = NULL; + char *full_path = NULL, *ref_path = NULL, *mdata = NULL; + + full_path = build_unc_path_to_root(volume_info, cifs_sb); + if (IS_ERR(full_path)) + return PTR_ERR(full_path); + + /* For DFS paths, skip the first '\' of the UNC */ + ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1; + + rc = get_dfs_path(xid, pSesInfo , ref_path, cifs_sb->local_nls, + &num_referrals, &referrals, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (!rc && num_referrals > 0) { + char *fake_devname = NULL; + + mdata = cifs_compose_mount_options(cifs_sb->mountdata, + full_path + 1, referrals, + &fake_devname); + + free_dfs_info_array(referrals, num_referrals); + kfree(fake_devname); + + if (cifs_sb->mountdata != NULL) + kfree(cifs_sb->mountdata); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } + cifs_sb->mountdata = mdata; + } + kfree(full_path); + return rc; +} #endif int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, - char *mount_data_global, const char *devname) + const char *devname) { int rc; int xid; @@ -2742,13 +2851,20 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct cifsTconInfo *tcon; struct TCP_Server_Info *srvTcp; char *full_path; - char *mount_data = mount_data_global; struct tcon_link *tlink; #ifdef CONFIG_CIFS_DFS_UPCALL - struct dfs_info3_param *referrals = NULL; - unsigned int num_referrals = 0; int referral_walks_count = 0; try_mount_again: + /* cleanup activities if we're chasing a referral */ + if (referral_walks_count) { + if (tcon) + cifs_put_tcon(tcon); + else if (pSesInfo) + cifs_put_smb_ses(pSesInfo); + + cleanup_volume_info(&volume_info); + FreeXid(xid); + } #endif rc = 0; tcon = NULL; @@ -2765,7 +2881,8 @@ try_mount_again: goto out; } - if (cifs_parse_mount_options(mount_data, devname, volume_info)) { + if (cifs_parse_mount_options(cifs_sb->mountdata, devname, + volume_info)) { rc = -EINVAL; goto out; } @@ -2861,6 +2978,24 @@ try_mount_again: (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); remote_path_check: +#ifdef CONFIG_CIFS_DFS_UPCALL + /* + * Perform an unconditional check for whether there are DFS + * referrals for this path without prefix, to provide support + * for DFS referrals from w2k8 servers which don't seem to respond + * with PATH_NOT_COVERED to requests that include the prefix. + * Chase the referral if found, otherwise continue normally. + */ + if (referral_walks_count == 0) { + int refrc = expand_dfs_referral(xid, pSesInfo, volume_info, + cifs_sb, false); + if (!refrc) { + referral_walks_count++; + goto try_mount_again; + } + } +#endif + /* check if a whole path (including prepath) is not remote */ if (!rc && tcon) { /* build_path_to_root works only when we have a valid tcon */ @@ -2894,46 +3029,15 @@ remote_path_check: if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) == 0) convert_delimiter(cifs_sb->prepath, CIFS_DIR_SEP(cifs_sb)); - full_path = build_unc_path_to_root(volume_info, cifs_sb); - if (IS_ERR(full_path)) { - rc = PTR_ERR(full_path); - goto mount_fail_check; - } - - cFYI(1, "Getting referral for: %s", full_path); - rc = get_dfs_path(xid, pSesInfo , full_path + 1, - cifs_sb->local_nls, &num_referrals, &referrals, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if (!rc && num_referrals > 0) { - char *fake_devname = NULL; - - if (mount_data != mount_data_global) - kfree(mount_data); - mount_data = cifs_compose_mount_options( - cifs_sb->mountdata, full_path + 1, - referrals, &fake_devname); + rc = expand_dfs_referral(xid, pSesInfo, volume_info, cifs_sb, + true); - free_dfs_info_array(referrals, num_referrals); - kfree(fake_devname); - kfree(full_path); - - if (IS_ERR(mount_data)) { - rc = PTR_ERR(mount_data); - mount_data = NULL; - goto mount_fail_check; - } - - if (tcon) - cifs_put_tcon(tcon); - else if (pSesInfo) - cifs_put_smb_ses(pSesInfo); - - cleanup_volume_info(&volume_info); + if (!rc) { referral_walks_count++; - FreeXid(xid); goto try_mount_again; } + goto mount_fail_check; #else /* No DFS support, return error on mount */ rc = -EOPNOTSUPP; #endif @@ -2966,8 +3070,6 @@ remote_path_check: mount_fail_check: /* on error free sesinfo and tcon struct if needed */ if (rc) { - if (mount_data != mount_data_global) - kfree(mount_data); /* If find_unc succeeded then rc == 0 so we can not end */ /* up accidentally freeing someone elses tcon struct */ if (tcon) @@ -3083,7 +3185,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr += strlen("?????"); bcc_ptr += 1; count = bcc_ptr - &pSMB->Password[0]; - pSMB->hdr.smb_buf_length += count; + pSMB->hdr.smb_buf_length = cpu_to_be32(be32_to_cpu( + pSMB->hdr.smb_buf_length) + count); pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, &length, @@ -3258,7 +3361,9 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, uid_t fsuid) struct cifsSesInfo *ses; struct cifsTconInfo *tcon = NULL; struct smb_vol *vol_info; - char username[MAX_USERNAME_SIZE + 1]; + char username[28]; /* big enough for "krb50x" + hex of ULONG_MAX 6+16 */ + /* We used to have this as MAX_USERNAME which is */ + /* way too big now (256 instead of 32) */ vol_info = kzalloc(sizeof(*vol_info), GFP_KERNEL); if (vol_info == NULL) { diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 993f82045bf..55d87ac5200 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -45,7 +45,7 @@ #include "cifs_debug.h" #include "cifsfs.h" -#ifdef CONFIG_CIFS_EXPERIMENTAL +#ifdef CIFS_NFSD_EXPORT static struct dentry *cifs_get_parent(struct dentry *dentry) { /* BB need to add code here eventually to enable export via NFSD */ @@ -63,5 +63,5 @@ const struct export_operations cifs_export_ops = { .encode_fs = */ }; -#endif /* EXPERIMENTAL */ +#endif /* CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index faf59529e84..c672afef0c0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -857,95 +857,6 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, cifsi->server_eof = end_of_write; } -ssize_t cifs_user_write(struct file *file, const char __user *write_data, - size_t write_size, loff_t *poffset) -{ - struct inode *inode = file->f_path.dentry->d_inode; - int rc = 0; - unsigned int bytes_written = 0; - unsigned int total_written; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; - int xid; - struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsi = CIFS_I(inode); - - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - - /* cFYI(1, " write %d bytes to offset %lld of %s", write_size, - *poffset, file->f_path.dentry->d_name.name); */ - - if (file->private_data == NULL) - return -EBADF; - - open_file = file->private_data; - pTcon = tlink_tcon(open_file->tlink); - - rc = generic_write_checks(file, poffset, &write_size, 0); - if (rc) - return rc; - - xid = GetXid(); - - for (total_written = 0; write_size > total_written; - total_written += bytes_written) { - rc = -EAGAIN; - while (rc == -EAGAIN) { - if (file->private_data == NULL) { - /* file has been closed on us */ - FreeXid(xid); - /* if we have gotten here we have written some data - and blocked, and the file has been freed on us while - we blocked so return what we managed to write */ - return total_written; - } - if (open_file->invalidHandle) { - /* we could deadlock if we called - filemap_fdatawait from here so tell - reopen_file not to flush data to server - now */ - rc = cifs_reopen_file(open_file, false); - if (rc != 0) - break; - } - - rc = CIFSSMBWrite(xid, pTcon, - open_file->netfid, - min_t(const int, cifs_sb->wsize, - write_size - total_written), - *poffset, &bytes_written, - NULL, write_data + total_written, 0); - } - if (rc || (bytes_written == 0)) { - if (total_written) - break; - else { - FreeXid(xid); - return rc; - } - } else { - cifs_update_eof(cifsi, *poffset, bytes_written); - *poffset += bytes_written; - } - } - - cifs_stats_bytes_written(pTcon, total_written); - -/* Do not update local mtime - server will set its actual value on write - * inode->i_ctime = inode->i_mtime = - * current_fs_time(inode->i_sb);*/ - if (total_written > 0) { - spin_lock(&inode->i_lock); - if (*poffset > inode->i_size) - i_size_write(inode, *poffset); - spin_unlock(&inode->i_lock); - } - mark_inode_dirty_sync(inode); - - FreeXid(xid); - return total_written; -} - static ssize_t cifs_write(struct cifsFileInfo *open_file, const char *write_data, size_t write_size, loff_t *poffset) @@ -1420,9 +1331,10 @@ retry_write: return rc; } -static int cifs_writepage(struct page *page, struct writeback_control *wbc) +static int +cifs_writepage_locked(struct page *page, struct writeback_control *wbc) { - int rc = -EFAULT; + int rc; int xid; xid = GetXid(); @@ -1442,15 +1354,29 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc) * to fail to update with the state of the page correctly. */ set_page_writeback(page); +retry_write: rc = cifs_partialpagewrite(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); /* BB add check for error and Clearuptodate? */ - unlock_page(page); + if (rc == -EAGAIN && wbc->sync_mode == WB_SYNC_ALL) + goto retry_write; + else if (rc == -EAGAIN) + redirty_page_for_writepage(wbc, page); + else if (rc != 0) + SetPageError(page); + else + SetPageUptodate(page); end_page_writeback(page); page_cache_release(page); FreeXid(xid); return rc; } +static int cifs_writepage(struct page *page, struct writeback_control *wbc) +{ + int rc = cifs_writepage_locked(page, wbc); + unlock_page(page); + return rc; +} + static int cifs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) @@ -1519,8 +1445,13 @@ int cifs_strict_fsync(struct file *file, int datasync) cFYI(1, "Sync file - name: %s datasync: 0x%x", file->f_path.dentry->d_name.name, datasync); - if (!CIFS_I(inode)->clientCanCacheRead) - cifs_invalidate_mapping(inode); + if (!CIFS_I(inode)->clientCanCacheRead) { + rc = cifs_invalidate_mapping(inode); + if (rc) { + cFYI(1, "rc: %d during invalidate phase", rc); + rc = 0; /* don't care about it in fsync */ + } + } tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) @@ -1726,7 +1657,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, return total_written; } -static ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, +ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { ssize_t written; @@ -1849,17 +1780,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, return total_read; } -ssize_t cifs_user_read(struct file *file, char __user *read_data, - size_t read_size, loff_t *poffset) -{ - struct iovec iov; - iov.iov_base = read_data; - iov.iov_len = read_size; - - return cifs_iovec_read(file, &iov, 1, poffset); -} - -static ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, +ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { ssize_t read; @@ -1987,8 +1908,11 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) xid = GetXid(); - if (!CIFS_I(inode)->clientCanCacheRead) - cifs_invalidate_mapping(inode); + if (!CIFS_I(inode)->clientCanCacheRead) { + rc = cifs_invalidate_mapping(inode); + if (rc) + return rc; + } rc = generic_file_mmap(file, vma); if (rc == 0) @@ -2415,6 +2339,27 @@ static void cifs_invalidate_page(struct page *page, unsigned long offset) cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); } +static int cifs_launder_page(struct page *page) +{ + int rc = 0; + loff_t range_start = page_offset(page); + loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + .range_start = range_start, + .range_end = range_end, + }; + + cFYI(1, "Launder page: %p", page); + + if (clear_page_dirty_for_io(page)) + rc = cifs_writepage_locked(page, &wbc); + + cifs_fscache_invalidate_page(page, page->mapping->host); + return rc; +} + void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, @@ -2486,7 +2431,7 @@ const struct address_space_operations cifs_addr_ops = { .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = cifs_release_page, .invalidatepage = cifs_invalidate_page, - /* .direct_IO = */ + .launder_page = cifs_launder_page, }; /* @@ -2503,5 +2448,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = cifs_release_page, .invalidatepage = cifs_invalidate_page, - /* .direct_IO = */ + .launder_page = cifs_launder_page, }; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 8852470b4fb..de02ed5e25c 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -878,7 +878,7 @@ retry_iget5_locked: } /* gets root inode */ -struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) +struct inode *cifs_root_iget(struct super_block *sb) { int xid; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1683,71 +1683,70 @@ cifs_inode_needs_reval(struct inode *inode) /* * Zap the cache. Called when invalid_mapping flag is set. */ -void +int cifs_invalidate_mapping(struct inode *inode) { - int rc; + int rc = 0; struct cifsInodeInfo *cifs_i = CIFS_I(inode); cifs_i->invalid_mapping = false; - /* write back any cached data */ if (inode->i_mapping && inode->i_mapping->nrpages != 0) { - rc = filemap_write_and_wait(inode->i_mapping); - mapping_set_error(inode->i_mapping, rc); + rc = invalidate_inode_pages2(inode->i_mapping); + if (rc) { + cERROR(1, "%s: could not invalidate inode %p", __func__, + inode); + cifs_i->invalid_mapping = true; + } } - invalidate_remote_inode(inode); + cifs_fscache_reset_inode_cookie(inode); + return rc; } -int cifs_revalidate_file(struct file *filp) +int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; struct inode *inode = filp->f_path.dentry->d_inode; struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; if (!cifs_inode_needs_reval(inode)) - goto check_inval; + return rc; if (tlink_tcon(cfile->tlink)->unix_ext) rc = cifs_get_file_info_unix(filp); else rc = cifs_get_file_info(filp); -check_inval: - if (CIFS_I(inode)->invalid_mapping) - cifs_invalidate_mapping(inode); - return rc; } -/* revalidate a dentry's inode attributes */ -int cifs_revalidate_dentry(struct dentry *dentry) +int cifs_revalidate_dentry_attr(struct dentry *dentry) { int xid; int rc = 0; - char *full_path = NULL; struct inode *inode = dentry->d_inode; struct super_block *sb = dentry->d_sb; + char *full_path = NULL; if (inode == NULL) return -ENOENT; - xid = GetXid(); - if (!cifs_inode_needs_reval(inode)) - goto check_inval; + return rc; + + xid = GetXid(); /* can not safely grab the rename sem here if rename calls revalidate since that would deadlock */ full_path = build_path_from_dentry(dentry); if (full_path == NULL) { rc = -ENOMEM; - goto check_inval; + goto out; } - cFYI(1, "Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld " - "jiffies %ld", full_path, inode, inode->i_count.counter, + cFYI(1, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time " + "%ld jiffies %ld", full_path, inode, inode->i_count.counter, dentry, dentry->d_time, jiffies); if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) @@ -1756,41 +1755,83 @@ int cifs_revalidate_dentry(struct dentry *dentry) rc = cifs_get_inode_info(&inode, full_path, NULL, sb, xid, NULL); -check_inval: - if (CIFS_I(inode)->invalid_mapping) - cifs_invalidate_mapping(inode); - +out: kfree(full_path); FreeXid(xid); return rc; } +int cifs_revalidate_file(struct file *filp) +{ + int rc; + struct inode *inode = filp->f_path.dentry->d_inode; + + rc = cifs_revalidate_file_attr(filp); + if (rc) + return rc; + + if (CIFS_I(inode)->invalid_mapping) + rc = cifs_invalidate_mapping(inode); + return rc; +} + +/* revalidate a dentry's inode attributes */ +int cifs_revalidate_dentry(struct dentry *dentry) +{ + int rc; + struct inode *inode = dentry->d_inode; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + if (CIFS_I(inode)->invalid_mapping) + rc = cifs_invalidate_mapping(inode); + return rc; +} + int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifsTconInfo *tcon = cifs_sb_master_tcon(cifs_sb); - int err = cifs_revalidate_dentry(dentry); - - if (!err) { - generic_fillattr(dentry->d_inode, stat); - stat->blksize = CIFS_MAX_MSGSIZE; - stat->ino = CIFS_I(dentry->d_inode)->uniqueid; + struct inode *inode = dentry->d_inode; + int rc; - /* - * If on a multiuser mount without unix extensions, and the - * admin hasn't overridden them, set the ownership to the - * fsuid/fsgid of the current process. - */ - if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && - !tcon->unix_ext) { - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) - stat->uid = current_fsuid(); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) - stat->gid = current_fsgid(); + /* + * We need to be sure that all dirty pages are written and the server + * has actual ctime, mtime and file length. + */ + if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping && + inode->i_mapping->nrpages != 0) { + rc = filemap_fdatawait(inode->i_mapping); + if (rc) { + mapping_set_error(inode->i_mapping, rc); + return rc; } } - return err; + + rc = cifs_revalidate_dentry_attr(dentry); + if (rc) + return rc; + + generic_fillattr(inode, stat); + stat->blksize = CIFS_MAX_MSGSIZE; + stat->ino = CIFS_I(inode)->uniqueid; + + /* + * If on a multiuser mount without unix extensions, and the admin hasn't + * overridden them, set the ownership to the fsuid/fsgid of the current + * process. + */ + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) && + !tcon->unix_ext) { + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)) + stat->uid = current_fsuid(); + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)) + stat->gid = current_fsgid(); + } + return rc; } static int cifs_truncate_page(struct address_space *mapping, loff_t from) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 0c684ae4c07..907531ac588 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -304,12 +304,10 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , memset(temp, 0, 256); /* bigger than MAX_CIFS_HDR_SIZE */ - buffer->smb_buf_length = + buffer->smb_buf_length = cpu_to_be32( (2 * word_count) + sizeof(struct smb_hdr) - 4 /* RFC 1001 length field does not count */ + - 2 /* for bcc field itself */ ; - /* Note that this is the only network field that has to be converted - to big endian and it is done just before we send it */ + 2 /* for bcc field itself */) ; buffer->Protocol[0] = 0xFF; buffer->Protocol[1] = 'S'; @@ -424,7 +422,7 @@ check_smb_hdr(struct smb_hdr *smb, __u16 mid) int checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) { - __u32 len = smb->smb_buf_length; + __u32 len = be32_to_cpu(smb->smb_buf_length); __u32 clc_len; /* calculated length */ cFYI(0, "checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len); @@ -464,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length) if (check_smb_hdr(smb, mid)) return 1; - clc_len = smbCalcSize_LE(smb); + clc_len = smbCalcSize(smb); if (4 + len != length) { cERROR(1, "Length read does not match RFC1001 length %d", @@ -521,7 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) (struct smb_com_transaction_change_notify_rsp *)buf; struct file_notify_information *pnotify; __u32 data_offset = 0; - if (get_bcc_le(buf) > sizeof(struct file_notify_information)) { + if (get_bcc(buf) > sizeof(struct file_notify_information)) { data_offset = le32_to_cpu(pSMBr->DataOffset); pnotify = (struct file_notify_information *) diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 79f641eeda3..79b71c2c7c9 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -919,13 +919,6 @@ smbCalcSize(struct smb_hdr *ptr) 2 /* size of the bcc field */ + get_bcc(ptr)); } -unsigned int -smbCalcSize_LE(struct smb_hdr *ptr) -{ - return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + - 2 /* size of the bcc field */ + get_bcc_le(ptr)); -} - /* The following are taken from fs/ntfs/util.c */ #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000) diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 645114ad0a1..7dd46210037 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -621,7 +621,7 @@ ssetup_ntlmssp_authenticate: and rest of bcc area. This allows us to avoid a large buffer 17K allocation */ iov[0].iov_base = (char *)pSMB; - iov[0].iov_len = smb_buf->smb_buf_length + 4; + iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4; /* setting this here allows the code at the end of the function to free the request buffer if there's an error */ @@ -656,7 +656,7 @@ ssetup_ntlmssp_authenticate: * to use challenge/response method (i.e. Password bit is 1). */ - calc_lanman_hash(ses->password, ses->server->cryptkey, + rc = calc_lanman_hash(ses->password, ses->server->cryptkey, ses->server->secMode & SECMODE_PW_ENCRYPT ? true : false, lnm_session_key); @@ -859,9 +859,10 @@ ssetup_ntlmssp_authenticate: iov[2].iov_len = (long) bcc_ptr - (long) str_area; count = iov[1].iov_len + iov[2].iov_len; - smb_buf->smb_buf_length += count; + smb_buf->smb_buf_length = + cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count); - put_bcc_le(count, smb_buf); + put_bcc(count, smb_buf); rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, CIFS_LOG_ERROR); diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c deleted file mode 100644 index 04721485925..00000000000 --- a/fs/cifs/smbdes.c +++ /dev/null @@ -1,418 +0,0 @@ -/* - Unix SMB/Netbios implementation. - Version 1.9. - - a partial implementation of DES designed for use in the - SMB authentication protocol - - Copyright (C) Andrew Tridgell 1998 - Modified by Steve French (sfrench@us.ibm.com) 2002,2004 - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -/* NOTES: - - This code makes no attempt to be fast! In fact, it is a very - slow implementation - - This code is NOT a complete DES implementation. It implements only - the minimum necessary for SMB authentication, as used by all SMB - products (including every copy of Microsoft Windows95 ever sold) - - In particular, it can only do a unchained forward DES pass. This - means it is not possible to use this code for encryption/decryption - of data, instead it is only useful as a "hash" algorithm. - - There is no entry point into this code that allows normal DES operation. - - I believe this means that this code does not come under ITAR - regulations but this is NOT a legal opinion. If you are concerned - about the applicability of ITAR regulations to this code then you - should confirm it for yourself (and maybe let me know if you come - up with a different answer to the one above) -*/ -#include <linux/slab.h> -#define uchar unsigned char - -static uchar perm1[56] = { 57, 49, 41, 33, 25, 17, 9, - 1, 58, 50, 42, 34, 26, 18, - 10, 2, 59, 51, 43, 35, 27, - 19, 11, 3, 60, 52, 44, 36, - 63, 55, 47, 39, 31, 23, 15, - 7, 62, 54, 46, 38, 30, 22, - 14, 6, 61, 53, 45, 37, 29, - 21, 13, 5, 28, 20, 12, 4 -}; - -static uchar perm2[48] = { 14, 17, 11, 24, 1, 5, - 3, 28, 15, 6, 21, 10, - 23, 19, 12, 4, 26, 8, - 16, 7, 27, 20, 13, 2, - 41, 52, 31, 37, 47, 55, - 30, 40, 51, 45, 33, 48, - 44, 49, 39, 56, 34, 53, - 46, 42, 50, 36, 29, 32 -}; - -static uchar perm3[64] = { 58, 50, 42, 34, 26, 18, 10, 2, - 60, 52, 44, 36, 28, 20, 12, 4, - 62, 54, 46, 38, 30, 22, 14, 6, - 64, 56, 48, 40, 32, 24, 16, 8, - 57, 49, 41, 33, 25, 17, 9, 1, - 59, 51, 43, 35, 27, 19, 11, 3, - 61, 53, 45, 37, 29, 21, 13, 5, - 63, 55, 47, 39, 31, 23, 15, 7 -}; - -static uchar perm4[48] = { 32, 1, 2, 3, 4, 5, - 4, 5, 6, 7, 8, 9, - 8, 9, 10, 11, 12, 13, - 12, 13, 14, 15, 16, 17, - 16, 17, 18, 19, 20, 21, - 20, 21, 22, 23, 24, 25, - 24, 25, 26, 27, 28, 29, - 28, 29, 30, 31, 32, 1 -}; - -static uchar perm5[32] = { 16, 7, 20, 21, - 29, 12, 28, 17, - 1, 15, 23, 26, - 5, 18, 31, 10, - 2, 8, 24, 14, - 32, 27, 3, 9, - 19, 13, 30, 6, - 22, 11, 4, 25 -}; - -static uchar perm6[64] = { 40, 8, 48, 16, 56, 24, 64, 32, - 39, 7, 47, 15, 55, 23, 63, 31, - 38, 6, 46, 14, 54, 22, 62, 30, - 37, 5, 45, 13, 53, 21, 61, 29, - 36, 4, 44, 12, 52, 20, 60, 28, - 35, 3, 43, 11, 51, 19, 59, 27, - 34, 2, 42, 10, 50, 18, 58, 26, - 33, 1, 41, 9, 49, 17, 57, 25 -}; - -static uchar sc[16] = { 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 }; - -static uchar sbox[8][4][16] = { - {{14, 4, 13, 1, 2, 15, 11, 8, 3, 10, 6, 12, 5, 9, 0, 7}, - {0, 15, 7, 4, 14, 2, 13, 1, 10, 6, 12, 11, 9, 5, 3, 8}, - {4, 1, 14, 8, 13, 6, 2, 11, 15, 12, 9, 7, 3, 10, 5, 0}, - {15, 12, 8, 2, 4, 9, 1, 7, 5, 11, 3, 14, 10, 0, 6, 13} }, - - {{15, 1, 8, 14, 6, 11, 3, 4, 9, 7, 2, 13, 12, 0, 5, 10}, - {3, 13, 4, 7, 15, 2, 8, 14, 12, 0, 1, 10, 6, 9, 11, 5}, - {0, 14, 7, 11, 10, 4, 13, 1, 5, 8, 12, 6, 9, 3, 2, 15}, - {13, 8, 10, 1, 3, 15, 4, 2, 11, 6, 7, 12, 0, 5, 14, 9} }, - - {{10, 0, 9, 14, 6, 3, 15, 5, 1, 13, 12, 7, 11, 4, 2, 8}, - {13, 7, 0, 9, 3, 4, 6, 10, 2, 8, 5, 14, 12, 11, 15, 1}, - {13, 6, 4, 9, 8, 15, 3, 0, 11, 1, 2, 12, 5, 10, 14, 7}, - {1, 10, 13, 0, 6, 9, 8, 7, 4, 15, 14, 3, 11, 5, 2, 12} }, - - {{7, 13, 14, 3, 0, 6, 9, 10, 1, 2, 8, 5, 11, 12, 4, 15}, - {13, 8, 11, 5, 6, 15, 0, 3, 4, 7, 2, 12, 1, 10, 14, 9}, - {10, 6, 9, 0, 12, 11, 7, 13, 15, 1, 3, 14, 5, 2, 8, 4}, - {3, 15, 0, 6, 10, 1, 13, 8, 9, 4, 5, 11, 12, 7, 2, 14} }, - - {{2, 12, 4, 1, 7, 10, 11, 6, 8, 5, 3, 15, 13, 0, 14, 9}, - {14, 11, 2, 12, 4, 7, 13, 1, 5, 0, 15, 10, 3, 9, 8, 6}, - {4, 2, 1, 11, 10, 13, 7, 8, 15, 9, 12, 5, 6, 3, 0, 14}, - {11, 8, 12, 7, 1, 14, 2, 13, 6, 15, 0, 9, 10, 4, 5, 3} }, - - {{12, 1, 10, 15, 9, 2, 6, 8, 0, 13, 3, 4, 14, 7, 5, 11}, - {10, 15, 4, 2, 7, 12, 9, 5, 6, 1, 13, 14, 0, 11, 3, 8}, - {9, 14, 15, 5, 2, 8, 12, 3, 7, 0, 4, 10, 1, 13, 11, 6}, - {4, 3, 2, 12, 9, 5, 15, 10, 11, 14, 1, 7, 6, 0, 8, 13} }, - - {{4, 11, 2, 14, 15, 0, 8, 13, 3, 12, 9, 7, 5, 10, 6, 1}, - {13, 0, 11, 7, 4, 9, 1, 10, 14, 3, 5, 12, 2, 15, 8, 6}, - {1, 4, 11, 13, 12, 3, 7, 14, 10, 15, 6, 8, 0, 5, 9, 2}, - {6, 11, 13, 8, 1, 4, 10, 7, 9, 5, 0, 15, 14, 2, 3, 12} }, - - {{13, 2, 8, 4, 6, 15, 11, 1, 10, 9, 3, 14, 5, 0, 12, 7}, - {1, 15, 13, 8, 10, 3, 7, 4, 12, 5, 6, 11, 0, 14, 9, 2}, - {7, 11, 4, 1, 9, 12, 14, 2, 0, 6, 10, 13, 15, 3, 5, 8}, - {2, 1, 14, 7, 4, 10, 8, 13, 15, 12, 9, 0, 3, 5, 6, 11} } -}; - -static void -permute(char *out, char *in, uchar *p, int n) -{ - int i; - for (i = 0; i < n; i++) - out[i] = in[p[i] - 1]; -} - -static void -lshift(char *d, int count, int n) -{ - char out[64]; - int i; - for (i = 0; i < n; i++) - out[i] = d[(i + count) % n]; - for (i = 0; i < n; i++) - d[i] = out[i]; -} - -static void -concat(char *out, char *in1, char *in2, int l1, int l2) -{ - while (l1--) - *out++ = *in1++; - while (l2--) - *out++ = *in2++; -} - -static void -xor(char *out, char *in1, char *in2, int n) -{ - int i; - for (i = 0; i < n; i++) - out[i] = in1[i] ^ in2[i]; -} - -static void -dohash(char *out, char *in, char *key, int forw) -{ - int i, j, k; - char *pk1; - char c[28]; - char d[28]; - char *cd; - char (*ki)[48]; - char *pd1; - char l[32], r[32]; - char *rl; - - /* Have to reduce stack usage */ - pk1 = kmalloc(56+56+64+64, GFP_KERNEL); - if (pk1 == NULL) - return; - - ki = kmalloc(16*48, GFP_KERNEL); - if (ki == NULL) { - kfree(pk1); - return; - } - - cd = pk1 + 56; - pd1 = cd + 56; - rl = pd1 + 64; - - permute(pk1, key, perm1, 56); - - for (i = 0; i < 28; i++) - c[i] = pk1[i]; - for (i = 0; i < 28; i++) - d[i] = pk1[i + 28]; - - for (i = 0; i < 16; i++) { - lshift(c, sc[i], 28); - lshift(d, sc[i], 28); - - concat(cd, c, d, 28, 28); - permute(ki[i], cd, perm2, 48); - } - - permute(pd1, in, perm3, 64); - - for (j = 0; j < 32; j++) { - l[j] = pd1[j]; - r[j] = pd1[j + 32]; - } - - for (i = 0; i < 16; i++) { - char *er; /* er[48] */ - char *erk; /* erk[48] */ - char b[8][6]; - char *cb; /* cb[32] */ - char *pcb; /* pcb[32] */ - char *r2; /* r2[32] */ - - er = kmalloc(48+48+32+32+32, GFP_KERNEL); - if (er == NULL) { - kfree(pk1); - kfree(ki); - return; - } - erk = er+48; - cb = erk+48; - pcb = cb+32; - r2 = pcb+32; - - permute(er, r, perm4, 48); - - xor(erk, er, ki[forw ? i : 15 - i], 48); - - for (j = 0; j < 8; j++) - for (k = 0; k < 6; k++) - b[j][k] = erk[j * 6 + k]; - - for (j = 0; j < 8; j++) { - int m, n; - m = (b[j][0] << 1) | b[j][5]; - - n = (b[j][1] << 3) | (b[j][2] << 2) | (b[j][3] << - 1) | b[j][4]; - - for (k = 0; k < 4; k++) - b[j][k] = - (sbox[j][m][n] & (1 << (3 - k))) ? 1 : 0; - } - - for (j = 0; j < 8; j++) - for (k = 0; k < 4; k++) - cb[j * 4 + k] = b[j][k]; - permute(pcb, cb, perm5, 32); - - xor(r2, l, pcb, 32); - - for (j = 0; j < 32; j++) - l[j] = r[j]; - - for (j = 0; j < 32; j++) - r[j] = r2[j]; - - kfree(er); - } - - concat(rl, r, l, 32, 32); - - permute(out, rl, perm6, 64); - kfree(pk1); - kfree(ki); -} - -static void -str_to_key(unsigned char *str, unsigned char *key) -{ - int i; - - key[0] = str[0] >> 1; - key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); - key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); - key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); - key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); - key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); - key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); - key[7] = str[6] & 0x7F; - for (i = 0; i < 8; i++) - key[i] = (key[i] << 1); -} - -static void -smbhash(unsigned char *out, const unsigned char *in, unsigned char *key, - int forw) -{ - int i; - char *outb; /* outb[64] */ - char *inb; /* inb[64] */ - char *keyb; /* keyb[64] */ - unsigned char key2[8]; - - outb = kmalloc(64 * 3, GFP_KERNEL); - if (outb == NULL) - return; - - inb = outb + 64; - keyb = inb + 64; - - str_to_key(key, key2); - - for (i = 0; i < 64; i++) { - inb[i] = (in[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0; - keyb[i] = (key2[i / 8] & (1 << (7 - (i % 8)))) ? 1 : 0; - outb[i] = 0; - } - - dohash(outb, inb, keyb, forw); - - for (i = 0; i < 8; i++) - out[i] = 0; - - for (i = 0; i < 64; i++) { - if (outb[i]) - out[i / 8] |= (1 << (7 - (i % 8))); - } - kfree(outb); -} - -void -E_P16(unsigned char *p14, unsigned char *p16) -{ - unsigned char sp8[8] = - { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 }; - smbhash(p16, sp8, p14, 1); - smbhash(p16 + 8, sp8, p14 + 7, 1); -} - -void -E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) -{ - smbhash(p24, c8, p21, 1); - smbhash(p24 + 8, c8, p21 + 7, 1); - smbhash(p24 + 16, c8, p21 + 14, 1); -} - -#if 0 /* currently unused */ -static void -D_P16(unsigned char *p14, unsigned char *in, unsigned char *out) -{ - smbhash(out, in, p14, 0); - smbhash(out + 8, in + 8, p14 + 7, 0); -} - -static void -E_old_pw_hash(unsigned char *p14, unsigned char *in, unsigned char *out) -{ - smbhash(out, in, p14, 1); - smbhash(out + 8, in + 8, p14 + 7, 1); -} -/* these routines are currently unneeded, but may be - needed later */ -void -cred_hash1(unsigned char *out, unsigned char *in, unsigned char *key) -{ - unsigned char buf[8]; - - smbhash(buf, in, key, 1); - smbhash(out, buf, key + 9, 1); -} - -void -cred_hash2(unsigned char *out, unsigned char *in, unsigned char *key) -{ - unsigned char buf[8]; - static unsigned char key2[8]; - - smbhash(buf, in, key, 1); - key2[0] = key[7]; - smbhash(out, buf, key2, 1); -} - -void -cred_hash3(unsigned char *out, unsigned char *in, unsigned char *key, int forw) -{ - static unsigned char key2[8]; - - smbhash(out, in, key, forw); - key2[0] = key[7]; - smbhash(out + 8, in + 8, key2, forw); -} -#endif /* unneeded routines */ diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index b5041c84998..1525d5e662b 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -47,6 +47,88 @@ #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) +static void +str_to_key(unsigned char *str, unsigned char *key) +{ + int i; + + key[0] = str[0] >> 1; + key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); + key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); + key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); + key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); + key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); + key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); + key[7] = str[6] & 0x7F; + for (i = 0; i < 8; i++) + key[i] = (key[i] << 1); +} + +static int +smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) +{ + int rc; + unsigned char key2[8]; + struct crypto_blkcipher *tfm_des; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + + str_to_key(key, key2); + + tfm_des = crypto_alloc_blkcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm_des)) { + rc = PTR_ERR(tfm_des); + cERROR(1, "could not allocate des crypto API\n"); + goto smbhash_err; + } + + desc.tfm = tfm_des; + + crypto_blkcipher_setkey(tfm_des, key2, 8); + + sg_init_one(&sgin, in, 8); + sg_init_one(&sgout, out, 8); + + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, 8); + if (rc) { + cERROR(1, "could not encrypt crypt key rc: %d\n", rc); + crypto_free_blkcipher(tfm_des); + goto smbhash_err; + } + +smbhash_err: + return rc; +} + +static int +E_P16(unsigned char *p14, unsigned char *p16) +{ + int rc; + unsigned char sp8[8] = + { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 }; + + rc = smbhash(p16, sp8, p14); + if (rc) + return rc; + rc = smbhash(p16 + 8, sp8, p14 + 7); + return rc; +} + +static int +E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) +{ + int rc; + + rc = smbhash(p24, c8, p21); + if (rc) + return rc; + rc = smbhash(p24 + 8, c8, p21 + 7); + if (rc) + return rc; + rc = smbhash(p24 + 16, c8, p21 + 14); + return rc; +} + /* produce a md4 message digest from data of length n bytes */ int mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) @@ -87,40 +169,30 @@ mdfour_err: return rc; } -/* Does the des encryption from the NT or LM MD4 hash. */ -static void -SMBOWFencrypt(unsigned char passwd[16], const unsigned char *c8, - unsigned char p24[24]) -{ - unsigned char p21[21]; - - memset(p21, '\0', 21); - - memcpy(p21, passwd, 16); - E_P24(p21, c8, p24); -} - /* This implements the X/Open SMB password encryption It takes a password, a 8 byte "crypt key" and puts 24 bytes of encrypted password into p24 */ /* Note that password must be uppercased and null terminated */ -void +int SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) { - unsigned char p14[15], p21[21]; + int rc; + unsigned char p14[14], p16[16], p21[21]; - memset(p21, '\0', 21); memset(p14, '\0', 14); - strncpy((char *) p14, (char *) passwd, 14); + memset(p16, '\0', 16); + memset(p21, '\0', 21); -/* strupper((char *)p14); *//* BB at least uppercase the easy range */ - E_P16(p14, p21); + memcpy(p14, passwd, 14); + rc = E_P16(p14, p16); + if (rc) + return rc; - SMBOWFencrypt(p21, c8, p24); + memcpy(p21, p16, 16); + rc = E_P24(p21, c8, p24); - memset(p14, 0, 15); - memset(p21, 0, 21); + return rc; } /* Routines for Windows NT MD4 Hash functions. */ @@ -279,16 +351,18 @@ int SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24) { int rc; - unsigned char p21[21]; + unsigned char p16[16], p21[21]; + memset(p16, '\0', 16); memset(p21, '\0', 21); - rc = E_md4hash(passwd, p21); + rc = E_md4hash(passwd, p16); if (rc) { cFYI(1, "%s Can't generate NT hash, error: %d", __func__, rc); return rc; } - SMBOWFencrypt(p21, c8, p24); + memcpy(p21, p16, 16); + rc = E_P24(p21, c8, p24); return rc; } diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 46d8756f2b2..f2513fb8c39 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -129,7 +129,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) unsigned int len = iov[0].iov_len; unsigned int total_len; int first_vec = 0; - unsigned int smb_buf_length = smb_buffer->smb_buf_length; + unsigned int smb_buf_length = be32_to_cpu(smb_buffer->smb_buf_length); struct socket *ssocket = server->ssocket; if (ssocket == NULL) @@ -144,17 +144,10 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) else smb_msg.msg_flags = MSG_NOSIGNAL; - /* smb header is converted in header_assemble. bcc and rest of SMB word - area, and byte area if necessary, is converted to littleendian in - cifssmb.c and RFC1001 len is converted to bigendian in smb_send - Flags2 is converted in SendReceive */ - - total_len = 0; for (i = 0; i < n_vec; i++) total_len += iov[i].iov_len; - smb_buffer->smb_buf_length = cpu_to_be32(smb_buffer->smb_buf_length); cFYI(1, "Sending smb: total_len %d", total_len); dump_smb(smb_buffer, len); @@ -243,7 +236,7 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec) /* Don't want to modify the buffer as a side effect of this call. */ - smb_buffer->smb_buf_length = smb_buf_length; + smb_buffer->smb_buf_length = cpu_to_be32(smb_buf_length); return rc; } @@ -387,7 +380,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf, #ifdef CONFIG_CIFS_STATS2 atomic_inc(&server->inSend); #endif - rc = smb_send(server, in_buf, in_buf->smb_buf_length); + rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&server->inSend); mid->when_sent = jiffies; @@ -422,7 +415,7 @@ SendReceiveNoRsp(const unsigned int xid, struct cifsSesInfo *ses, int resp_buf_type; iov[0].iov_base = (char *)in_buf; - iov[0].iov_len = in_buf->smb_buf_length + 4; + iov[0].iov_len = be32_to_cpu(in_buf->smb_buf_length) + 4; flags |= CIFS_NO_RESP; rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags); cFYI(DBG2, "SendRcvNoRsp flags %d rc %d", flags, rc); @@ -488,10 +481,10 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf, int rc = 0; /* -4 for RFC1001 length and +2 for BCC field */ - in_buf->smb_buf_length = sizeof(struct smb_hdr) - 4 + 2; + in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4 + 2); in_buf->Command = SMB_COM_NT_CANCEL; in_buf->WordCount = 0; - put_bcc_le(0, in_buf); + put_bcc(0, in_buf); mutex_lock(&server->srv_mutex); rc = cifs_sign_smb(in_buf, server, &mid->sequence_number); @@ -499,7 +492,7 @@ send_nt_cancel(struct TCP_Server_Info *server, struct smb_hdr *in_buf, mutex_unlock(&server->srv_mutex); return rc; } - rc = smb_send(server, in_buf, in_buf->smb_buf_length); + rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); mutex_unlock(&server->srv_mutex); cFYI(1, "issued NT_CANCEL for mid %u, rc = %d", @@ -612,7 +605,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, return rc; } - receive_len = midQ->resp_buf->smb_buf_length; + receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length); if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { cERROR(1, "Frame too large received. Length: %d Xid: %d", @@ -651,11 +644,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, rc = map_smb_to_linux_error(midQ->resp_buf, flags & CIFS_LOG_ERROR); - /* convert ByteCount if necessary */ - if (receive_len >= sizeof(struct smb_hdr) - 4 - /* do not count RFC1001 header */ + - (2 * midQ->resp_buf->WordCount) + 2 /* bcc */ ) - put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf); if ((flags & CIFS_NO_RESP) == 0) midQ->resp_buf = NULL; /* mark it so buf will not be freed by @@ -698,9 +686,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, to the same server. We may make this configurable later or use ses->maxReq */ - if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE - 4) { cERROR(1, "Illegal length, greater than maximum frame, %d", - in_buf->smb_buf_length); + be32_to_cpu(in_buf->smb_buf_length)); return -EIO; } @@ -733,7 +722,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, #ifdef CONFIG_CIFS_STATS2 atomic_inc(&ses->server->inSend); #endif - rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length); + rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; @@ -768,7 +757,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, return rc; } - receive_len = midQ->resp_buf->smb_buf_length; + receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length); if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { cERROR(1, "Frame too large received. Length: %d Xid: %d", @@ -781,7 +770,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, if (midQ->resp_buf && out_buf && (midQ->midState == MID_RESPONSE_RECEIVED)) { - out_buf->smb_buf_length = receive_len; + out_buf->smb_buf_length = cpu_to_be32(receive_len); memcpy((char *)out_buf + 4, (char *)midQ->resp_buf + 4, receive_len); @@ -800,16 +789,10 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, } } - *pbytes_returned = out_buf->smb_buf_length; + *pbytes_returned = be32_to_cpu(out_buf->smb_buf_length); /* BB special case reconnect tid and uid here? */ rc = map_smb_to_linux_error(out_buf, 0 /* no log */ ); - - /* convert ByteCount if necessary */ - if (receive_len >= sizeof(struct smb_hdr) - 4 - /* do not count RFC1001 header */ + - (2 * out_buf->WordCount) + 2 /* bcc */ ) - put_bcc(get_bcc_le(midQ->resp_buf), midQ->resp_buf); } else { rc = -EIO; cERROR(1, "Bad MID state?"); @@ -877,9 +860,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, to the same server. We may make this configurable later or use ses->maxReq */ - if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { + if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize + + MAX_CIFS_HDR_SIZE - 4) { cERROR(1, "Illegal length, greater than maximum frame, %d", - in_buf->smb_buf_length); + be32_to_cpu(in_buf->smb_buf_length)); return -EIO; } @@ -910,7 +894,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, #ifdef CONFIG_CIFS_STATS2 atomic_inc(&ses->server->inSend); #endif - rc = smb_send(ses->server, in_buf, in_buf->smb_buf_length); + rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length)); #ifdef CONFIG_CIFS_STATS2 atomic_dec(&ses->server->inSend); midQ->when_sent = jiffies; @@ -977,7 +961,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, if (rc != 0) return rc; - receive_len = midQ->resp_buf->smb_buf_length; + receive_len = be32_to_cpu(midQ->resp_buf->smb_buf_length); if (receive_len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE) { cERROR(1, "Frame too large received. Length: %d Xid: %d", receive_len, xid); @@ -993,7 +977,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, goto out; } - out_buf->smb_buf_length = receive_len; + out_buf->smb_buf_length = cpu_to_be32(receive_len); memcpy((char *)out_buf + 4, (char *)midQ->resp_buf + 4, receive_len); @@ -1012,17 +996,11 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, } } - *pbytes_returned = out_buf->smb_buf_length; + *pbytes_returned = be32_to_cpu(out_buf->smb_buf_length); /* BB special case reconnect tid and uid here? */ rc = map_smb_to_linux_error(out_buf, 0 /* no log */ ); - /* convert ByteCount if necessary */ - if (receive_len >= sizeof(struct smb_hdr) - 4 - /* do not count RFC1001 header */ + - (2 * out_buf->WordCount) + 2 /* bcc */ ) - put_bcc(get_bcc_le(out_buf), out_buf); - out: delete_mid(midQ); if (rstart && rc == -EACCES) diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index eae2a149160..912995e013e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -112,6 +112,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, struct cifsTconInfo *pTcon; struct super_block *sb; char *full_path; + struct cifs_ntsd *pacl; if (direntry == NULL) return -EIO; @@ -166,6 +167,25 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = CIFSSMBSetEA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, + strlen(CIFS_XATTR_CIFS_ACL)) == 0) { + pacl = kmalloc(value_size, GFP_KERNEL); + if (!pacl) { + cFYI(1, "%s: Can't allocate memory for ACL", + __func__); + rc = -ENOMEM; + } else { +#ifdef CONFIG_CIFS_ACL + memcpy(pacl, ea_value, value_size); + rc = set_cifs_acl(pacl, value_size, + direntry->d_inode, full_path); + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(direntry->d_inode)->time = 0; + kfree(pacl); +#else + cFYI(1, "Set CIFS ACL not supported yet"); +#endif /* CONFIG_CIFS_ACL */ + } } else { int temp; temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, diff --git a/fs/compat.c b/fs/compat.c index 72fe6cda910..0ea00832de2 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1306,241 +1306,6 @@ compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int return do_sys_open(dfd, filename, flags, mode); } -/* - * compat_count() counts the number of arguments/envelopes. It is basically - * a copy of count() from fs/exec.c, except that it works with 32 bit argv - * and envp pointers. - */ -static int compat_count(compat_uptr_t __user *argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - compat_uptr_t p; - - if (get_user(p, argv)) - return -EFAULT; - if (!p) - break; - argv++; - if (i++ >= max) - return -E2BIG; - - if (fatal_signal_pending(current)) - return -ERESTARTNOHAND; - cond_resched(); - } - } - return i; -} - -/* - * compat_copy_strings() is basically a copy of copy_strings() from fs/exec.c - * except that it works with 32 bit argv and envp pointers. - */ -static int compat_copy_strings(int argc, compat_uptr_t __user *argv, - struct linux_binprm *bprm) -{ - struct page *kmapped_page = NULL; - char *kaddr = NULL; - unsigned long kpos = 0; - int ret; - - while (argc-- > 0) { - compat_uptr_t str; - int len; - unsigned long pos; - - if (get_user(str, argv+argc) || - !(len = strnlen_user(compat_ptr(str), MAX_ARG_STRLEN))) { - ret = -EFAULT; - goto out; - } - - if (len > MAX_ARG_STRLEN) { - ret = -E2BIG; - goto out; - } - - /* We're going to work our way backwords. */ - pos = bprm->p; - str += len; - bprm->p -= len; - - while (len > 0) { - int offset, bytes_to_copy; - - if (fatal_signal_pending(current)) { - ret = -ERESTARTNOHAND; - goto out; - } - cond_resched(); - - offset = pos % PAGE_SIZE; - if (offset == 0) - offset = PAGE_SIZE; - - bytes_to_copy = offset; - if (bytes_to_copy > len) - bytes_to_copy = len; - - offset -= bytes_to_copy; - pos -= bytes_to_copy; - str -= bytes_to_copy; - len -= bytes_to_copy; - - if (!kmapped_page || kpos != (pos & PAGE_MASK)) { - struct page *page; - - page = get_arg_page(bprm, pos, 1); - if (!page) { - ret = -E2BIG; - goto out; - } - - if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); - kunmap(kmapped_page); - put_page(kmapped_page); - } - kmapped_page = page; - kaddr = kmap(kmapped_page); - kpos = pos & PAGE_MASK; - flush_cache_page(bprm->vma, kpos, - page_to_pfn(kmapped_page)); - } - if (copy_from_user(kaddr+offset, compat_ptr(str), - bytes_to_copy)) { - ret = -EFAULT; - goto out; - } - } - } - ret = 0; -out: - if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); - kunmap(kmapped_page); - put_page(kmapped_page); - } - return ret; -} - -/* - * compat_do_execve() is mostly a copy of do_execve(), with the exception - * that it processes 32 bit argv and envp pointers. - */ -int compat_do_execve(char * filename, - compat_uptr_t __user *argv, - compat_uptr_t __user *envp, - struct pt_regs * regs) -{ - struct linux_binprm *bprm; - struct file *file; - struct files_struct *displaced; - bool clear_in_exec; - int retval; - - retval = unshare_files(&displaced); - if (retval) - goto out_ret; - - retval = -ENOMEM; - bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); - if (!bprm) - goto out_files; - - retval = prepare_bprm_creds(bprm); - if (retval) - goto out_free; - - retval = check_unsafe_exec(bprm); - if (retval < 0) - goto out_free; - clear_in_exec = retval; - current->in_execve = 1; - - file = open_exec(filename); - retval = PTR_ERR(file); - if (IS_ERR(file)) - goto out_unmark; - - sched_exec(); - - bprm->file = file; - bprm->filename = filename; - bprm->interp = filename; - - retval = bprm_mm_init(bprm); - if (retval) - goto out_file; - - bprm->argc = compat_count(argv, MAX_ARG_STRINGS); - if ((retval = bprm->argc) < 0) - goto out; - - bprm->envc = compat_count(envp, MAX_ARG_STRINGS); - if ((retval = bprm->envc) < 0) - goto out; - - retval = prepare_binprm(bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm->filename, bprm); - if (retval < 0) - goto out; - - bprm->exec = bprm->p; - retval = compat_copy_strings(bprm->envc, envp, bprm); - if (retval < 0) - goto out; - - retval = compat_copy_strings(bprm->argc, argv, bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(bprm, regs); - if (retval < 0) - goto out; - - /* execve succeeded */ - current->fs->in_exec = 0; - current->in_execve = 0; - acct_update_integrals(current); - free_bprm(bprm); - if (displaced) - put_files_struct(displaced); - return retval; - -out: - if (bprm->mm) { - acct_arg_size(bprm, 0); - mmput(bprm->mm); - } - -out_file: - if (bprm->file) { - allow_write_access(bprm->file); - fput(bprm->file); - } - -out_unmark: - if (clear_in_exec) - current->fs->in_exec = 0; - current->in_execve = 0; - -out_free: - free_bprm(bprm); - -out_files: - if (displaced) - reset_files_struct(displaced); -out_ret: - return retval; -} - #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, diff --git a/fs/dcache.c b/fs/dcache.c index 22a0ef41bad..37f72ee5bf7 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -35,6 +35,7 @@ #include <linux/hardirq.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> +#include <linux/prefetch.h> #include "internal.h" /* @@ -1219,7 +1220,7 @@ void shrink_dcache_parent(struct dentry * parent) EXPORT_SYMBOL(shrink_dcache_parent); /* - * Scan `nr' dentries and return the number which remain. + * Scan `sc->nr_slab_to_reclaim' dentries and return the number which remain. * * We need to avoid reentering the filesystem if the caller is performing a * GFP_NOFS allocation attempt. One example deadlock is: @@ -1230,8 +1231,12 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, + struct shrink_control *sc) { + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 89d394d8fe2..90f76575c05 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -428,26 +428,17 @@ static ssize_t write_file_bool(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; - int buf_size; + size_t buf_size; + bool bv; u32 *val = file->private_data; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; - switch (buf[0]) { - case 'y': - case 'Y': - case '1': - *val = 1; - break; - case 'n': - case 'N': - case '0': - *val = 0; - break; - } - + if (strtobool(buf, &bv) == 0) + *val = bv; + return count; } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 0d329ff8ed4..9b026ea8baa 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -100,6 +100,7 @@ struct dlm_cluster { unsigned int cl_log_debug; unsigned int cl_protocol; unsigned int cl_timewarn_cs; + unsigned int cl_waitwarn_us; }; enum { @@ -114,6 +115,7 @@ enum { CLUSTER_ATTR_LOG_DEBUG, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_TIMEWARN_CS, + CLUSTER_ATTR_WAITWARN_US, }; struct cluster_attribute { @@ -166,6 +168,7 @@ CLUSTER_ATTR(scan_secs, 1); CLUSTER_ATTR(log_debug, 0); CLUSTER_ATTR(protocol, 0); CLUSTER_ATTR(timewarn_cs, 1); +CLUSTER_ATTR(waitwarn_us, 0); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr, @@ -179,6 +182,7 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol.attr, [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs.attr, + [CLUSTER_ATTR_WAITWARN_US] = &cluster_attr_waitwarn_us.attr, NULL, }; @@ -439,6 +443,7 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_protocol = dlm_config.ci_protocol; cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; + cl->cl_waitwarn_us = dlm_config.ci_waitwarn_us; space_list = &sps->ss_group; comm_list = &cms->cs_group; @@ -986,6 +991,7 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_DEBUG 0 #define DEFAULT_PROTOCOL 0 #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ +#define DEFAULT_WAITWARN_US 0 struct dlm_config_info dlm_config = { .ci_tcp_port = DEFAULT_TCP_PORT, @@ -998,6 +1004,7 @@ struct dlm_config_info dlm_config = { .ci_scan_secs = DEFAULT_SCAN_SECS, .ci_log_debug = DEFAULT_LOG_DEBUG, .ci_protocol = DEFAULT_PROTOCOL, - .ci_timewarn_cs = DEFAULT_TIMEWARN_CS + .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, + .ci_waitwarn_us = DEFAULT_WAITWARN_US }; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 4f1d6fce58c..dd0ce24d5a8 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -28,6 +28,7 @@ struct dlm_config_info { int ci_log_debug; int ci_protocol; int ci_timewarn_cs; + int ci_waitwarn_us; }; extern struct dlm_config_info dlm_config; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index b9420491301..0262451eb9c 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -209,6 +209,7 @@ struct dlm_args { #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 +#define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ #define DLM_IFL_USER 0x00000001 #define DLM_IFL_ORPHAN 0x00000002 @@ -245,6 +246,7 @@ struct dlm_lkb { int8_t lkb_wait_type; /* type of reply waiting for */ int8_t lkb_wait_count; + int lkb_wait_nodeid; /* for debugging */ struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ struct list_head lkb_statequeue; /* rsb g/c/w list */ @@ -254,6 +256,7 @@ struct dlm_lkb { struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; ktime_t lkb_timestamp; + ktime_t lkb_wait_time; unsigned long lkb_timeout_cs; struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE]; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 56d6bfcc1e4..f71d0b5abd9 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -799,10 +799,84 @@ static int msg_reply_type(int mstype) return -1; } +static int nodeid_warned(int nodeid, int num_nodes, int *warned) +{ + int i; + + for (i = 0; i < num_nodes; i++) { + if (!warned[i]) { + warned[i] = nodeid; + return 0; + } + if (warned[i] == nodeid) + return 1; + } + return 0; +} + +void dlm_scan_waiters(struct dlm_ls *ls) +{ + struct dlm_lkb *lkb; + ktime_t zero = ktime_set(0, 0); + s64 us; + s64 debug_maxus = 0; + u32 debug_scanned = 0; + u32 debug_expired = 0; + int num_nodes = 0; + int *warned = NULL; + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_equal(lkb->lkb_wait_time, zero)) + continue; + + debug_scanned++; + + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); + + if (us < dlm_config.ci_waitwarn_us) + continue; + + lkb->lkb_wait_time = zero; + + debug_expired++; + if (us > debug_maxus) + debug_maxus = us; + + if (!num_nodes) { + num_nodes = ls->ls_num_nodes; + warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int)); + if (warned) + memset(warned, 0, num_nodes * sizeof(int)); + } + if (!warned) + continue; + if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) + continue; + + log_error(ls, "waitwarn %x %lld %d us check connection to " + "node %d", lkb->lkb_id, (long long)us, + dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); + } + mutex_unlock(&ls->ls_waiters_mutex); + + if (warned) + kfree(warned); + + if (debug_expired) + log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", + debug_scanned, debug_expired, + dlm_config.ci_waitwarn_us, (long long)debug_maxus); +} + /* add/remove lkb from global waiters list of lkb's waiting for a reply from a remote node */ -static int add_to_waiters(struct dlm_lkb *lkb, int mstype) +static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error = 0; @@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype) lkb->lkb_wait_count++; lkb->lkb_wait_type = mstype; + lkb->lkb_wait_time = ktime_get(); + lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ hold_lkb(lkb); list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); out: @@ -961,10 +1037,10 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms) struct dlm_ls *ls = lkb->lkb_resource->res_ls; int error; - if (ms != &ls->ls_stub_ms) + if (ms->m_flags != DLM_IFL_STUB_MS) mutex_lock(&ls->ls_waiters_mutex); error = _remove_from_waiters(lkb, ms->m_type, ms); - if (ms != &ls->ls_stub_ms) + if (ms->m_flags != DLM_IFL_STUB_MS) mutex_unlock(&ls->ls_waiters_mutex); return error; } @@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); + + if (!dlm_config.ci_waitwarn_us) + return; + + mutex_lock(&ls->ls_waiters_mutex); + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { + if (ktime_to_us(lkb->lkb_wait_time)) + lkb->lkb_wait_time = ktime_get(); + } + mutex_unlock(&ls->ls_waiters_mutex); } /* lkb is master or local copy */ @@ -1376,14 +1462,8 @@ static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb) ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become compatible with other granted locks */ -static void munge_demoted(struct dlm_lkb *lkb, struct dlm_message *ms) +static void munge_demoted(struct dlm_lkb *lkb) { - if (ms->m_type != DLM_MSG_CONVERT_REPLY) { - log_print("munge_demoted %x invalid reply type %d", - lkb->lkb_id, ms->m_type); - return; - } - if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) { log_print("munge_demoted %x invalid modes gr %d rq %d", lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode); @@ -2844,12 +2924,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) struct dlm_mhandle *mh; int to_nodeid, error; - error = add_to_waiters(lkb, mstype); + to_nodeid = r->res_nodeid; + + error = add_to_waiters(lkb, mstype, to_nodeid); if (error) return error; - to_nodeid = r->res_nodeid; - error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); if (error) goto fail; @@ -2880,9 +2960,9 @@ static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) /* down conversions go without a reply from the master */ if (!error && down_conversion(lkb)) { remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY); + r->res_ls->ls_stub_ms.m_flags = DLM_IFL_STUB_MS; r->res_ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; r->res_ls->ls_stub_ms.m_result = 0; - r->res_ls->ls_stub_ms.m_flags = lkb->lkb_flags; __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); } @@ -2951,12 +3031,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) struct dlm_mhandle *mh; int to_nodeid, error; - error = add_to_waiters(lkb, DLM_MSG_LOOKUP); + to_nodeid = dlm_dir_nodeid(r); + + error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); if (error) return error; - to_nodeid = dlm_dir_nodeid(r); - error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); if (error) goto fail; @@ -3070,6 +3150,9 @@ static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) { + if (ms->m_flags == DLM_IFL_STUB_MS) + return; + lkb->lkb_sbflags = ms->m_sbflags; lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | (ms->m_flags & 0x0000FFFF); @@ -3612,7 +3695,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, /* convert was queued on remote master */ receive_flags_reply(lkb, ms); if (is_demoted(lkb)) - munge_demoted(lkb, ms); + munge_demoted(lkb); del_lkb(r, lkb); add_lkb(r, lkb, DLM_LKSTS_CONVERT); add_timeout(lkb); @@ -3622,7 +3705,7 @@ static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, /* convert was granted on remote master */ receive_flags_reply(lkb, ms); if (is_demoted(lkb)) - munge_demoted(lkb, ms); + munge_demoted(lkb); grant_lock_pc(r, lkb, ms); queue_cast(r, lkb, 0); break; @@ -3996,15 +4079,17 @@ void dlm_receive_buffer(union dlm_packet *p, int nodeid) dlm_put_lockspace(ls); } -static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) +static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb, + struct dlm_message *ms_stub) { if (middle_conversion(lkb)) { hold_lkb(lkb); - ls->ls_stub_ms.m_type = DLM_MSG_CONVERT_REPLY; - ls->ls_stub_ms.m_result = -EINPROGRESS; - ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; - _receive_convert_reply(lkb, &ls->ls_stub_ms); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CONVERT_REPLY; + ms_stub->m_result = -EINPROGRESS; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_convert_reply(lkb, ms_stub); /* Same special case as in receive_rcom_lock_args() */ lkb->lkb_grmode = DLM_LOCK_IV; @@ -4045,13 +4130,27 @@ static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) void dlm_recover_waiters_pre(struct dlm_ls *ls) { struct dlm_lkb *lkb, *safe; + struct dlm_message *ms_stub; int wait_type, stub_unlock_result, stub_cancel_result; + ms_stub = kmalloc(GFP_KERNEL, sizeof(struct dlm_message)); + if (!ms_stub) { + log_error(ls, "dlm_recover_waiters_pre no mem"); + return; + } + mutex_lock(&ls->ls_waiters_mutex); list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { - log_debug(ls, "pre recover waiter lkid %x type %d flags %x", - lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags); + + /* exclude debug messages about unlocks because there can be so + many and they aren't very interesting */ + + if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) { + log_debug(ls, "recover_waiter %x nodeid %d " + "msg %d to %d", lkb->lkb_id, lkb->lkb_nodeid, + lkb->lkb_wait_type, lkb->lkb_wait_nodeid); + } /* all outstanding lookups, regardless of destination will be resent after recovery is done */ @@ -4097,26 +4196,28 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) break; case DLM_MSG_CONVERT: - recover_convert_waiter(ls, lkb); + recover_convert_waiter(ls, lkb, ms_stub); break; case DLM_MSG_UNLOCK: hold_lkb(lkb); - ls->ls_stub_ms.m_type = DLM_MSG_UNLOCK_REPLY; - ls->ls_stub_ms.m_result = stub_unlock_result; - ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; - _receive_unlock_reply(lkb, &ls->ls_stub_ms); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_UNLOCK_REPLY; + ms_stub->m_result = stub_unlock_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_unlock_reply(lkb, ms_stub); dlm_put_lkb(lkb); break; case DLM_MSG_CANCEL: hold_lkb(lkb); - ls->ls_stub_ms.m_type = DLM_MSG_CANCEL_REPLY; - ls->ls_stub_ms.m_result = stub_cancel_result; - ls->ls_stub_ms.m_flags = lkb->lkb_flags; - ls->ls_stub_ms.m_header.h_nodeid = lkb->lkb_nodeid; - _receive_cancel_reply(lkb, &ls->ls_stub_ms); + memset(ms_stub, 0, sizeof(struct dlm_message)); + ms_stub->m_flags = DLM_IFL_STUB_MS; + ms_stub->m_type = DLM_MSG_CANCEL_REPLY; + ms_stub->m_result = stub_cancel_result; + ms_stub->m_header.h_nodeid = lkb->lkb_nodeid; + _receive_cancel_reply(lkb, ms_stub); dlm_put_lkb(lkb); break; @@ -4127,6 +4228,7 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) schedule(); } mutex_unlock(&ls->ls_waiters_mutex); + kfree(ms_stub); } static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls) @@ -4191,8 +4293,8 @@ int dlm_recover_waiters_post(struct dlm_ls *ls) ou = is_overlap_unlock(lkb); err = 0; - log_debug(ls, "recover_waiters_post %x type %d flags %x %s", - lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name); + log_debug(ls, "recover_waiter %x nodeid %d msg %d r_nodeid %d", + lkb->lkb_id, lkb->lkb_nodeid, mstype, r->res_nodeid); /* At this point we assume that we won't get a reply to any previous op or overlap op on this lock. First, do a big diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 88e93c80cc2..265017a7c3e 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,6 +24,7 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); +void dlm_scan_waiters(struct dlm_ls *ls); void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f994a7dfda8..14cbf409975 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -243,7 +243,6 @@ static struct dlm_ls *find_ls_to_scan(void) static int dlm_scand(void *data) { struct dlm_ls *ls; - int timeout_jiffies = dlm_config.ci_scan_secs * HZ; while (!kthread_should_stop()) { ls = find_ls_to_scan(); @@ -252,13 +251,14 @@ static int dlm_scand(void *data) ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); + dlm_scan_waiters(ls); dlm_unlock_recovery(ls); } else { ls->ls_scan_time += HZ; } - } else { - schedule_timeout_interruptible(timeout_jiffies); + continue; } + schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; } diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index 30d8b85febb..e2b87800436 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -71,6 +71,36 @@ static void send_op(struct plock_op *op) wake_up(&send_wq); } +/* If a process was killed while waiting for the only plock on a file, + locks_remove_posix will not see any lock on the file so it won't + send an unlock-close to us to pass on to userspace to clean up the + abandoned waiter. So, we have to insert the unlock-close when the + lock call is interrupted. */ + +static void do_unlock_close(struct dlm_ls *ls, u64 number, + struct file *file, struct file_lock *fl) +{ + struct plock_op *op; + + op = kzalloc(sizeof(*op), GFP_NOFS); + if (!op) + return; + + op->info.optype = DLM_PLOCK_OP_UNLOCK; + op->info.pid = fl->fl_pid; + op->info.fsid = ls->ls_global_id; + op->info.number = number; + op->info.start = 0; + op->info.end = OFFSET_MAX; + if (fl->fl_lmops && fl->fl_lmops->fl_grant) + op->info.owner = (__u64) fl->fl_pid; + else + op->info.owner = (__u64)(long) fl->fl_owner; + + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); +} + int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, int cmd, struct file_lock *fl) { @@ -114,9 +144,19 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file, send_op(op); - if (xop->callback == NULL) - wait_event(recv_wq, (op->done != 0)); - else { + if (xop->callback == NULL) { + rv = wait_event_killable(recv_wq, (op->done != 0)); + if (rv == -ERESTARTSYS) { + log_debug(ls, "dlm_posix_lock: wait killed %llx", + (unsigned long long)number); + spin_lock(&ops_lock); + list_del(&op->list); + spin_unlock(&ops_lock); + kfree(xop); + do_unlock_close(ls, number, file, fl); + goto out; + } + } else { rv = FILE_LOCK_DEFERRED; goto out; } @@ -233,6 +273,13 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file, else op->info.owner = (__u64)(long) fl->fl_owner; + if (fl->fl_flags & FL_CLOSE) { + op->info.flags |= DLM_PLOCK_FL_CLOSE; + send_op(op); + rv = 0; + goto out; + } + send_op(op); wait_event(recv_wq, (op->done != 0)); @@ -334,7 +381,10 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, spin_lock(&ops_lock); if (!list_empty(&send_list)) { op = list_entry(send_list.next, struct plock_op, list); - list_move(&op->list, &recv_list); + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + list_del(&op->list); + else + list_move(&op->list, &recv_list); memcpy(&info, &op->info, sizeof(info)); } spin_unlock(&ops_lock); @@ -342,6 +392,13 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count, if (!op) return -EAGAIN; + /* there is no need to get a reply from userspace for unlocks + that were generated by the vfs cleaning up for a close + (the process did not make an unlock call). */ + + if (op->info.flags & DLM_PLOCK_FL_CLOSE) + kfree(op); + if (copy_to_user(u, &info, sizeof(info))) return -EFAULT; return sizeof(info); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index d5ab3fe7c19..e96bf3e9be8 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -611,7 +611,6 @@ static ssize_t device_write(struct file *file, const char __user *buf, out_sig: sigprocmask(SIG_SETMASK, &tmpsig, NULL); - recalc_sigpending(); out_free: kfree(kbuf); return error; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 98b77c89494..c00e055b628 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -40,9 +40,12 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) static void drop_slab(void) { int nr_objects; + struct shrink_control shrink = { + .gfp_mask = GFP_KERNEL, + }; do { - nr_objects = shrink_slab(1000, GFP_KERNEL, 1000); + nr_objects = shrink_slab(&shrink, 1000, 1000); } while (nr_objects > 10); } diff --git a/fs/exec.c b/fs/exec.c index 5e62d26a4fe..936f5776655 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -55,6 +55,7 @@ #include <linux/fs_struct.h> #include <linux/pipe_fs_i.h> #include <linux/oom.h> +#include <linux/compat.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -166,8 +167,13 @@ out: } #ifdef CONFIG_MMU - -void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +/* + * The nascent bprm->mm is not visible until exec_mmap() but it can + * use a lot of memory, account these pages in current->mm temporary + * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we + * change the counter back via acct_arg_size(0). + */ +static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); @@ -186,7 +192,7 @@ void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) #endif } -struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -194,7 +200,7 @@ struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, #ifdef CONFIG_STACK_GROWSUP if (write) { - ret = expand_stack_downwards(bprm->vma, pos); + ret = expand_downwards(bprm->vma, pos); if (ret < 0) return NULL; } @@ -305,11 +311,11 @@ static bool valid_arg_len(struct linux_binprm *bprm, long len) #else -void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) +static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } -struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, +static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; @@ -398,22 +404,56 @@ err: return err; } +struct user_arg_ptr { +#ifdef CONFIG_COMPAT + bool is_compat; +#endif + union { + const char __user *const __user *native; +#ifdef CONFIG_COMPAT + compat_uptr_t __user *compat; +#endif + } ptr; +}; + +static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) +{ + const char __user *native; + +#ifdef CONFIG_COMPAT + if (unlikely(argv.is_compat)) { + compat_uptr_t compat; + + if (get_user(compat, argv.ptr.compat + nr)) + return ERR_PTR(-EFAULT); + + return compat_ptr(compat); + } +#endif + + if (get_user(native, argv.ptr.native + nr)) + return ERR_PTR(-EFAULT); + + return native; +} + /* * count() counts the number of strings in array ARGV. */ -static int count(const char __user * const __user * argv, int max) +static int count(struct user_arg_ptr argv, int max) { int i = 0; - if (argv != NULL) { + if (argv.ptr.native != NULL) { for (;;) { - const char __user * p; + const char __user *p = get_user_arg_ptr(argv, i); - if (get_user(p, argv)) - return -EFAULT; if (!p) break; - argv++; + + if (IS_ERR(p)) + return -EFAULT; + if (i++ >= max) return -E2BIG; @@ -430,7 +470,7 @@ static int count(const char __user * const __user * argv, int max) * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ -static int copy_strings(int argc, const char __user *const __user *argv, +static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; @@ -443,16 +483,18 @@ static int copy_strings(int argc, const char __user *const __user *argv, int len; unsigned long pos; - if (get_user(str, argv+argc) || - !(len = strnlen_user(str, MAX_ARG_STRLEN))) { - ret = -EFAULT; + ret = -EFAULT; + str = get_user_arg_ptr(argv, argc); + if (IS_ERR(str)) goto out; - } - if (!valid_arg_len(bprm, len)) { - ret = -E2BIG; + len = strnlen_user(str, MAX_ARG_STRLEN); + if (!len) + goto out; + + ret = -E2BIG; + if (!valid_arg_len(bprm, len)) goto out; - } /* We're going to work our way backwords. */ pos = bprm->p; @@ -519,14 +561,19 @@ out: /* * Like copy_strings, but get argv and its values from kernel memory. */ -int copy_strings_kernel(int argc, const char *const *argv, +int copy_strings_kernel(int argc, const char *const *__argv, struct linux_binprm *bprm) { int r; mm_segment_t oldfs = get_fs(); + struct user_arg_ptr argv = { + .ptr.native = (const char __user *const __user *)__argv, + }; + set_fs(KERNEL_DS); - r = copy_strings(argc, (const char __user *const __user *)argv, bprm); + r = copy_strings(argc, argv, bprm); set_fs(oldfs); + return r; } EXPORT_SYMBOL(copy_strings_kernel); @@ -553,7 +600,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; - struct mmu_gather *tlb; + struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -579,12 +626,12 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) return -ENOMEM; lru_add_drain(); - tlb = tlb_gather_mmu(mm, 0); + tlb_gather_mmu(&tlb, mm, 0); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ - free_pgd_range(tlb, new_end, old_end, new_end, + free_pgd_range(&tlb, new_end, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } else { /* @@ -593,10 +640,10 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * have constraints on va-space that make this illegal (IA64) - * for the others its just a little faster. */ - free_pgd_range(tlb, old_start, old_end, new_end, + free_pgd_range(&tlb, old_start, old_end, new_end, vma->vm_next ? vma->vm_next->vm_start : 0); } - tlb_finish_mmu(tlb, new_end, old_end); + tlb_finish_mmu(&tlb, new_end, old_end); /* * Shrink the vma to just the new range. Always succeeds. @@ -1004,6 +1051,7 @@ char *get_task_comm(char *buf, struct task_struct *tsk) task_unlock(tsk); return buf; } +EXPORT_SYMBOL_GPL(get_task_comm); void set_task_comm(struct task_struct *tsk, char *buf) { @@ -1379,10 +1427,10 @@ EXPORT_SYMBOL(search_binary_handler); /* * sys_execve() executes a new program. */ -int do_execve(const char * filename, - const char __user *const __user *argv, - const char __user *const __user *envp, - struct pt_regs * regs) +static int do_execve_common(const char *filename, + struct user_arg_ptr argv, + struct user_arg_ptr envp, + struct pt_regs *regs) { struct linux_binprm *bprm; struct file *file; @@ -1489,6 +1537,34 @@ out_ret: return retval; } +int do_execve(const char *filename, + const char __user *const __user *__argv, + const char __user *const __user *__envp, + struct pt_regs *regs) +{ + struct user_arg_ptr argv = { .ptr.native = __argv }; + struct user_arg_ptr envp = { .ptr.native = __envp }; + return do_execve_common(filename, argv, envp, regs); +} + +#ifdef CONFIG_COMPAT +int compat_do_execve(char *filename, + compat_uptr_t __user *__argv, + compat_uptr_t __user *__envp, + struct pt_regs *regs) +{ + struct user_arg_ptr argv = { + .is_compat = true, + .ptr.compat = __argv, + }; + struct user_arg_ptr envp = { + .is_compat = true, + .ptr.compat = __envp, + }; + return do_execve_common(filename, argv, envp, regs); +} +#endif + void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; @@ -1659,6 +1735,7 @@ static int zap_process(struct task_struct *start, int exit_code) t = start; do { + task_clear_group_stop_pending(t); if (t != current && t->mm) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0a78dae7e2c..1dd62ed35b8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if (!sb_set_blocksize(sb, blocksize)) { - ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); + ext2_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); goto failed_sbi; } diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 32f3b869585..34b6d9bfc48 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -2189,6 +2198,7 @@ static int ext3_symlink (struct inode * dir, handle_t *handle; struct inode * inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2196,10 +2206,26 @@ static int ext3_symlink (struct inode * dir, dquot_initialize(dir); + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + handle = ext3_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2211,21 +2237,45 @@ retry: if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT3_I(inode)->i_data)) { + if (l > EXT3_N_BLOCKS * 4) { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); /* - * page_symlink() calls into ext3_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext3_orphan_del(handle, inode); if (err) { + ext3_journal_stop(handle); drop_nlink(inode); - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; + goto err_drop_inode; } } else { inode->i_op = &ext3_fast_symlink_inode_operations; @@ -2239,6 +2289,10 @@ out_stop: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput(inode); + return err; } static int ext3_link (struct dentry * old_dentry, diff --git a/fs/fat/cache.c b/fs/fat/cache.c index ae8200f84e3..1cc7038e273 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -151,6 +151,13 @@ static void fat_cache_add(struct inode *inode, struct fat_cache_id *new) spin_unlock(&MSDOS_I(inode)->cache_lru_lock); tmp = fat_cache_alloc(inode); + if (!tmp) { + spin_lock(&MSDOS_I(inode)->cache_lru_lock); + MSDOS_I(inode)->nr_caches--; + spin_unlock(&MSDOS_I(inode)->cache_lru_lock); + return; + } + spin_lock(&MSDOS_I(inode)->cache_lru_lock); cache = fat_cache_merge(inode, new); if (cache != NULL) { diff --git a/fs/fat/dir.c b/fs/fat/dir.c index ee42b9e0b16..4ad64732cbc 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -98,7 +98,7 @@ next: *bh = sb_bread(sb, phys); if (*bh == NULL) { - printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n", + fat_msg(sb, KERN_ERR, "Directory bread(block %llu) failed", (llu)phys); /* skip this block */ *pos = (iblock + 1) << sb->s_blocksize_bits; @@ -136,9 +136,10 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, * but ignore that right now. * Ahem... Stack smashing in ring 0 isn't fun. Fixed. */ -static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, - int uni_xlate, struct nls_table *nls) +static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, + const wchar_t *uni, int len, struct nls_table *nls) { + int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate; const wchar_t *ip; wchar_t ec; unsigned char *op; @@ -166,23 +167,23 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len, } if (unlikely(*ip)) { - printk(KERN_WARNING "FAT: filename was truncated while " - "converting."); + fat_msg(sb, KERN_WARNING, "filename was truncated while " + "converting."); } *op = 0; return (op - ascii); } -static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, +static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, unsigned char *buf, int size) { + struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sbi->options.utf8) return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, UTF16_HOST_ENDIAN, buf, size); else - return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, - sbi->nls_io); + return uni16_to_x8(sb, buf, uni, size, sbi->nls_io); } static inline int @@ -419,7 +420,7 @@ parse_record: /* Compare shortname */ bufuname[last_u] = 0x0000; - len = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + len = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); if (fat_name_match(sbi, name, name_len, bufname, len)) goto found; @@ -428,7 +429,7 @@ parse_record: int size = PATH_MAX - FAT_MAX_UNI_SIZE; /* Compare longname */ - len = fat_uni_to_x8(sbi, unicode, longname, size); + len = fat_uni_to_x8(sb, unicode, longname, size); if (fat_name_match(sbi, name, name_len, longname, len)) goto found; } @@ -545,7 +546,7 @@ parse_record: if (nr_slots) { void *longname = unicode + FAT_MAX_UNI_CHARS; int size = PATH_MAX - FAT_MAX_UNI_SIZE; - int len = fat_uni_to_x8(sbi, unicode, longname, size); + int len = fat_uni_to_x8(sb, unicode, longname, size); fill_name = longname; fill_len = len; @@ -621,7 +622,7 @@ parse_record: if (isvfat) { bufuname[j] = 0x0000; - i = fat_uni_to_x8(sbi, bufuname, bufname, sizeof(bufname)); + i = fat_uni_to_x8(sb, bufuname, bufname, sizeof(bufname)); } if (nr_slots) { /* hack for fat_ioctl_filldir() */ @@ -979,6 +980,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) { + struct super_block *sb = dir->i_sb; struct msdos_dir_entry *de; struct buffer_head *bh; int err = 0, nr_slots; @@ -1013,8 +1015,8 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) */ err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots); if (err) { - printk(KERN_WARNING - "FAT: Couldn't remove the long name slots\n"); + fat_msg(sb, KERN_WARNING, + "Couldn't remove the long name slots"); } } @@ -1265,7 +1267,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots, if (sbi->fat_bits != 32) goto error; } else if (MSDOS_I(dir)->i_start == 0) { - printk(KERN_ERR "FAT: Corrupted directory (i_pos %lld)\n", + fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)", MSDOS_I(dir)->i_pos); err = -EIO; goto error; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index f50408901f7..8276cc282de 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -319,19 +319,20 @@ extern struct inode *fat_build_inode(struct super_block *sb, struct msdos_dir_entry *de, loff_t i_pos); extern int fat_sync_inode(struct inode *inode); extern int fat_fill_super(struct super_block *sb, void *data, int silent, - const struct inode_operations *fs_dir_inode_ops, - int isvfat, void (*setup)(struct super_block *)); + int isvfat, void (*setup)(struct super_block *)); extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2); /* fat/misc.c */ extern void -__fat_fs_error(struct super_block *s, int report, const char *fmt, ...) +__fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))) __cold; +#define fat_fs_error(sb, fmt, args...) \ + __fat_fs_error(sb, 1, fmt , ## args) +#define fat_fs_error_ratelimit(sb, fmt, args...) \ + __fat_fs_error(sb, __ratelimit(&MSDOS_SB(sb)->ratelimit), fmt , ## args) +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) __attribute__ ((format (printf, 3, 4))) __cold; -#define fat_fs_error(s, fmt, args...) \ - __fat_fs_error(s, 1, fmt , ## args) -#define fat_fs_error_ratelimit(s, fmt, args...) \ - __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index b47d2c9f4fa..2e81ac0df7e 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -95,7 +95,7 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent, err_brelse: brelse(bhs[0]); err: - printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", (llu)blocknr); + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } @@ -108,7 +108,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, fatent->fat_inode = MSDOS_SB(sb)->fat_inode; fatent->bhs[0] = sb_bread(sb, blocknr); if (!fatent->bhs[0]) { - printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n", + fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr); return -EIO; } diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 8d68690bdcf..cb8d8391ac0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -581,7 +581,8 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = sbi->free_clusters; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - buf->f_namelen = sbi->options.isvfat ? FAT_LFN_LEN : 12; + buf->f_namelen = + (sbi->options.isvfat ? FAT_LFN_LEN : 12) * NLS_MAX_CHARSET_SIZE; return 0; } @@ -619,8 +620,8 @@ retry: bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits); if (!bh) { - printk(KERN_ERR "FAT: unable to read inode block " - "for updating (i_pos %lld)\n", i_pos); + fat_msg(sb, KERN_ERR, "unable to read inode block " + "for updating (i_pos %lld)", i_pos); return -EIO; } spin_lock(&sbi->inode_hash_lock); @@ -976,8 +977,8 @@ static const match_table_t vfat_tokens = { {Opt_err, NULL} }; -static int parse_options(char *options, int is_vfat, int silent, int *debug, - struct fat_mount_options *opts) +static int parse_options(struct super_block *sb, char *options, int is_vfat, + int silent, int *debug, struct fat_mount_options *opts) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -1168,15 +1169,15 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, /* obsolete mount options */ case Opt_obsolate: - printk(KERN_INFO "FAT: \"%s\" option is obsolete, " - "not supported now\n", p); + fat_msg(sb, KERN_INFO, "\"%s\" option is obsolete, " + "not supported now", p); break; /* unknown option */ default: if (!silent) { - printk(KERN_ERR - "FAT: Unrecognized mount option \"%s\" " - "or missing value\n", p); + fat_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" " + "or missing value", p); } return -EINVAL; } @@ -1185,7 +1186,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, out: /* UTF-8 doesn't provide FAT semantics */ if (!strcmp(opts->iocharset, "utf8")) { - printk(KERN_ERR "FAT: utf8 is not a recommended IO charset" + fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset" " for FAT filesystems, filesystem will be " "case sensitive!\n"); } @@ -1238,8 +1239,7 @@ static int fat_read_root(struct inode *inode) /* * Read the super block of an MS-DOS FS. */ -int fat_fill_super(struct super_block *sb, void *data, int silent, - const struct inode_operations *fs_dir_inode_ops, int isvfat, +int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, void (*setup)(struct super_block *)) { struct inode *root_inode = NULL, *fat_inode = NULL; @@ -1268,11 +1268,10 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; - sbi->dir_ops = fs_dir_inode_ops; ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - error = parse_options(data, isvfat, silent, &debug, &sbi->options); + error = parse_options(sb, data, isvfat, silent, &debug, &sbi->options); if (error) goto out_fail; @@ -1282,20 +1281,20 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sb_min_blocksize(sb, 512); bh = sb_bread(sb, 0); if (bh == NULL) { - printk(KERN_ERR "FAT: unable to read boot sector\n"); + fat_msg(sb, KERN_ERR, "unable to read boot sector"); goto out_fail; } b = (struct fat_boot_sector *) bh->b_data; if (!b->reserved) { if (!silent) - printk(KERN_ERR "FAT: bogus number of reserved sectors\n"); + fat_msg(sb, KERN_ERR, "bogus number of reserved sectors"); brelse(bh); goto out_invalid; } if (!b->fats) { if (!silent) - printk(KERN_ERR "FAT: bogus number of FAT structure\n"); + fat_msg(sb, KERN_ERR, "bogus number of FAT structure"); brelse(bh); goto out_invalid; } @@ -1308,7 +1307,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, media = b->media; if (!fat_valid_media(media)) { if (!silent) - printk(KERN_ERR "FAT: invalid media value (0x%02x)\n", + fat_msg(sb, KERN_ERR, "invalid media value (0x%02x)", media); brelse(bh); goto out_invalid; @@ -1318,7 +1317,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, || (logical_sector_size < 512) || (logical_sector_size > 4096)) { if (!silent) - printk(KERN_ERR "FAT: bogus logical sector size %u\n", + fat_msg(sb, KERN_ERR, "bogus logical sector size %u", logical_sector_size); brelse(bh); goto out_invalid; @@ -1326,15 +1325,15 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->sec_per_clus = b->sec_per_clus; if (!is_power_of_2(sbi->sec_per_clus)) { if (!silent) - printk(KERN_ERR "FAT: bogus sectors per cluster %u\n", + fat_msg(sb, KERN_ERR, "bogus sectors per cluster %u", sbi->sec_per_clus); brelse(bh); goto out_invalid; } if (logical_sector_size < sb->s_blocksize) { - printk(KERN_ERR "FAT: logical sector size too small for device" - " (logical sector size = %u)\n", logical_sector_size); + fat_msg(sb, KERN_ERR, "logical sector size too small for device" + " (logical sector size = %u)", logical_sector_size); brelse(bh); goto out_fail; } @@ -1342,14 +1341,14 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, brelse(bh); if (!sb_set_blocksize(sb, logical_sector_size)) { - printk(KERN_ERR "FAT: unable to set blocksize %u\n", + fat_msg(sb, KERN_ERR, "unable to set blocksize %u", logical_sector_size); goto out_fail; } bh = sb_bread(sb, 0); if (bh == NULL) { - printk(KERN_ERR "FAT: unable to read boot sector" - " (logical sector size = %lu)\n", + fat_msg(sb, KERN_ERR, "unable to read boot sector" + " (logical sector size = %lu)", sb->s_blocksize); goto out_fail; } @@ -1385,16 +1384,16 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, fsinfo_bh = sb_bread(sb, sbi->fsinfo_sector); if (fsinfo_bh == NULL) { - printk(KERN_ERR "FAT: bread failed, FSINFO block" - " (sector = %lu)\n", sbi->fsinfo_sector); + fat_msg(sb, KERN_ERR, "bread failed, FSINFO block" + " (sector = %lu)", sbi->fsinfo_sector); brelse(bh); goto out_fail; } fsinfo = (struct fat_boot_fsinfo *)fsinfo_bh->b_data; if (!IS_FSINFO(fsinfo)) { - printk(KERN_WARNING "FAT: Invalid FSINFO signature: " - "0x%08x, 0x%08x (sector = %lu)\n", + fat_msg(sb, KERN_WARNING, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); @@ -1415,8 +1414,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->dir_entries = get_unaligned_le16(&b->dir_entries); if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) - printk(KERN_ERR "FAT: bogus directroy-entries per block" - " (%u)\n", sbi->dir_entries); + fat_msg(sb, KERN_ERR, "bogus directroy-entries per block" + " (%u)", sbi->dir_entries); brelse(bh); goto out_invalid; } @@ -1438,7 +1437,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, total_clusters = min(total_clusters, fat_clusters - FAT_START_ENT); if (total_clusters > MAX_FAT(sb)) { if (!silent) - printk(KERN_ERR "FAT: count of clusters too big (%u)\n", + fat_msg(sb, KERN_ERR, "count of clusters too big (%u)", total_clusters); brelse(bh); goto out_invalid; @@ -1471,7 +1470,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sprintf(buf, "cp%d", sbi->options.codepage); sbi->nls_disk = load_nls(buf); if (!sbi->nls_disk) { - printk(KERN_ERR "FAT: codepage %s not found\n", buf); + fat_msg(sb, KERN_ERR, "codepage %s not found", buf); goto out_fail; } @@ -1479,7 +1478,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, if (sbi->options.isvfat) { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { - printk(KERN_ERR "FAT: IO charset %s not found\n", + fat_msg(sb, KERN_ERR, "IO charset %s not found", sbi->options.iocharset); goto out_fail; } @@ -1503,7 +1502,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, insert_inode_hash(root_inode); sb->s_root = d_alloc_root(root_inode); if (!sb->s_root) { - printk(KERN_ERR "FAT: get root inode failed\n"); + fat_msg(sb, KERN_ERR, "get root inode failed"); goto out_fail; } @@ -1512,8 +1511,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, out_invalid: error = -EINVAL; if (!silent) - printk(KERN_INFO "VFS: Can't find a valid FAT filesystem" - " on dev %s.\n", sb->s_id); + fat_msg(sb, KERN_INFO, "Can't find a valid FAT filesystem"); out_fail: if (fat_inode) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 970e682ea75..6d93360ca0c 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -20,30 +20,46 @@ * In case the file system is remounted read-only, it can be made writable * again by remounting it. */ -void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...) +void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) { - struct fat_mount_options *opts = &MSDOS_SB(s)->options; + struct fat_mount_options *opts = &MSDOS_SB(sb)->options; va_list args; + struct va_format vaf; if (report) { - printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); - - printk(KERN_ERR " "); va_start(args, fmt); - vprintk(fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_ERR "FAT-fs (%s): error, %pV\n", sb->s_id, &vaf); va_end(args); - printk("\n"); } if (opts->errors == FAT_ERRORS_PANIC) - panic("FAT: fs panic from previous error\n"); - else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { - s->s_flags |= MS_RDONLY; - printk(KERN_ERR "FAT: Filesystem has been set read-only\n"); + panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); + else if (opts->errors == FAT_ERRORS_RO && !(sb->s_flags & MS_RDONLY)) { + sb->s_flags |= MS_RDONLY; + printk(KERN_ERR "FAT-fs (%s): Filesystem has been " + "set read-only\n", sb->s_id); } } EXPORT_SYMBOL_GPL(__fat_fs_error); +/** + * fat_msg() - print preformated FAT specific messages. Every thing what is + * not fat_fs_error() should be fat_msg(). + */ +void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sFAT-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + /* Flushes the number of free clusters on FAT32 */ /* XXX: Need to write one per FSINFO block. Currently only writes 1 */ int fat_clusters_flush(struct super_block *sb) @@ -57,15 +73,15 @@ int fat_clusters_flush(struct super_block *sb) bh = sb_bread(sb, sbi->fsinfo_sector); if (bh == NULL) { - printk(KERN_ERR "FAT: bread failed in fat_clusters_flush\n"); + fat_msg(sb, KERN_ERR, "bread failed in fat_clusters_flush"); return -EIO; } fsinfo = (struct fat_boot_fsinfo *)bh->b_data; /* Sanity check */ if (!IS_FSINFO(fsinfo)) { - printk(KERN_ERR "FAT: Invalid FSINFO signature: " - "0x%08x, 0x%08x (sector = %lu)\n", + fat_msg(sb, KERN_ERR, "Invalid FSINFO signature: " + "0x%08x, 0x%08x (sector = %lu)", le32_to_cpu(fsinfo->signature1), le32_to_cpu(fsinfo->signature2), sbi->fsinfo_sector); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 711499040eb..3b222dafd15 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -659,14 +659,14 @@ static const struct inode_operations msdos_dir_inode_operations = { static void setup(struct super_block *sb) { + MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; sb->s_d_op = &msdos_dentry_operations; sb->s_flags |= MS_NOATIME; } static int msdos_fill_super(struct super_block *sb, void *data, int silent) { - return fat_fill_super(sb, data, silent, &msdos_dir_inode_operations, - 0, setup); + return fat_fill_super(sb, data, silent, 0, setup); } static struct dentry *msdos_mount(struct file_system_type *fs_type, diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index adae3fb7451..20b4ea53fdc 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -1065,6 +1065,7 @@ static const struct inode_operations vfat_dir_inode_operations = { static void setup(struct super_block *sb) { + MSDOS_SB(sb)->dir_ops = &vfat_dir_inode_operations; if (MSDOS_SB(sb)->options.name_check != 's') sb->s_d_op = &vfat_ci_dentry_ops; else @@ -1073,8 +1074,7 @@ static void setup(struct super_block *sb) static int vfat_fill_super(struct super_block *sb, void *data, int silent) { - return fat_fill_super(sb, data, silent, &vfat_dir_inode_operations, - 1, setup); + return fat_fill_super(sb, data, silent, 1, setup); } static struct dentry *vfat_mount(struct file_system_type *fs_type, diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 2ba6719ac61..1a4311437a8 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -272,7 +272,7 @@ vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip) * *ip: VFS inode * * Description: - * vxfs_put_fake_inode frees all data asssociated with @ip. + * vxfs_put_fake_inode frees all data associated with @ip. */ void vxfs_put_fake_inode(struct inode *ip) diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 48a18f184d5..30afdfa7aec 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -33,8 +33,6 @@ void fscache_enqueue_operation(struct fscache_operation *op) _enter("{OBJ%x OP%x,%u}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); - fscache_set_op_state(op, "EnQ"); - ASSERT(list_empty(&op->pend_link)); ASSERT(op->processor != NULL); ASSERTCMP(op->object->state, >=, FSCACHE_OBJECT_AVAILABLE); @@ -66,8 +64,6 @@ EXPORT_SYMBOL(fscache_enqueue_operation); static void fscache_run_op(struct fscache_object *object, struct fscache_operation *op) { - fscache_set_op_state(op, "Run"); - object->n_in_progress++; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); @@ -88,8 +84,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); - fscache_set_op_state(op, "SubmitX"); - spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -194,8 +188,6 @@ int fscache_submit_op(struct fscache_object *object, ASSERTCMP(atomic_read(&op->usage), >, 0); - fscache_set_op_state(op, "Submit"); - spin_lock(&object->lock); ASSERTCMP(object->n_ops, >=, object->n_in_progress); ASSERTCMP(object->n_ops, >=, object->n_exclusive); @@ -335,8 +327,6 @@ void fscache_put_operation(struct fscache_operation *op) if (!atomic_dec_and_test(&op->usage)) return; - fscache_set_op_state(op, "Put"); - _debug("PUT OP"); if (test_and_set_bit(FSCACHE_OP_DEAD, &op->flags)) BUG(); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 41c441c2058..a2a5d19ece6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -155,11 +155,9 @@ static void fscache_attr_changed_op(struct fscache_operation *op) fscache_stat(&fscache_n_attr_changed_calls); if (fscache_object_is_active(object)) { - fscache_set_op_state(op, "CallFS"); fscache_stat(&fscache_n_cop_attr_changed); ret = object->cache->ops->attr_changed(object); fscache_stat_d(&fscache_n_cop_attr_changed); - fscache_set_op_state(op, "Done"); if (ret < 0) fscache_abort_object(object); } @@ -190,7 +188,6 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) fscache_operation_init(op, fscache_attr_changed_op, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); - fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -257,7 +254,6 @@ static struct fscache_retrieval *fscache_alloc_retrieval( op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); - fscache_set_op_name(&op->op, "Retr"); return op; } @@ -368,7 +364,6 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, _leave(" = -ENOMEM"); return -ENOMEM; } - fscache_set_op_name(&op->op, "RetrRA1"); spin_lock(&cookie->lock); @@ -487,7 +482,6 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(mapping, end_io_func, context); if (!op) return -ENOMEM; - fscache_set_op_name(&op->op, "RetrRAN"); spin_lock(&cookie->lock); @@ -589,7 +583,6 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, op = fscache_alloc_retrieval(page->mapping, NULL, NULL); if (!op) return -ENOMEM; - fscache_set_op_name(&op->op, "RetrAL1"); spin_lock(&cookie->lock); @@ -662,8 +655,6 @@ static void fscache_write_op(struct fscache_operation *_op) _enter("{OP%x,%d}", op->op.debug_id, atomic_read(&op->op.usage)); - fscache_set_op_state(&op->op, "GetPage"); - spin_lock(&object->lock); cookie = object->cookie; @@ -698,15 +689,12 @@ static void fscache_write_op(struct fscache_operation *_op) spin_unlock(&cookie->stores_lock); spin_unlock(&object->lock); - fscache_set_op_state(&op->op, "Store"); fscache_stat(&fscache_n_store_pages); fscache_stat(&fscache_n_cop_write_page); ret = object->cache->ops->write_page(op, page); fscache_stat_d(&fscache_n_cop_write_page); - fscache_set_op_state(&op->op, "EndWrite"); fscache_end_page_write(object, page); if (ret < 0) { - fscache_set_op_state(&op->op, "Abort"); fscache_abort_object(object); } else { fscache_enqueue_operation(&op->op); @@ -778,7 +766,6 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_operation_init(&op->op, fscache_write_op, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); - fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index f3d23ef4e87..86128202384 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,9 +1,9 @@ ccflags-y := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ - glops.o inode.o log.o lops.o main.o meta_io.o \ + glops.o log.o lops.o main.o meta_io.o \ aops.o dentry.o export.o file.o \ - ops_fstype.o ops_inode.o quota.o \ + ops_fstype.o inode.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 0f5c4f9d5d6..802ac5eeba2 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1076,8 +1076,8 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) bd = bh->b_private; if (bd && bd->bd_ail) goto cannot_release; - gfs2_assert_warn(sdp, !buffer_pinned(bh)); - gfs2_assert_warn(sdp, !buffer_dirty(bh)); + if (buffer_pinned(bh) || buffer_dirty(bh)) + goto not_possible; bh = bh->b_this_page; } while(bh != head); gfs2_log_unlock(sdp); @@ -1107,6 +1107,10 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask) } while (bh != head); return try_to_free_buffers(page); + +not_possible: /* Should never happen */ + WARN_ON(buffer_dirty(bh)); + WARN_ON(buffer_pinned(bh)); cannot_release: gfs2_log_unlock(sdp); return 0; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 74add2ddcc3..e65493a8ac0 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -780,6 +780,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, metadata = (height != ip->i_height - 1); if (metadata) revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; + else if (ip->i_depth) + revokes = sdp->sd_inptrs; if (ip != GFS2_I(sdp->sd_rindex)) error = gfs2_rindex_hold(sdp, &ip->i_alloc->al_ri_gh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index f789c5732b7..091ee477953 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -82,12 +82,9 @@ struct qstr gfs2_qdot __read_mostly; struct qstr gfs2_qdotdot __read_mostly; -typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, - u64 leaf_no, void *data); typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, const struct qstr *name, void *opaque); - int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head **bhp) { @@ -1600,7 +1597,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name) */ int gfs2_dir_add(struct inode *inode, const struct qstr *name, - const struct gfs2_inode *nip, unsigned type) + const struct gfs2_inode *nip) { struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *bh; @@ -1616,7 +1613,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, return PTR_ERR(dent); dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); - dent->de_type = cpu_to_be16(type); + dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); @@ -1628,6 +1625,8 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, gfs2_trans_add_bh(ip->i_gl, bh, 1); ip->i_entries++; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; + if (S_ISDIR(nip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); gfs2_dinode_out(ip, bh->b_data); brelse(bh); error = 0; @@ -1672,8 +1671,9 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, * Returns: 0 on success, error code on failure */ -int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) { + const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; int error; @@ -1714,6 +1714,8 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) gfs2_trans_add_bh(dip->i_gl, bh, 1); dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; + if (S_ISDIR(dentry->d_inode->i_mode)) + drop_nlink(&dip->i_inode); gfs2_dinode_out(dip, bh->b_data); brelse(bh); mark_inode_dirty(&dip->i_inode); @@ -1768,94 +1770,20 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, } /** - * foreach_leaf - call a function for each leaf in a directory - * @dip: the directory - * @lc: the function to call for each each - * @data: private data to pass to it - * - * Returns: errno - */ - -static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); - struct buffer_head *bh; - struct gfs2_leaf *leaf; - u32 hsize, len; - u32 ht_offset, lp_offset, ht_offset_cur = -1; - u32 index = 0; - __be64 *lp; - u64 leaf_no; - int error = 0; - - hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { - gfs2_consist_inode(dip); - return -EIO; - } - - lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); - if (!lp) - return -ENOMEM; - - while (index < hsize) { - lp_offset = index & (sdp->sd_hash_ptrs - 1); - ht_offset = index - lp_offset; - - if (ht_offset_cur != ht_offset) { - error = gfs2_dir_read_data(dip, (char *)lp, - ht_offset * sizeof(__be64), - sdp->sd_hash_bsize, 1); - if (error != sdp->sd_hash_bsize) { - if (error >= 0) - error = -EIO; - goto out; - } - ht_offset_cur = ht_offset; - } - - leaf_no = be64_to_cpu(lp[lp_offset]); - if (leaf_no) { - error = get_leaf(dip, leaf_no, &bh); - if (error) - goto out; - leaf = (struct gfs2_leaf *)bh->b_data; - len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); - brelse(bh); - - error = lc(dip, index, len, leaf_no, data); - if (error) - goto out; - - index = (index & ~(len - 1)) + len; - } else - index++; - } - - if (index != hsize) { - gfs2_consist_inode(dip); - error = -EIO; - } - -out: - kfree(lp); - - return error; -} - -/** * leaf_dealloc - Deallocate a directory leaf * @dip: the directory * @index: the hash table offset in the directory * @len: the number of pointers to this leaf * @leaf_no: the leaf number - * @data: not used + * @leaf_bh: buffer_head for the starting leaf + * last_dealloc: 1 if this is the final dealloc for the leaf, else 0 * * Returns: errno */ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, - u64 leaf_no, void *data) + u64 leaf_no, struct buffer_head *leaf_bh, + int last_dealloc) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_leaf *tmp_leaf; @@ -1887,14 +1815,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, goto out_qs; /* Count the number of leaves */ + bh = leaf_bh; for (blk = leaf_no; blk; blk = nblk) { - error = get_leaf(dip, blk, &bh); - if (error) - goto out_rlist; + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + } tmp_leaf = (struct gfs2_leaf *)bh->b_data; nblk = be64_to_cpu(tmp_leaf->lf_next); - brelse(bh); + if (blk != leaf_no) + brelse(bh); gfs2_rlist_add(sdp, &rlist, blk); l_blocks++; @@ -1918,13 +1850,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, if (error) goto out_rg_gunlock; + bh = leaf_bh; + for (blk = leaf_no; blk; blk = nblk) { - error = get_leaf(dip, blk, &bh); - if (error) - goto out_end_trans; + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + } tmp_leaf = (struct gfs2_leaf *)bh->b_data; nblk = be64_to_cpu(tmp_leaf->lf_next); - brelse(bh); + if (blk != leaf_no) + brelse(bh); gfs2_free_meta(dip, blk, 1); gfs2_add_inode_blocks(&dip->i_inode, -1); @@ -1942,6 +1879,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, goto out_end_trans; gfs2_trans_add_bh(dip->i_gl, dibh, 1); + /* On the last dealloc, make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + if (last_dealloc) + dip->i_inode.i_mode = S_IFREG; gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); @@ -1975,29 +1916,67 @@ int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct buffer_head *bh; - int error; + struct gfs2_leaf *leaf; + u32 hsize, len; + u32 ht_offset, lp_offset, ht_offset_cur = -1; + u32 index = 0, next_index; + __be64 *lp; + u64 leaf_no; + int error = 0, last; - /* Dealloc on-disk leaves to FREEMETA state */ - error = foreach_leaf(dip, leaf_dealloc, NULL); - if (error) - return error; + hsize = 1 << dip->i_depth; + if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) { + gfs2_consist_inode(dip); + return -EIO; + } - /* Make this a regular file in case we crash. - (We don't want to free these blocks a second time.) */ + lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS); + if (!lp) + return -ENOMEM; - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; + while (index < hsize) { + lp_offset = index & (sdp->sd_hash_ptrs - 1); + ht_offset = index - lp_offset; - error = gfs2_meta_inode_buffer(dip, &bh); - if (!error) { - gfs2_trans_add_bh(dip->i_gl, bh, 1); - ((struct gfs2_dinode *)bh->b_data)->di_mode = - cpu_to_be32(S_IFREG); - brelse(bh); + if (ht_offset_cur != ht_offset) { + error = gfs2_dir_read_data(dip, (char *)lp, + ht_offset * sizeof(__be64), + sdp->sd_hash_bsize, 1); + if (error != sdp->sd_hash_bsize) { + if (error >= 0) + error = -EIO; + goto out; + } + ht_offset_cur = ht_offset; + } + + leaf_no = be64_to_cpu(lp[lp_offset]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth)); + + next_index = (index & ~(len - 1)) + len; + last = ((next_index >= hsize) ? 1 : 0); + error = leaf_dealloc(dip, index, len, leaf_no, bh, + last); + brelse(bh); + if (error) + goto out; + index = next_index; + } else + index++; } - gfs2_trans_end(sdp); + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: + kfree(lp); return error; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index a98f644bd3d..e686af11bec 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -22,8 +22,8 @@ extern struct inode *gfs2_dir_search(struct inode *dir, extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, const struct gfs2_inode *ip); extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, - const struct gfs2_inode *ip, unsigned int type); -extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); + const struct gfs2_inode *ip); +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, filldir_t filldir); extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index b5a5e60df0d..fe9945f2ff7 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -139,7 +139,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, struct gfs2_sbd *sdp = sb->s_fs_info; struct inode *inode; - inode = gfs2_ilookup(sb, inum->no_addr); + inode = gfs2_ilookup(sb, inum->no_addr, 0); if (inode) { if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { iput(inode); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e48310885c4..a9f5cbe45cd 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -545,18 +545,10 @@ static int gfs2_close(struct inode *inode, struct file *file) /** * gfs2_fsync - sync the dirty data for a file (across the cluster) * @file: the file that points to the dentry (we ignore this) - * @dentry: the dentry that points to the inode to sync + * @datasync: set if we can ignore timestamp changes * - * The VFS will flush "normal" data for us. We only need to worry - * about metadata here. For journaled data, we just do a log flush - * as we can't avoid it. Otherwise we can just bale out if datasync - * is set. For stuffed inodes we must flush the log in order to - * ensure that all data is on disk. - * - * The call to write_inode_now() is there to write back metadata and - * the inode itself. It does also try and write the data, but thats - * (hopefully) a no-op due to the VFS having already called filemap_fdatawrite() - * for us. + * The VFS will flush data for us. We only need to worry + * about metadata here. * * Returns: errno */ @@ -565,22 +557,20 @@ static int gfs2_fsync(struct file *file, int datasync) { struct inode *inode = file->f_mapping->host; int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); - int ret = 0; - - if (gfs2_is_jdata(GFS2_I(inode))) { - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); - return 0; - } + struct gfs2_inode *ip = GFS2_I(inode); + int ret; - if (sync_state != 0) { - if (!datasync) - ret = write_inode_now(inode, 0); + if (datasync) + sync_state &= ~I_DIRTY_SYNC; - if (gfs2_is_stuffed(GFS2_I(inode))) - gfs2_log_flush(GFS2_SB(inode), GFS2_I(inode)->i_gl); + if (sync_state) { + ret = sync_inode_metadata(inode, 1); + if (ret) + return ret; + gfs2_ail_flush(ip->i_gl); } - return ret; + return 0; } /** @@ -826,6 +816,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t bytes, max_bytes; struct gfs2_alloc *al; int error; + loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; @@ -833,13 +824,15 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~FALLOC_FL_KEEP_SIZE) return -EOPNOTSUPP; - offset = (offset >> sdp->sd_sb.sb_bsize_shift) << - sdp->sd_sb.sb_bsize_shift; + offset &= bsize_mask; len = next - offset; bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; if (!bytes) bytes = UINT_MAX; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); error = gfs2_glock_nq(&ip->i_gh); @@ -870,6 +863,9 @@ retry: if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { bytes >>= 1; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; goto retry; } goto out_qunlock; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7a4fb630a32..2792a790e50 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -143,14 +143,9 @@ static int demote_ok(const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; - /* assert_spin_locked(&gl->gl_spin); */ - if (gl->gl_state == LM_ST_UNLOCKED) return 0; - if (test_bit(GLF_LFLUSH, &gl->gl_flags)) - return 0; - if ((gl->gl_name.ln_type != LM_TYPE_INODE) && - !list_empty(&gl->gl_holders)) + if (!list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) return glops->go_demote_ok(gl); @@ -158,6 +153,31 @@ static int demote_ok(const struct gfs2_glock *gl) } +void gfs2_glock_add_to_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + + if (!list_empty(&gl->gl_lru)) + list_del_init(&gl->gl_lru); + else + atomic_inc(&lru_count); + + list_add_tail(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); + spin_unlock(&lru_lock); +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); + } + spin_unlock(&lru_lock); +} + /** * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list * @gl: the glock @@ -168,24 +188,8 @@ static int demote_ok(const struct gfs2_glock *gl) static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) { - if (demote_ok(gl)) { - spin_lock(&lru_lock); - - if (!list_empty(&gl->gl_lru)) - list_del_init(&gl->gl_lru); - else - atomic_inc(&lru_count); - - list_add_tail(&gl->gl_lru, &lru_list); - spin_unlock(&lru_lock); - } -} - -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) -{ - spin_lock(&gl->gl_spin); - __gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); + if (demote_ok(gl)) + gfs2_glock_add_to_lru(gl); } /** @@ -217,12 +221,7 @@ void gfs2_glock_put(struct gfs2_glock *gl) spin_lock_bucket(gl->gl_hash); hlist_bl_del_rcu(&gl->gl_list); spin_unlock_bucket(gl->gl_hash); - spin_lock(&lru_lock); - if (!list_empty(&gl->gl_lru)) { - list_del_init(&gl->gl_lru); - atomic_dec(&lru_count); - } - spin_unlock(&lru_lock); + gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); @@ -542,11 +541,6 @@ __acquires(&gl->gl_spin) clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_hold(gl); - if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED || - gl->gl_state == LM_ST_DEFERRED) && - !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) - lck_flags |= LM_FLAG_TRY_1CB; - if (sdp->sd_lockstruct.ls_ops->lm_lock) { /* lock_dlm */ ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); @@ -648,7 +642,7 @@ static void delete_work_func(struct work_struct *work) /* Note: Unsafe to dereference ip as we don't hold right refs/locks */ if (ip) - inode = gfs2_ilookup(sdp->sd_vfs, no_addr); + inode = gfs2_ilookup(sdp->sd_vfs, no_addr, 1); else inode = gfs2_lookup_by_inum(sdp, no_addr, NULL, GFS2_BLKST_UNLINKED); if (inode && !IS_ERR(inode)) { @@ -1025,6 +1019,9 @@ int gfs2_glock_nq(struct gfs2_holder *gh) if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) return -EIO; + if (test_bit(GLF_LRU, &gl->gl_flags)) + gfs2_glock_remove_from_lru(gl); + spin_lock(&gl->gl_spin); add_to_queue(gh); if ((LM_FLAG_NOEXP & gh->gh_flags) && @@ -1082,7 +1079,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } - __gfs2_glock_schedule_for_reclaim(gl); + if (!test_bit(GLF_LFLUSH, &gl->gl_flags)) + __gfs2_glock_schedule_for_reclaim(gl); trace_gfs2_glock_queue(gh, 0); spin_unlock(&gl->gl_spin); if (likely(fast_path)) @@ -1348,11 +1346,14 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, + struct shrink_control *sc) { struct gfs2_glock *gl; int may_demote; int nr_skipped = 0; + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; LIST_HEAD(skipped); if (nr == 0) @@ -1365,6 +1366,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m while(nr && !list_empty(&lru_list)) { gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); atomic_dec(&lru_count); /* Test for being demotable */ @@ -1387,6 +1389,7 @@ static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_m } nr_skipped++; list_add(&gl->gl_lru, &skipped); + set_bit(GLF_LRU, &gl->gl_flags); } list_splice(&skipped, &lru_list); atomic_add(nr_skipped, &lru_count); @@ -1459,12 +1462,7 @@ static void thaw_glock(struct gfs2_glock *gl) static void clear_glock(struct gfs2_glock *gl) { - spin_lock(&lru_lock); - if (!list_empty(&gl->gl_lru)) { - list_del_init(&gl->gl_lru); - atomic_dec(&lru_count); - } - spin_unlock(&lru_lock); + gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_spin); if (gl->gl_state != LM_ST_UNLOCKED) @@ -1599,9 +1597,11 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) return 0; } -static const char *gflags2str(char *buf, const unsigned long *gflags) +static const char *gflags2str(char *buf, const struct gfs2_glock *gl) { + const unsigned long *gflags = &gl->gl_flags; char *p = buf; + if (test_bit(GLF_LOCK, gflags)) *p++ = 'l'; if (test_bit(GLF_DEMOTE, gflags)) @@ -1624,6 +1624,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) *p++ = 'F'; if (test_bit(GLF_QUEUED, gflags)) *p++ = 'q'; + if (test_bit(GLF_LRU, gflags)) + *p++ = 'L'; + if (gl->gl_object) + *p++ = 'o'; *p = 0; return buf; } @@ -1658,14 +1662,15 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl) dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; - gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d r:%d\n", + gfs2_print_dbg(seq, "G: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d v:%d r:%d\n", state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, - gflags2str(gflags_buf, &gl->gl_flags), + gflags2str(gflags_buf, gl), state2str(gl->gl_target), state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), atomic_read(&gl->gl_ref)); list_for_each_entry(gh, &gl->gl_holders, gh_list) { diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index aea160690e9..6b2f757b928 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -225,11 +225,10 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); -extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp); extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip); extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); -extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); +extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); extern void gfs2_glock_free(struct gfs2_glock *gl); extern int __init gfs2_glock_init(void); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 25eeb2bcee4..8ef70f46473 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,33 +28,18 @@ #include "trans.h" /** - * ail_empty_gl - remove all buffers for a given lock from the AIL + * __gfs2_ail_flush - remove all buffers for a given lock from the AIL * @gl: the glock * * None of the buffers should be dirty, locked, or pinned. */ -static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +static void __gfs2_ail_flush(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct list_head *head = &gl->gl_ail_list; struct gfs2_bufdata *bd; struct buffer_head *bh; - struct gfs2_trans tr; - - memset(&tr, 0, sizeof(tr)); - tr.tr_revokes = atomic_read(&gl->gl_ail_count); - - if (!tr.tr_revokes) - return; - - /* A shortened, inline version of gfs2_trans_begin() */ - tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); - tr.tr_ip = (unsigned long)__builtin_return_address(0); - INIT_LIST_HEAD(&tr.tr_list_buf); - gfs2_log_reserve(sdp, tr.tr_reserved); - BUG_ON(current->journal_info); - current->journal_info = &tr; spin_lock(&sdp->sd_ail_lock); while (!list_empty(head)) { @@ -76,7 +61,47 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) } gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); spin_unlock(&sdp->sd_ail_lock); +} + + +static void gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_trans tr; + + memset(&tr, 0, sizeof(tr)); + tr.tr_revokes = atomic_read(&gl->gl_ail_count); + + if (!tr.tr_revokes) + return; + + /* A shortened, inline version of gfs2_trans_begin() */ + tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); + tr.tr_ip = (unsigned long)__builtin_return_address(0); + INIT_LIST_HEAD(&tr.tr_list_buf); + gfs2_log_reserve(sdp, tr.tr_reserved); + BUG_ON(current->journal_info); + current->journal_info = &tr; + + __gfs2_ail_flush(gl); + + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL); +} + +void gfs2_ail_flush(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + unsigned int revokes = atomic_read(&gl->gl_ail_count); + int ret; + + if (!revokes) + return; + ret = gfs2_trans_begin(sdp, 0, revokes); + if (ret) + return; + __gfs2_ail_flush(gl); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL); } @@ -227,6 +252,119 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) } /** + * gfs2_set_nlink - Set the inode's link count based on on-disk info + * @inode: The inode in question + * @nlink: The link count + * + * If the link count has hit zero, it must never be raised, whatever the + * on-disk inode might say. When new struct inodes are created the link + * count is set to 1, so that we can safely use this test even when reading + * in on disk information for the first time. + */ + +static void gfs2_set_nlink(struct inode *inode, u32 nlink) +{ + /* + * We will need to review setting the nlink count here in the + * light of the forthcoming ro bind mount work. This is a reminder + * to do that. + */ + if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { + if (nlink == 0) + clear_nlink(inode); + else + inode->i_nlink = nlink; + } +} + +static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) +{ + const struct gfs2_dinode *str = buf; + struct timespec atime; + u16 height, depth; + + if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) + goto corrupt; + ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); + ip->i_inode.i_mode = be32_to_cpu(str->di_mode); + ip->i_inode.i_rdev = 0; + switch (ip->i_inode.i_mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + }; + + ip->i_inode.i_uid = be32_to_cpu(str->di_uid); + ip->i_inode.i_gid = be32_to_cpu(str->di_gid); + gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + ip->i_inode.i_atime = atime; + ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); + ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + + ip->i_goal = be64_to_cpu(str->di_goal_meta); + ip->i_generation = be64_to_cpu(str->di_generation); + + ip->i_diskflags = be32_to_cpu(str->di_flags); + gfs2_set_inode_flags(&ip->i_inode); + height = be16_to_cpu(str->di_height); + if (unlikely(height > GFS2_MAX_META_HEIGHT)) + goto corrupt; + ip->i_height = (u8)height; + + depth = be16_to_cpu(str->di_depth); + if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) + goto corrupt; + ip->i_depth = (u8)depth; + ip->i_entries = be32_to_cpu(str->di_entries); + + ip->i_eattr = be64_to_cpu(str->di_eattr); + if (S_ISREG(ip->i_inode.i_mode)) + gfs2_set_aops(&ip->i_inode); + + return 0; +corrupt: + gfs2_consist_inode(ip); + return -EIO; +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { + brelse(dibh); + return -EIO; + } + + error = gfs2_dinode_in(ip, dibh->b_data); + brelse(dibh); + clear_bit(GIF_INVALID, &ip->i_flags); + + return error; +} + +/** * inode_go_lock - operation done after an inode lock is locked by a process * @gl: the glock * @flags: diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index b3aa2e3210f..6fce409b5a5 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -23,4 +23,6 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; +extern void gfs2_ail_flush(struct gfs2_glock *gl); + #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 870a89d6d4d..0a064e91ac7 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -20,7 +20,6 @@ #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 -#define DIO_ALL 0x00000100 struct gfs2_log_operations; struct gfs2_log_element; @@ -200,6 +199,8 @@ enum { GLF_INITIAL = 10, GLF_FROZEN = 11, GLF_QUEUED = 12, + GLF_LRU = 13, + GLF_OBJECT = 14, /* Used only for tracing */ }; struct gfs2_glock { @@ -234,6 +235,7 @@ struct gfs2_glock { struct list_head gl_ail_list; atomic_t gl_ail_count; + atomic_t gl_revokes; struct delayed_work gl_work; struct work_struct gl_delete; struct rcu_head gl_rcu; @@ -374,8 +376,6 @@ struct gfs2_ail { unsigned int ai_first; struct list_head ai_ail1_list; struct list_head ai_ail2_list; - - u64 ai_sync_gen; }; struct gfs2_journal_extent { @@ -488,7 +488,6 @@ struct gfs2_sb_host { char sb_lockproto[GFS2_LOCKNAME_LEN]; char sb_locktable[GFS2_LOCKNAME_LEN]; - u8 sb_uuid[16]; }; /* @@ -654,7 +653,6 @@ struct gfs2_sbd { spinlock_t sd_ail_lock; struct list_head sd_ail1_list; struct list_head sd_ail2_list; - u64 sd_ail_sync_gen; /* Replay stuff */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9134dcb8947..03e0c529063 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1,23 +1,25 @@ /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License version 2. */ -#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> +#include <linux/namei.h> +#include <linux/mm.h> +#include <linux/xattr.h> #include <linux/posix_acl.h> -#include <linux/sort.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> +#include <linux/fiemap.h> #include <linux/security.h> -#include <linux/time.h> +#include <asm/uaccess.h> #include "gfs2.h" #include "incore.h" @@ -26,19 +28,14 @@ #include "dir.h" #include "xattr.h" #include "glock.h" -#include "glops.h" #include "inode.h" -#include "log.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" - -struct gfs2_inum_range_host { - u64 ir_start; - u64 ir_length; -}; +#include "super.h" +#include "glops.h" struct gfs2_skip_data { u64 no_addr; @@ -74,14 +71,14 @@ static int iget_set(struct inode *inode, void *opaque) return 0; } -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr) +struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int non_block) { unsigned long hash = (unsigned long)no_addr; struct gfs2_skip_data data; data.no_addr = no_addr; data.skipped = 0; - data.non_block = 0; + data.non_block = non_block; return ilookup5(sb, hash, iget_test, &data); } @@ -248,203 +245,6 @@ fail_iput: goto fail; } -static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) -{ - const struct gfs2_dinode *str = buf; - struct timespec atime; - u16 height, depth; - - if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) - goto corrupt; - ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); - ip->i_inode.i_mode = be32_to_cpu(str->di_mode); - ip->i_inode.i_rdev = 0; - switch (ip->i_inode.i_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - ip->i_inode.i_rdev = MKDEV(be32_to_cpu(str->di_major), - be32_to_cpu(str->di_minor)); - break; - }; - - ip->i_inode.i_uid = be32_to_cpu(str->di_uid); - ip->i_inode.i_gid = be32_to_cpu(str->di_gid); - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); - i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); - gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); - atime.tv_sec = be64_to_cpu(str->di_atime); - atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) - ip->i_inode.i_atime = atime; - ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); - - ip->i_goal = be64_to_cpu(str->di_goal_meta); - ip->i_generation = be64_to_cpu(str->di_generation); - - ip->i_diskflags = be32_to_cpu(str->di_flags); - gfs2_set_inode_flags(&ip->i_inode); - height = be16_to_cpu(str->di_height); - if (unlikely(height > GFS2_MAX_META_HEIGHT)) - goto corrupt; - ip->i_height = (u8)height; - - depth = be16_to_cpu(str->di_depth); - if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) - goto corrupt; - ip->i_depth = (u8)depth; - ip->i_entries = be32_to_cpu(str->di_entries); - - ip->i_eattr = be64_to_cpu(str->di_eattr); - if (S_ISREG(ip->i_inode.i_mode)) - gfs2_set_aops(&ip->i_inode); - - return 0; -corrupt: - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; -} - -/** - * gfs2_inode_refresh - Refresh the incore copy of the dinode - * @ip: The GFS2 inode - * - * Returns: errno - */ - -int gfs2_inode_refresh(struct gfs2_inode *ip) -{ - struct buffer_head *dibh; - int error; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), dibh, GFS2_METATYPE_DI)) { - brelse(dibh); - return -EIO; - } - - error = gfs2_dinode_in(ip, dibh->b_data); - brelse(dibh); - clear_bit(GIF_INVALID, &ip->i_flags); - - return error; -} - -int gfs2_dinode_dealloc(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_alloc *al; - struct gfs2_rgrpd *rgd; - int error; - - if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - - al = gfs2_alloc_get(ip); - if (!al) - return -ENOMEM; - - error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); - if (error) - goto out; - - error = gfs2_rindex_hold(sdp, &al->al_ri_gh); - if (error) - goto out_qs; - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - if (!rgd) { - gfs2_consist_inode(ip); - error = -EIO; - goto out_rindex_relse; - } - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &al->al_rgd_gh); - if (error) - goto out_rindex_relse; - - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, 1); - if (error) - goto out_rg_gunlock; - - set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); - set_bit(GLF_LFLUSH, &ip->i_gl->gl_flags); - - gfs2_free_di(rgd, ip); - - gfs2_trans_end(sdp); - -out_rg_gunlock: - gfs2_glock_dq_uninit(&al->al_rgd_gh); -out_rindex_relse: - gfs2_glock_dq_uninit(&al->al_ri_gh); -out_qs: - gfs2_quota_unhold(ip); -out: - gfs2_alloc_put(ip); - return error; -} - -/** - * gfs2_change_nlink - Change nlink count on inode - * @ip: The GFS2 inode - * @diff: The change in the nlink count required - * - * Returns: errno - */ -int gfs2_change_nlink(struct gfs2_inode *ip, int diff) -{ - struct buffer_head *dibh; - u32 nlink; - int error; - - BUG_ON(diff != 1 && diff != -1); - nlink = ip->i_inode.i_nlink + diff; - - /* If we are reducing the nlink count, but the new value ends up being - bigger than the old one, we must have underflowed. */ - if (diff < 0 && nlink > ip->i_inode.i_nlink) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - return error; - - if (diff > 0) - inc_nlink(&ip->i_inode); - else - drop_nlink(&ip->i_inode); - - ip->i_inode.i_ctime = CURRENT_TIME; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - mark_inode_dirty(&ip->i_inode); - - if (ip->i_inode.i_nlink == 0) - gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */ - - return error; -} struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) { @@ -543,7 +343,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, /* Don't create entries in an unlinked directory */ if (!dip->i_inode.i_nlink) - return -EPERM; + return -ENOENT; error = gfs2_dir_check(&dip->i_inode, name, NULL); switch (error) { @@ -613,21 +413,44 @@ out: return error; } +static void gfs2_init_dir(struct buffer_head *dibh, + const struct gfs2_inode *parent) +{ + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = cpu_to_be16(DT_DIR); + + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_inum_out(parent, dent); + dent->de_type = cpu_to_be16(DT_DIR); + +} + /** * init_dinode - Fill in a new dinode structure - * @dip: the directory this inode is being created in + * @dip: The directory this inode is being created in * @gl: The glock covering the new inode - * @inum: the inode number - * @mode: the file permissions - * @uid: - * @gid: + * @inum: The inode number + * @mode: The file permissions + * @uid: The uid of the new inode + * @gid: The gid of the new inode + * @generation: The generation number of the new inode + * @dev: The device number (if a device node) + * @symname: The symlink destination (if a symlink) + * @size: The inode size (ignored for directories) + * @bhp: The buffer head (returned to caller) * */ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, const struct gfs2_inum_host *inum, unsigned int mode, unsigned int uid, unsigned int gid, - const u64 *generation, dev_t dev, struct buffer_head **bhp) + const u64 *generation, dev_t dev, const char *symname, + unsigned size, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_dinode *di; @@ -646,7 +469,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_uid = cpu_to_be32(uid); di->di_gid = cpu_to_be32(gid); di->di_nlink = 0; - di->di_size = 0; + di->di_size = cpu_to_be64(size); di->di_blocks = cpu_to_be64(1); di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec); di->di_major = cpu_to_be32(MAJOR(dev)); @@ -654,16 +477,6 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr); di->di_generation = cpu_to_be64(*generation); di->di_flags = 0; - - if (S_ISREG(mode)) { - if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || - gfs2_tune_get(sdp, gt_new_files_jdata)) - di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); - } else if (S_ISDIR(mode)) { - di->di_flags |= cpu_to_be32(dip->i_diskflags & - GFS2_DIF_INHERIT_JDATA); - } - di->__pad1 = 0; di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0); di->di_height = 0; @@ -677,7 +490,26 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_mtime_nsec = cpu_to_be32(tv.tv_nsec); di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec); memset(&di->di_reserved, 0, sizeof(di->di_reserved)); - + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); + break; + case S_IFDIR: + di->di_flags |= cpu_to_be32(dip->i_diskflags & + GFS2_DIF_INHERIT_JDATA); + di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); + di->di_size = cpu_to_be64(sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); + di->di_entries = cpu_to_be32(2); + gfs2_init_dir(dibh, dip); + break; + case S_IFLNK: + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size); + break; + } + set_buffer_uptodate(dibh); *bhp = dibh; @@ -685,7 +517,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, unsigned int mode, const struct gfs2_inum_host *inum, - const u64 *generation, dev_t dev, struct buffer_head **bhp) + const u64 *generation, dev_t dev, const char *symname, + unsigned int size, struct buffer_head **bhp) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); unsigned int uid, gid; @@ -707,7 +540,7 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, if (error) goto out_quota; - init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, bhp); + init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp); gfs2_quota_change(dip, +1, uid, gid); gfs2_trans_end(sdp); @@ -761,14 +594,16 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, goto fail_quota_locks; } - error = gfs2_dir_add(&dip->i_inode, name, ip, IF2DT(ip->i_inode.i_mode)); + error = gfs2_dir_add(&dip->i_inode, name, ip); if (error) goto fail_end_trans; error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto fail_end_trans; - ip->i_inode.i_nlink = 1; + inc_nlink(&ip->i_inode); + if (S_ISDIR(ip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -815,27 +650,25 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip, } /** - * gfs2_createi - Create a new inode - * @ghs: An array of two holders - * @name: The name of the new file - * @mode: the permissions on the new inode - * - * @ghs[0] is an initialized holder for the directory - * @ghs[1] is the holder for the inode lock + * gfs2_create_inode - Create a new inode + * @dir: The parent directory + * @dentry: The new dentry + * @mode: The permissions on the new inode + * @dev: For device nodes, this is the device number + * @symname: For symlinks, this is the link destination + * @size: The initial size of the inode (ignored for directories) * - * If the return value is not NULL, the glocks on both the directory and the new - * file are held. A transaction has been started and an inplace reservation - * is held, as well. - * - * Returns: An inode + * Returns: 0 on success, or error code */ -struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, - unsigned int mode, dev_t dev) +static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + unsigned int mode, dev_t dev, const char *symname, + unsigned int size) { + const struct qstr *name = &dentry->d_name; + struct gfs2_holder ghs[2]; struct inode *inode = NULL; - struct gfs2_inode *dip = ghs->gh_gl->gl_object; - struct inode *dir = &dip->i_inode; + struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; int error; @@ -843,10 +676,9 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, struct buffer_head *bh = NULL; if (!name->len || name->len > GFS2_FNAMESIZE) - return ERR_PTR(-ENAMETOOLONG); + return -ENAMETOOLONG; - gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, ghs); - error = gfs2_glock_nq(ghs); + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); if (error) goto fail; @@ -864,7 +696,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock; - error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, &bh); + error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh); if (error) goto fail_gunlock2; @@ -891,18 +723,852 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (bh) brelse(bh); - return inode; + + gfs2_trans_end(sdp); + if (dip->i_alloc->al_rgd) + gfs2_inplace_release(dip); + gfs2_quota_unlock(dip); + gfs2_alloc_put(dip); + gfs2_glock_dq_uninit_m(2, ghs); + mark_inode_dirty(inode); + d_instantiate(dentry, inode); + return 0; fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); if (inode && !IS_ERR(inode)) iput(inode); fail_gunlock: - gfs2_glock_dq(ghs); + gfs2_glock_dq_uninit(ghs); fail: if (bh) brelse(bh); - return ERR_PTR(error); + return error; +} + +/** + * gfs2_create - Create a file + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * + * Returns: errno + */ + +static int gfs2_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct inode *inode; + int ret; + + for (;;) { + ret = gfs2_create_inode(dir, dentry, S_IFREG | mode, 0, NULL, 0); + if (ret != -EEXIST || (nd && (nd->flags & LOOKUP_EXCL))) + return ret; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode) { + if (!IS_ERR(inode)) + break; + return PTR_ERR(inode); + } + } + + d_instantiate(dentry, inode); + return 0; +} + +/** + * gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @nd: passed from Linux VFS, ignored by us + * + * Called by the VFS layer. Lock dir and call gfs2_lookupi() + * + * Returns: errno + */ + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode = NULL; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode && IS_ERR(inode)) + return ERR_CAST(inode); + + if (inode) { + struct gfs2_glock *gl = GFS2_I(inode)->i_gl; + struct gfs2_holder gh; + int error; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); + } + gfs2_glock_dq_uninit(&gh); + return d_splice_alias(inode, dentry); + } + d_add(dentry, inode); + + return NULL; +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = old_dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder ghs[2]; + struct buffer_head *dibh; + int alloc_required; + int error; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_gunlock; + + error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(dir, &dentry->d_name, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + } + + error = -EINVAL; + if (!dip->i_inode.i_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_entries == (u32)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EINVAL; + if (!ip->i_inode.i_nlink) + goto out_gunlock; + error = -EMLINK; + if (ip->i_inode.i_nlink == (u32)-1) + goto out_gunlock; + + alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(dip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } + + error = gfs2_quota_lock_check(dip); + if (error) + goto out_alloc; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve(dip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + gfs2_rg_blocks(al) + + 2 * RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(dir, &dentry->d_name, ip); + if (error) + goto out_brelse; + + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + inc_nlink(&ip->i_inode); + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_dinode_out(ip, dibh->b_data); + mark_inode_dirty(&ip->i_inode); + +out_brelse: + brelse(dibh); +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (alloc_required) + gfs2_inplace_release(dip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(dip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(dip); +out_gunlock: + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); + gfs2_holder_uninit(ghs + 1); + if (!error) { + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + } + return error; +} + +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) +{ + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_inode.i_mode & S_ISVTX) && + dip->i_inode.i_uid != current_fsuid() && + ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); + if (error) + return error; + + error = gfs2_dir_check(&dip->i_inode, name, ip); + if (error) + return error; + + return 0; +} + +/** + * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it + * @dip: The parent directory + * @name: The name of the entry in the parent directory + * @bh: The inode buffer for the inode to be removed + * @inode: The inode to be removed + * + * Called with all the locks and in a transaction. This will only be + * called for a directory after it has been checked to ensure it is empty. + * + * Returns: 0 on success, or an error + */ + +static int gfs2_unlink_inode(struct gfs2_inode *dip, + const struct dentry *dentry, + struct buffer_head *bh) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = gfs2_dir_del(dip, dentry); + if (error) + return error; + + ip->i_entries = 0; + inode->i_ctime = CURRENT_TIME; + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else + drop_nlink(inode); + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + mark_inode_dirty(inode); + if (inode->i_nlink == 0) + gfs2_unlink_di(inode); + return 0; +} + + +/** + * gfs2_unlink - Unlink an inode (this does rmdir as well) + * @dir: The inode of the directory containing the inode to unlink + * @dentry: The file itself + * + * This routine uses the type of the inode as a flag to figure out + * whether this is an unlink or an rmdir. + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh; + struct gfs2_holder ghs[3]; + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh; + int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + + + error = gfs2_glock_nq(ghs); /* parent */ + if (error) + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_rgrp; + + if (S_ISDIR(inode->i_mode)) { + error = -ENOTEMPTY; + if (ip->i_entries > 2 || inode->i_nlink > 2) + goto out_rgrp; + } + + error = gfs2_glock_nq(ghs + 2); /* rgrp */ + if (error) + goto out_rgrp; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_end_trans; + + error = gfs2_unlink_inode(dip, dentry, bh); + brelse(bh); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + gfs2_glock_dq(ghs + 2); +out_rgrp: + gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_holder_uninit(ghs + 1); + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); + gfs2_glock_dq_uninit(&ri_gh); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct gfs2_sbd *sdp = GFS2_SB(dir); + unsigned int size; + + size = strlen(symname); + if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) + return -ENAMETOOLONG; + + return gfs2_create_inode(dir, dentry, S_IFLNK | S_IRWXUGO, 0, symname, size); +} + +/** + * gfs2_mkdir - Make a directory + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + return gfs2_create_inode(dir, dentry, S_IFDIR | mode, 0, NULL, 0); +} + +/** + * gfs2_mknod - Make a special file + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @dev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t dev) +{ + return gfs2_create_inode(dir, dentry, mode, dev, NULL, 0); +} + +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + int error = 0; + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == sb->s_root->d_inode) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + +/** + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * + * Returns: errno + */ + +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; + struct gfs2_rgrpd *nrgd; + unsigned int num_gh; + int dir_rename = 0; + int alloc_required = 0; + unsigned int x; + int error; + + if (ndentry->d_inode) { + nip = GFS2_I(ndentry->d_inode); + if (ip == nip) + return 0; + } + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a dirctory into it's subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + /* grab the resource lock for unlink flag twiddling + * this is the case of the target file already existing + * so we unlink before doing the rename + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); + if (nrgd) + gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); + } + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (ip->i_inode.i_nlink == 0) + goto out_gunlock; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (nip->i_inode.i_nlink == 0) { + error = -EAGAIN; + goto out_gunlock; + } + + if (S_ISDIR(nip->i_inode.i_mode)) { + if (nip->i_entries < 2) { + gfs2_consist_inode(nip); + error = -EIO; + goto out_gunlock; + } + if (nip->i_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + default: + goto out_gunlock; + }; + + if (odip != ndip) { + if (!ndip->i_inode.i_nlink) { + error = -ENOENT; + goto out_gunlock; + } + if (ndip->i_entries == (u32)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_inode.i_mode) && + ndip->i_inode.i_nlink == (u32)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0); + if (error) + goto out_gunlock; + } + + if (nip == NULL) + alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + error = alloc_required; + if (error < 0) + goto out_gunlock; + error = 0; + + if (alloc_required) { + struct gfs2_alloc *al = gfs2_alloc_get(ndip); + if (!al) { + error = -ENOMEM; + goto out_gunlock; + } + + error = gfs2_quota_lock_check(ndip); + if (error) + goto out_alloc; + + al->al_requested = sdp->sd_max_dirres; + + error = gfs2_inplace_reserve_ri(ndip); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + + gfs2_rg_blocks(al) + + 4 * RES_DINODE + 4 * RES_LEAF + + RES_STATFS + RES_QUOTA + 4, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF + 4, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) { + struct buffer_head *bh; + error = gfs2_meta_inode_buffer(nip, &bh); + if (error) + goto out_end_trans; + error = gfs2_unlink_inode(ndip, ndentry, bh); + brelse(bh); + } + + if (dir_rename) { + error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + if (error) + goto out_end_trans; + } else { + struct buffer_head *dibh; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_bh(ip->i_gl, dibh, 1); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + error = gfs2_dir_del(odip, odentry); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, ip); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (alloc_required) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (alloc_required) + gfs2_quota_unlock(ndip); +out_alloc: + if (alloc_required) + gfs2_alloc_put(ndip); +out_gunlock: + while (x--) { + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (r_gh.gh_gl) + gfs2_glock_dq_uninit(&r_gh); +out: + gfs2_glock_dq_uninit(&ri_gh); + return error; +} + +/** + * gfs2_follow_link - Follow a symbolic link + * @dentry: The dentry of the link + * @nd: Data that we pass to vfs_follow_link() + * + * This can handle symlinks of any size. + * + * Returns: 0 on success or error code + */ + +static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int size; + char *buf; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + nd_set_link(nd, ERR_PTR(error)); + return NULL; + } + + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { + gfs2_consist_inode(ip); + buf = ERR_PTR(-EIO); + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) { + buf = ERR_PTR(error); + goto out; + } + + buf = kzalloc(size + 1, GFP_NOFS); + if (!buf) + buf = ERR_PTR(-ENOMEM); + else + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + nd_set_link(nd, buf); + return NULL; +} + +static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + kfree(s); +} + +/** + * gfs2_permission - + * @inode: The inode + * @mask: The mask to be tested + * @flags: Indicates whether this is an RCU path walk or not + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. + * + * Returns: errno + */ + +int gfs2_permission(struct inode *inode, int mask, unsigned int flags) +{ + struct gfs2_inode *ip; + struct gfs2_holder i_gh; + int error; + int unlock = 0; + + + ip = GFS2_I(inode); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + if (flags & IPERM_FLAG_RCU) + return -ECHILD; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + unlock = 1; + } + + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + error = -EACCES; + else + error = generic_permission(inode, mask, flags, gfs2_check_acl); + if (unlock) + gfs2_glock_dq_uninit(&i_gh); + + return error; } static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) @@ -928,8 +1594,6 @@ static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) * @ip: * @attr: * - * Called with a reference on the vnode. - * * Returns: errno */ @@ -949,60 +1613,280 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) return error; } -void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) -{ - struct gfs2_dinode *str = buf; - - str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); - str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); - str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); - str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); - str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); - str->di_mode = cpu_to_be32(ip->i_inode.i_mode); - str->di_uid = cpu_to_be32(ip->i_inode.i_uid); - str->di_gid = cpu_to_be32(ip->i_inode.i_gid); - str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); - str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); - str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); - - str->di_goal_meta = cpu_to_be64(ip->i_goal); - str->di_goal_data = cpu_to_be64(ip->i_goal); - str->di_generation = cpu_to_be64(ip->i_generation); - - str->di_flags = cpu_to_be32(ip->i_diskflags); - str->di_height = cpu_to_be16(ip->i_height); - str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && - !(ip->i_diskflags & GFS2_DIF_EXHASH) ? - GFS2_FORMAT_DE : 0); - str->di_depth = cpu_to_be16(ip->i_depth); - str->di_entries = cpu_to_be32(ip->i_entries); - - str->di_eattr = cpu_to_be64(ip->i_eattr); - str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); -} - -void gfs2_dinode_print(const struct gfs2_inode *ip) -{ - printk(KERN_INFO " no_formal_ino = %llu\n", - (unsigned long long)ip->i_no_formal_ino); - printk(KERN_INFO " no_addr = %llu\n", - (unsigned long long)ip->i_no_addr); - printk(KERN_INFO " i_size = %llu\n", - (unsigned long long)i_size_read(&ip->i_inode)); - printk(KERN_INFO " blocks = %llu\n", - (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); - printk(KERN_INFO " i_goal = %llu\n", - (unsigned long long)ip->i_goal); - printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags); - printk(KERN_INFO " i_height = %u\n", ip->i_height); - printk(KERN_INFO " i_depth = %u\n", ip->i_depth); - printk(KERN_INFO " i_entries = %u\n", ip->i_entries); - printk(KERN_INFO " i_eattr = %llu\n", - (unsigned long long)ip->i_eattr); +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u32 ouid, ogid, nuid, ngid; + int error; + + ouid = inode->i_uid; + ogid = inode->i_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; + + if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) + ouid = nuid = NO_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) + ogid = ngid = NO_QUOTA_CHANGE; + + if (!gfs2_alloc_get(ip)) + return -ENOMEM; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out_alloc; + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + error = gfs2_quota_check(ip, nuid, ngid); + if (error) + goto out_gunlock_q; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; + + error = gfs2_setattr_simple(ip, attr); + if (error) + goto out_end_trans; + + if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { + u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); + gfs2_quota_change(ip, -blocks, ouid, ogid); + gfs2_quota_change(ip, blocks, nuid, ngid); + } + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock_q: + gfs2_quota_unlock(ip); +out_alloc: + gfs2_alloc_put(ip); + return error; +} + +/** + * gfs2_setattr - Change attributes on an inode + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + return error; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out; + + error = inode_change_ok(inode, attr); + if (error) + goto out; + + if (attr->ia_valid & ATTR_SIZE) + error = gfs2_setattr_size(inode, attr->ia_size); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) + error = gfs2_acl_chmod(ip, attr); + else + error = gfs2_setattr_simple(ip, attr); + +out: + gfs2_glock_dq_uninit(&i_gh); + if (!error) + mark_inode_dirty(inode); + return error; +} + +/** + * gfs2_getattr - Read out an inode's attributes + * @mnt: The vfsmount the inode is being accessed from + * @dentry: The dentry to stat + * @stat: The inode's stats + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. Note that its the NFS + * readdirplus operation which causes this to be called (from filldir) + * with the glock already held. + * + * Returns: errno + */ + +static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + int unlock = 0; + + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + unlock = 1; + } + + generic_fillattr(inode, stat); + if (unlock) + gfs2_glock_dq_uninit(&gh); + + return 0; +} + +static int gfs2_setxattr(struct dentry *dentry, const char *name, + const void *data, size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, + void *data, size_t size) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; +} + +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + if (gfs2_is_stuffed(ip)) { + u64 phys = ip->i_no_addr << inode->i_blkbits; + u64 size = i_size_read(inode); + u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE; + phys += sizeof(struct gfs2_dinode); + phys += start; + if (start + len > size) + len = size - start; + if (start < size) + ret = fiemap_fill_next_extent(fieinfo, start, phys, + len, flags); + if (ret == 1) + ret = 0; + } else { + ret = __generic_block_fiemap(inode, fieinfo, start, len, + gfs2_block_map); + } + + gfs2_glock_dq_uninit(&gh); +out: + mutex_unlock(&inode->i_mutex); + return ret; } +const struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + +const struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_unlink, + .mknod = gfs2_mknod, + .rename = gfs2_rename, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + +const struct inode_operations gfs2_symlink_iops = { + .readlink = generic_readlink, + .follow_link = gfs2_follow_link, + .put_link = gfs2_put_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .setxattr = gfs2_setxattr, + .getxattr = gfs2_getxattr, + .listxattr = gfs2_listxattr, + .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, +}; + diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 099ca305e51..31606076f70 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -102,22 +102,16 @@ extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 *no_formal_ino, unsigned int blktype); -extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr, int nonblock); extern int gfs2_inode_refresh(struct gfs2_inode *ip); -extern int gfs2_dinode_dealloc(struct gfs2_inode *inode); -extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern struct inode *gfs2_createi(struct gfs2_holder *ghs, - const struct qstr *name, - unsigned int mode, dev_t dev); extern int gfs2_permission(struct inode *inode, int mask, unsigned int flags); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); -extern void gfs2_dinode_print(const struct gfs2_inode *ip); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5b102c1887f..903115f2bb3 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -18,6 +18,7 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/bio.h> +#include <linux/writeback.h> #include "gfs2.h" #include "incore.h" @@ -83,55 +84,97 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) /** * gfs2_ail1_start_one - Start I/O on a part of the AIL * @sdp: the filesystem - * @tr: the part of the AIL + * @wbc: The writeback control structure + * @ai: The ail structure * */ -static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, + struct writeback_control *wbc, + struct gfs2_ail *ai) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { + struct gfs2_glock *gl = NULL; + struct address_space *mapping; struct gfs2_bufdata *bd, *s; struct buffer_head *bh; - int retry; - do { - retry = 0; + list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; - list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, - bd_ail_st_list) { - bh = bd->bd_bh; + gfs2_assert(sdp, bd->bd_ail == ai); - gfs2_assert(sdp, bd->bd_ail == ai); + if (!buffer_busy(bh)) { + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); + continue; + } - if (!buffer_busy(bh)) { - if (!buffer_uptodate(bh)) - gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); - continue; - } + if (!buffer_dirty(bh)) + continue; + if (gl == bd->bd_gl) + continue; + gl = bd->bd_gl; + list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); + mapping = bh->b_page->mapping; + if (!mapping) + continue; + spin_unlock(&sdp->sd_ail_lock); + generic_writepages(mapping, wbc); + spin_lock(&sdp->sd_ail_lock); + if (wbc->nr_to_write <= 0) + break; + return 1; + } - if (!buffer_dirty(bh)) - continue; + return 0; +} - list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); - get_bh(bh); - spin_unlock(&sdp->sd_ail_lock); - lock_buffer(bh); - if (test_clear_buffer_dirty(bh)) { - bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE_SYNC, bh); - } else { - unlock_buffer(bh); - brelse(bh); - } - spin_lock(&sdp->sd_ail_lock); - - retry = 1; +/** + * gfs2_ail1_flush - start writeback of some ail1 entries + * @sdp: The super block + * @wbc: The writeback control structure + * + * Writes back some ail1 entries, according to the limits in the + * writeback control structure + */ + +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) +{ + struct list_head *head = &sdp->sd_ail1_list; + struct gfs2_ail *ai; + + trace_gfs2_ail_flush(sdp, wbc, 1); + spin_lock(&sdp->sd_ail_lock); +restart: + list_for_each_entry_reverse(ai, head, ai_list) { + if (wbc->nr_to_write <= 0) break; - } - } while (retry); + if (gfs2_ail1_start_one(sdp, wbc, ai)) + goto restart; + } + spin_unlock(&sdp->sd_ail_lock); + trace_gfs2_ail_flush(sdp, wbc, 0); +} + +/** + * gfs2_ail1_start - start writeback of all ail1 entries + * @sdp: The superblock + */ + +static void gfs2_ail1_start(struct gfs2_sbd *sdp) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + return gfs2_ail1_flush(sdp, &wbc); } /** @@ -141,7 +184,7 @@ __acquires(&sdp->sd_ail_lock) * */ -static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags) +static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) { struct gfs2_bufdata *bd, *s; struct buffer_head *bh; @@ -149,76 +192,63 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int fl list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list, bd_ail_st_list) { bh = bd->bd_bh; - gfs2_assert(sdp, bd->bd_ail == ai); - - if (buffer_busy(bh)) { - if (flags & DIO_ALL) - continue; - else - break; - } - + if (buffer_busy(bh)) + continue; if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); - list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list); } - return list_empty(&ai->ai_ail1_list); } -static void gfs2_ail1_start(struct gfs2_sbd *sdp) -{ - struct list_head *head; - u64 sync_gen; - struct gfs2_ail *ai; - int done = 0; - - spin_lock(&sdp->sd_ail_lock); - head = &sdp->sd_ail1_list; - if (list_empty(head)) { - spin_unlock(&sdp->sd_ail_lock); - return; - } - sync_gen = sdp->sd_ail_sync_gen++; - - while(!done) { - done = 1; - list_for_each_entry_reverse(ai, head, ai_list) { - if (ai->ai_sync_gen >= sync_gen) - continue; - ai->ai_sync_gen = sync_gen; - gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */ - done = 0; - break; - } - } - - spin_unlock(&sdp->sd_ail_lock); -} +/** + * gfs2_ail1_empty - Try to empty the ail1 lists + * @sdp: The superblock + * + * Tries to empty the ail1 lists, starting with the oldest first + */ -static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) +static int gfs2_ail1_empty(struct gfs2_sbd *sdp) { struct gfs2_ail *ai, *s; int ret; spin_lock(&sdp->sd_ail_lock); - list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { - if (gfs2_ail1_empty_one(sdp, ai, flags)) + gfs2_ail1_empty_one(sdp, ai); + if (list_empty(&ai->ai_ail1_list)) list_move(&ai->ai_list, &sdp->sd_ail2_list); - else if (!(flags & DIO_ALL)) + else break; } - ret = list_empty(&sdp->sd_ail1_list); - spin_unlock(&sdp->sd_ail_lock); return ret; } +static void gfs2_ail1_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_ail *ai; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_reverse(ai, &sdp->sd_ail1_list, ai_list) { + list_for_each_entry(bd, &ai->ai_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + if (!buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(&sdp->sd_ail_lock); + wait_on_buffer(bh); + brelse(bh); + return; + } + } + spin_unlock(&sdp->sd_ail_lock); +} /** * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced @@ -574,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) set_buffer_uptodate(bh); clear_buffer_dirty(bh); - gfs2_ail1_empty(sdp, 0); + gfs2_ail1_empty(sdp); tail = current_tail(sdp); lh = (struct gfs2_log_header *)bh->b_data; @@ -869,9 +899,9 @@ void gfs2_meta_syncfs(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL); for (;;) { gfs2_ail1_start(sdp); - if (gfs2_ail1_empty(sdp, DIO_ALL)) + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp)) break; - msleep(10); } } @@ -905,20 +935,20 @@ int gfs2_logd(void *data) preflush = atomic_read(&sdp->sd_log_pinned); if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { - gfs2_ail1_empty(sdp, DIO_ALL); + gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL); - gfs2_ail1_empty(sdp, DIO_ALL); } if (gfs2_ail_flush_reqd(sdp)) { gfs2_ail1_start(sdp); - io_schedule(); - gfs2_ail1_empty(sdp, 0); + gfs2_ail1_wait(sdp); + gfs2_ail1_empty(sdp); gfs2_log_flush(sdp, NULL); - gfs2_ail1_empty(sdp, DIO_ALL); } - wake_up(&sdp->sd_log_waitq); + if (!gfs2_ail_flush_reqd(sdp)) + wake_up(&sdp->sd_log_waitq); + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; if (freezing(current)) refrigerator(); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 0d007f92023..ab0621698b7 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -12,6 +12,7 @@ #include <linux/list.h> #include <linux/spinlock.h> +#include <linux/writeback.h> #include "incore.h" /** @@ -59,6 +60,7 @@ extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); extern void gfs2_log_shutdown(struct gfs2_sbd *sdp); extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 51d27f00ebb..05bbb124699 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -40,7 +40,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) { struct gfs2_bufdata *bd; - gfs2_assert_withdraw(sdp, test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + BUG_ON(!current->journal_info); clear_buffer_dirty(bh); if (test_set_buffer_pinned(bh)) @@ -65,6 +65,7 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) * @sdp: the filesystem the buffer belongs to * @bh: The buffer to unpin * @ai: + * @flags: The inode dirty flags * */ @@ -73,10 +74,8 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, { struct gfs2_bufdata *bd = bh->b_private; - gfs2_assert_withdraw(sdp, buffer_uptodate(bh)); - - if (!buffer_pinned(bh)) - gfs2_assert_withdraw(sdp, 0); + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!buffer_pinned(bh)); lock_buffer(bh); mark_buffer_dirty(bh); @@ -95,8 +94,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); spin_unlock(&sdp->sd_ail_lock); - if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags)) - gfs2_glock_schedule_for_reclaim(bd->bd_gl); + clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); trace_gfs2_pin(bd, 0); unlock_buffer(bh); atomic_dec(&sdp->sd_log_pinned); @@ -322,12 +320,16 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) { + struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le); + struct gfs2_glock *gl = bd->bd_gl; struct gfs2_trans *tr; tr = current->journal_info; tr->tr_touched = 1; tr->tr_num_revoke++; sdp->sd_log_num_revoke++; + atomic_inc(&gl->gl_revokes); + set_bit(GLF_LFLUSH, &gl->gl_flags); list_add(&le->le_list, &sdp->sd_log_le_revoke); } @@ -350,9 +352,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) ld->ld_data1 = cpu_to_be32(sdp->sd_log_num_revoke); offset = sizeof(struct gfs2_log_descriptor); - while (!list_empty(head)) { - bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); - list_del_init(&bd->bd_le.le_list); + list_for_each_entry(bd, head, bd_le.le_list) { sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { @@ -367,8 +367,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) } *(__be64 *)(bh->b_data + offset) = cpu_to_be64(bd->bd_blkno); - kmem_cache_free(gfs2_bufdata_cachep, bd); - offset += sizeof(u64); } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); @@ -376,6 +374,22 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) submit_bh(WRITE_SYNC, bh); } +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai) +{ + struct list_head *head = &sdp->sd_log_le_revoke; + struct gfs2_bufdata *bd; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list); + list_del_init(&bd->bd_le.le_list); + gl = bd->bd_gl; + atomic_dec(&gl->gl_revokes); + clear_bit(GLF_LFLUSH, &gl->gl_flags); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + static void revoke_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { @@ -749,6 +763,7 @@ const struct gfs2_log_operations gfs2_buf_lops = { const struct gfs2_log_operations gfs2_revoke_lops = { .lo_add = revoke_lo_add, .lo_before_commit = revoke_lo_before_commit, + .lo_after_commit = revoke_lo_after_commit, .lo_before_scan = revoke_lo_before_scan, .lo_scan_elements = revoke_lo_scan_elements, .lo_after_scan = revoke_lo_after_scan, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 888a5f5a1a5..cfa327d3319 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -53,6 +53,7 @@ static void gfs2_init_glock_once(void *foo) INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); atomic_set(&gl->gl_ail_count, 0); + atomic_set(&gl->gl_revokes, 0); } static void gfs2_init_gl_aspace_once(void *foo) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 675349b5a13..747238cd9f9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -31,6 +31,7 @@ #include "rgrp.h" #include "trans.h" #include "util.h" +#include "trace_gfs2.h" static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { @@ -310,6 +311,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct gfs2_bufdata *bd = bh->b_private; if (test_clear_buffer_pinned(bh)) { + trace_gfs2_pin(bd, 0); atomic_dec(&sdp->sd_log_pinned); list_del_init(&bd->bd_le.le_list); if (meta) { diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 6a1d9ba1641..22c52659313 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -77,8 +77,6 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); #define buffer_busy(bh) \ ((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) -#define buffer_in_io(bh) \ -((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock))) #endif /* __DIO_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index d3c69eb91c7..8ac9ae189b5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -126,8 +126,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) * changed. */ -static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) +static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) { + struct gfs2_sb_host *sb = &sdp->sd_sb; + if (sb->sb_magic != GFS2_MAGIC || sb->sb_type != GFS2_METATYPE_SB) { if (!silent) @@ -157,8 +159,10 @@ static void end_bio_io_page(struct bio *bio, int error) unlock_page(page); } -static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) +static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) { + struct gfs2_sb_host *sb = &sdp->sd_sb; + struct super_block *s = sdp->sd_vfs; const struct gfs2_sb *str = buf; sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); @@ -175,7 +179,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); - memcpy(sb->sb_uuid, str->sb_uuid, 16); + memcpy(s->s_uuid, str->sb_uuid, 16); } /** @@ -197,7 +201,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) * Returns: 0 on success or error */ -static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) { struct super_block *sb = sdp->sd_vfs; struct gfs2_sb *p; @@ -227,10 +231,10 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) return -EIO; } p = kmap(page); - gfs2_sb_in(&sdp->sd_sb, p); + gfs2_sb_in(sdp, p); kunmap(page); __free_page(page); - return 0; + return gfs2_check_sb(sdp, silent); } /** @@ -247,17 +251,13 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) unsigned int x; int error; - error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); if (error) { if (!silent) fs_err(sdp, "can't read superblock\n"); return error; } - error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); - if (error) - return error; - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; @@ -340,14 +340,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent) /* Try to autodetect */ if (!proto[0] || !table[0]) { - error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); if (error) return error; - error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); - if (error) - goto out; - if (!proto[0]) proto = sdp->sd_sb.sb_lockproto; if (!table[0]) @@ -364,7 +360,6 @@ static int init_names(struct gfs2_sbd *sdp, int silent) while ((table = strchr(table, '/'))) *table = '_'; -out: return error; } @@ -1119,8 +1114,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent if (sdp->sd_args.ar_statfs_quantum) { sdp->sd_tune.gt_statfs_slow = 0; sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum; - } - else { + } else { sdp->sd_tune.gt_statfs_slow = 1; sdp->sd_tune.gt_statfs_quantum = 30; } diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c deleted file mode 100644 index 09e436a5072..00000000000 --- a/fs/gfs2/ops_inode.c +++ /dev/null @@ -1,1344 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/namei.h> -#include <linux/mm.h> -#include <linux/xattr.h> -#include <linux/posix_acl.h> -#include <linux/gfs2_ondisk.h> -#include <linux/crc32.h> -#include <linux/fiemap.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "acl.h" -#include "bmap.h" -#include "dir.h" -#include "xattr.h" -#include "glock.h" -#include "inode.h" -#include "meta_io.h" -#include "quota.h" -#include "rgrp.h" -#include "trans.h" -#include "util.h" -#include "super.h" - -/** - * gfs2_create - Create a file - * @dir: The directory in which to create the file - * @dentry: The dentry of the new file - * @mode: The mode of the new file - * - * Returns: errno - */ - -static int gfs2_create(struct inode *dir, struct dentry *dentry, - int mode, struct nameidata *nd) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - for (;;) { - inode = gfs2_createi(ghs, &dentry->d_name, S_IFREG | mode, 0); - if (!IS_ERR(inode)) { - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - gfs2_glock_dq_uninit_m(2, ghs); - mark_inode_dirty(inode); - break; - } else if (PTR_ERR(inode) != -EEXIST || - (nd && nd->flags & LOOKUP_EXCL)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode) { - if (!IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - break; - } else { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - } - } - - d_instantiate(dentry, inode); - - return 0; -} - -/** - * gfs2_lookup - Look up a filename in a directory and return its inode - * @dir: The directory inode - * @dentry: The dentry of the new inode - * @nd: passed from Linux VFS, ignored by us - * - * Called by the VFS layer. Lock dir and call gfs2_lookupi() - * - * Returns: errno - */ - -static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) -{ - struct inode *inode = NULL; - - inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (inode && IS_ERR(inode)) - return ERR_CAST(inode); - - if (inode) { - struct gfs2_glock *gl = GFS2_I(inode)->i_gl; - struct gfs2_holder gh; - int error; - error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) { - iput(inode); - return ERR_PTR(error); - } - gfs2_glock_dq_uninit(&gh); - return d_splice_alias(inode, dentry); - } - d_add(dentry, inode); - - return NULL; -} - -/** - * gfs2_link - Link to a file - * @old_dentry: The inode to link - * @dir: Add link to this directory - * @dentry: The name of the link - * - * Link the inode in "old_dentry" into the directory "dir" with the - * name in "dentry". - * - * Returns: errno - */ - -static int gfs2_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = old_dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder ghs[2]; - int alloc_required; - int error; - - if (S_ISDIR(inode->i_mode)) - return -EPERM; - - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - - error = gfs2_glock_nq(ghs); /* parent */ - if (error) - goto out_parent; - - error = gfs2_glock_nq(ghs + 1); /* child */ - if (error) - goto out_child; - - error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0); - if (error) - goto out_gunlock; - - error = gfs2_dir_check(dir, &dentry->d_name, NULL); - switch (error) { - case -ENOENT: - break; - case 0: - error = -EEXIST; - default: - goto out_gunlock; - } - - error = -EINVAL; - if (!dip->i_inode.i_nlink) - goto out_gunlock; - error = -EFBIG; - if (dip->i_entries == (u32)-1) - goto out_gunlock; - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out_gunlock; - error = -EINVAL; - if (!ip->i_inode.i_nlink) - goto out_gunlock; - error = -EMLINK; - if (ip->i_inode.i_nlink == (u32)-1) - goto out_gunlock; - - alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name); - if (error < 0) - goto out_gunlock; - error = 0; - - if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(dip); - if (!al) { - error = -ENOMEM; - goto out_gunlock; - } - - error = gfs2_quota_lock_check(dip); - if (error) - goto out_alloc; - - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve(dip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + - 2 * RES_DINODE + RES_STATFS + - RES_QUOTA, 0); - if (error) - goto out_ipres; - } else { - error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); - if (error) - goto out_ipres; - } - - error = gfs2_dir_add(dir, &dentry->d_name, ip, IF2DT(inode->i_mode)); - if (error) - goto out_end_trans; - - error = gfs2_change_nlink(ip, +1); - -out_end_trans: - gfs2_trans_end(sdp); -out_ipres: - if (alloc_required) - gfs2_inplace_release(dip); -out_gunlock_q: - if (alloc_required) - gfs2_quota_unlock(dip); -out_alloc: - if (alloc_required) - gfs2_alloc_put(dip); -out_gunlock: - gfs2_glock_dq(ghs + 1); -out_child: - gfs2_glock_dq(ghs); -out_parent: - gfs2_holder_uninit(ghs); - gfs2_holder_uninit(ghs + 1); - if (!error) { - ihold(inode); - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - } - return error; -} - -/* - * gfs2_unlink_ok - check to see that a inode is still in a directory - * @dip: the directory - * @name: the name of the file - * @ip: the inode - * - * Assumes that the lock on (at least) @dip is held. - * - * Returns: 0 if the parent/child relationship is correct, errno if it isn't - */ - -static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip) -{ - int error; - - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - return -EPERM; - - if ((dip->i_inode.i_mode & S_ISVTX) && - dip->i_inode.i_uid != current_fsuid() && - ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) - return -EPERM; - - if (IS_APPEND(&dip->i_inode)) - return -EPERM; - - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0); - if (error) - return error; - - error = gfs2_dir_check(&dip->i_inode, name, ip); - if (error) - return error; - - return 0; -} - -/** - * gfs2_unlink - Unlink a file - * @dir: The inode of the directory containing the file to unlink - * @dentry: The file itself - * - * Unlink a file. Call gfs2_unlinki() - * - * Returns: errno - */ - -static int gfs2_unlink(struct inode *dir, struct dentry *dentry) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[3]; - struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; - int error; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - - - error = gfs2_glock_nq(ghs); /* parent */ - if (error) - goto out_parent; - - error = gfs2_glock_nq(ghs + 1); /* child */ - if (error) - goto out_child; - - error = gfs2_glock_nq(ghs + 2); /* rgrp */ - if (error) - goto out_rgrp; - - error = gfs2_unlink_ok(dip, &dentry->d_name, ip); - if (error) - goto out_gunlock; - - error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); - if (error) - goto out_gunlock; - - error = gfs2_dir_del(dip, &dentry->d_name); - if (error) - goto out_end_trans; - - error = gfs2_change_nlink(ip, -1); - -out_end_trans: - gfs2_trans_end(sdp); -out_gunlock: - gfs2_glock_dq(ghs + 2); -out_rgrp: - gfs2_holder_uninit(ghs + 2); - gfs2_glock_dq(ghs + 1); -out_child: - gfs2_holder_uninit(ghs + 1); - gfs2_glock_dq(ghs); -out_parent: - gfs2_holder_uninit(ghs); - gfs2_glock_dq_uninit(&ri_gh); - return error; -} - -/** - * gfs2_symlink - Create a symlink - * @dir: The directory to create the symlink in - * @dentry: The dentry to put the symlink in - * @symname: The thing which the link points to - * - * Returns: errno - */ - -static int gfs2_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) -{ - struct gfs2_inode *dip = GFS2_I(dir), *ip; - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - struct buffer_head *dibh; - int size; - int error; - - /* Must be stuffed with a null terminator for gfs2_follow_link() */ - size = strlen(symname); - if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode) - 1) - return -ENAMETOOLONG; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - inode = gfs2_createi(ghs, &dentry->d_name, S_IFLNK | S_IRWXUGO, 0); - if (IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - ip = ghs[1].gh_gl->gl_object; - - i_size_write(inode, size); - - error = gfs2_meta_inode_buffer(ip, &dibh); - - if (!gfs2_assert_withdraw(sdp, !error)) { - gfs2_dinode_out(ip, dibh->b_data); - memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, - size); - brelse(dibh); - } - - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - - gfs2_glock_dq_uninit_m(2, ghs); - - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - - return 0; -} - -/** - * gfs2_mkdir - Make a directory - * @dir: The parent directory of the new one - * @dentry: The dentry of the new directory - * @mode: The mode of the new directory - * - * Returns: errno - */ - -static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) -{ - struct gfs2_inode *dip = GFS2_I(dir), *ip; - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - struct buffer_head *dibh; - int error; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - inode = gfs2_createi(ghs, &dentry->d_name, S_IFDIR | mode, 0); - if (IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - ip = ghs[1].gh_gl->gl_object; - - ip->i_inode.i_nlink = 2; - i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)); - ip->i_diskflags |= GFS2_DIF_JDATA; - ip->i_entries = 2; - - error = gfs2_meta_inode_buffer(ip, &dibh); - - if (!gfs2_assert_withdraw(sdp, !error)) { - struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; - struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); - dent->de_inum = di->di_num; /* already GFS2 endian */ - dent->de_type = cpu_to_be16(DT_DIR); - di->di_entries = cpu_to_be32(1); - - dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); - gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); - - gfs2_inum_out(dip, dent); - dent->de_type = cpu_to_be16(DT_DIR); - - gfs2_dinode_out(ip, di); - - brelse(dibh); - } - - error = gfs2_change_nlink(dip, +1); - gfs2_assert_withdraw(sdp, !error); /* dip already pinned */ - - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - - gfs2_glock_dq_uninit_m(2, ghs); - - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - - return 0; -} - -/** - * gfs2_rmdiri - Remove a directory - * @dip: The parent directory of the directory to be removed - * @name: The name of the directory to be removed - * @ip: The GFS2 inode of the directory to be removed - * - * Assumes Glocks on dip and ip are held - * - * Returns: errno - */ - -static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) -{ - int error; - - if (ip->i_entries != 2) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - - error = gfs2_dir_del(dip, name); - if (error) - return error; - - error = gfs2_change_nlink(dip, -1); - if (error) - return error; - - error = gfs2_dir_del(ip, &gfs2_qdot); - if (error) - return error; - - error = gfs2_dir_del(ip, &gfs2_qdotdot); - if (error) - return error; - - /* It looks odd, but it really should be done twice */ - error = gfs2_change_nlink(ip, -1); - if (error) - return error; - - error = gfs2_change_nlink(ip, -1); - if (error) - return error; - - return error; -} - -/** - * gfs2_rmdir - Remove a directory - * @dir: The parent directory of the directory to be removed - * @dentry: The dentry of the directory to remove - * - * Remove a directory. Call gfs2_rmdiri() - * - * Returns: errno - */ - -static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder ghs[3]; - struct gfs2_rgrpd *rgd; - struct gfs2_holder ri_gh; - int error; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - - error = gfs2_glock_nq(ghs); /* parent */ - if (error) - goto out_parent; - - error = gfs2_glock_nq(ghs + 1); /* child */ - if (error) - goto out_child; - - error = gfs2_glock_nq(ghs + 2); /* rgrp */ - if (error) - goto out_rgrp; - - error = gfs2_unlink_ok(dip, &dentry->d_name, ip); - if (error) - goto out_gunlock; - - if (ip->i_entries < 2) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - error = -EIO; - goto out_gunlock; - } - if (ip->i_entries > 2) { - error = -ENOTEMPTY; - goto out_gunlock; - } - - error = gfs2_trans_begin(sdp, 2 * RES_DINODE + 3 * RES_LEAF + RES_RG_BIT, 0); - if (error) - goto out_gunlock; - - error = gfs2_rmdiri(dip, &dentry->d_name, ip); - - gfs2_trans_end(sdp); - -out_gunlock: - gfs2_glock_dq(ghs + 2); -out_rgrp: - gfs2_holder_uninit(ghs + 2); - gfs2_glock_dq(ghs + 1); -out_child: - gfs2_holder_uninit(ghs + 1); - gfs2_glock_dq(ghs); -out_parent: - gfs2_holder_uninit(ghs); - gfs2_glock_dq_uninit(&ri_gh); - return error; -} - -/** - * gfs2_mknod - Make a special file - * @dir: The directory in which the special file will reside - * @dentry: The dentry of the special file - * @mode: The mode of the special file - * @rdev: The device specification of the special file - * - */ - -static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, - dev_t dev) -{ - struct gfs2_inode *dip = GFS2_I(dir); - struct gfs2_sbd *sdp = GFS2_SB(dir); - struct gfs2_holder ghs[2]; - struct inode *inode; - - gfs2_holder_init(dip->i_gl, 0, 0, ghs); - - inode = gfs2_createi(ghs, &dentry->d_name, mode, dev); - if (IS_ERR(inode)) { - gfs2_holder_uninit(ghs); - return PTR_ERR(inode); - } - - gfs2_trans_end(sdp); - if (dip->i_alloc->al_rgd) - gfs2_inplace_release(dip); - gfs2_quota_unlock(dip); - gfs2_alloc_put(dip); - - gfs2_glock_dq_uninit_m(2, ghs); - - d_instantiate(dentry, inode); - mark_inode_dirty(inode); - - return 0; -} - -/* - * gfs2_ok_to_move - check if it's ok to move a directory to another directory - * @this: move this - * @to: to here - * - * Follow @to back to the root and make sure we don't encounter @this - * Assumes we already hold the rename lock. - * - * Returns: errno - */ - -static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) -{ - struct inode *dir = &to->i_inode; - struct super_block *sb = dir->i_sb; - struct inode *tmp; - int error = 0; - - igrab(dir); - - for (;;) { - if (dir == &this->i_inode) { - error = -EINVAL; - break; - } - if (dir == sb->s_root->d_inode) { - error = 0; - break; - } - - tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); - if (IS_ERR(tmp)) { - error = PTR_ERR(tmp); - break; - } - - iput(dir); - dir = tmp; - } - - iput(dir); - - return error; -} - -/** - * gfs2_rename - Rename a file - * @odir: Parent directory of old file name - * @odentry: The old dentry of the file - * @ndir: Parent directory of new file name - * @ndentry: The new dentry of the file - * - * Returns: errno - */ - -static int gfs2_rename(struct inode *odir, struct dentry *odentry, - struct inode *ndir, struct dentry *ndentry) -{ - struct gfs2_inode *odip = GFS2_I(odir); - struct gfs2_inode *ndip = GFS2_I(ndir); - struct gfs2_inode *ip = GFS2_I(odentry->d_inode); - struct gfs2_inode *nip = NULL; - struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh; - struct gfs2_rgrpd *nrgd; - unsigned int num_gh; - int dir_rename = 0; - int alloc_required = 0; - unsigned int x; - int error; - - if (ndentry->d_inode) { - nip = GFS2_I(ndentry->d_inode); - if (ip == nip) - return 0; - } - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - return error; - - if (odip != ndip) { - error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, - 0, &r_gh); - if (error) - goto out; - - if (S_ISDIR(ip->i_inode.i_mode)) { - dir_rename = 1; - /* don't move a dirctory into it's subdir */ - error = gfs2_ok_to_move(ip, ndip); - if (error) - goto out_gunlock_r; - } - } - - num_gh = 1; - gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); - if (odip != ndip) { - gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); - num_gh++; - } - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); - num_gh++; - - if (nip) { - gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); - num_gh++; - /* grab the resource lock for unlink flag twiddling - * this is the case of the target file already existing - * so we unlink before doing the rename - */ - nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); - if (nrgd) - gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); - } - - for (x = 0; x < num_gh; x++) { - error = gfs2_glock_nq(ghs + x); - if (error) - goto out_gunlock; - } - - /* Check out the old directory */ - - error = gfs2_unlink_ok(odip, &odentry->d_name, ip); - if (error) - goto out_gunlock; - - /* Check out the new directory */ - - if (nip) { - error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); - if (error) - goto out_gunlock; - - if (S_ISDIR(nip->i_inode.i_mode)) { - if (nip->i_entries < 2) { - if (gfs2_consist_inode(nip)) - gfs2_dinode_print(nip); - error = -EIO; - goto out_gunlock; - } - if (nip->i_entries > 2) { - error = -ENOTEMPTY; - goto out_gunlock; - } - } - } else { - error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0); - if (error) - goto out_gunlock; - - error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); - switch (error) { - case -ENOENT: - error = 0; - break; - case 0: - error = -EEXIST; - default: - goto out_gunlock; - }; - - if (odip != ndip) { - if (!ndip->i_inode.i_nlink) { - error = -EINVAL; - goto out_gunlock; - } - if (ndip->i_entries == (u32)-1) { - error = -EFBIG; - goto out_gunlock; - } - if (S_ISDIR(ip->i_inode.i_mode) && - ndip->i_inode.i_nlink == (u32)-1) { - error = -EMLINK; - goto out_gunlock; - } - } - } - - /* Check out the dir to be renamed */ - - if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0); - if (error) - goto out_gunlock; - } - - if (nip == NULL) - alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); - error = alloc_required; - if (error < 0) - goto out_gunlock; - error = 0; - - if (alloc_required) { - struct gfs2_alloc *al = gfs2_alloc_get(ndip); - if (!al) { - error = -ENOMEM; - goto out_gunlock; - } - - error = gfs2_quota_lock_check(ndip); - if (error) - goto out_alloc; - - al->al_requested = sdp->sd_max_dirres; - - error = gfs2_inplace_reserve_ri(ndip); - if (error) - goto out_gunlock_q; - - error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + - gfs2_rg_blocks(al) + - 4 * RES_DINODE + 4 * RES_LEAF + - RES_STATFS + RES_QUOTA + 4, 0); - if (error) - goto out_ipreserv; - } else { - error = gfs2_trans_begin(sdp, 4 * RES_DINODE + - 5 * RES_LEAF + 4, 0); - if (error) - goto out_gunlock; - } - - /* Remove the target file, if it exists */ - - if (nip) { - if (S_ISDIR(nip->i_inode.i_mode)) - error = gfs2_rmdiri(ndip, &ndentry->d_name, nip); - else { - error = gfs2_dir_del(ndip, &ndentry->d_name); - if (error) - goto out_end_trans; - error = gfs2_change_nlink(nip, -1); - } - if (error) - goto out_end_trans; - } - - if (dir_rename) { - error = gfs2_change_nlink(ndip, +1); - if (error) - goto out_end_trans; - error = gfs2_change_nlink(odip, -1); - if (error) - goto out_end_trans; - - error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - if (error) - goto out_end_trans; - } else { - struct buffer_head *dibh; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } - - error = gfs2_dir_del(odip, &odentry->d_name); - if (error) - goto out_end_trans; - - error = gfs2_dir_add(ndir, &ndentry->d_name, ip, IF2DT(ip->i_inode.i_mode)); - if (error) - goto out_end_trans; - -out_end_trans: - gfs2_trans_end(sdp); -out_ipreserv: - if (alloc_required) - gfs2_inplace_release(ndip); -out_gunlock_q: - if (alloc_required) - gfs2_quota_unlock(ndip); -out_alloc: - if (alloc_required) - gfs2_alloc_put(ndip); -out_gunlock: - while (x--) { - gfs2_glock_dq(ghs + x); - gfs2_holder_uninit(ghs + x); - } -out_gunlock_r: - if (r_gh.gh_gl) - gfs2_glock_dq_uninit(&r_gh); -out: - gfs2_glock_dq_uninit(&ri_gh); - return error; -} - -/** - * gfs2_follow_link - Follow a symbolic link - * @dentry: The dentry of the link - * @nd: Data that we pass to vfs_follow_link() - * - * This can handle symlinks of any size. - * - * Returns: 0 on success or error code - */ - -static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); - struct gfs2_holder i_gh; - struct buffer_head *dibh; - unsigned int x, size; - char *buf; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error) { - gfs2_holder_uninit(&i_gh); - nd_set_link(nd, ERR_PTR(error)); - return NULL; - } - - size = (unsigned int)i_size_read(&ip->i_inode); - if (size == 0) { - gfs2_consist_inode(ip); - buf = ERR_PTR(-EIO); - goto out; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) { - buf = ERR_PTR(error); - goto out; - } - - x = size + 1; - buf = kmalloc(x, GFP_NOFS); - if (!buf) - buf = ERR_PTR(-ENOMEM); - else - memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), x); - brelse(dibh); -out: - gfs2_glock_dq_uninit(&i_gh); - nd_set_link(nd, buf); - return NULL; -} - -static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p) -{ - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); -} - -/** - * gfs2_permission - - * @inode: The inode - * @mask: The mask to be tested - * @flags: Indicates whether this is an RCU path walk or not - * - * This may be called from the VFS directly, or from within GFS2 with the - * inode locked, so we look to see if the glock is already locked and only - * lock the glock if its not already been done. - * - * Returns: errno - */ - -int gfs2_permission(struct inode *inode, int mask, unsigned int flags) -{ - struct gfs2_inode *ip; - struct gfs2_holder i_gh; - int error; - int unlock = 0; - - - ip = GFS2_I(inode); - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - if (flags & IPERM_FLAG_RCU) - return -ECHILD; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - unlock = 1; - } - - if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) - error = -EACCES; - else - error = generic_permission(inode, mask, flags, gfs2_check_acl); - if (unlock) - gfs2_glock_dq_uninit(&i_gh); - - return error; -} - -static int setattr_chown(struct inode *inode, struct iattr *attr) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - u32 ouid, ogid, nuid, ngid; - int error; - - ouid = inode->i_uid; - ogid = inode->i_gid; - nuid = attr->ia_uid; - ngid = attr->ia_gid; - - if (!(attr->ia_valid & ATTR_UID) || ouid == nuid) - ouid = nuid = NO_QUOTA_CHANGE; - if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) - ogid = ngid = NO_QUOTA_CHANGE; - - if (!gfs2_alloc_get(ip)) - return -ENOMEM; - - error = gfs2_quota_lock(ip, nuid, ngid); - if (error) - goto out_alloc; - - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { - error = gfs2_quota_check(ip, nuid, ngid); - if (error) - goto out_gunlock_q; - } - - error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); - if (error) - goto out_gunlock_q; - - error = gfs2_setattr_simple(ip, attr); - if (error) - goto out_end_trans; - - if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { - u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); - gfs2_quota_change(ip, -blocks, ouid, ogid); - gfs2_quota_change(ip, blocks, nuid, ngid); - } - -out_end_trans: - gfs2_trans_end(sdp); -out_gunlock_q: - gfs2_quota_unlock(ip); -out_alloc: - gfs2_alloc_put(ip); - return error; -} - -/** - * gfs2_setattr - Change attributes on an inode - * @dentry: The dentry which is changing - * @attr: The structure describing the change - * - * The VFS layer wants to change one or more of an inodes attributes. Write - * that change out to disk. - * - * Returns: errno - */ - -static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder i_gh; - int error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - return error; - - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out; - - error = inode_change_ok(inode, attr); - if (error) - goto out; - - if (attr->ia_valid & ATTR_SIZE) - error = gfs2_setattr_size(inode, attr->ia_size); - else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) - error = setattr_chown(inode, attr); - else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) - error = gfs2_acl_chmod(ip, attr); - else - error = gfs2_setattr_simple(ip, attr); - -out: - gfs2_glock_dq_uninit(&i_gh); - if (!error) - mark_inode_dirty(inode); - return error; -} - -/** - * gfs2_getattr - Read out an inode's attributes - * @mnt: The vfsmount the inode is being accessed from - * @dentry: The dentry to stat - * @stat: The inode's stats - * - * This may be called from the VFS directly, or from within GFS2 with the - * inode locked, so we look to see if the glock is already locked and only - * lock the glock if its not already been done. Note that its the NFS - * readdirplus operation which causes this to be called (from filldir) - * with the glock already held. - * - * Returns: errno - */ - -static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, - struct kstat *stat) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - int unlock = 0; - - if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - if (error) - return error; - unlock = 1; - } - - generic_fillattr(inode, stat); - if (unlock) - gfs2_glock_dq_uninit(&gh); - - return 0; -} - -static int gfs2_setxattr(struct dentry *dentry, const char *name, - const void *data, size_t size, int flags) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret == 0) { - ret = generic_setxattr(dentry, name, data, size, flags); - gfs2_glock_dq(&gh); - } - gfs2_holder_uninit(&gh); - return ret; -} - -static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, - void *data, size_t size) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); - ret = gfs2_glock_nq(&gh); - if (ret == 0) { - ret = generic_getxattr(dentry, name, data, size); - gfs2_glock_dq(&gh); - } - gfs2_holder_uninit(&gh); - return ret; -} - -static int gfs2_removexattr(struct dentry *dentry, const char *name) -{ - struct inode *inode = dentry->d_inode; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - ret = gfs2_glock_nq(&gh); - if (ret == 0) { - ret = generic_removexattr(dentry, name); - gfs2_glock_dq(&gh); - } - gfs2_holder_uninit(&gh); - return ret; -} - -static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - u64 start, u64 len) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int ret; - - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - mutex_lock(&inode->i_mutex); - - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); - if (ret) - goto out; - - if (gfs2_is_stuffed(ip)) { - u64 phys = ip->i_no_addr << inode->i_blkbits; - u64 size = i_size_read(inode); - u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| - FIEMAP_EXTENT_DATA_INLINE; - phys += sizeof(struct gfs2_dinode); - phys += start; - if (start + len > size) - len = size - start; - if (start < size) - ret = fiemap_fill_next_extent(fieinfo, start, phys, - len, flags); - if (ret == 1) - ret = 0; - } else { - ret = __generic_block_fiemap(inode, fieinfo, start, len, - gfs2_block_map); - } - - gfs2_glock_dq_uninit(&gh); -out: - mutex_unlock(&inode->i_mutex); - return ret; -} - -const struct inode_operations gfs2_file_iops = { - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, - .fiemap = gfs2_fiemap, -}; - -const struct inode_operations gfs2_dir_iops = { - .create = gfs2_create, - .lookup = gfs2_lookup, - .link = gfs2_link, - .unlink = gfs2_unlink, - .symlink = gfs2_symlink, - .mkdir = gfs2_mkdir, - .rmdir = gfs2_rmdir, - .mknod = gfs2_mknod, - .rename = gfs2_rename, - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, - .fiemap = gfs2_fiemap, -}; - -const struct inode_operations gfs2_symlink_iops = { - .readlink = generic_readlink, - .follow_link = gfs2_follow_link, - .put_link = gfs2_put_link, - .permission = gfs2_permission, - .setattr = gfs2_setattr, - .getattr = gfs2_getattr, - .setxattr = gfs2_setxattr, - .getxattr = gfs2_getxattr, - .listxattr = gfs2_listxattr, - .removexattr = gfs2_removexattr, - .fiemap = gfs2_fiemap, -}; - diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e23d9864c41..42e8d23bc04 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -38,6 +38,7 @@ #include <linux/sched.h> #include <linux/slab.h> +#include <linux/mm.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> @@ -77,19 +78,20 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; + int nr_to_scan = sc->nr_to_scan; - if (nr == 0) + if (nr_to_scan == 0) goto out; - if (!(gfp_mask & __GFP_FS)) + if (!(sc->gfp_mask & __GFP_FS)) return -1; spin_lock(&qd_lru_lock); - while (nr && !list_empty(&qd_lru_list)) { + while (nr_to_scan && !list_empty(&qd_lru_list)) { qd = list_entry(qd_lru_list.next, struct gfs2_quota_data, qd_reclaim); sdp = qd->qd_gl->gl_sbd; @@ -110,7 +112,7 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) spin_unlock(&qd_lru_lock); kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&qd_lru_lock); - nr--; + nr_to_scan--; } spin_unlock(&qd_lru_lock); diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index e7d236ca48b..90bf1c302a9 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -12,6 +12,7 @@ struct gfs2_inode; struct gfs2_sbd; +struct shrink_control; #define NO_QUOTA_CHANGE ((u32)-1) @@ -51,7 +52,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, + struct shrink_control *sc); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6fcae8469f6..9b780df3fd5 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -78,10 +78,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, unsigned char *buf2, unsigned int offset, - unsigned int buflen, u32 block, + struct gfs2_bitmap *bi, u32 block, unsigned char new_state) { unsigned char *byte1, *byte2, *end, cur_state; + unsigned int buflen = bi->bi_len; const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; byte1 = buf1 + offset + (block / GFS2_NBBY); @@ -92,6 +93,16 @@ static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1, cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; if (unlikely(!valid_change[new_state * 4 + cur_state])) { + printk(KERN_WARNING "GFS2: buf_blk = 0x%llx old_state=%d, " + "new_state=%d\n", + (unsigned long long)block, cur_state, new_state); + printk(KERN_WARNING "GFS2: rgrp=0x%llx bi_start=0x%lx\n", + (unsigned long long)rgd->rd_addr, + (unsigned long)bi->bi_start); + printk(KERN_WARNING "GFS2: bi_offset=0x%lx bi_len=0x%lx\n", + (unsigned long)bi->bi_offset, + (unsigned long)bi->bi_len); + dump_stack(); gfs2_consist_rgrpd(rgd); return; } @@ -381,6 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp) if (gl) { gl->gl_object = NULL; + gfs2_glock_add_to_lru(gl); gfs2_glock_put(gl); } @@ -1365,7 +1377,7 @@ skip: gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi->bi_len, blk, new_state); + bi, blk, new_state); goal = blk; while (*n < elen) { goal++; @@ -1375,7 +1387,7 @@ skip: GFS2_BLKST_FREE) break; gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi->bi_len, goal, new_state); + bi, goal, new_state); (*n)++; } out: @@ -1432,7 +1444,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, } gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset, - bi->bi_len, buf_blk, new_state); + bi, buf_blk, new_state); } return rgd; @@ -1617,6 +1629,10 @@ void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); gfs2_trans_add_rg(rgd); + + /* Directories keep their data in the metadata address space */ + if (ip->i_depth) + gfs2_meta_wipe(ip, bstart, blen); } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b9f28e66dad..ed540e7018b 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -23,6 +23,7 @@ #include <linux/time.h> #include <linux/wait.h> #include <linux/writeback.h> +#include <linux/backing-dev.h> #include "gfs2.h" #include "incore.h" @@ -700,11 +701,47 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) mutex_unlock(&sdp->sd_freeze_lock); } +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +{ + struct gfs2_dinode *str = buf; + + str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); + str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + str->di_mode = cpu_to_be32(ip->i_inode.i_mode); + str->di_uid = cpu_to_be32(ip->i_inode.i_uid); + str->di_gid = cpu_to_be32(ip->i_inode.i_gid); + str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); + str->di_size = cpu_to_be64(i_size_read(&ip->i_inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); + str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); + + str->di_goal_meta = cpu_to_be64(ip->i_goal); + str->di_goal_data = cpu_to_be64(ip->i_goal); + str->di_generation = cpu_to_be64(ip->i_generation); + + str->di_flags = cpu_to_be32(ip->i_diskflags); + str->di_height = cpu_to_be16(ip->i_height); + str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? + GFS2_FORMAT_DE : 0); + str->di_depth = cpu_to_be16(ip->i_depth); + str->di_entries = cpu_to_be32(ip->i_entries); + + str->di_eattr = cpu_to_be64(ip->i_eattr); + str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); +} /** * gfs2_write_inode - Make sure the inode is stable on the disk * @inode: The inode - * @sync: synchronous write flag + * @wbc: The writeback control structure * * Returns: errno */ @@ -713,15 +750,17 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + struct backing_dev_info *bdi = metamapping->backing_dev_info; struct gfs2_holder gh; struct buffer_head *bh; struct timespec atime; struct gfs2_dinode *di; - int ret = 0; + int ret = -EAGAIN; - /* Check this is a "normal" inode, etc */ + /* Skip timestamp update, if this is from a memalloc */ if (current->flags & PF_MEMALLOC) - return 0; + goto do_flush; ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) goto do_flush; @@ -745,6 +784,13 @@ do_unlock: do_flush: if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + filemap_fdatawrite(metamapping); + if (bdi->dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + if (!ret && (wbc->sync_mode == WB_SYNC_ALL)) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); return ret; } @@ -874,8 +920,9 @@ restart: static int gfs2_sync_fs(struct super_block *sb, int wait) { - if (wait && sb->s_fs_info) - gfs2_log_flush(sb->s_fs_info, NULL); + struct gfs2_sbd *sdp = sb->s_fs_info; + if (wait && sdp) + gfs2_log_flush(sdp, NULL); return 0; } @@ -1308,6 +1355,78 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) return 0; } +static void gfs2_final_release_pages(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_glock *gl = ip->i_gl; + + truncate_inode_pages(gfs2_glock2aspace(ip->i_gl), 0); + truncate_inode_pages(&inode->i_data, 0); + + if (atomic_read(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } +} + +static int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc *al; + struct gfs2_rgrpd *rgd; + int error; + + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { + gfs2_consist_inode(ip); + return -EIO; + } + + al = gfs2_alloc_get(ip); + if (!al) + return -ENOMEM; + + error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); + if (error) + goto out; + + error = gfs2_rindex_hold(sdp, &al->al_ri_gh); + if (error) + goto out_qs; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_rindex_relse; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, + &al->al_rgd_gh); + if (error) + goto out_rindex_relse; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, + sdp->sd_jdesc->jd_blocks); + if (error) + goto out_rg_gunlock; + + gfs2_free_di(rgd, ip); + + gfs2_final_release_pages(ip); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&al->al_rgd_gh); +out_rindex_relse: + gfs2_glock_dq_uninit(&al->al_ri_gh); +out_qs: + gfs2_quota_unhold(ip); +out: + gfs2_alloc_put(ip); + return error; +} + /* * We have to (at the moment) hold the inodes main lock to cover * the gap between unlocking the shared lock on the iopen lock and @@ -1371,15 +1490,13 @@ static void gfs2_evict_inode(struct inode *inode) } error = gfs2_dinode_dealloc(ip); - if (error) - goto out_unlock; + goto out_unlock; out_truncate: error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) goto out_unlock; - /* Needs to be done before glock release & also in a transaction */ - truncate_inode_pages(&inode->i_data, 0); + gfs2_final_release_pages(ip); gfs2_trans_end(sdp); out_unlock: @@ -1394,6 +1511,7 @@ out: end_writeback(inode); ip->i_gl->gl_object = NULL; + gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 748ccb557c1..e20eab37bc8 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -81,7 +81,8 @@ static int gfs2_uuid_valid(const u8 *uuid) static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) { - const u8 *uuid = sdp->sd_sb.sb_uuid; + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; buf[0] = '\0'; if (!gfs2_uuid_valid(uuid)) return 0; @@ -616,7 +617,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env) { struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - const u8 *uuid = sdp->sd_sb.sb_uuid; + struct super_block *s = sdp->sd_vfs; + const u8 *uuid = s->s_uuid; add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index cedb0bb96d9..5d07609ec57 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -10,6 +10,7 @@ #include <linux/buffer_head.h> #include <linux/dlmconstants.h> #include <linux/gfs2_ondisk.h> +#include <linux/writeback.h> #include "incore.h" #include "glock.h" @@ -40,7 +41,9 @@ {(1UL << GLF_REPLY_PENDING), "r" }, \ {(1UL << GLF_INITIAL), "I" }, \ {(1UL << GLF_FROZEN), "F" }, \ - {(1UL << GLF_QUEUED), "q" }) + {(1UL << GLF_QUEUED), "q" }, \ + {(1UL << GLF_LRU), "L" }, \ + {(1UL << GLF_OBJECT), "o" }) #ifndef NUMPTY #define NUMPTY @@ -94,7 +97,7 @@ TRACE_EVENT(gfs2_glock_state_change, __entry->new_state = glock_trace_state(new_state); __entry->tgt_state = glock_trace_state(gl->gl_target); __entry->dmt_state = glock_trace_state(gl->gl_demote_state); - __entry->flags = gl->gl_flags; + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); ), TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s", @@ -127,7 +130,7 @@ TRACE_EVENT(gfs2_glock_put, __entry->gltype = gl->gl_name.ln_type; __entry->glnum = gl->gl_name.ln_number; __entry->cur_state = glock_trace_state(gl->gl_state); - __entry->flags = gl->gl_flags; + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); ), TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s", @@ -161,7 +164,7 @@ TRACE_EVENT(gfs2_demote_rq, __entry->glnum = gl->gl_name.ln_number; __entry->cur_state = glock_trace_state(gl->gl_state); __entry->dmt_state = glock_trace_state(gl->gl_demote_state); - __entry->flags = gl->gl_flags; + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); ), TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s", @@ -318,6 +321,33 @@ TRACE_EVENT(gfs2_log_blocks, MINOR(__entry->dev), __entry->blocks) ); +/* Writing back the AIL */ +TRACE_EVENT(gfs2_ail_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start), + + TP_ARGS(sdp, wbc, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( int, sync_mode ) + __field( long, nr_to_write ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->sync_mode = wbc->sync_mode; + __entry->nr_to_write = wbc->nr_to_write; + ), + + TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->start ? "start" : "end", + __entry->sync_mode == WB_SYNC_ALL ? "all" : "none", + __entry->nr_to_write) +); + /* Section 3 - bmap * * Objectives: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b9eeb1cd03f..e7a035781b7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -412,10 +412,10 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) pgoff = offset >> PAGE_SHIFT; i_size_write(inode, offset); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (!prio_tree_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); return 0; } diff --git a/fs/inode.c b/fs/inode.c index 33c963d08ab..990d284877a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -24,6 +24,7 @@ #include <linux/mount.h> #include <linux/async.h> #include <linux/posix_acl.h> +#include <linux/prefetch.h> #include <linux/ima.h> #include <linux/cred.h> #include "internal.h" @@ -325,12 +326,11 @@ void address_space_init_once(struct address_space *mapping) memset(mapping, 0, sizeof(*mapping)); INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC); spin_lock_init(&mapping->tree_lock); - spin_lock_init(&mapping->i_mmap_lock); + mutex_init(&mapping->i_mmap_mutex); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap); INIT_LIST_HEAD(&mapping->i_mmap_nonlinear); - mutex_init(&mapping->unmap_mutex); } EXPORT_SYMBOL(address_space_init_once); @@ -751,8 +751,12 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, + struct shrink_control *sc) { + int nr = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; + if (nr) { /* * Nasty deadlock avoidance. We may hold various FS locks, diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 69b18045946..72ffa974b0b 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); @@ -722,8 +716,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b3713afaaa9..e2d4285fbe9 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal) int __log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d2319651b..f7ee81a065d 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *journal_start(journal_t *journal, int nblocks) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6e28000a4b2..29148a81c78 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 1adc8d455f0..df0de27c273 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -10,6 +10,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/gfp.h> +#include <linux/prefetch.h> #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 9e22085231b..d8d09380c7d 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -481,7 +481,7 @@ static int inode_write_alias(struct super_block *sb, val = inode_val0(inode); break; case INODE_USED_OFS: - val = cpu_to_be64(li->li_used_bytes);; + val = cpu_to_be64(li->li_used_bytes); break; case INODE_SIZE_OFS: val = cpu_to_be64(i_size_read(inode)); diff --git a/fs/mbcache.c b/fs/mbcache.c index 2f174be0655..8c32ef3ba88 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -90,7 +90,8 @@ static DEFINE_SPINLOCK(mb_cache_spinlock); * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, + struct shrink_control *sc); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -156,18 +157,19 @@ forget: * gets low. * * @shrink: (ignored) - * @nr_to_scan: Number of objects to scan - * @gfp_mask: (ignored) + * @sc: shrink_control passed from reclaim * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(free_list); struct mb_cache *cache; struct mb_cache_entry *entry, *tmp; int count = 0; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; mb_debug("trying to free %d entries", nr_to_scan); spin_lock(&mb_cache_spinlock); diff --git a/fs/namei.c b/fs/namei.c index e3c4f112ebf..6ff858c049c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1378,12 +1378,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) { int res; - BUG_ON(nd->depth >= MAX_NESTED_LINKS); if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { path_put_conditional(path, nd); path_put(&nd->path); return -ELOOP; } + BUG_ON(nd->depth >= MAX_NESTED_LINKS); nd->depth++; current->link_count++; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 0250e4ce489..202f370526a 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -461,7 +461,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) #endif struct ncp_entry_info finfo; - data.wdog_pid = NULL; + memset(&data, 0, sizeof(data)); server = kzalloc(sizeof(struct ncp_server), GFP_KERNEL); if (!server) return -ENOMEM; @@ -496,7 +496,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) struct ncp_mount_data_v4* md = (struct ncp_mount_data_v4*)raw_data; data.flags = md->flags; - data.int_flags = 0; data.mounted_uid = md->mounted_uid; data.wdog_pid = find_get_pid(md->wdog_pid); data.ncp_fd = md->ncp_fd; @@ -507,7 +506,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) data.file_mode = md->file_mode; data.dir_mode = md->dir_mode; data.info_fd = -1; - data.mounted_vol[0] = 0; } break; default: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7237672216c..424e47773a8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2042,11 +2042,14 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, + struct shrink_control *sc) { LIST_HEAD(head); struct nfs_inode *nfsi, *next; struct nfs_access_entry *cache; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) return (nr_to_scan == 0) ? 0 : -1; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ce118ce885d..2df6ca7b589 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -234,7 +234,7 @@ extern int nfs_init_client(struct nfs_client *clp, /* dir.c */ extern int nfs_access_cache_shrinker(struct shrinker *shrink, - int nr_to_scan, gfp_t gfp_mask); + struct shrink_control *sc); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 5232d3e8fb2..a2e2402b2af 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -8,7 +8,7 @@ * Statistsics for the reply cache * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache> * statistics for filehandle lookup - * io <bytes-read> <bytes-writtten> + * io <bytes-read> <bytes-written> * statistics for IO throughput * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%> * time (seconds) when nfsd thread usage above thresholds diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index f7684483785..eed4d7b2624 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -489,8 +489,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode, void nilfs_palloc_commit_alloc_entry(struct inode *inode, struct nilfs_palloc_req *req) { - nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); - nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); + mark_buffer_dirty(req->pr_bitmap_bh); + mark_buffer_dirty(req->pr_desc_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); @@ -527,8 +527,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode, kunmap(req->pr_bitmap_bh->b_page); kunmap(req->pr_desc_bh->b_page); - nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh); - nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh); + mark_buffer_dirty(req->pr_desc_bh); + mark_buffer_dirty(req->pr_bitmap_bh); nilfs_mdt_mark_dirty(inode); brelse(req->pr_bitmap_bh); @@ -683,8 +683,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) kunmap(bitmap_bh->b_page); kunmap(desc_bh->b_page); - nilfs_mdt_mark_buffer_dirty(desc_bh); - nilfs_mdt_mark_buffer_dirty(bitmap_bh); + mark_buffer_dirty(desc_bh); + mark_buffer_dirty(bitmap_bh); nilfs_mdt_mark_dirty(inode); brelse(bitmap_bh); diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 4723f04e9b1..aadbd0b5e3e 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -34,7 +34,9 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) { - return NILFS_I_NILFS(bmap->b_inode)->ns_dat; + struct the_nilfs *nilfs = bmap->b_inode->i_sb->s_fs_info; + + return nilfs->ns_dat; } static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap, diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 609cd223eea..a35ae35e693 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -34,12 +34,6 @@ #include "page.h" #include "btnode.h" -void nilfs_btnode_cache_init(struct address_space *btnc, - struct backing_dev_info *bdi) -{ - nilfs_mapping_init(btnc, bdi); -} - void nilfs_btnode_cache_clear(struct address_space *btnc) { invalidate_mapping_pages(btnc, 0, -1); @@ -62,7 +56,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) BUG(); } memset(bh->b_data, 0, 1 << inode->i_blkbits); - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); @@ -94,10 +88,11 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, if (pblocknr == 0) { pblocknr = blocknr; if (inode->i_ino != NILFS_DAT_INO) { - struct inode *dat = NILFS_I_NILFS(inode)->ns_dat; + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; /* blocknr is a virtual block number */ - err = nilfs_dat_translate(dat, blocknr, &pblocknr); + err = nilfs_dat_translate(nilfs->ns_dat, blocknr, + &pblocknr); if (unlikely(err)) { brelse(bh); goto out_locked; @@ -120,7 +115,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, goto found; } set_buffer_mapped(bh); - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -259,7 +254,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, "invalid oldkey %lld (newkey=%lld)", (unsigned long long)oldkey, (unsigned long long)newkey); - nilfs_btnode_mark_dirty(obh); + mark_buffer_dirty(obh); spin_lock_irq(&btnc->tree_lock); radix_tree_delete(&btnc->page_tree, oldkey); @@ -271,7 +266,7 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc, unlock_page(opage); } else { nilfs_copy_buffer(nbh, obh); - nilfs_btnode_mark_dirty(nbh); + mark_buffer_dirty(nbh); nbh->b_blocknr = newkey; ctxt->bh = nbh; diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 1b8ebd888c2..3a4dd2d8d3f 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt { struct buffer_head *newbh; }; -void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); @@ -51,7 +50,4 @@ void nilfs_btnode_commit_change_key(struct address_space *, void nilfs_btnode_abort_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); -#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh) - - #endif /* _NILFS_BTNODE_H */ diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index d451ae0e0bf..7eafe468a29 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -714,7 +714,7 @@ static void nilfs_btree_promote_key(struct nilfs_bmap *btree, nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); } while ((path[level].bp_index == 0) && (++level < nilfs_btree_height(btree) - 1)); } @@ -739,7 +739,7 @@ static void nilfs_btree_do_insert(struct nilfs_bmap *btree, nilfs_btree_node_insert(node, path[level].bp_index, *keyp, *ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, @@ -777,9 +777,9 @@ static void nilfs_btree_carry_left(struct nilfs_bmap *btree, nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -823,9 +823,9 @@ static void nilfs_btree_carry_right(struct nilfs_bmap *btree, nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, @@ -870,9 +870,9 @@ static void nilfs_btree_split(struct nilfs_bmap *btree, nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; @@ -919,7 +919,7 @@ static void nilfs_btree_grow(struct nilfs_bmap *btree, nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; @@ -1194,7 +1194,7 @@ static void nilfs_btree_do_delete(struct nilfs_bmap *btree, nilfs_btree_node_delete(node, path[level].bp_index, keyp, ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1226,9 +1226,9 @@ static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, nilfs_btree_node_get_key(node, 0)); @@ -1258,9 +1258,9 @@ static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, @@ -1289,7 +1289,7 @@ static void nilfs_btree_concat_left(struct nilfs_bmap *btree, nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_sib_bh)) - nilfs_btnode_mark_dirty(path[level].bp_sib_bh); + mark_buffer_dirty(path[level].bp_sib_bh); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; @@ -1315,7 +1315,7 @@ static void nilfs_btree_concat_right(struct nilfs_bmap *btree, nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); nilfs_btnode_delete(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; @@ -1709,7 +1709,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); if (!buffer_dirty(bh)) - nilfs_btnode_mark_dirty(bh); + mark_buffer_dirty(bh); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); @@ -1787,7 +1787,7 @@ static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, { while ((++level < nilfs_btree_height(btree) - 1) && !buffer_dirty(path[level].bp_bh)) - nilfs_btnode_mark_dirty(path[level].bp_bh); + mark_buffer_dirty(path[level].bp_bh); return 0; } @@ -2229,7 +2229,7 @@ static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) } if (!buffer_dirty(bh)) - nilfs_btnode_mark_dirty(bh); + mark_buffer_dirty(bh); brelse(bh); if (!nilfs_bmap_dirty(btree)) nilfs_bmap_set_dirty(btree); diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 5ff15a8a102..c9b342c8b50 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -216,14 +216,14 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile, if (!nilfs_cpfile_is_in_first(cpfile, cno)) nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh, kaddr, 1); - nilfs_mdt_mark_buffer_dirty(cp_bh); + mark_buffer_dirty(cp_bh); kaddr = kmap_atomic(header_bh->b_page, KM_USER0); header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, 1); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); } @@ -326,7 +326,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, } if (nicps > 0) { tnicps += nicps; - nilfs_mdt_mark_buffer_dirty(cp_bh); + mark_buffer_dirty(cp_bh); nilfs_mdt_mark_dirty(cpfile); if (!nilfs_cpfile_is_in_first(cpfile, cno)) { count = @@ -358,7 +358,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr); le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); kunmap_atomic(kaddr, KM_USER0); } @@ -671,10 +671,10 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno) le64_add_cpu(&header->ch_nsnapshots, 1); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(prev_bh); - nilfs_mdt_mark_buffer_dirty(curr_bh); - nilfs_mdt_mark_buffer_dirty(cp_bh); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(curr_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); brelse(prev_bh); @@ -774,10 +774,10 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) le64_add_cpu(&header->ch_nsnapshots, -1); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(next_bh); - nilfs_mdt_mark_buffer_dirty(prev_bh); - nilfs_mdt_mark_buffer_dirty(cp_bh); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(next_bh); + mark_buffer_dirty(prev_bh); + mark_buffer_dirty(cp_bh); + mark_buffer_dirty(header_bh); nilfs_mdt_mark_dirty(cpfile); brelse(prev_bh); diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 59e5fe742f7..fcc2f869af1 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -54,7 +54,7 @@ static int nilfs_dat_prepare_entry(struct inode *dat, static void nilfs_dat_commit_entry(struct inode *dat, struct nilfs_palloc_req *req) { - nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh); + mark_buffer_dirty(req->pr_entry_bh); nilfs_mdt_mark_dirty(dat); brelse(req->pr_entry_bh); } @@ -361,7 +361,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) entry->de_blocknr = cpu_to_le64(blocknr); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(entry_bh); + mark_buffer_dirty(entry_bh); nilfs_mdt_mark_dirty(dat); brelse(entry_bh); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 397e7325863..d7eeca62feb 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -111,7 +111,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - SetPageChecked(page); wait_on_page_writeback(page); return VM_FAULT_LOCKED; } diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 1c2a3e23f8b..08a07a218d2 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -48,9 +48,6 @@ #include "dat.h" #include "ifile.h" -static const struct address_space_operations def_gcinode_aops = { -}; - /* * nilfs_gccache_submit_read_data() - add data buffer and submit read request * @inode - gc inode @@ -87,9 +84,9 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, goto out; if (pbn == 0) { - struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat; - /* use original dat, not gc dat. */ - err = nilfs_dat_translate(dat_inode, vbn, &pbn); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ brelse(bh); goto failed; @@ -103,7 +100,7 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, } if (!buffer_mapped(bh)) { - bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev; + bh->b_bdev = inode->i_sb->s_bdev; set_buffer_mapped(bh); } bh->b_blocknr = pbn; @@ -160,15 +157,11 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) if (buffer_dirty(bh)) return -EEXIST; - if (buffer_nilfs_node(bh)) { - if (nilfs_btree_broken_node_block(bh)) { - clear_buffer_uptodate(bh); - return -EIO; - } - nilfs_btnode_mark_dirty(bh); - } else { - nilfs_mark_buffer_dirty(bh); + if (buffer_nilfs_node(bh) && nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + return -EIO; } + mark_buffer_dirty(bh); return 0; } @@ -178,7 +171,7 @@ int nilfs_init_gcinode(struct inode *inode) inode->i_mode = S_IFREG; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - inode->i_mapping->a_ops = &def_gcinode_aops; + inode->i_mapping->a_ops = &empty_aops; inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi; ii->i_flags = 0; diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index bfc73d3a30e..684d76300a8 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -80,7 +80,7 @@ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, return ret; } nilfs_palloc_commit_alloc_entry(ifile, &req); - nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + mark_buffer_dirty(req.pr_entry_bh); nilfs_mdt_mark_dirty(ifile); *out_ino = (ino_t)req.pr_entry_nr; *out_bh = req.pr_entry_bh; @@ -128,7 +128,7 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) raw_inode->i_flags = 0; kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh); + mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); nilfs_palloc_commit_free_entry(ifile, &req); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index c0aa27490c0..587f1843283 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -74,14 +74,14 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, struct buffer_head *bh_result, int create) { struct nilfs_inode_info *ii = NILFS_I(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 blknum = 0; int err = 0, ret; - struct inode *dat = NILFS_I_NILFS(inode)->ns_dat; unsigned maxblocks = bh_result->b_size >> inode->i_blkbits; - down_read(&NILFS_MDT(dat)->mi_sem); + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks); - up_read(&NILFS_MDT(dat)->mi_sem); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); if (ret >= 0) { /* found */ map_bh(bh_result, inode->i_sb, blknum); if (ret > 0) @@ -596,6 +596,16 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_flags = cpu_to_le32(ii->i_flags); raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) { + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + + /* zero-fill unused portion in the case of super root block */ + raw_inode->i_xattr = 0; + raw_inode->i_pad = 0; + memset((void *)raw_inode + sizeof(*raw_inode), 0, + nilfs->ns_inode_size - sizeof(*raw_inode)); + } + if (has_bmap) nilfs_bmap_write(ii->i_bmap, raw_inode); else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) @@ -872,8 +882,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) return -EINVAL; /* NILFS_I_DIRTY may remain for freeing inode */ } - list_del(&ii->i_dirty); - list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files); + list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files); set_bit(NILFS_I_QUEUED, &ii->i_state); } spin_unlock(&nilfs->ns_inode_lock); @@ -892,7 +901,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) return err; } nilfs_update_inode(inode, ibh); - nilfs_mdt_mark_buffer_dirty(ibh); + mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); brelse(ibh); return 0; @@ -931,7 +940,7 @@ void nilfs_dirty_inode(struct inode *inode) int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { - struct the_nilfs *nilfs = NILFS_I_NILFS(inode); + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; __u64 logical = 0, phys = 0, size = 0; __u32 flags = 0; loff_t isize; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index f2469ba6246..41d6743d303 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -698,6 +698,63 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, return 0; } +static int nilfs_ioctl_resize(struct inode *inode, struct file *filp, + void __user *argp) +{ + __u64 newsize; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + goto out; + + ret = -EFAULT; + if (copy_from_user(&newsize, argp, sizeof(newsize))) + goto out_drop_write; + + ret = nilfs_resize_fs(inode->i_sb, newsize); + +out_drop_write: + mnt_drop_write(filp->f_path.mnt); +out: + return ret; +} + +static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) +{ + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; + __u64 range[2]; + __u64 minseg, maxseg; + unsigned long segbytes; + int ret = -EPERM; + + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -EFAULT; + if (copy_from_user(range, argp, sizeof(__u64[2]))) + goto out; + + ret = -ERANGE; + if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + goto out; + + segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; + + minseg = range[0] + segbytes - 1; + do_div(minseg, segbytes); + maxseg = NILFS_SB2_OFFSET_BYTES(range[1]); + do_div(maxseg, segbytes); + maxseg--; + + ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg); +out: + return ret; +} + static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp, size_t membsz, @@ -763,6 +820,10 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); case NILFS_IOCTL_SYNC: return nilfs_ioctl_sync(inode, filp, cmd, argp); + case NILFS_IOCTL_RESIZE: + return nilfs_ioctl_resize(inode, filp, argp); + case NILFS_IOCTL_SET_ALLOC_RANGE: + return nilfs_ioctl_set_alloc_range(inode, argp); default: return -ENOTTY; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index a649b05f706..800e8d78a83 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -66,7 +66,7 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block, kunmap_atomic(kaddr, KM_USER0); set_buffer_uptodate(bh); - nilfs_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); return 0; } @@ -355,7 +355,7 @@ int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block) err = nilfs_mdt_read_block(inode, block, 0, &bh); if (unlikely(err)) return err; - nilfs_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(inode); brelse(bh); return 0; @@ -450,9 +450,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode, INIT_LIST_HEAD(&shadow->frozen_buffers); address_space_init_once(&shadow->frozen_data); - nilfs_mapping_init(&shadow->frozen_data, bdi); + nilfs_mapping_init(&shadow->frozen_data, inode, bdi); address_space_init_once(&shadow->frozen_btnodes); - nilfs_mapping_init(&shadow->frozen_btnodes, bdi); + nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi); mi->mi_shadow = shadow; return 0; } diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index ed68563ec70..ab20a4baa50 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -64,11 +64,6 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode) return inode->i_private; } -static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) -{ - return inode->i_sb->s_fs_info; -} - /* Default GFP flags using highmem */ #define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM) @@ -93,8 +88,6 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh); struct buffer_head *nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh); -#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh) - static inline void nilfs_mdt_mark_dirty(struct inode *inode) { if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state)) @@ -108,7 +101,7 @@ static inline void nilfs_mdt_clear_dirty(struct inode *inode) static inline __u64 nilfs_mdt_cno(struct inode *inode) { - return NILFS_I_NILFS(inode)->ns_cno; + return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno; } #define nilfs_mdt_bgl_lock(inode, bg) \ diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index a8dd344303c..a9c6a531f80 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -80,12 +80,6 @@ static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) return &ii->vfs_inode; } -static inline struct inode *NILFS_AS_I(struct address_space *mapping) -{ - return (mapping->host) ? : - container_of(mapping, struct inode, i_data); -} - /* * Dynamic state flags of NILFS on-memory inode (i_state) */ @@ -298,6 +292,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb, int flip); int nilfs_commit_super(struct super_block *sb, int flag); int nilfs_cleanup_super(struct super_block *sb); +int nilfs_resize_fs(struct super_block *sb, __u64 newsize); int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt, struct nilfs_root **root); int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 1168059c7ef..65221a04c6f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -37,8 +37,7 @@ #define NILFS_BUFFER_INHERENT_BITS \ ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ - (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \ - (1UL << BH_NILFS_Checked)) + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked)) static struct buffer_head * __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, @@ -59,19 +58,6 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, return bh; } -/* - * Since the page cache of B-tree node pages or data page cache of pseudo - * inodes does not have a valid mapping->host pointer, calling - * mark_buffer_dirty() for their buffers causes a NULL pointer dereference; - * it calls __mark_inode_dirty(NULL) through __set_page_dirty(). - * To avoid this problem, the old style mark_buffer_dirty() is used instead. - */ -void nilfs_mark_buffer_dirty(struct buffer_head *bh) -{ - if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); -} - struct buffer_head *nilfs_grab_buffer(struct inode *inode, struct address_space *mapping, unsigned long blkoff, @@ -183,7 +169,7 @@ int nilfs_page_buffers_clean(struct page *page) void nilfs_page_bug(struct page *page) { struct address_space *m; - unsigned long ino = 0; + unsigned long ino; if (unlikely(!page)) { printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n"); @@ -191,11 +177,8 @@ void nilfs_page_bug(struct page *page) } m = page->mapping; - if (m) { - struct inode *inode = NILFS_AS_I(m); - if (inode != NULL) - ino = inode->i_ino; - } + ino = m ? m->host->i_ino : 0; + printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx " "mapping=%p ino=%lu\n", page, atomic_read(&page->_count), @@ -217,56 +200,6 @@ void nilfs_page_bug(struct page *page) } /** - * nilfs_alloc_private_page - allocate a private page with buffer heads - * - * Return Value: On success, a pointer to the allocated page is returned. - * On error, NULL is returned. - */ -struct page *nilfs_alloc_private_page(struct block_device *bdev, int size, - unsigned long state) -{ - struct buffer_head *bh, *head, *tail; - struct page *page; - - page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */ - if (unlikely(!page)) - return NULL; - - lock_page(page); - head = alloc_page_buffers(page, size, 0); - if (unlikely(!head)) { - unlock_page(page); - __free_page(page); - return NULL; - } - - bh = head; - do { - bh->b_state = (1UL << BH_NILFS_Allocated) | state; - tail = bh; - bh->b_bdev = bdev; - bh = bh->b_this_page; - } while (bh); - - tail->b_this_page = head; - attach_page_buffers(page, head); - - return page; -} - -void nilfs_free_private_page(struct page *page) -{ - BUG_ON(!PageLocked(page)); - BUG_ON(page->mapping); - - if (page_has_buffers(page) && !try_to_free_buffers(page)) - NILFS_PAGE_BUG(page, "failed to free page"); - - unlock_page(page); - __free_page(page); -} - -/** * nilfs_copy_page -- copy the page with buffers * @dst: destination page * @src: source page @@ -492,10 +425,10 @@ unsigned nilfs_page_count_clean_buffers(struct page *page, return nc; } -void nilfs_mapping_init(struct address_space *mapping, +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi) { - mapping->host = NULL; + mapping->host = inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->assoc_mapping = NULL; diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index f06b79ad749..fb7de71605a 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -38,14 +38,12 @@ enum { BH_NILFS_Redirected, }; -BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ BUFFER_FNS(NILFS_Volatile, nilfs_volatile) BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */ -void nilfs_mark_buffer_dirty(struct buffer_head *bh); int __nilfs_clear_page_dirty(struct page *); struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *, @@ -54,14 +52,11 @@ void nilfs_forget_buffer(struct buffer_head *); void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *); int nilfs_page_buffers_clean(struct page *); void nilfs_page_bug(struct page *); -struct page *nilfs_alloc_private_page(struct block_device *, int, - unsigned long); -void nilfs_free_private_page(struct page *); int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); void nilfs_copy_back_pages(struct address_space *, struct address_space *); void nilfs_clear_dirty_pages(struct address_space *); -void nilfs_mapping_init(struct address_space *mapping, +void nilfs_mapping_init(struct address_space *mapping, struct inode *inode, struct backing_dev_info *bdi); unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); unsigned long nilfs_find_uncommitted_extent(struct inode *inode, diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index ba4a64518f3..a604ac0331b 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -387,9 +387,9 @@ static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, static void dispose_recovery_list(struct list_head *head) { while (!list_empty(head)) { - struct nilfs_recovery_block *rb - = list_entry(head->next, - struct nilfs_recovery_block, list); + struct nilfs_recovery_block *rb; + + rb = list_first_entry(head, struct nilfs_recovery_block, list); list_del(&rb->list); kfree(rb); } @@ -416,9 +416,9 @@ static int nilfs_segment_list_add(struct list_head *head, __u64 segnum) void nilfs_dispose_segment_list(struct list_head *head) { while (!list_empty(head)) { - struct nilfs_segment_entry *ent - = list_entry(head->next, - struct nilfs_segment_entry, list); + struct nilfs_segment_entry *ent; + + ent = list_first_entry(head, struct nilfs_segment_entry, list); list_del(&ent->list); kfree(ent); } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 2853ff20f85..850a7c0228f 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -239,12 +239,15 @@ nilfs_segbuf_fill_in_super_root_crc(struct nilfs_segment_buffer *segbuf, u32 seed) { struct nilfs_super_root *raw_sr; + struct the_nilfs *nilfs = segbuf->sb_super->s_fs_info; + unsigned srsize; u32 crc; raw_sr = (struct nilfs_super_root *)segbuf->sb_super_root->b_data; + srsize = NILFS_SR_BYTES(nilfs->ns_inode_size); crc = crc32_le(seed, (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum), - NILFS_SR_BYTES - sizeof(raw_sr->sr_sum)); + srsize - sizeof(raw_sr->sr_sum)); raw_sr->sr_sum = cpu_to_le32(crc); } @@ -254,18 +257,6 @@ static void nilfs_release_buffers(struct list_head *list) list_for_each_entry_safe(bh, n, list, b_assoc_buffers) { list_del_init(&bh->b_assoc_buffers); - if (buffer_nilfs_allocated(bh)) { - struct page *clone_page = bh->b_page; - - /* remove clone page */ - brelse(bh); - page_cache_release(clone_page); /* for each bh */ - if (page_count(clone_page) <= 2) { - lock_page(clone_page); - nilfs_free_private_page(clone_page); - } - continue; - } brelse(bh); } } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index afe4f218345..141646e88fb 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -655,13 +655,10 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, if (unlikely(page->index > last)) break; - if (mapping->host) { - lock_page(page); - if (!page_has_buffers(page)) - create_empty_buffers(page, - 1 << inode->i_blkbits, 0); - unlock_page(page); - } + lock_page(page); + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); + unlock_page(page); bh = head = page_buffers(page); do { @@ -809,7 +806,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) /* The following code is duplicated with cpfile. But, it is needed to collect the checkpoint even if it was not newly created */ - nilfs_mdt_mark_buffer_dirty(bh_cp); + mark_buffer_dirty(bh_cp); nilfs_mdt_mark_dirty(nilfs->ns_cpfile); nilfs_cpfile_put_checkpoint( nilfs->ns_cpfile, nilfs->ns_cno, bh_cp); @@ -889,12 +886,14 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; - unsigned isz = nilfs->ns_inode_size; + unsigned isz, srsz; bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root; raw_sr = (struct nilfs_super_root *)bh_sr->b_data; + isz = nilfs->ns_inode_size; + srsz = NILFS_SR_BYTES(isz); - raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES); + raw_sr->sr_bytes = cpu_to_le16(srsz); raw_sr->sr_nongc_ctime = cpu_to_le64(nilfs_doing_gc() ? nilfs->ns_nongc_ctime : sci->sc_seg_ctime); @@ -906,6 +905,7 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci, NILFS_SR_CPFILE_OFFSET(isz), 1); nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr + NILFS_SR_SUFILE_OFFSET(isz), 1); + memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz); } static void nilfs_redirty_inodes(struct list_head *head) @@ -954,8 +954,8 @@ static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci, dispose_buffers: while (!list_empty(listp)) { - bh = list_entry(listp->next, struct buffer_head, - b_assoc_buffers); + bh = list_first_entry(listp, struct buffer_head, + b_assoc_buffers); list_del_init(&bh->b_assoc_buffers); brelse(bh); } @@ -1500,10 +1500,7 @@ nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci, nblocks = le32_to_cpu(finfo->fi_nblocks); ndatablk = le32_to_cpu(finfo->fi_ndatablk); - if (buffer_nilfs_node(bh)) - inode = NILFS_BTNC_I(bh->b_page->mapping); - else - inode = NILFS_AS_I(bh->b_page->mapping); + inode = bh->b_page->mapping->host; if (mode == SC_LSEG_DSYNC) sc_op = &nilfs_sc_dsync_ops; @@ -1556,83 +1553,24 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode) return 0; } -static int -nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out) -{ - struct page *clone_page; - struct buffer_head *bh, *head, *bh2; - void *kaddr; - - bh = head = page_buffers(page); - - clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0); - if (unlikely(!clone_page)) - return -ENOMEM; - - bh2 = page_buffers(clone_page); - kaddr = kmap_atomic(page, KM_USER0); - do { - if (list_empty(&bh->b_assoc_buffers)) - continue; - get_bh(bh2); - page_cache_get(clone_page); /* for each bh */ - memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size); - bh2->b_blocknr = bh->b_blocknr; - list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers); - list_add_tail(&bh->b_assoc_buffers, out); - } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head); - kunmap_atomic(kaddr, KM_USER0); - - if (!TestSetPageWriteback(clone_page)) - account_page_writeback(clone_page); - unlock_page(clone_page); - - return 0; -} - -static int nilfs_test_page_to_be_frozen(struct page *page) -{ - struct address_space *mapping = page->mapping; - - if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode)) - return 0; - - if (page_mapped(page)) { - ClearPageChecked(page); - return 1; - } - return PageChecked(page); -} - -static int nilfs_begin_page_io(struct page *page, struct list_head *out) +static void nilfs_begin_page_io(struct page *page) { if (!page || PageWriteback(page)) /* For split b-tree node pages, this function may be called twice. We ignore the 2nd or later calls by this check. */ - return 0; + return; lock_page(page); clear_page_dirty_for_io(page); set_page_writeback(page); unlock_page(page); - - if (nilfs_test_page_to_be_frozen(page)) { - int err = nilfs_copy_replace_page_buffers(page, out); - if (unlikely(err)) - return err; - } - return 0; } -static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, - struct page **failed_page) +static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; - struct list_head *list = &sci->sc_copied_buffers; - int err; - *failed_page = NULL; list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) { struct buffer_head *bh; @@ -1662,11 +1600,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, break; } if (bh->b_page != fs_page) { - err = nilfs_begin_page_io(fs_page, list); - if (unlikely(err)) { - *failed_page = fs_page; - goto out; - } + nilfs_begin_page_io(fs_page); fs_page = bh->b_page; } } @@ -1677,11 +1611,7 @@ static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci, set_page_writeback(bd_page); unlock_page(bd_page); } - err = nilfs_begin_page_io(fs_page, list); - if (unlikely(err)) - *failed_page = fs_page; - out: - return err; + nilfs_begin_page_io(fs_page); } static int nilfs_segctor_write(struct nilfs_sc_info *sci, @@ -1694,24 +1624,6 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci, return ret; } -static void __nilfs_end_page_io(struct page *page, int err) -{ - if (!err) { - if (!nilfs_page_buffers_clean(page)) - __set_page_dirty_nobuffers(page); - ClearPageError(page); - } else { - __set_page_dirty_nobuffers(page); - SetPageError(page); - } - - if (buffer_nilfs_allocated(page_buffers(page))) { - if (TestClearPageWriteback(page)) - dec_zone_page_state(page, NR_WRITEBACK); - } else - end_page_writeback(page); -} - static void nilfs_end_page_io(struct page *page, int err) { if (!page) @@ -1738,40 +1650,19 @@ static void nilfs_end_page_io(struct page *page, int err) return; } - __nilfs_end_page_io(page, err); -} - -static void nilfs_clear_copied_buffers(struct list_head *list, int err) -{ - struct buffer_head *bh, *head; - struct page *page; - - while (!list_empty(list)) { - bh = list_entry(list->next, struct buffer_head, - b_assoc_buffers); - page = bh->b_page; - page_cache_get(page); - head = bh = page_buffers(page); - do { - if (!list_empty(&bh->b_assoc_buffers)) { - list_del_init(&bh->b_assoc_buffers); - if (!err) { - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - clear_buffer_delay(bh); - clear_buffer_nilfs_volatile(bh); - } - brelse(bh); /* for b_assoc_buffers */ - } - } while ((bh = bh->b_this_page) != head); - - __nilfs_end_page_io(page, err); - page_cache_release(page); + if (!err) { + if (!nilfs_page_buffers_clean(page)) + __set_page_dirty_nobuffers(page); + ClearPageError(page); + } else { + __set_page_dirty_nobuffers(page); + SetPageError(page); } + + end_page_writeback(page); } -static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, - int err) +static void nilfs_abort_logs(struct list_head *logs, int err) { struct nilfs_segment_buffer *segbuf; struct page *bd_page = NULL, *fs_page = NULL; @@ -1801,8 +1692,6 @@ static void nilfs_abort_logs(struct list_head *logs, struct page *failed_page, } if (bh->b_page != fs_page) { nilfs_end_page_io(fs_page, err); - if (fs_page && fs_page == failed_page) - return; fs_page = bh->b_page; } } @@ -1821,12 +1710,11 @@ static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci, list_splice_tail_init(&sci->sc_write_logs, &logs); ret = nilfs_wait_on_logs(&logs); - nilfs_abort_logs(&logs, NULL, ret ? : err); + nilfs_abort_logs(&logs, ret ? : err); list_splice_tail_init(&sci->sc_segbufs, &logs); nilfs_cancel_segusage(&logs, nilfs->ns_sufile); nilfs_free_incomplete_logs(&logs, nilfs); - nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err); if (sci->sc_stage.flags & NILFS_CF_SUFREED) { ret = nilfs_sufile_cancel_freev(nilfs->ns_sufile, @@ -1920,8 +1808,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) nilfs_end_page_io(fs_page, 0); - nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0); - nilfs_drop_collected_inodes(&sci->sc_dirty_files); if (nilfs_doing_gc()) @@ -1979,7 +1865,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, "failed to get inode block.\n"); return err; } - nilfs_mdt_mark_buffer_dirty(ibh); + mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) @@ -1991,8 +1877,7 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); - list_del(&ii->i_dirty); - list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); + list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); } spin_unlock(&nilfs->ns_inode_lock); @@ -2014,8 +1899,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, clear_bit(NILFS_I_BUSY, &ii->i_state); brelse(ii->i_bh); ii->i_bh = NULL; - list_del(&ii->i_dirty); - list_add_tail(&ii->i_dirty, &ti->ti_garbage); + list_move_tail(&ii->i_dirty, &ti->ti_garbage); } spin_unlock(&nilfs->ns_inode_lock); } @@ -2026,7 +1910,6 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) { struct the_nilfs *nilfs = sci->sc_super->s_fs_info; - struct page *failed_page; int err; sci->sc_stage.scnt = NILFS_ST_INIT; @@ -2081,11 +1964,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) nilfs_segctor_update_segusage(sci, nilfs->ns_sufile); /* Write partial segments */ - err = nilfs_segctor_prepare_write(sci, &failed_page); - if (err) { - nilfs_abort_logs(&sci->sc_segbufs, failed_page, err); - goto failed_to_write; - } + nilfs_segctor_prepare_write(sci); nilfs_add_checksums_on_logs(&sci->sc_segbufs, nilfs->ns_crc_seed); @@ -2687,7 +2566,6 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_segbufs); INIT_LIST_HEAD(&sci->sc_write_logs); INIT_LIST_HEAD(&sci->sc_gc_inodes); - INIT_LIST_HEAD(&sci->sc_copied_buffers); init_timer(&sci->sc_timer); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; @@ -2741,8 +2619,6 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) if (flag || !nilfs_segctor_confirm(sci)) nilfs_segctor_write_out(sci); - WARN_ON(!list_empty(&sci->sc_copied_buffers)); - if (!list_empty(&sci->sc_dirty_files)) { nilfs_warning(sci->sc_super, __func__, "dirty file(s) after the final construction\n"); diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 6c02a86745f..38a1d001331 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -92,7 +92,6 @@ struct nilfs_segsum_pointer { * @sc_nblk_inc: Block count of current generation * @sc_dirty_files: List of files to be written * @sc_gc_inodes: List of GC inodes having blocks to be written - * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data * @sc_freesegs: array of segment numbers to be freed * @sc_nfreesegs: number of segments on @sc_freesegs * @sc_dsync_inode: inode whose data pages are written for a sync operation @@ -136,7 +135,6 @@ struct nilfs_sc_info { struct list_head sc_dirty_files; struct list_head sc_gc_inodes; - struct list_head sc_copied_buffers; __u64 *sc_freesegs; size_t sc_nfreesegs; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1d6f488ccae..0a0aba617d8 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -33,7 +33,9 @@ struct nilfs_sufile_info { struct nilfs_mdt_info mi; - unsigned long ncleansegs; + unsigned long ncleansegs;/* number of clean segments */ + __u64 allocmin; /* lower limit of allocatable segment range */ + __u64 allocmax; /* upper limit of allocatable segment range */ }; static inline struct nilfs_sufile_info *NILFS_SUI(struct inode *sufile) @@ -96,6 +98,13 @@ nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum, create, NULL, bhp); } +static int nilfs_sufile_delete_segment_usage_block(struct inode *sufile, + __u64 segnum) +{ + return nilfs_mdt_delete_block(sufile, + nilfs_sufile_get_blkoff(sufile, segnum)); +} + static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, u64 ncleanadd, u64 ndirtyadd) { @@ -108,7 +117,7 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh, le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(header_bh); + mark_buffer_dirty(header_bh); } /** @@ -248,6 +257,35 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create, } /** + * nilfs_sufile_set_alloc_range - limit range of segment to be allocated + * @sufile: inode of segment usage file + * @start: minimum segment number of allocatable region (inclusive) + * @end: maximum segment number of allocatable region (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-ERANGE - invalid segment region + */ +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end) +{ + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + __u64 nsegs; + int ret = -ERANGE; + + down_write(&NILFS_MDT(sufile)->mi_sem); + nsegs = nilfs_sufile_get_nsegments(sufile); + + if (start <= end && end < nsegs) { + sui->allocmin = start; + sui->allocmax = end; + ret = 0; + } + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** * nilfs_sufile_alloc - allocate a segment * @sufile: inode of segment usage file * @segnump: pointer to segment number @@ -269,11 +307,12 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) struct buffer_head *header_bh, *su_bh; struct nilfs_sufile_header *header; struct nilfs_segment_usage *su; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); size_t susz = NILFS_MDT(sufile)->mi_entry_size; __u64 segnum, maxsegnum, last_alloc; void *kaddr; - unsigned long nsegments, ncleansegs, nsus; - int ret, i, j; + unsigned long nsegments, ncleansegs, nsus, cnt; + int ret, j; down_write(&NILFS_MDT(sufile)->mi_sem); @@ -287,13 +326,31 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) kunmap_atomic(kaddr, KM_USER0); nsegments = nilfs_sufile_get_nsegments(sufile); + maxsegnum = sui->allocmax; segnum = last_alloc + 1; - maxsegnum = nsegments - 1; - for (i = 0; i < nsegments; i += nsus) { - if (segnum >= nsegments) { - /* wrap around */ - segnum = 0; - maxsegnum = last_alloc; + if (segnum < sui->allocmin || segnum > sui->allocmax) + segnum = sui->allocmin; + + for (cnt = 0; cnt < nsegments; cnt += nsus) { + if (segnum > maxsegnum) { + if (cnt < sui->allocmax - sui->allocmin + 1) { + /* + * wrap around in the limited region. + * if allocation started from + * sui->allocmin, this never happens. + */ + segnum = sui->allocmin; + maxsegnum = last_alloc; + } else if (segnum > sui->allocmin && + sui->allocmax + 1 < nsegments) { + segnum = sui->allocmax + 1; + maxsegnum = nsegments - 1; + } else if (sui->allocmin > 0) { + segnum = 0; + maxsegnum = sui->allocmin - 1; + } else { + break; /* never happens */ + } } ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &su_bh); @@ -319,9 +376,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) header->sh_last_alloc = cpu_to_le64(segnum); kunmap_atomic(kaddr, KM_USER0); - NILFS_SUI(sufile)->ncleansegs--; - nilfs_mdt_mark_buffer_dirty(header_bh); - nilfs_mdt_mark_buffer_dirty(su_bh); + sui->ncleansegs--; + mark_buffer_dirty(header_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; @@ -364,7 +421,7 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum, nilfs_sufile_mod_counter(header_bh, -1, 1); NILFS_SUI(sufile)->ncleansegs--; - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -395,7 +452,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum, nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1); NILFS_SUI(sufile)->ncleansegs -= clean; - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } @@ -421,7 +478,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, sudirty = nilfs_segment_usage_dirty(su); nilfs_segment_usage_set_clean(su); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0); NILFS_SUI(sufile)->ncleansegs++; @@ -441,7 +498,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh); if (!ret) { - nilfs_mdt_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); } @@ -476,7 +533,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, su->su_nblocks = cpu_to_le32(nblocks); kunmap_atomic(kaddr, KM_USER0); - nilfs_mdt_mark_buffer_dirty(bh); + mark_buffer_dirty(bh); nilfs_mdt_mark_dirty(sufile); brelse(bh); @@ -505,7 +562,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat) { struct buffer_head *header_bh; struct nilfs_sufile_header *header; - struct the_nilfs *nilfs = NILFS_I_NILFS(sufile); + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; void *kaddr; int ret; @@ -555,11 +612,183 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, nilfs_sufile_mod_counter(header_bh, -1, 0); NILFS_SUI(sufile)->ncleansegs--; } - nilfs_mdt_mark_buffer_dirty(su_bh); + mark_buffer_dirty(su_bh); nilfs_mdt_mark_dirty(sufile); } /** + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ +static int nilfs_sufile_truncate_range(struct inode *sufile, + __u64 start, __u64 end) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct buffer_head *su_bh; + struct nilfs_segment_usage *su, *su2; + size_t susz = NILFS_MDT(sufile)->mi_entry_size; + unsigned long segusages_per_block; + unsigned long nsegs, ncleaned; + __u64 segnum; + void *kaddr; + ssize_t n, nc; + int ret; + int j; + + nsegs = nilfs_sufile_get_nsegments(sufile); + + ret = -EINVAL; + if (start > end || start >= nsegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + ncleaned = 0; + + for (segnum = start; segnum <= end; segnum += n) { + n = min_t(unsigned long, + segusages_per_block - + nilfs_sufile_get_offset(sufile, segnum), + end - segnum + 1); + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, + &su_bh); + if (ret < 0) { + if (ret != -ENOENT) + goto out_header; + /* hole */ + continue; + } + kaddr = kmap_atomic(su_bh->b_page, KM_USER0); + su = nilfs_sufile_block_get_segment_usage( + sufile, segnum, su_bh, kaddr); + su2 = su; + for (j = 0; j < n; j++, su = (void *)su + susz) { + if ((le32_to_cpu(su->su_flags) & + ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) || + nilfs_segment_is_active(nilfs, segnum + j)) { + ret = -EBUSY; + kunmap_atomic(kaddr, KM_USER0); + brelse(su_bh); + goto out_header; + } + } + nc = 0; + for (su = su2, j = 0; j < n; j++, su = (void *)su + susz) { + if (nilfs_segment_usage_error(su)) { + nilfs_segment_usage_set_clean(su); + nc++; + } + } + kunmap_atomic(kaddr, KM_USER0); + if (nc > 0) { + mark_buffer_dirty(su_bh); + ncleaned += nc; + } + brelse(su_bh); + + if (n == segusages_per_block) { + /* make hole */ + nilfs_sufile_delete_segment_usage_block(sufile, segnum); + } + } + ret = 0; + +out_header: + if (ncleaned > 0) { + NILFS_SUI(sufile)->ncleansegs += ncleaned; + nilfs_sufile_mod_counter(header_bh, ncleaned, 0); + nilfs_mdt_mark_dirty(sufile); + } + brelse(header_bh); +out: + return ret; +} + +/** + * nilfs_sufile_resize - resize segment array + * @sufile: inode of segment usage file + * @newnsegs: new number of segments + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOSPC - Enough free space is not left for shrinking + * + * %-EBUSY - Dirty or active segments exist in the region to be truncated + */ +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs) +{ + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; + struct buffer_head *header_bh; + struct nilfs_sufile_header *header; + struct nilfs_sufile_info *sui = NILFS_SUI(sufile); + void *kaddr; + unsigned long nsegs, nrsvsegs; + int ret = 0; + + down_write(&NILFS_MDT(sufile)->mi_sem); + + nsegs = nilfs_sufile_get_nsegments(sufile); + if (nsegs == newnsegs) + goto out; + + ret = -ENOSPC; + nrsvsegs = nilfs_nrsvsegs(nilfs, newnsegs); + if (newnsegs < nsegs && nsegs - newnsegs + nrsvsegs > sui->ncleansegs) + goto out; + + ret = nilfs_sufile_get_header_block(sufile, &header_bh); + if (ret < 0) + goto out; + + if (newnsegs > nsegs) { + sui->ncleansegs += newnsegs - nsegs; + } else /* newnsegs < nsegs */ { + ret = nilfs_sufile_truncate_range(sufile, newnsegs, nsegs - 1); + if (ret < 0) + goto out_header; + + sui->ncleansegs -= nsegs - newnsegs; + } + + kaddr = kmap_atomic(header_bh->b_page, KM_USER0); + header = kaddr + bh_offset(header_bh); + header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs); + kunmap_atomic(kaddr, KM_USER0); + + mark_buffer_dirty(header_bh); + nilfs_mdt_mark_dirty(sufile); + nilfs_set_nsegments(nilfs, newnsegs); + +out_header: + brelse(header_bh); +out: + up_write(&NILFS_MDT(sufile)->mi_sem); + return ret; +} + +/** * nilfs_sufile_get_suinfo - * @sufile: inode of segment usage file * @segnum: segment number to start looking @@ -583,7 +812,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, struct nilfs_segment_usage *su; struct nilfs_suinfo *si = buf; size_t susz = NILFS_MDT(sufile)->mi_entry_size; - struct the_nilfs *nilfs = NILFS_I_NILFS(sufile); + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; void *kaddr; unsigned long nsegs, segusages_per_block; ssize_t n; @@ -679,6 +908,9 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize, kunmap_atomic(kaddr, KM_USER0); brelse(header_bh); + sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1; + sui->allocmin = 0; + unlock_new_inode(sufile); out: *inodep = sufile; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a943fbacb45..e84bc5b51fc 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -31,11 +31,12 @@ static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { - return NILFS_I_NILFS(sufile)->ns_nsegments; + return ((struct the_nilfs *)sufile->i_sb->s_fs_info)->ns_nsegments; } unsigned long nilfs_sufile_get_ncleansegs(struct inode *sufile); +int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); int nilfs_sufile_alloc(struct inode *, __u64 *); int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, @@ -61,6 +62,7 @@ void nilfs_sufile_do_cancel_free(struct inode *, __u64, struct buffer_head *, void nilfs_sufile_do_set_error(struct inode *, __u64, struct buffer_head *, struct buffer_head *); +int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); int nilfs_sufile_read(struct super_block *sb, size_t susize, struct nilfs_inode *raw_inode, struct inode **inodep); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 062cca06519..8351c44a732 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -56,6 +56,7 @@ #include "btnode.h" #include "page.h" #include "cpfile.h" +#include "sufile.h" /* nilfs_sufile_resize(), nilfs_sufile_set_alloc_range() */ #include "ifile.h" #include "dat.h" #include "segment.h" @@ -165,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_state = 0; ii->i_cno = 0; ii->vfs_inode.i_version = 1; - nilfs_btnode_cache_init(&ii->i_btnode_cache, sb->s_bdi); + nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi); return &ii->vfs_inode; } @@ -347,6 +348,134 @@ int nilfs_cleanup_super(struct super_block *sb) return ret; } +/** + * nilfs_move_2nd_super - relocate secondary super block + * @sb: super block instance + * @sb2off: new offset of the secondary super block (in bytes) + */ +static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct buffer_head *nsbh; + struct nilfs_super_block *nsbp; + sector_t blocknr, newblocknr; + unsigned long offset; + int sb2i = -1; /* array index of the secondary superblock */ + int ret = 0; + + /* nilfs->ns_sem must be locked by the caller. */ + if (nilfs->ns_sbh[1] && + nilfs->ns_sbh[1]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 1; + blocknr = nilfs->ns_sbh[1]->b_blocknr; + } else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) { + sb2i = 0; + blocknr = nilfs->ns_sbh[0]->b_blocknr; + } + if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off) + goto out; /* super block location is unchanged */ + + /* Get new super block buffer */ + newblocknr = sb2off >> nilfs->ns_blocksize_bits; + offset = sb2off & (nilfs->ns_blocksize - 1); + nsbh = sb_getblk(sb, newblocknr); + if (!nsbh) { + printk(KERN_WARNING + "NILFS warning: unable to move secondary superblock " + "to block %llu\n", (unsigned long long)newblocknr); + ret = -EIO; + goto out; + } + nsbp = (void *)nsbh->b_data + offset; + memset(nsbp, 0, nilfs->ns_blocksize); + + if (sb2i >= 0) { + memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize); + brelse(nilfs->ns_sbh[sb2i]); + nilfs->ns_sbh[sb2i] = nsbh; + nilfs->ns_sbp[sb2i] = nsbp; + } else if (nilfs->ns_sbh[0]->b_blocknr < nilfs->ns_first_data_block) { + /* secondary super block will be restored to index 1 */ + nilfs->ns_sbh[1] = nsbh; + nilfs->ns_sbp[1] = nsbp; + } else { + brelse(nsbh); + } +out: + return ret; +} + +/** + * nilfs_resize_fs - resize the filesystem + * @sb: super block instance + * @newsize: new size of the filesystem (in bytes) + */ +int nilfs_resize_fs(struct super_block *sb, __u64 newsize) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + struct nilfs_super_block **sbp; + __u64 devsize, newnsegs; + loff_t sb2off; + int ret; + + ret = -ERANGE; + devsize = i_size_read(sb->s_bdev->bd_inode); + if (newsize > devsize) + goto out; + + /* + * Write lock is required to protect some functions depending + * on the number of segments, the number of reserved segments, + * and so forth. + */ + down_write(&nilfs->ns_segctor_sem); + + sb2off = NILFS_SB2_OFFSET_BYTES(newsize); + newnsegs = sb2off >> nilfs->ns_blocksize_bits; + do_div(newnsegs, nilfs->ns_blocks_per_segment); + + ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs); + up_write(&nilfs->ns_segctor_sem); + if (ret < 0) + goto out; + + ret = nilfs_construct_segment(sb); + if (ret < 0) + goto out; + + down_write(&nilfs->ns_sem); + nilfs_move_2nd_super(sb, sb2off); + ret = -EIO; + sbp = nilfs_prepare_super(sb, 0); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + /* + * Drop NILFS_RESIZE_FS flag for compatibility with + * mount-time resize which may be implemented in a + * future release. + */ + sbp[0]->s_state = cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & + ~NILFS_RESIZE_FS); + sbp[0]->s_dev_size = cpu_to_le64(newsize); + sbp[0]->s_nsegments = cpu_to_le64(nilfs->ns_nsegments); + if (sbp[1]) + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + ret = nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL); + } + up_write(&nilfs->ns_sem); + + /* + * Reset the range of allocatable segments last. This order + * is important in the case of expansion because the secondary + * superblock must be protected from log write until migration + * completes. + */ + if (!ret) + nilfs_sufile_set_alloc_range(nilfs->ns_sufile, 0, newnsegs - 1); +out: + return ret; +} + static void nilfs_put_super(struct super_block *sb) { struct the_nilfs *nilfs = sb->s_fs_info; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index d2acd1a651f..d3271409437 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -363,6 +363,24 @@ static unsigned long long nilfs_max_size(unsigned int blkbits) return res; } +/** + * nilfs_nrsvsegs - calculate the number of reserved segments + * @nilfs: nilfs object + * @nsegs: total number of segments + */ +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs) +{ + return max_t(unsigned long, NILFS_MIN_NRSVSEGS, + DIV_ROUND_UP(nsegs * nilfs->ns_r_segments_percentage, + 100)); +} + +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs) +{ + nilfs->ns_nsegments = nsegs; + nilfs->ns_nrsvsegs = nilfs_nrsvsegs(nilfs, nsegs); +} + static int nilfs_store_disk_layout(struct the_nilfs *nilfs, struct nilfs_super_block *sbp) { @@ -389,13 +407,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs, } nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block); - nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments); nilfs->ns_r_segments_percentage = le32_to_cpu(sbp->s_r_segments_percentage); - nilfs->ns_nrsvsegs = - max_t(unsigned long, NILFS_MIN_NRSVSEGS, - DIV_ROUND_UP(nilfs->ns_nsegments * - nilfs->ns_r_segments_percentage, 100)); + nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments)); nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed); return 0; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index f4968145c2a..9992b11312f 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -268,6 +268,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev); void destroy_nilfs(struct the_nilfs *nilfs); int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data); int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb); +unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs); +void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs); int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 5d32749c896..3c7606cff1a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3706,7 +3706,7 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, context->cow_start = cow_start; context->cow_len = cow_len; context->ref_tree = ref_tree; - context->ref_root_bh = ref_root_bh;; + context->ref_root_bh = ref_root_bh; context->cow_object = xv; context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index d545e97d99c..8ed4d343319 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -255,7 +255,11 @@ ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%u\n", + queue_limit_discard_alignment(&disk->queue->limits, + p->start_sect)); } ssize_t part_stat_show(struct device *dev, @@ -449,8 +453,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->alignment_offset = queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index ce4f6244042..af9fdf04676 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -565,7 +565,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) data = read_part_sector(state, 0, §); if (!data) { - ldm_crit ("Disk read failed."); + ldm_info ("Disk read failed."); return false; } @@ -1335,6 +1335,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) list_add_tail (&f->list, frags); found: + if (rec >= f->num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); + return false; + } + if (f->map & (1 << rec)) { ldm_error ("Duplicate VBLK, part %d.", rec); f->map &= 0x7F; /* Mark the group as broken */ diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c03e8d3a3a5..3763b436e69 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -61,6 +61,14 @@ extern const struct file_operations proc_pagemap_operations; extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; +struct proc_maps_private { + struct pid *pid; + struct task_struct *task; +#ifdef CONFIG_MMU + struct vm_area_struct *tail_vma; +#endif +}; + void proc_init_inodecache(void); static inline struct pid *proc_pid(struct inode *inode) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 318d8654989..2c9db29ea35 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -858,7 +858,192 @@ const struct file_operations proc_pagemap_operations = { #endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA -extern int show_numa_map(struct seq_file *m, void *v); + +struct numa_maps { + struct vm_area_struct *vma; + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +struct numa_maps_private { + struct proc_maps_private proc_maps; + struct numa_maps md; +}; + +static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty) +{ + int count = page_mapcount(page); + + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; + + if (PageSwapCache(page)) + md->swapcache++; + + if (PageActive(page) || PageUnevictable(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; + + if (PageAnon(page)) + md->anon++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +} + +static int gather_pte_stats(pmd_t *pmd, unsigned long addr, + unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + spinlock_t *ptl; + pte_t *orig_pte; + pte_t *pte; + + md = walk->private; + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + do { + struct page *page; + int nid; + + if (!pte_present(*pte)) + continue; + + page = vm_normal_page(md->vma, addr, *pte); + if (!page) + continue; + + if (PageReserved(page)) + continue; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_HIGH_MEMORY])) + continue; + + gather_stats(page, md, pte_dirty(*pte)); + + } while (pte++, addr += PAGE_SIZE, addr != end); + pte_unmap_unlock(orig_pte, ptl); + return 0; +} +#ifdef CONFIG_HUGETLB_PAGE +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + struct numa_maps *md; + struct page *page; + + if (pte_none(*pte)) + return 0; + + page = pte_page(*pte); + if (!page) + return 0; + + md = walk->private; + gather_stats(page, md, pte_dirty(*pte)); + return 0; +} + +#else +static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, struct mm_walk *walk) +{ + return 0; +} +#endif + +/* + * Display pages allocated per node and memory policy via /proc. + */ +static int show_numa_map(struct seq_file *m, void *v) +{ + struct numa_maps_private *numa_priv = m->private; + struct proc_maps_private *proc_priv = &numa_priv->proc_maps; + struct vm_area_struct *vma = v; + struct numa_maps *md = &numa_priv->md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mm_walk walk = {}; + struct mempolicy *pol; + int n; + char buffer[50]; + + if (!mm) + return 0; + + /* Ensure we start with an empty set of numa_maps statistics. */ + memset(md, 0, sizeof(*md)); + + md->vma = vma; + + walk.hugetlb_entry = gather_hugetbl_stats; + walk.pmd_entry = gather_pte_stats; + walk.private = md; + walk.mm = mm; + + pol = get_vma_policy(proc_priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol, 0); + mpol_cond_put(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, &file->f_path, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + walk_page_range(vma->vm_start, vma->vm_end, &walk); + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m, " anon=%lu", md->anon); + + if (md->dirty) + seq_printf(m, " dirty=%lu", md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m, " swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m, " active=%lu", md->active); + + if (md->writeback) + seq_printf(m, " writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + + if (m->count < m->size) + m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + return 0; +} static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, @@ -869,7 +1054,20 @@ static const struct seq_operations proc_pid_numa_maps_op = { static int numa_maps_open(struct inode *inode, struct file *file) { - return do_maps_open(inode, file, &proc_pid_numa_maps_op); + struct numa_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->proc_maps.pid = proc_pid(inode); + ret = seq_open(file, &proc_pid_numa_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; } const struct file_operations proc_numa_maps_operations = { @@ -878,4 +1076,4 @@ const struct file_operations proc_numa_maps_operations = { .llseek = seq_lseek, .release = seq_release_private, }; -#endif +#endif /* CONFIG_NUMA */ diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f835a25625f..f2c3ff20ea6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -152,21 +152,27 @@ EXPORT_SYMBOL_GPL(pstore_register); void pstore_get_records(void) { struct pstore_info *psi = psinfo; - size_t size; + ssize_t size; u64 id; enum pstore_type_id type; struct timespec time; - int failed = 0; + int failed = 0, rc; if (!psi) return; mutex_lock(&psinfo->buf_mutex); + rc = psi->open(psi); + if (rc) + goto out; + while ((size = psi->read(&id, &type, &time)) > 0) { - if (pstore_mkfile(type, psi->name, id, psi->buf, size, + if (pstore_mkfile(type, psi->name, id, psi->buf, (size_t)size, time, psi->erase)) failed++; } + psi->close(psi); +out: mutex_unlock(&psinfo->buf_mutex); if (failed) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index d3c032f5fa0..5b572c89e6c 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -691,8 +691,11 @@ static void prune_dqcache(int count) * This is called from kswapd when we think we need some * more memory */ -static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, + struct shrink_control *sc) { + int nr = sc->nr_to_scan; + if (nr) { spin_lock(&dq_list_lock); prune_dqcache(nr); diff --git a/fs/splice.c b/fs/splice.c index 50a5d978da1..aa866d30969 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -162,6 +162,14 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + /** * splice_to_pipe - fill passed data into a pipe * @pipe: pipe to fill @@ -247,12 +255,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, pipe_unlock(pipe); - if (do_wakeup) { - smp_mb(); - if (waitqueue_active(&pipe->wait)) - wake_up_interruptible(&pipe->wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + if (do_wakeup) + wakeup_pipe_readers(pipe); while (page_nr < spd_pages) spd->spd_release(spd, page_nr++); @@ -1892,12 +1896,9 @@ retry: /* * If we put data in the output pipe, wakeup any potential readers. */ - if (ret > 0) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - } + if (ret > 0) + wakeup_pipe_readers(opipe); + if (input_wakeup) wakeup_pipe_writers(ipipe); @@ -1976,12 +1977,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * If we put data in the output pipe, wakeup any potential readers. */ - if (ret > 0) { - smp_mb(); - if (waitqueue_active(&opipe->wait)) - wake_up_interruptible(&opipe->wait); - kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); - } + if (ret > 0) + wakeup_pipe_readers(opipe); return ret; } diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index efc309fa303..7797218d0b3 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -42,7 +42,7 @@ config SQUASHFS_LZO select LZO_DECOMPRESS help Saying Y here includes support for reading Squashfs file systems - compressed with LZO compresssion. LZO compression is mainly + compressed with LZO compression. LZO compression is mainly aimed at embedded systems with slower CPUs where the overheads of zlib are too high. @@ -57,7 +57,7 @@ config SQUASHFS_XZ select XZ_DEC help Saying Y here includes support for reading Squashfs file systems - compressed with XZ compresssion. XZ gives better compression than + compressed with XZ compression. XZ gives better compression than the default zlib compression, at the expense of greater CPU and memory overhead. diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index c37b520132f..4b5a3fbb1f1 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -29,7 +29,7 @@ * plus functions layered ontop of the generic cache implementation to * access the metadata and fragment caches. * - * To avoid out of memory and fragmentation isssues with vmalloc the cache + * To avoid out of memory and fragmentation issues with vmalloc the cache * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. * * It should be noted that the cache is not used for file datablocks, these diff --git a/fs/super.c b/fs/super.c index 8a06881b192..c04f7e0b7ed 100644 --- a/fs/super.c +++ b/fs/super.c @@ -948,8 +948,7 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that - * violate this rule. This warning should be either removed or - * converted to a BUG() in 2.6.34. + * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", type->name, sb->s_maxbytes); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index da3fefe91a8..1ad8c93c1b8 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -24,13 +24,6 @@ #include "sysfs.h" -/* used in crash dumps to help with debugging */ -static char last_sysfs_file[PATH_MAX]; -void sysfs_printk_last_file(void) -{ - printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file); -} - /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -337,11 +330,6 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_buffer *buffer; const struct sysfs_ops *ops; int error = -EACCES; - char *p; - - p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (!IS_ERR(p)) - memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active(attr_sd)) diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index c8769dc222d..194414f8298 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -101,9 +101,9 @@ int sysfs_create_group(struct kobject *kobj, } /** - * sysfs_update_group - given a directory kobject, create an attribute group - * @kobj: The kobject to create the group on - * @grp: The attribute group to create + * sysfs_update_group - given a directory kobject, update an attribute group + * @kobj: The kobject to update the group on + * @grp: The attribute group to update * * This function updates an attribute group. Unlike * sysfs_create_group(), it will explicitly not warn or error if any diff --git a/fs/timerfd.c b/fs/timerfd.c index 8c4fc1425b3..f67acbdda5e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -22,16 +22,24 @@ #include <linux/anon_inodes.h> #include <linux/timerfd.h> #include <linux/syscalls.h> +#include <linux/rcupdate.h> struct timerfd_ctx { struct hrtimer tmr; ktime_t tintv; + ktime_t moffs; wait_queue_head_t wqh; u64 ticks; int expired; int clockid; + struct rcu_head rcu; + struct list_head clist; + bool might_cancel; }; +static LIST_HEAD(cancel_list); +static DEFINE_SPINLOCK(cancel_lock); + /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, @@ -51,6 +59,63 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) return HRTIMER_NORESTART; } +/* + * Called when the clock was set to cancel the timers in the cancel + * list. + */ +void timerfd_clock_was_set(void) +{ + ktime_t moffs = ktime_get_monotonic_offset(); + struct timerfd_ctx *ctx; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(ctx, &cancel_list, clist) { + if (!ctx->might_cancel) + continue; + spin_lock_irqsave(&ctx->wqh.lock, flags); + if (ctx->moffs.tv64 != moffs.tv64) { + ctx->moffs.tv64 = KTIME_MAX; + wake_up_locked(&ctx->wqh); + } + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + } + rcu_read_unlock(); +} + +static void timerfd_remove_cancel(struct timerfd_ctx *ctx) +{ + if (ctx->might_cancel) { + ctx->might_cancel = false; + spin_lock(&cancel_lock); + list_del_rcu(&ctx->clist); + spin_unlock(&cancel_lock); + } +} + +static bool timerfd_canceled(struct timerfd_ctx *ctx) +{ + if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) + return false; + ctx->moffs = ktime_get_monotonic_offset(); + return true; +} + +static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags) +{ + if (ctx->clockid == CLOCK_REALTIME && (flags & TFD_TIMER_ABSTIME) && + (flags & TFD_TIMER_CANCEL_ON_SET)) { + if (!ctx->might_cancel) { + ctx->might_cancel = true; + spin_lock(&cancel_lock); + list_add_rcu(&ctx->clist, &cancel_list); + spin_unlock(&cancel_lock); + } + } else if (ctx->might_cancel) { + timerfd_remove_cancel(ctx); + } +} + static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) { ktime_t remaining; @@ -59,11 +124,12 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; } -static void timerfd_setup(struct timerfd_ctx *ctx, int flags, - const struct itimerspec *ktmr) +static int timerfd_setup(struct timerfd_ctx *ctx, int flags, + const struct itimerspec *ktmr) { enum hrtimer_mode htmode; ktime_t texp; + int clockid = ctx->clockid; htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; @@ -72,19 +138,24 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->expired = 0; ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, ctx->clockid, htmode); + hrtimer_init(&ctx->tmr, clockid, htmode); hrtimer_set_expires(&ctx->tmr, texp); ctx->tmr.function = timerfd_tmrproc; - if (texp.tv64 != 0) + if (texp.tv64 != 0) { hrtimer_start(&ctx->tmr, texp, htmode); + if (timerfd_canceled(ctx)) + return -ECANCELED; + } + return 0; } static int timerfd_release(struct inode *inode, struct file *file) { struct timerfd_ctx *ctx = file->private_data; + timerfd_remove_cancel(ctx); hrtimer_cancel(&ctx->tmr); - kfree(ctx); + kfree_rcu(ctx, rcu); return 0; } @@ -118,8 +189,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, res = -EAGAIN; else res = wait_event_interruptible_locked_irq(ctx->wqh, ctx->ticks); + + /* + * If clock has changed, we do not care about the + * ticks and we do not rearm the timer. Userspace must + * reevaluate anyway. + */ + if (timerfd_canceled(ctx)) { + ctx->ticks = 0; + ctx->expired = 0; + res = -ECANCELED; + } + if (ctx->ticks) { ticks = ctx->ticks; + if (ctx->expired && ctx->tintv.tv64) { /* * If tintv.tv64 != 0, this is a periodic timer that @@ -183,6 +267,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) init_waitqueue_head(&ctx->wqh); ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + ctx->moffs = ktime_get_monotonic_offset(); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); @@ -199,6 +284,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, struct file *file; struct timerfd_ctx *ctx; struct itimerspec ktmr, kotmr; + int ret; if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; @@ -213,6 +299,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, return PTR_ERR(file); ctx = file->private_data; + timerfd_setup_cancel(ctx, flags); + /* * We need to stop the existing timer before reprogramming * it to the new values. @@ -240,14 +328,14 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, /* * Re-program the timer to the new value ... */ - timerfd_setup(ctx, flags, &ktmr); + ret = timerfd_setup(ctx, flags, &ktmr); spin_unlock_irq(&ctx->wqh.lock); fput(file); if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) return -EFAULT; - return 0; + return ret; } SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 8b3a7da531e..315de66e52b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -106,7 +106,7 @@ static long long get_liability(struct ubifs_info *c) long long liab; spin_lock(&c->space_lock); - liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth; + liab = c->bi.idx_growth + c->bi.data_growth + c->bi.dd_growth; spin_unlock(&c->space_lock); return liab; } @@ -180,7 +180,7 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c) int idx_lebs; long long idx_size; - idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; + idx_size = c->bi.old_idx_sz + c->bi.idx_growth + c->bi.uncommitted_idx; /* And make sure we have thrice the index size of space reserved */ idx_size += idx_size << 1; /* @@ -292,13 +292,13 @@ static int can_use_rp(struct ubifs_info *c) * budgeted index space to the size of the current index, multiplies this by 3, * and makes sure this does not exceed the amount of free LEBs. * - * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: + * Notes about @c->bi.min_idx_lebs and @c->lst.idx_lebs variables: * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might * be large, because UBIFS does not do any index consolidation as long as * there is free space. IOW, the index may take a lot of LEBs, but the LEBs * will contain a lot of dirt. - * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, - * the index may be consolidated to take up to @c->min_idx_lebs LEBs. + * o @c->bi.min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->bi.min_idx_lebs LEBs. * * This function returns zero in case of success, and %-ENOSPC in case of * failure. @@ -343,13 +343,13 @@ static int do_budget_space(struct ubifs_info *c) c->lst.taken_empty_lebs; if (unlikely(rsvd_idx_lebs > lebs)) { dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " - "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, + "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs); return -ENOSPC; } available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; + outstanding = c->bi.data_growth + c->bi.dd_growth; if (unlikely(available < outstanding)) { dbg_budg("out of data space: available %lld, outstanding %lld", @@ -360,7 +360,7 @@ static int do_budget_space(struct ubifs_info *c) if (available - outstanding <= c->rp_size && !can_use_rp(c)) return -ENOSPC; - c->min_idx_lebs = min_idx_lebs; + c->bi.min_idx_lebs = min_idx_lebs; return 0; } @@ -393,11 +393,11 @@ static int calc_data_growth(const struct ubifs_info *c, { int data_growth; - data_growth = req->new_ino ? c->inode_budget : 0; + data_growth = req->new_ino ? c->bi.inode_budget : 0; if (req->new_page) - data_growth += c->page_budget; + data_growth += c->bi.page_budget; if (req->new_dent) - data_growth += c->dent_budget; + data_growth += c->bi.dent_budget; data_growth += req->new_ino_d; return data_growth; } @@ -413,12 +413,12 @@ static int calc_dd_growth(const struct ubifs_info *c, { int dd_growth; - dd_growth = req->dirtied_page ? c->page_budget : 0; + dd_growth = req->dirtied_page ? c->bi.page_budget : 0; if (req->dirtied_ino) - dd_growth += c->inode_budget << (req->dirtied_ino - 1); + dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1); if (req->mod_dent) - dd_growth += c->dent_budget; + dd_growth += c->bi.dent_budget; dd_growth += req->dirtied_ino_d; return dd_growth; } @@ -460,19 +460,19 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) again: spin_lock(&c->space_lock); - ubifs_assert(c->budg_idx_growth >= 0); - ubifs_assert(c->budg_data_growth >= 0); - ubifs_assert(c->budg_dd_growth >= 0); + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); - if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { + if (unlikely(c->bi.nospace) && (c->bi.nospace_rp || !can_use_rp(c))) { dbg_budg("no space"); spin_unlock(&c->space_lock); return -ENOSPC; } - c->budg_idx_growth += idx_growth; - c->budg_data_growth += data_growth; - c->budg_dd_growth += dd_growth; + c->bi.idx_growth += idx_growth; + c->bi.data_growth += data_growth; + c->bi.dd_growth += dd_growth; err = do_budget_space(c); if (likely(!err)) { @@ -484,9 +484,9 @@ again: } /* Restore the old values */ - c->budg_idx_growth -= idx_growth; - c->budg_data_growth -= data_growth; - c->budg_dd_growth -= dd_growth; + c->bi.idx_growth -= idx_growth; + c->bi.data_growth -= data_growth; + c->bi.dd_growth -= dd_growth; spin_unlock(&c->space_lock); if (req->fast) { @@ -506,9 +506,9 @@ again: goto again; } dbg_budg("FS is full, -ENOSPC"); - c->nospace = 1; + c->bi.nospace = 1; if (can_use_rp(c) || c->rp_size == 0) - c->nospace_rp = 1; + c->bi.nospace_rp = 1; smp_wmb(); } else ubifs_err("cannot budget space, error %d", err); @@ -523,8 +523,8 @@ again: * This function releases the space budgeted by 'ubifs_budget_space()'. Note, * since the index changes (which were budgeted for in @req->idx_growth) will * only be written to the media on commit, this function moves the index budget - * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be - * zeroed by the commit operation. + * from @c->bi.idx_growth to @c->bi.uncommitted_idx. The latter will be zeroed + * by the commit operation. */ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) { @@ -553,23 +553,23 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) if (!req->data_growth && !req->dd_growth) return; - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); spin_lock(&c->space_lock); - c->budg_idx_growth -= req->idx_growth; - c->budg_uncommitted_idx += req->idx_growth; - c->budg_data_growth -= req->data_growth; - c->budg_dd_growth -= req->dd_growth; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); - - ubifs_assert(c->budg_idx_growth >= 0); - ubifs_assert(c->budg_data_growth >= 0); - ubifs_assert(c->budg_dd_growth >= 0); - ubifs_assert(c->min_idx_lebs < c->main_lebs); - ubifs_assert(!(c->budg_idx_growth & 7)); - ubifs_assert(!(c->budg_data_growth & 7)); - ubifs_assert(!(c->budg_dd_growth & 7)); + c->bi.idx_growth -= req->idx_growth; + c->bi.uncommitted_idx += req->idx_growth; + c->bi.data_growth -= req->data_growth; + c->bi.dd_growth -= req->dd_growth; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->bi.idx_growth >= 0); + ubifs_assert(c->bi.data_growth >= 0); + ubifs_assert(c->bi.dd_growth >= 0); + ubifs_assert(c->bi.min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->bi.idx_growth & 7)); + ubifs_assert(!(c->bi.data_growth & 7)); + ubifs_assert(!(c->bi.dd_growth & 7)); spin_unlock(&c->space_lock); } @@ -586,13 +586,13 @@ void ubifs_convert_page_budget(struct ubifs_info *c) { spin_lock(&c->space_lock); /* Release the index growth reservation */ - c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + c->bi.idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; /* Release the data growth reservation */ - c->budg_data_growth -= c->page_budget; + c->bi.data_growth -= c->bi.page_budget; /* Increase the dirty data growth reservation instead */ - c->budg_dd_growth += c->page_budget; + c->bi.dd_growth += c->bi.page_budget; /* And re-calculate the indexing space reservation */ - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); } @@ -612,7 +612,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, memset(&req, 0, sizeof(struct ubifs_budget_req)); /* The "no space" flags will be cleared because dd_growth is > 0 */ - req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); + req.dd_growth = c->bi.inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } @@ -682,9 +682,9 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c) int rsvd_idx_lebs, lebs; long long available, outstanding, free; - ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); - outstanding = c->budg_data_growth + c->budg_dd_growth; - available = ubifs_calc_available(c, c->min_idx_lebs); + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + outstanding = c->bi.data_growth + c->bi.dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); /* * When reporting free space to user-space, UBIFS guarantees that it is @@ -697,8 +697,8 @@ long long ubifs_get_free_space_nolock(struct ubifs_info *c) * Note, the calculations below are similar to what we have in * 'do_budget_space()', so refer there for comments. */ - if (c->min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 1bd01ded712..87cd0ead863 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -182,7 +182,7 @@ static int do_commit(struct ubifs_info *c) c->mst_node->root_len = cpu_to_le32(zroot.len); c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); - c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); + c->mst_node->index_size = cpu_to_le64(c->bi.old_idx_sz); c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 004d3745dc4..0bb2bcef0de 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -34,7 +34,6 @@ #include <linux/moduleparam.h> #include <linux/debugfs.h> #include <linux/math64.h> -#include <linux/slab.h> #ifdef CONFIG_UBIFS_FS_DEBUG @@ -43,15 +42,12 @@ DEFINE_SPINLOCK(dbg_lock); static char dbg_key_buf0[128]; static char dbg_key_buf1[128]; -unsigned int ubifs_msg_flags; unsigned int ubifs_chk_flags; unsigned int ubifs_tst_flags; -module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(debug_msgs, "Debug message type flags"); MODULE_PARM_DESC(debug_chks, "Debug check flags"); MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); @@ -317,6 +313,8 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) printk(KERN_DEBUG "\tflags %#x\n", sup_flags); printk(KERN_DEBUG "\t big_lpt %u\n", !!(sup_flags & UBIFS_FLG_BIGLPT)); + printk(KERN_DEBUG "\t space_fixup %u\n", + !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); printk(KERN_DEBUG "\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); printk(KERN_DEBUG "\tleb_size %u\n", @@ -602,7 +600,7 @@ void dbg_dump_lstats(const struct ubifs_lp_stats *lst) spin_unlock(&dbg_lock); } -void dbg_dump_budg(struct ubifs_info *c) +void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi) { int i; struct rb_node *rb; @@ -610,26 +608,42 @@ void dbg_dump_budg(struct ubifs_info *c) struct ubifs_gced_idx_leb *idx_gc; long long available, outstanding, free; - ubifs_assert(spin_is_locked(&c->space_lock)); + spin_lock(&c->space_lock); spin_lock(&dbg_lock); - printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " - "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, - c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); - printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " - "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, - c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, - c->freeable_cnt); - printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " - "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, - c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); + printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, " + "total budget sum %lld\n", current->pid, + bi->data_growth + bi->dd_growth, + bi->data_growth + bi->dd_growth + bi->idx_growth); + printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, " + "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth, + bi->idx_growth); + printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, " + "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz, + bi->uncommitted_idx); + printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n", + bi->page_budget, bi->inode_budget, bi->dent_budget); + printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n", + bi->nospace, bi->nospace_rp); + printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + + if (bi != &c->bi) + /* + * If we are dumping saved budgeting data, do not print + * additional information which is about the current state, not + * the old one which corresponded to the saved budgeting data. + */ + goto out_unlock; + + printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n", + c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt); printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), atomic_long_read(&c->dirty_zn_cnt), atomic_long_read(&c->clean_zn_cnt)); - printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", - c->dark_wm, c->dead_wm, c->max_idx_node_sz); printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum); + /* If we are in R/O mode, journal heads do not exist */ if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) @@ -648,13 +662,15 @@ void dbg_dump_budg(struct ubifs_info *c) printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); /* Print budgeting predictions */ - available = ubifs_calc_available(c, c->min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; + available = ubifs_calc_available(c, c->bi.min_idx_lebs); + outstanding = c->bi.data_growth + c->bi.dd_growth; free = ubifs_get_free_space_nolock(c); printk(KERN_DEBUG "Budgeting predictions:\n"); printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", available, outstanding, free); +out_unlock: spin_unlock(&dbg_lock); + spin_unlock(&c->space_lock); } void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) @@ -729,7 +745,13 @@ void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) if (bud->lnum == lp->lnum) { int head = 0; for (i = 0; i < c->jhead_cnt; i++) { - if (lp->lnum == c->jheads[i].wbuf.lnum) { + /* + * Note, if we are in R/O mode or in the middle + * of mounting/re-mounting, the write-buffers do + * not exist. + */ + if (c->jheads && + lp->lnum == c->jheads[i].wbuf.lnum) { printk(KERN_CONT ", jhead %s", dbg_jhead(i)); head = 1; @@ -976,6 +998,8 @@ void dbg_save_space_info(struct ubifs_info *c) spin_lock(&c->space_lock); memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats)); + memcpy(&d->saved_bi, &c->bi, sizeof(struct ubifs_budg_info)); + d->saved_idx_gc_cnt = c->idx_gc_cnt; /* * We use a dirty hack here and zero out @c->freeable_cnt, because it @@ -1042,14 +1066,14 @@ int dbg_check_space_info(struct ubifs_info *c) out: ubifs_msg("saved lprops statistics dump"); dbg_dump_lstats(&d->saved_lst); - ubifs_get_lp_stats(c, &lst); - + ubifs_msg("saved budgeting info dump"); + dbg_dump_budg(c, &d->saved_bi); + ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); ubifs_msg("current lprops statistics dump"); + ubifs_get_lp_stats(c, &lst); dbg_dump_lstats(&lst); - - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); + ubifs_msg("current budgeting info dump"); + dbg_dump_budg(c, &c->bi); dump_stack(); return -EINVAL; } @@ -1793,6 +1817,8 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, struct rb_node **p, *parent = NULL; struct fsck_inode *fscki; ino_t inum = key_inum_flash(c, &ino->key); + struct inode *inode; + struct ubifs_inode *ui; p = &fsckd->inodes.rb_node; while (*p) { @@ -1816,19 +1842,46 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, if (!fscki) return ERR_PTR(-ENOMEM); + inode = ilookup(c->vfs_sb, inum); + fscki->inum = inum; - fscki->nlink = le32_to_cpu(ino->nlink); - fscki->size = le64_to_cpu(ino->size); - fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); - fscki->xattr_sz = le32_to_cpu(ino->xattr_size); - fscki->xattr_nms = le32_to_cpu(ino->xattr_names); - fscki->mode = le32_to_cpu(ino->mode); + /* + * If the inode is present in the VFS inode cache, use it instead of + * the on-flash inode which might be out-of-date. E.g., the size might + * be out-of-date. If we do not do this, the following may happen, for + * example: + * 1. A power cut happens + * 2. We mount the file-system R/O, the replay process fixes up the + * inode size in the VFS cache, but on on-flash. + * 3. 'check_leaf()' fails because it hits a data node beyond inode + * size. + */ + if (!inode) { + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + } else { + ui = ubifs_inode(inode); + fscki->nlink = inode->i_nlink; + fscki->size = inode->i_size; + fscki->xattr_cnt = ui->xattr_cnt; + fscki->xattr_sz = ui->xattr_size; + fscki->xattr_nms = ui->xattr_names; + fscki->mode = inode->i_mode; + iput(inode); + } + if (S_ISDIR(fscki->mode)) { fscki->calc_sz = UBIFS_INO_NODE_SZ; fscki->calc_cnt = 2; } + rb_link_node(&fscki->rb, parent, p); rb_insert_color(&fscki->rb, &fsckd->inodes); + return fscki; } @@ -2421,7 +2474,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) hashb = key_block(c, &sb->key); if (hasha > hashb) { - ubifs_err("larger hash %u goes before %u", hasha, hashb); + ubifs_err("larger hash %u goes before %u", + hasha, hashb); goto error_dump; } } @@ -2437,14 +2491,12 @@ error_dump: return 0; } -static int invocation_cnt; - int dbg_force_in_the_gaps(void) { - if (!dbg_force_in_the_gaps_enabled) + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) return 0; - /* Force in-the-gaps every 8th commit */ - return !((invocation_cnt++) & 0x7); + + return !(random32() & 7); } /* Failure mode for recovery testing */ @@ -2632,7 +2684,7 @@ int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, int len, int check) { if (in_failure_mode(desc)) - return -EIO; + return -EROFS; return ubi_leb_read(desc, lnum, buf, offset, len, check); } @@ -2642,7 +2694,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, int err, failing; if (in_failure_mode(desc)) - return -EIO; + return -EROFS; failing = do_fail(desc, lnum, 1); if (failing) cut_data(buf, len); @@ -2650,7 +2702,7 @@ int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, if (err) return err; if (failing) - return -EIO; + return -EROFS; return 0; } @@ -2660,12 +2712,12 @@ int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, int err; if (do_fail(desc, lnum, 1)) - return -EIO; + return -EROFS; err = ubi_leb_change(desc, lnum, buf, len, dtype); if (err) return err; if (do_fail(desc, lnum, 1)) - return -EIO; + return -EROFS; return 0; } @@ -2674,12 +2726,12 @@ int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) int err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; err = ubi_leb_erase(desc, lnum); if (err) return err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; return 0; } @@ -2688,19 +2740,19 @@ int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) int err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; err = ubi_leb_unmap(desc, lnum); if (err) return err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; return 0; } int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) { if (in_failure_mode(desc)) - return -EIO; + return -EROFS; return ubi_is_mapped(desc, lnum); } @@ -2709,12 +2761,12 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) int err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; err = ubi_leb_map(desc, lnum, dtype); if (err) return err; if (do_fail(desc, lnum, 0)) - return -EIO; + return -EROFS; return 0; } @@ -2784,7 +2836,7 @@ void dbg_debugfs_exit(void) static int open_debugfs_file(struct inode *inode, struct file *file) { file->private_data = inode->i_private; - return 0; + return nonseekable_open(inode, file); } static ssize_t write_debugfs_file(struct file *file, const char __user *buf, @@ -2795,18 +2847,15 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf, if (file->f_path.dentry == d->dfs_dump_lprops) dbg_dump_lprops(c); - else if (file->f_path.dentry == d->dfs_dump_budg) { - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); - } else if (file->f_path.dentry == d->dfs_dump_tnc) { + else if (file->f_path.dentry == d->dfs_dump_budg) + dbg_dump_budg(c, &c->bi); + else if (file->f_path.dentry == d->dfs_dump_tnc) { mutex_lock(&c->tnc_mutex); dbg_dump_tnc(c); mutex_unlock(&c->tnc_mutex); } else return -EINVAL; - *ppos += count; return count; } @@ -2814,7 +2863,7 @@ static const struct file_operations dfs_fops = { .open = open_debugfs_file, .write = write_debugfs_file, .owner = THIS_MODULE, - .llseek = default_llseek, + .llseek = no_llseek, }; /** diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h index e6493cac193..a811ac4a26b 100644 --- a/fs/ubifs/debug.h +++ b/fs/ubifs/debug.h @@ -31,6 +31,8 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, #ifdef CONFIG_UBIFS_FS_DEBUG +#include <linux/random.h> + /** * ubifs_debug_info - per-FS debugging information. * @old_zroot: old index root - used by 'dbg_check_old_index()' @@ -50,13 +52,15 @@ typedef int (*dbg_znode_callback)(struct ubifs_info *c, * @new_ihead_offs: used by debugging to check @c->ihead_offs * * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') - * @saved_free: saved free space (used by 'dbg_save_space_info()') + * @saved_bi: saved budgeting information + * @saved_free: saved amount of free space + * @saved_idx_gc_cnt: saved value of @c->idx_gc_cnt * - * dfs_dir_name: name of debugfs directory containing this file-system's files - * dfs_dir: direntry object of the file-system debugfs directory - * dfs_dump_lprops: "dump lprops" debugfs knob - * dfs_dump_budg: "dump budgeting information" debugfs knob - * dfs_dump_tnc: "dump TNC" debugfs knob + * @dfs_dir_name: name of debugfs directory containing this file-system's files + * @dfs_dir: direntry object of the file-system debugfs directory + * @dfs_dump_lprops: "dump lprops" debugfs knob + * @dfs_dump_budg: "dump budgeting information" debugfs knob + * @dfs_dump_tnc: "dump TNC" debugfs knob */ struct ubifs_debug_info { struct ubifs_zbranch old_zroot; @@ -76,7 +80,9 @@ struct ubifs_debug_info { int new_ihead_offs; struct ubifs_lp_stats saved_lst; + struct ubifs_budg_info saved_bi; long long saved_free; + int saved_idx_gc_cnt; char dfs_dir_name[100]; struct dentry *dfs_dir; @@ -101,23 +107,7 @@ struct ubifs_debug_info { } \ } while (0) -#define dbg_dump_stack() do { \ - if (!dbg_failure_mode) \ - dump_stack(); \ -} while (0) - -/* Generic debugging messages */ -#define dbg_msg(fmt, ...) do { \ - spin_lock(&dbg_lock); \ - printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ - __func__, ##__VA_ARGS__); \ - spin_unlock(&dbg_lock); \ -} while (0) - -#define dbg_do_msg(typ, fmt, ...) do { \ - if (ubifs_msg_flags & typ) \ - dbg_msg(fmt, ##__VA_ARGS__); \ -} while (0) +#define dbg_dump_stack() dump_stack() #define dbg_err(fmt, ...) do { \ spin_lock(&dbg_lock); \ @@ -137,77 +127,40 @@ const char *dbg_key_str1(const struct ubifs_info *c, #define DBGKEY(key) dbg_key_str0(c, (key)) #define DBGKEY1(key) dbg_key_str1(c, (key)) -/* General messages */ -#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) +#define ubifs_dbg_msg(type, fmt, ...) do { \ + spin_lock(&dbg_lock); \ + pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) +/* Just a debugging messages not related to any specific UBIFS subsystem */ +#define dbg_msg(fmt, ...) ubifs_dbg_msg("msg", fmt, ##__VA_ARGS__) +/* General messages */ +#define dbg_gen(fmt, ...) ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__) /* Additional journal messages */ -#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) - +#define dbg_jnl(fmt, ...) ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__) /* Additional TNC messages */ -#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) - +#define dbg_tnc(fmt, ...) ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__) /* Additional lprops messages */ -#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) - +#define dbg_lp(fmt, ...) ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__) /* Additional LEB find messages */ -#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) - +#define dbg_find(fmt, ...) ubifs_dbg_msg("find", fmt, ##__VA_ARGS__) /* Additional mount messages */ -#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) - +#define dbg_mnt(fmt, ...) ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__) /* Additional I/O messages */ -#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) - +#define dbg_io(fmt, ...) ubifs_dbg_msg("io", fmt, ##__VA_ARGS__) /* Additional commit messages */ -#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) - +#define dbg_cmt(fmt, ...) ubifs_dbg_msg("cmt", fmt, ##__VA_ARGS__) /* Additional budgeting messages */ -#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) - +#define dbg_budg(fmt, ...) ubifs_dbg_msg("budg", fmt, ##__VA_ARGS__) /* Additional log messages */ -#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) - +#define dbg_log(fmt, ...) ubifs_dbg_msg("log", fmt, ##__VA_ARGS__) /* Additional gc messages */ -#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) - +#define dbg_gc(fmt, ...) ubifs_dbg_msg("gc", fmt, ##__VA_ARGS__) /* Additional scan messages */ -#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) - +#define dbg_scan(fmt, ...) ubifs_dbg_msg("scan", fmt, ##__VA_ARGS__) /* Additional recovery messages */ -#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) - -/* - * Debugging message type flags. - * - * UBIFS_MSG_GEN: general messages - * UBIFS_MSG_JNL: journal messages - * UBIFS_MSG_MNT: mount messages - * UBIFS_MSG_CMT: commit messages - * UBIFS_MSG_FIND: LEB find messages - * UBIFS_MSG_BUDG: budgeting messages - * UBIFS_MSG_GC: garbage collection messages - * UBIFS_MSG_TNC: TNC messages - * UBIFS_MSG_LP: lprops messages - * UBIFS_MSG_IO: I/O messages - * UBIFS_MSG_LOG: log messages - * UBIFS_MSG_SCAN: scan messages - * UBIFS_MSG_RCVRY: recovery messages - */ -enum { - UBIFS_MSG_GEN = 0x1, - UBIFS_MSG_JNL = 0x2, - UBIFS_MSG_MNT = 0x4, - UBIFS_MSG_CMT = 0x8, - UBIFS_MSG_FIND = 0x10, - UBIFS_MSG_BUDG = 0x20, - UBIFS_MSG_GC = 0x40, - UBIFS_MSG_TNC = 0x80, - UBIFS_MSG_LP = 0x100, - UBIFS_MSG_IO = 0x200, - UBIFS_MSG_LOG = 0x400, - UBIFS_MSG_SCAN = 0x800, - UBIFS_MSG_RCVRY = 0x1000, -}; +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg("rcvry", fmt, ##__VA_ARGS__) /* * Debugging check flags. @@ -233,11 +186,9 @@ enum { /* * Special testing flags. * - * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method * UBIFS_TST_RCVRY: failure mode for recovery testing */ enum { - UBIFS_TST_FORCE_IN_THE_GAPS = 0x2, UBIFS_TST_RCVRY = 0x4, }; @@ -262,7 +213,7 @@ void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, int offs); void dbg_dump_budget_req(const struct ubifs_budget_req *req); void dbg_dump_lstats(const struct ubifs_lp_stats *lst); -void dbg_dump_budg(struct ubifs_info *c); +void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi); void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); void dbg_dump_lprops(struct ubifs_info *c); void dbg_dump_lpt_info(struct ubifs_info *c); @@ -304,18 +255,16 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head); int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head); /* Force the use of in-the-gaps method for testing */ - -#define dbg_force_in_the_gaps_enabled \ - (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) - +static inline int dbg_force_in_the_gaps_enabled(void) +{ + return ubifs_chk_flags & UBIFS_CHK_GEN; +} int dbg_force_in_the_gaps(void); /* Failure mode for recovery testing */ - #define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) #ifndef UBIFS_DBG_PRESERVE_UBI - #define ubi_leb_read dbg_leb_read #define ubi_leb_write dbg_leb_write #define ubi_leb_change dbg_leb_change @@ -323,7 +272,6 @@ int dbg_force_in_the_gaps(void); #define ubi_leb_unmap dbg_leb_unmap #define ubi_is_mapped dbg_is_mapped #define ubi_leb_map dbg_leb_map - #endif int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, @@ -370,33 +318,33 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c); __func__, __LINE__, current->pid); \ } while (0) -#define dbg_err(fmt, ...) do { \ - if (0) \ - ubifs_err(fmt, ##__VA_ARGS__); \ +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ } while (0) -#define dbg_msg(fmt, ...) do { \ - if (0) \ - printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__); \ +#define ubifs_dbg_msg(fmt, ...) do { \ + if (0) \ + pr_debug(fmt "\n", ##__VA_ARGS__); \ } while (0) #define dbg_dump_stack() #define ubifs_assert_cmt_locked(c) -#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) -#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gen(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__) #define DBGKEY(key) ((char *)(key)) #define DBGKEY1(key) ((char *)(key)) @@ -420,7 +368,9 @@ static inline void dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; } static inline void dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; } -static inline void dbg_dump_budg(struct ubifs_info *c) { return; } +static inline void +dbg_dump_budg(struct ubifs_info *c, + const struct ubifs_budg_info *bi) { return; } static inline void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) { return; } static inline void dbg_dump_lprops(struct ubifs_info *c) { return; } @@ -482,8 +432,8 @@ dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) { return 0; } static inline int dbg_force_in_the_gaps(void) { return 0; } -#define dbg_force_in_the_gaps_enabled 0 -#define dbg_failure_mode 0 +#define dbg_force_in_the_gaps_enabled() 0 +#define dbg_failure_mode 0 static inline int dbg_debugfs_init(void) { return 0; } static inline void dbg_debugfs_exit(void) { return; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 7217d67a80a..ef5abd38f0b 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -603,7 +603,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) ubifs_release_budget(c, &req); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return 0; @@ -693,7 +693,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) ubifs_release_budget(c, &req); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return 0; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b286db79c68..5e7fccfc4b2 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -212,7 +212,7 @@ static void release_new_page_budget(struct ubifs_info *c) */ static void release_existing_page_budget(struct ubifs_info *c) { - struct ubifs_budget_req req = { .dd_growth = c->page_budget}; + struct ubifs_budget_req req = { .dd_growth = c->bi.page_budget}; ubifs_release_budget(c, &req); } @@ -971,11 +971,11 @@ static int do_writepage(struct page *page, int len) * the page locked, and it locks @ui_mutex. However, write-back does take inode * @i_mutex, which means other VFS operations may be run on this inode at the * same time. And the problematic one is truncation to smaller size, from where - * we have to call 'truncate_setsize()', which first changes @inode->i_size, then - * drops the truncated pages. And while dropping the pages, it takes the page - * lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' with - * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This - * means that @inode->i_size is changed while @ui_mutex is unlocked. + * we have to call 'truncate_setsize()', which first changes @inode->i_size, + * then drops the truncated pages. And while dropping the pages, it takes the + * page lock. This means that 'do_truncation()' cannot call 'truncate_setsize()' + * with @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. + * This means that @inode->i_size is changed while @ui_mutex is unlocked. * * XXX(truncate): with the new truncate sequence this is not true anymore, * and the calls to truncate_setsize can be move around freely. They should @@ -1189,7 +1189,7 @@ out_budg: if (budgeted) ubifs_release_budget(c, &req); else { - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } return err; @@ -1312,7 +1312,11 @@ int ubifs_fsync(struct file *file, int datasync) dbg_gen("syncing inode %lu", inode->i_ino); - if (inode->i_sb->s_flags & MS_RDONLY) + if (c->ro_mount) + /* + * For some really strange reasons VFS does not filter out + * 'fsync()' for R/O mounted file-systems as per 2.6.39. + */ return 0; /* @@ -1432,10 +1436,11 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) } /* - * mmap()d file has taken write protection fault and is being made - * writable. UBIFS must ensure page is budgeted for. + * mmap()d file has taken write protection fault and is being made writable. + * UBIFS must ensure page is budgeted for. */ -static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = vma->vm_file->f_path.dentry->d_inode; @@ -1536,7 +1541,6 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) { int err; - /* 'generic_file_mmap()' takes care of NOMMU case */ err = generic_file_mmap(file, vma); if (err) return err; diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 1d54383d126..2559d174e00 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -252,8 +252,8 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, * But if the index takes fewer LEBs than it is reserved for it, * this function must avoid picking those reserved LEBs. */ - if (c->min_idx_lebs >= c->lst.idx_lebs) { - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; exclude_index = 1; } spin_unlock(&c->space_lock); @@ -276,7 +276,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, pick_free = 0; } else { spin_lock(&c->space_lock); - exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); + exclude_index = (c->bi.min_idx_lebs >= c->lst.idx_lebs); spin_unlock(&c->space_lock); } @@ -501,8 +501,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, /* Check if there are enough empty LEBs for commit */ spin_lock(&c->space_lock); - if (c->min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + if (c->bi.min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->bi.min_idx_lebs - c->lst.idx_lebs; else rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 151f1088282..ded29f6224c 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -100,6 +100,10 @@ static int switch_gc_head(struct ubifs_info *c) if (err) return err; + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); if (err) return err; @@ -118,7 +122,7 @@ static int switch_gc_head(struct ubifs_info *c) * This function compares data nodes @a and @b. Returns %1 if @a has greater * inode or block number, and %-1 otherwise. */ -int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +static int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; @@ -161,7 +165,8 @@ int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) * first and sorted by length in descending order. Directory entry nodes go * after inode nodes and are sorted in ascending hash valuer order. */ -int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +static int nondata_nodes_cmp(void *priv, struct list_head *a, + struct list_head *b) { ino_t inuma, inumb; struct ubifs_info *c = priv; @@ -473,6 +478,37 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) ubifs_assert(c->gc_lnum != lnum); ubifs_assert(wbuf->lnum != lnum); + if (lp->free + lp->dirty == c->leb_size) { + /* Special case - a free LEB */ + dbg_gc("LEB %d is free, return it", lp->lnum); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + + if (lp->free != c->leb_size) { + /* + * Write buffers must be sync'd before unmapping + * freeable LEBs, because one of them may contain data + * which obsoletes something in 'lp->pnum'. + */ + err = gc_sync_wbufs(c); + if (err) + return err; + err = ubifs_change_one_lp(c, lp->lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + return err; + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + return LEB_RETAINED; + } + + return LEB_FREED; + } + /* * We scan the entire LEB even though we only really need to scan up to * (c->leb_size - lp->free). @@ -682,37 +718,6 @@ int ubifs_garbage_collect(struct ubifs_info *c, int anyway) "(min. space %d)", lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty, min_space); - if (lp.free + lp.dirty == c->leb_size) { - /* An empty LEB was returned */ - dbg_gc("LEB %d is free, return it", lp.lnum); - /* - * ubifs_find_dirty_leb() doesn't return freeable index - * LEBs. - */ - ubifs_assert(!(lp.flags & LPROPS_INDEX)); - if (lp.free != c->leb_size) { - /* - * Write buffers must be sync'd before - * unmapping freeable LEBs, because one of them - * may contain data which obsoletes something - * in 'lp.pnum'. - */ - ret = gc_sync_wbufs(c); - if (ret) - goto out; - ret = ubifs_change_one_lp(c, lp.lnum, - c->leb_size, 0, 0, 0, - 0); - if (ret) - goto out; - } - ret = ubifs_leb_unmap(c, lp.lnum); - if (ret) - goto out; - ret = lp.lnum; - break; - } - space_before = c->leb_size - wbuf->offs - wbuf->used; if (wbuf->lnum == -1) space_before = 0; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index dfd168b7807..166951e0dcd 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -393,7 +393,7 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) ubifs_assert(wbuf->size % c->min_io_size == 0); ubifs_assert(!c->ro_media && !c->ro_mount); if (c->leb_size - wbuf->offs >= c->max_write_size) - ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); if (c->ro_error) return -EROFS; @@ -452,8 +452,8 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) * @dtype: data type * * This function targets the write-buffer to logical eraseblock @lnum:@offs. - * The write-buffer is synchronized if it is not empty. Returns zero in case of - * success and a negative error code in case of failure. + * The write-buffer has to be empty. Returns zero in case of success and a + * negative error code in case of failure. */ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, int dtype) @@ -465,13 +465,7 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, ubifs_assert(offs >= 0 && offs <= c->leb_size); ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); ubifs_assert(lnum != wbuf->lnum); - - if (wbuf->used > 0) { - int err = ubifs_wbuf_sync_nolock(wbuf); - - if (err) - return err; - } + ubifs_assert(wbuf->used == 0); spin_lock(&wbuf->lock); wbuf->lnum = lnum; @@ -573,7 +567,7 @@ out_timers: int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) { struct ubifs_info *c = wbuf->c; - int err, written, n, aligned_len = ALIGN(len, 8), offs; + int err, written, n, aligned_len = ALIGN(len, 8); dbg_io("%d bytes (%s) to jhead %s wbuf at LEB %d:%d", len, dbg_ntype(((struct ubifs_ch *)buf)->node_type), @@ -588,7 +582,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); ubifs_assert(!c->ro_media && !c->ro_mount); if (c->leb_size - wbuf->offs >= c->max_write_size) - ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size )); + ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size)); if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { err = -ENOSPC; @@ -636,7 +630,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) goto exit; } - offs = wbuf->offs; written = 0; if (wbuf->used) { @@ -653,7 +646,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (err) goto out; - offs += wbuf->size; + wbuf->offs += wbuf->size; len -= wbuf->avail; aligned_len -= wbuf->avail; written += wbuf->avail; @@ -672,7 +665,7 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) if (err) goto out; - offs += wbuf->size; + wbuf->offs += wbuf->size; len -= wbuf->size; aligned_len -= wbuf->size; written += wbuf->size; @@ -687,12 +680,13 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) n = aligned_len >> c->max_write_shift; if (n) { n <<= c->max_write_shift; - dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); - err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, - wbuf->dtype); + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, + wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, + wbuf->offs, n, wbuf->dtype); if (err) goto out; - offs += n; + wbuf->offs += n; aligned_len -= n; len -= n; written += n; @@ -707,7 +701,6 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) */ memcpy(wbuf->buf, buf + written, len); - wbuf->offs = offs; if (c->leb_size - wbuf->offs >= c->max_write_size) wbuf->size = c->max_write_size; else diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index aed25e86422..34b1679e6e3 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -141,14 +141,8 @@ again: * LEB with some empty space. */ lnum = ubifs_find_free_space(c, len, &offs, squeeze); - if (lnum >= 0) { - /* Found an LEB, add it to the journal head */ - err = ubifs_add_bud_to_log(c, jhead, lnum, offs); - if (err) - goto out_return; - /* A new bud was successfully allocated and added to the log */ + if (lnum >= 0) goto out; - } err = lnum; if (err != -ENOSPC) @@ -203,12 +197,23 @@ again: return 0; } - err = ubifs_add_bud_to_log(c, jhead, lnum, 0); - if (err) - goto out_return; offs = 0; out: + /* + * Make sure we synchronize the write-buffer before we add the new bud + * to the log. Otherwise we may have a power cut after the log + * reference node for the last bud (@lnum) is written but before the + * write-buffer data are written to the next-to-last bud + * (@wbuf->lnum). And the effect would be that the recovery would see + * that there is corruption in the next-to-last bud. + */ + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out_return; + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); if (err) goto out_unlock; @@ -380,10 +385,8 @@ out: if (err == -ENOSPC) { /* This are some budgeting problems, print useful information */ down_write(&c->commit_sem); - spin_lock(&c->space_lock); dbg_dump_stack(); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); + dbg_dump_budg(c, &c->bi); dbg_dump_lprops(c); cmt_retries = dbg_check_lprops(c); up_write(&c->commit_sem); diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index 40fa780ebea..affea9494ae 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -100,20 +100,6 @@ struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) } /** - * next_log_lnum - switch to the next log LEB. - * @c: UBIFS file-system description object - * @lnum: current log LEB - */ -static inline int next_log_lnum(const struct ubifs_info *c, int lnum) -{ - lnum += 1; - if (lnum > c->log_last) - lnum = UBIFS_LOG_LNUM; - - return lnum; -} - -/** * empty_log_bytes - calculate amount of empty space in the log. * @c: UBIFS file-system description object */ @@ -257,7 +243,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) ref->jhead = cpu_to_le32(jhead); if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -425,7 +411,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) /* Switch to the next log LEB */ if (c->lhead_offs) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -446,7 +432,7 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) c->lhead_offs += len; if (c->lhead_offs == c->leb_size) { - c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); c->lhead_offs = 0; } @@ -533,7 +519,7 @@ int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) } mutex_lock(&c->log_mutex); for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; - lnum = next_log_lnum(c, lnum)) { + lnum = ubifs_next_log_lnum(c, lnum)) { dbg_log("unmap log LEB %d", lnum); err = ubifs_leb_unmap(c, lnum); if (err) @@ -642,7 +628,7 @@ static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); if (err) return err; - *lnum = next_log_lnum(c, *lnum); + *lnum = ubifs_next_log_lnum(c, *lnum); *offs = 0; } memcpy(buf + *offs, node, len); @@ -712,7 +698,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) ubifs_scan_destroy(sleb); if (lnum == c->lhead_lnum) break; - lnum = next_log_lnum(c, lnum); + lnum = ubifs_next_log_lnum(c, lnum); } if (offs) { int sz = ALIGN(offs, c->min_io_size); @@ -732,7 +718,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) /* Unmap remaining LEBs */ lnum = write_lnum; do { - lnum = next_log_lnum(c, lnum); + lnum = ubifs_next_log_lnum(c, lnum); err = ubifs_leb_unmap(c, lnum); if (err) return err; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 0ee0847f242..667884f4a61 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1007,21 +1007,11 @@ out: } /** - * struct scan_check_data - data provided to scan callback function. - * @lst: LEB properties statistics - * @err: error code - */ -struct scan_check_data { - struct ubifs_lp_stats lst; - int err; -}; - -/** * scan_check_cb - scan callback. * @c: the UBIFS file-system description object * @lp: LEB properties to scan * @in_tree: whether the LEB properties are in main memory - * @data: information passed to and from the caller of the scan + * @lst: lprops statistics to update * * This function returns a code that indicates whether the scan should continue * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree @@ -1030,11 +1020,10 @@ struct scan_check_data { */ static int scan_check_cb(struct ubifs_info *c, const struct ubifs_lprops *lp, int in_tree, - struct scan_check_data *data) + struct ubifs_lp_stats *lst) { struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - struct ubifs_lp_stats *lst = &data->lst; int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret; void *buf = NULL; @@ -1044,7 +1033,7 @@ static int scan_check_cb(struct ubifs_info *c, if (cat != (lp->flags & LPROPS_CAT_MASK)) { ubifs_err("bad LEB category %d expected %d", (lp->flags & LPROPS_CAT_MASK), cat); - goto out; + return -EINVAL; } } @@ -1078,7 +1067,7 @@ static int scan_check_cb(struct ubifs_info *c, } if (!found) { ubifs_err("bad LPT list (category %d)", cat); - goto out; + return -EINVAL; } } } @@ -1090,45 +1079,40 @@ static int scan_check_cb(struct ubifs_info *c, if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || lp != heap->arr[lp->hpos]) { ubifs_err("bad LPT heap (category %d)", cat); - goto out; + return -EINVAL; } } buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); - if (!buf) { - ubifs_err("cannot allocate memory to scan LEB %d", lnum); - goto out; + if (!buf) + return -ENOMEM; + + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage - do not scan them. + */ + if (lp->free == c->leb_size) { + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += ubifs_calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; } sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { - /* - * After an unclean unmount, empty and freeable LEBs - * may contain garbage. - */ - if (lp->free == c->leb_size) { - ubifs_err("scan errors were in empty LEB " - "- continuing checking"); - lst->empty_lebs += 1; - lst->total_free += c->leb_size; - lst->total_dark += ubifs_calc_dark(c, c->leb_size); - ret = LPT_SCAN_CONTINUE; - goto exit; - } - - if (lp->free + lp->dirty == c->leb_size && - !(lp->flags & LPROPS_INDEX)) { - ubifs_err("scan errors were in freeable LEB " - "- continuing checking"); - lst->total_free += lp->free; - lst->total_dirty += lp->dirty; - lst->total_dark += ubifs_calc_dark(c, c->leb_size); - ret = LPT_SCAN_CONTINUE; - goto exit; + ret = PTR_ERR(sleb); + if (ret == -EUCLEAN) { + dbg_dump_lprops(c); + dbg_dump_budg(c, &c->bi); } - data->err = PTR_ERR(sleb); - ret = LPT_SCAN_STOP; - goto exit; + goto out; } is_idx = -1; @@ -1246,10 +1230,8 @@ static int scan_check_cb(struct ubifs_info *c, } ubifs_scan_destroy(sleb); - ret = LPT_SCAN_CONTINUE; -exit: vfree(buf); - return ret; + return LPT_SCAN_CONTINUE; out_print: ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " @@ -1258,10 +1240,10 @@ out_print: dbg_dump_leb(c, lnum); out_destroy: ubifs_scan_destroy(sleb); + ret = -EINVAL; out: vfree(buf); - data->err = -EINVAL; - return LPT_SCAN_STOP; + return ret; } /** @@ -1278,8 +1260,7 @@ out: int dbg_check_lprops(struct ubifs_info *c) { int i, err; - struct scan_check_data data; - struct ubifs_lp_stats *lst = &data.lst; + struct ubifs_lp_stats lst; if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) return 0; @@ -1294,29 +1275,23 @@ int dbg_check_lprops(struct ubifs_info *c) return err; } - memset(lst, 0, sizeof(struct ubifs_lp_stats)); - - data.err = 0; + memset(&lst, 0, sizeof(struct ubifs_lp_stats)); err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, (ubifs_lpt_scan_callback)scan_check_cb, - &data); + &lst); if (err && err != -ENOSPC) goto out; - if (data.err) { - err = data.err; - goto out; - } - if (lst->empty_lebs != c->lst.empty_lebs || - lst->idx_lebs != c->lst.idx_lebs || - lst->total_free != c->lst.total_free || - lst->total_dirty != c->lst.total_dirty || - lst->total_used != c->lst.total_used) { + if (lst.empty_lebs != c->lst.empty_lebs || + lst.idx_lebs != c->lst.idx_lebs || + lst.total_free != c->lst.total_free || + lst.total_dirty != c->lst.total_dirty || + lst.total_used != c->lst.total_used) { ubifs_err("bad overall accounting"); ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " "total_free %lld, total_dirty %lld, total_used %lld", - lst->empty_lebs, lst->idx_lebs, lst->total_free, - lst->total_dirty, lst->total_used); + lst.empty_lebs, lst.idx_lebs, lst.total_free, + lst.total_dirty, lst.total_used); ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " "total_free %lld, total_dirty %lld, total_used %lld", c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, @@ -1325,11 +1300,11 @@ int dbg_check_lprops(struct ubifs_info *c) goto out; } - if (lst->total_dead != c->lst.total_dead || - lst->total_dark != c->lst.total_dark) { + if (lst.total_dead != c->lst.total_dead || + lst.total_dark != c->lst.total_dark) { ubifs_err("bad dead/dark space accounting"); ubifs_err("calculated: total_dead %lld, total_dark %lld", - lst->total_dead, lst->total_dark); + lst.total_dead, lst.total_dark); ubifs_err("read from lprops: total_dead %lld, total_dark %lld", c->lst.total_dead, c->lst.total_dark); err = -EINVAL; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index 0c9c69bd983..dfcb5748a7d 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -29,6 +29,12 @@ #include <linux/slab.h> #include "ubifs.h" +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_populate_lsave(struct ubifs_info *c); +#else +#define dbg_populate_lsave(c) 0 +#endif + /** * first_dirty_cnode - find first dirty cnode. * @c: UBIFS file-system description object @@ -586,7 +592,7 @@ static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, if (nnode->nbranch[iip].lnum) break; } - } while (iip >= UBIFS_LPT_FANOUT); + } while (iip >= UBIFS_LPT_FANOUT); /* Go right */ nnode = ubifs_get_nnode(c, nnode, iip); @@ -815,6 +821,10 @@ static void populate_lsave(struct ubifs_info *c) c->lpt_drty_flgs |= LSAVE_DIRTY; ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); } + + if (dbg_populate_lsave(c)) + return; + list_for_each_entry(lprops, &c->empty_list, list) { c->lsave[cnt++] = lprops->lnum; if (cnt >= c->lsave_cnt) @@ -1994,4 +2004,47 @@ void dbg_dump_lpt_lebs(const struct ubifs_info *c) current->pid); } +/** + * dbg_populate_lsave - debugging version of 'populate_lsave()' + * @c: UBIFS file-system description object + * + * This is a debugging version for 'populate_lsave()' which populates lsave + * with random LEBs instead of useful LEBs, which is good for test coverage. + * Returns zero if lsave has not been populated (this debugging feature is + * disabled) an non-zero if lsave has been populated. + */ +static int dbg_populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (random32() & 3) + return 0; + + for (i = 0; i < c->lsave_cnt; i++) + c->lsave[i] = c->main_first; + + list_for_each_entry(lprops, &c->empty_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->freeable_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + list_for_each_entry(lprops, &c->frdi_idx_list, list) + c->lsave[random32() % c->lsave_cnt] = lprops->lnum; + + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) + c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum; + + return 1; +} + #endif /* CONFIG_UBIFS_FS_DEBUG */ diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 21f47afdacf..278c2382e8c 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -148,7 +148,7 @@ static int validate_master(const struct ubifs_info *c) } main_sz = (long long)c->main_lebs * c->leb_size; - if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { + if (c->bi.old_idx_sz & 7 || c->bi.old_idx_sz >= main_sz) { err = 9; goto out; } @@ -218,7 +218,7 @@ static int validate_master(const struct ubifs_info *c) } if (c->lst.total_dead + c->lst.total_dark + - c->lst.total_used + c->old_idx_sz > main_sz) { + c->lst.total_used + c->bi.old_idx_sz > main_sz) { err = 21; goto out; } @@ -286,7 +286,7 @@ int ubifs_read_master(struct ubifs_info *c) c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); - c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->bi.old_idx_sz = le64_to_cpu(c->mst_node->index_size); c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); @@ -305,7 +305,7 @@ int ubifs_read_master(struct ubifs_info *c) c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); - c->calc_idx_sz = c->old_idx_sz; + c->calc_idx_sz = c->bi.old_idx_sz; if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) c->no_orphs = 1; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index c3de04dc952..0b5296a9a4c 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -340,4 +340,21 @@ static inline void ubifs_release_lprops(struct ubifs_info *c) mutex_unlock(&c->lp_mutex); } +/** + * ubifs_next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + * + * This helper function returns the log LEB number which goes next after LEB + * 'lnum'. + */ +static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum) +{ + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 09df318e368..bd644bf587a 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -673,7 +673,8 @@ static int kill_orphans(struct ubifs_info *c) sleb = ubifs_scan(c, lnum, 0, c->sbuf, 1); if (IS_ERR(sleb)) { if (PTR_ERR(sleb) == -EUCLEAN) - sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); + sleb = ubifs_recover_leb(c, lnum, 0, + c->sbuf, 0); if (IS_ERR(sleb)) { err = PTR_ERR(sleb); break; diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 3dbad6fbd1e..731d9e2e7b5 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -564,13 +564,16 @@ static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, } /** - * drop_incomplete_group - drop nodes from an incomplete group. + * drop_last_node - drop the last node or group of nodes. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here + * @grouped: non-zero if whole group of nodes have to be dropped * - * This function returns %1 if nodes are dropped and %0 otherwise. + * This is a helper function for 'ubifs_recover_leb()' which drops the last + * node of the scanned LEB or the last group of nodes if @grouped is not zero. + * This function returns %1 if a node was dropped and %0 otherwise. */ -static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) +static int drop_last_node(struct ubifs_scan_leb *sleb, int *offs, int grouped) { int dropped = 0; @@ -589,6 +592,8 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) kfree(snod); sleb->nodes_cnt -= 1; dropped = 1; + if (!grouped) + break; } return dropped; } @@ -609,8 +614,7 @@ static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf, int grouped) { - int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; - int empty_chkd = 0, start = offs; + int ret = 0, err, len = c->leb_size - offs, start = offs, min_io_unit; struct ubifs_scan_leb *sleb; void *buf = sbuf + offs; @@ -620,12 +624,8 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, if (IS_ERR(sleb)) return sleb; - if (sleb->ecc) - need_clean = 1; - + ubifs_assert(len >= 8); while (len >= 8) { - int ret; - dbg_scan("look at LEB %d:%d (%d bytes left)", lnum, offs, len); @@ -635,8 +635,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * Scan quietly until there is an error from which we cannot * recover */ - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); - + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); if (ret == SCANNED_A_NODE) { /* A valid node, and not a padding node */ struct ubifs_ch *ch = buf; @@ -649,70 +648,32 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, offs += node_len; buf += node_len; len -= node_len; - continue; - } - - if (ret > 0) { + } else if (ret > 0) { /* Padding bytes or a valid padding node */ offs += ret; buf += ret; len -= ret; - continue; - } - - if (ret == SCANNED_EMPTY_SPACE) { - if (!is_empty(buf, len)) { - if (!is_last_write(c, buf, offs)) - break; - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } - empty_chkd = 1; + } else if (ret == SCANNED_EMPTY_SPACE || + ret == SCANNED_GARBAGE || + ret == SCANNED_A_BAD_PAD_NODE || + ret == SCANNED_A_CORRUPT_NODE) { + dbg_rcvry("found corruption - %d", ret); break; - } - - if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) - if (is_last_write(c, buf, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - empty_chkd = 1; - break; - } - - if (ret == SCANNED_A_CORRUPT_NODE) - if (no_more_nodes(c, buf, len, lnum, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - empty_chkd = 1; - break; - } - - if (quiet) { - /* Redo the last scan but noisily */ - quiet = 0; - continue; - } - - switch (ret) { - case SCANNED_GARBAGE: - dbg_err("garbage"); - goto corrupted; - case SCANNED_A_CORRUPT_NODE: - case SCANNED_A_BAD_PAD_NODE: - dbg_err("bad node"); - goto corrupted; - default: - dbg_err("unknown"); + } else { + dbg_err("unexpected return value %d", ret); err = -EINVAL; goto error; } } - if (!empty_chkd && !is_empty(buf, len)) { - if (is_last_write(c, buf, offs)) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } else { + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) { + if (!is_last_write(c, buf, offs)) + goto corrupted_rescan; + } else if (ret == SCANNED_A_CORRUPT_NODE) { + if (!no_more_nodes(c, buf, len, lnum, offs)) + goto corrupted_rescan; + } else if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) { int corruption = first_non_ff(buf, len); /* @@ -728,29 +689,82 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, } } - /* Drop nodes from incomplete group */ - if (grouped && drop_incomplete_group(sleb, &offs)) { - buf = sbuf + offs; - len = c->leb_size - offs; - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } + min_io_unit = round_down(offs, c->min_io_size); + if (grouped) + /* + * If nodes are grouped, always drop the incomplete group at + * the end. + */ + drop_last_node(sleb, &offs, 1); - if (offs % c->min_io_size) { - clean_buf(c, &buf, lnum, &offs, &len); - need_clean = 1; - } + /* + * While we are in the middle of the same min. I/O unit keep dropping + * nodes. So basically, what we want is to make sure that the last min. + * I/O unit where we saw the corruption is dropped completely with all + * the uncorrupted node which may possibly sit there. + * + * In other words, let's name the min. I/O unit where the corruption + * starts B, and the previous min. I/O unit A. The below code tries to + * deal with a situation when half of B contains valid nodes or the end + * of a valid node, and the second half of B contains corrupted data or + * garbage. This means that UBIFS had been writing to B just before the + * power cut happened. I do not know how realistic is this scenario + * that half of the min. I/O unit had been written successfully and the + * other half not, but this is possible in our 'failure mode emulation' + * infrastructure at least. + * + * So what is the problem, why we need to drop those nodes? Whey can't + * we just clean-up the second half of B by putting a padding node + * there? We can, and this works fine with one exception which was + * reproduced with power cut emulation testing and happens extremely + * rarely. The description follows, but it is worth noting that that is + * only about the GC head, so we could do this trick only if the bud + * belongs to the GC head, but it does not seem to be worth an + * additional "if" statement. + * + * So, imagine the file-system is full, we run GC which is moving valid + * nodes from LEB X to LEB Y (obviously, LEB Y is the current GC head + * LEB). The @c->gc_lnum is -1, which means that GC will retain LEB X + * and will try to continue. Imagine that LEB X is currently the + * dirtiest LEB, and the amount of used space in LEB Y is exactly the + * same as amount of free space in LEB X. + * + * And a power cut happens when nodes are moved from LEB X to LEB Y. We + * are here trying to recover LEB Y which is the GC head LEB. We find + * the min. I/O unit B as described above. Then we clean-up LEB Y by + * padding min. I/O unit. And later 'ubifs_rcvry_gc_commit()' function + * fails, because it cannot find a dirty LEB which could be GC'd into + * LEB Y! Even LEB X does not match because the amount of valid nodes + * there does not fit the free space in LEB Y any more! And this is + * because of the padding node which we added to LEB Y. The + * user-visible effect of this which I once observed and analysed is + * that we cannot mount the file-system with -ENOSPC error. + * + * So obviously, to make sure that situation does not happen we should + * free min. I/O unit B in LEB Y completely and the last used min. I/O + * unit in LEB Y should be A. This is basically what the below code + * tries to do. + */ + while (min_io_unit == round_down(offs, c->min_io_size) && + min_io_unit != offs && + drop_last_node(sleb, &offs, grouped)); + + buf = sbuf + offs; + len = c->leb_size - offs; + clean_buf(c, &buf, lnum, &offs, &len); ubifs_end_scan(c, sleb, lnum, offs); - if (need_clean) { - err = fix_unclean_leb(c, sleb, start); - if (err) - goto error; - } + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; return sleb; +corrupted_rescan: + /* Re-scan the corrupted data with verbose messages */ + dbg_err("corruptio %d", ret); + ubifs_scan_a_node(c, buf, len, lnum, offs, 1); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); err = -EUCLEAN; @@ -1070,6 +1084,53 @@ int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) } /** + * grab_empty_leb - grab an empty LEB to use as GC LEB and run commit. + * @c: UBIFS file-system description object + * + * This is a helper function for 'ubifs_rcvry_gc_commit()' which grabs an empty + * LEB to be used as GC LEB (@c->gc_lnum), and then runs the commit. Returns + * zero in case of success and a negative error code in case of failure. + */ +static int grab_empty_leb(struct ubifs_info *c) +{ + int lnum, err; + + /* + * Note, it is very important to first search for an empty LEB and then + * run the commit, not vice-versa. The reason is that there might be + * only one empty LEB at the moment, the one which has been the + * @c->gc_lnum just before the power cut happened. During the regular + * UBIFS operation (not now) @c->gc_lnum is marked as "taken", so no + * one but GC can grab it. But at this moment this single empty LEB is + * not marked as taken, so if we run commit - what happens? Right, the + * commit will grab it and write the index there. Remember that the + * index always expands as long as there is free space, and it only + * starts consolidating when we run out of space. + * + * IOW, if we run commit now, we might not be able to find a free LEB + * after this. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + dbg_err("could not find an empty LEB"); + dbg_dump_lprops(c); + dbg_dump_budg(c, &c->bi); + return lnum; + } + + /* Reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + + c->gc_lnum = lnum; + dbg_rcvry("found empty LEB %d, run commit", lnum); + + return ubifs_run_commit(c); +} + +/** * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. * @c: UBIFS file-system description object * @@ -1091,71 +1152,26 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) { struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; struct ubifs_lprops lp; - int lnum, err; + int err; + + dbg_rcvry("GC head LEB %d, offs %d", wbuf->lnum, wbuf->offs); c->gc_lnum = -1; - if (wbuf->lnum == -1) { - dbg_rcvry("no GC head LEB"); - goto find_free; - } - /* - * See whether the used space in the dirtiest LEB fits in the GC head - * LEB. - */ - if (wbuf->offs == c->leb_size) { - dbg_rcvry("no room in GC head LEB"); - goto find_free; - } + if (wbuf->lnum == -1 || wbuf->offs == c->leb_size) + return grab_empty_leb(c); + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); if (err) { - /* - * There are no dirty or empty LEBs subject to here being - * enough for the index. Try to use - * 'ubifs_find_free_leb_for_idx()', which will return any empty - * LEBs (ignoring index requirements). If the index then - * doesn't have enough LEBs the recovery commit will fail - - * which is the same result anyway i.e. recovery fails. So - * there is no problem ignoring index requirements and just - * grabbing a free LEB since we have already established there - * is not a dirty LEB we could have used instead. - */ - if (err == -ENOSPC) { - dbg_rcvry("could not find a dirty LEB"); - goto find_free; - } - return err; - } - ubifs_assert(!(lp.flags & LPROPS_INDEX)); - lnum = lp.lnum; - if (lp.free + lp.dirty == c->leb_size) { - /* An empty LEB was returned */ - if (lp.free != c->leb_size) { - err = ubifs_change_one_lp(c, lnum, c->leb_size, - 0, 0, 0, 0); - if (err) - return err; - } - err = ubifs_leb_unmap(c, lnum); - if (err) + if (err != -ENOSPC) return err; - c->gc_lnum = lnum; - dbg_rcvry("allocated LEB %d for GC", lnum); - /* Run the commit */ - dbg_rcvry("committing"); - return ubifs_run_commit(c); - } - /* - * There was no empty LEB so the used space in the dirtiest LEB must fit - * in the GC head LEB. - */ - if (lp.free + lp.dirty < wbuf->offs) { - dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d", - lnum, wbuf->lnum, wbuf->offs); - err = ubifs_return_leb(c, lnum); - if (err) - return err; - goto find_free; + + dbg_rcvry("could not find a dirty LEB"); + return grab_empty_leb(c); } + + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + ubifs_assert(lp.free + lp.dirty >= wbuf->offs); + /* * We run the commit before garbage collection otherwise subsequent * mounts will see the GC and orphan deletion in a different order. @@ -1164,11 +1180,8 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) err = ubifs_run_commit(c); if (err) return err; - /* - * The data in the dirtiest LEB fits in the GC head LEB, so do the GC - * - use locking to keep 'ubifs_assert()' happy. - */ - dbg_rcvry("GC'ing LEB %d", lnum); + + dbg_rcvry("GC'ing LEB %d", lp.lnum); mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); err = ubifs_garbage_collect_leb(c, &lp); if (err >= 0) { @@ -1184,37 +1197,17 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) err = -EINVAL; return err; } - if (err != LEB_RETAINED) { - dbg_err("GC returned %d", err); + + ubifs_assert(err == LEB_RETAINED); + if (err != LEB_RETAINED) return -EINVAL; - } + err = ubifs_leb_unmap(c, c->gc_lnum); if (err) return err; - dbg_rcvry("allocated LEB %d for GC", lnum); - return 0; -find_free: - /* - * There is no GC head LEB or the free space in the GC head LEB is too - * small, or there are not dirty LEBs. Allocate gc_lnum by calling - * 'ubifs_find_free_leb_for_idx()' so GC is not run. - */ - lnum = ubifs_find_free_leb_for_idx(c); - if (lnum < 0) { - dbg_err("could not find an empty LEB"); - return lnum; - } - /* And reset the index flag */ - err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, - LPROPS_INDEX, 0); - if (err) - return err; - c->gc_lnum = lnum; - dbg_rcvry("allocated LEB %d for GC", lnum); - /* Run the commit */ - dbg_rcvry("committing"); - return ubifs_run_commit(c); + dbg_rcvry("allocated LEB %d for GC", lp.lnum); + return 0; } /** @@ -1456,7 +1449,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); if (err) goto out; - dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld", (unsigned long)e->inum, lnum, offs, i_size, e->d_size); return 0; @@ -1505,20 +1498,27 @@ int ubifs_recover_size(struct ubifs_info *c) e->i_size = le64_to_cpu(ino->size); } } + if (e->exists && e->i_size < e->d_size) { - if (!e->inode && c->ro_mount) { + if (c->ro_mount) { /* Fix the inode size and pin it in memory */ struct inode *inode; + struct ubifs_inode *ui; + + ubifs_assert(!e->inode); inode = ubifs_iget(c->vfs_sb, e->inum); if (IS_ERR(inode)) return PTR_ERR(inode); + + ui = ubifs_inode(inode); if (inode->i_size < e->d_size) { dbg_rcvry("ino %lu size %lld -> %lld", (unsigned long)e->inum, - e->d_size, inode->i_size); + inode->i_size, e->d_size); inode->i_size = e->d_size; - ubifs_inode(inode)->ui_size = e->d_size; + ui->ui_size = e->d_size; + ui->synced_i_size = e->d_size; e->inode = inode; this = rb_next(this); continue; @@ -1533,9 +1533,11 @@ int ubifs_recover_size(struct ubifs_info *c) iput(e->inode); } } + this = rb_next(this); rb_erase(&e->rb, &c->size_tree); kfree(e); } + return 0; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index d3d6d365bfc..6617280d167 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -33,44 +33,32 @@ */ #include "ubifs.h" - -/* - * Replay flags. - * - * REPLAY_DELETION: node was deleted - * REPLAY_REF: node is a reference node - */ -enum { - REPLAY_DELETION = 1, - REPLAY_REF = 2, -}; +#include <linux/list_sort.h> /** - * struct replay_entry - replay tree entry. + * struct replay_entry - replay list entry. * @lnum: logical eraseblock number of the node * @offs: node offset * @len: node length + * @deletion: non-zero if this entry corresponds to a node deletion * @sqnum: node sequence number - * @flags: replay flags - * @rb: links the replay tree + * @list: links the replay list * @key: node key * @nm: directory entry name * @old_size: truncation old size * @new_size: truncation new size - * @free: amount of free space in a bud - * @dirty: amount of dirty space in a bud from padding and deletion nodes - * @jhead: journal head number of the bud * - * UBIFS journal replay must compare node sequence numbers, which means it must - * build a tree of node information to insert into the TNC. + * The replay process first scans all buds and builds the replay list, then + * sorts the replay list in nodes sequence number order, and then inserts all + * the replay entries to the TNC. */ struct replay_entry { int lnum; int offs; int len; + unsigned int deletion:1; unsigned long long sqnum; - int flags; - struct rb_node rb; + struct list_head list; union ubifs_key key; union { struct qstr nm; @@ -78,11 +66,6 @@ struct replay_entry { loff_t old_size; loff_t new_size; }; - struct { - int free; - int dirty; - int jhead; - }; }; }; @@ -90,57 +73,64 @@ struct replay_entry { * struct bud_entry - entry in the list of buds to replay. * @list: next bud in the list * @bud: bud description object - * @free: free bytes in the bud * @sqnum: reference node sequence number + * @free: free bytes in the bud + * @dirty: dirty bytes in the bud */ struct bud_entry { struct list_head list; struct ubifs_bud *bud; - int free; unsigned long long sqnum; + int free; + int dirty; }; /** * set_bud_lprops - set free and dirty space used by a bud. * @c: UBIFS file-system description object - * @r: replay entry of bud + * @b: bud entry which describes the bud + * + * This function makes sure the LEB properties of bud @b are set correctly + * after the replay. Returns zero in case of success and a negative error code + * in case of failure. */ -static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) +static int set_bud_lprops(struct ubifs_info *c, struct bud_entry *b) { const struct ubifs_lprops *lp; int err = 0, dirty; ubifs_get_lprops(c); - lp = ubifs_lpt_lookup_dirty(c, r->lnum); + lp = ubifs_lpt_lookup_dirty(c, b->bud->lnum); if (IS_ERR(lp)) { err = PTR_ERR(lp); goto out; } dirty = lp->dirty; - if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + if (b->bud->start == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { /* * The LEB was added to the journal with a starting offset of * zero which means the LEB must have been empty. The LEB - * property values should be lp->free == c->leb_size and - * lp->dirty == 0, but that is not the case. The reason is that - * the LEB was garbage collected. The garbage collector resets - * the free and dirty space without recording it anywhere except - * lprops, so if there is not a commit then lprops does not have - * that information next time the file system is mounted. + * property values should be @lp->free == @c->leb_size and + * @lp->dirty == 0, but that is not the case. The reason is that + * the LEB had been garbage collected before it became the bud, + * and there was not commit inbetween. The garbage collector + * resets the free and dirty space without recording it + * anywhere except lprops, so if there was no commit then + * lprops does not have that information. * * We do not need to adjust free space because the scan has told * us the exact value which is recorded in the replay entry as - * r->free. + * @b->free. * * However we do need to subtract from the dirty space the * amount of space that the garbage collector reclaimed, which * is the whole LEB minus the amount of space that was free. */ - dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, lp->free, lp->dirty); - dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", b->bud->lnum, lp->free, lp->dirty); dirty -= c->leb_size - lp->free; /* @@ -152,10 +142,10 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) */ if (dirty != 0) dbg_msg("LEB %d lp: %d free %d dirty " - "replay: %d free %d dirty", r->lnum, lp->free, - lp->dirty, r->free, r->dirty); + "replay: %d free %d dirty", b->bud->lnum, + lp->free, lp->dirty, b->free, b->dirty); } - lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, + lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty, lp->flags | LPROPS_TAKEN, 0); if (IS_ERR(lp)) { err = PTR_ERR(lp); @@ -163,8 +153,9 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) } /* Make sure the journal head points to the latest bud */ - err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum, - c->leb_size - r->free, UBI_SHORTTERM); + err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf, + b->bud->lnum, c->leb_size - b->free, + UBI_SHORTTERM); out: ubifs_release_lprops(c); @@ -172,6 +163,27 @@ out: } /** + * set_buds_lprops - set free and dirty space for all replayed buds. + * @c: UBIFS file-system description object + * + * This function sets LEB properties for all replayed buds. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int set_buds_lprops(struct ubifs_info *c) +{ + struct bud_entry *b; + int err; + + list_for_each_entry(b, &c->replay_buds, list) { + err = set_bud_lprops(c, b); + if (err) + return err; + } + + return 0; +} + +/** * trun_remove_range - apply a replay entry for a truncation to the TNC. * @c: UBIFS file-system description object * @r: replay entry of truncation @@ -207,24 +219,22 @@ static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) */ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) { - int err, deletion = ((r->flags & REPLAY_DELETION) != 0); + int err; - dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, - r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); + dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum, + r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key)); /* Set c->replay_sqnum to help deal with dangling branches. */ c->replay_sqnum = r->sqnum; - if (r->flags & REPLAY_REF) - err = set_bud_lprops(c, r); - else if (is_hash_key(c, &r->key)) { - if (deletion) + if (is_hash_key(c, &r->key)) { + if (r->deletion) err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); else err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, r->len, &r->nm); } else { - if (deletion) + if (r->deletion) switch (key_type(c, &r->key)) { case UBIFS_INO_KEY: { @@ -247,7 +257,7 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) return err; if (c->need_recovery) - err = ubifs_recover_size_accum(c, &r->key, deletion, + err = ubifs_recover_size_accum(c, &r->key, r->deletion, r->new_size); } @@ -255,68 +265,77 @@ static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) } /** - * destroy_replay_tree - destroy the replay. - * @c: UBIFS file-system description object + * replay_entries_cmp - compare 2 replay entries. + * @priv: UBIFS file-system description object + * @a: first replay entry + * @a: second replay entry * - * Destroy the replay tree. + * This is a comparios function for 'list_sort()' which compares 2 replay + * entries @a and @b by comparing their sequence numer. Returns %1 if @a has + * greater sequence number and %-1 otherwise. */ -static void destroy_replay_tree(struct ubifs_info *c) +static int replay_entries_cmp(void *priv, struct list_head *a, + struct list_head *b) { - struct rb_node *this = c->replay_tree.rb_node; - struct replay_entry *r; - - while (this) { - if (this->rb_left) { - this = this->rb_left; - continue; - } else if (this->rb_right) { - this = this->rb_right; - continue; - } - r = rb_entry(this, struct replay_entry, rb); - this = rb_parent(this); - if (this) { - if (this->rb_left == &r->rb) - this->rb_left = NULL; - else - this->rb_right = NULL; - } - if (is_hash_key(c, &r->key)) - kfree(r->nm.name); - kfree(r); - } - c->replay_tree = RB_ROOT; + struct replay_entry *ra, *rb; + + cond_resched(); + if (a == b) + return 0; + + ra = list_entry(a, struct replay_entry, list); + rb = list_entry(b, struct replay_entry, list); + ubifs_assert(ra->sqnum != rb->sqnum); + if (ra->sqnum > rb->sqnum) + return 1; + return -1; } /** - * apply_replay_tree - apply the replay tree to the TNC. + * apply_replay_list - apply the replay list to the TNC. * @c: UBIFS file-system description object * - * Apply the replay tree. - * Returns zero in case of success and a negative error code in case of - * failure. + * Apply all entries in the replay list to the TNC. Returns zero in case of + * success and a negative error code in case of failure. */ -static int apply_replay_tree(struct ubifs_info *c) +static int apply_replay_list(struct ubifs_info *c) { - struct rb_node *this = rb_first(&c->replay_tree); + struct replay_entry *r; + int err; - while (this) { - struct replay_entry *r; - int err; + list_sort(c, &c->replay_list, &replay_entries_cmp); + list_for_each_entry(r, &c->replay_list, list) { cond_resched(); - r = rb_entry(this, struct replay_entry, rb); err = apply_replay_entry(c, r); if (err) return err; - this = rb_next(this); } + return 0; } /** - * insert_node - insert a node to the replay tree. + * destroy_replay_list - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay list. + */ +static void destroy_replay_list(struct ubifs_info *c) +{ + struct replay_entry *r, *tmp; + + list_for_each_entry_safe(r, tmp, &c->replay_list, list) { + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + list_del(&r->list); + kfree(r); + } +} + +/** + * insert_node - insert a node to the replay list * @c: UBIFS file-system description object * @lnum: node logical eraseblock number * @offs: node offset @@ -328,39 +347,25 @@ static int apply_replay_tree(struct ubifs_info *c) * @old_size: truncation old size * @new_size: truncation new size * - * This function inserts a scanned non-direntry node to the replay tree. The - * replay tree is an RB-tree containing @struct replay_entry elements which are - * indexed by the sequence number. The replay tree is applied at the very end - * of the replay process. Since the tree is sorted in sequence number order, - * the older modifications are applied first. This function returns zero in - * case of success and a negative error code in case of failure. + * This function inserts a scanned non-direntry node to the replay list. The + * replay list contains @struct replay_entry elements, and we sort this list in + * sequence number order before applying it. The replay list is applied at the + * very end of the replay process. Since the list is sorted in sequence number + * order, the older modifications are applied first. This function returns zero + * in case of success and a negative error code in case of failure. */ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, union ubifs_key *key, unsigned long long sqnum, int deletion, int *used, loff_t old_size, loff_t new_size) { - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } else if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay"); - return -EINVAL; - } - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); if (!r) return -ENOMEM; @@ -370,19 +375,18 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + r->deletion = !!deletion; r->sqnum = sqnum; - r->flags = (deletion ? REPLAY_DELETION : 0); + key_copy(c, key, &r->key); r->old_size = old_size; r->new_size = new_size; - key_copy(c, key, &r->key); - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); + list_add_tail(&r->list, &c->replay_list); return 0; } /** - * insert_dent - insert a directory entry node into the replay tree. + * insert_dent - insert a directory entry node into the replay list. * @c: UBIFS file-system description object * @lnum: node logical eraseblock number * @offs: node offset @@ -394,43 +398,25 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, * @deletion: non-zero if this is a deletion * @used: number of bytes in use in a LEB * - * This function inserts a scanned directory entry node to the replay tree. - * Returns zero in case of success and a negative error code in case of - * failure. - * - * This function is also used for extended attribute entries because they are - * implemented as directory entry nodes. + * This function inserts a scanned directory entry node or an extended + * attribute entry to the replay list. Returns zero in case of success and a + * negative error code in case of failure. */ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, union ubifs_key *key, const char *name, int nlen, unsigned long long sqnum, int deletion, int *used) { - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; struct replay_entry *r; char *nbuf; + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); if (key_inum(c, key) >= c->highest_inum) c->highest_inum = key_inum(c, key); - dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } - if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay"); - return -EINVAL; - } - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); if (!r) return -ENOMEM; + nbuf = kmalloc(nlen + 1, GFP_KERNEL); if (!nbuf) { kfree(r); @@ -442,17 +428,15 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, r->lnum = lnum; r->offs = offs; r->len = len; + r->deletion = !!deletion; r->sqnum = sqnum; + key_copy(c, key, &r->key); r->nm.len = nlen; memcpy(nbuf, name, nlen); nbuf[nlen] = '\0'; r->nm.name = nbuf; - r->flags = (deletion ? REPLAY_DELETION : 0); - key_copy(c, key, &r->key); - ubifs_assert(!*p); - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); + list_add_tail(&r->list, &c->replay_list); return 0; } @@ -489,29 +473,92 @@ int ubifs_validate_entry(struct ubifs_info *c, } /** + * is_last_bud - check if the bud is the last in the journal head. + * @c: UBIFS file-system description object + * @bud: bud description object + * + * This function checks if bud @bud is the last bud in its journal head. This + * information is then used by 'replay_bud()' to decide whether the bud can + * have corruptions or not. Indeed, only last buds can be corrupted by power + * cuts. Returns %1 if this is the last bud, and %0 if not. + */ +static int is_last_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct ubifs_jhead *jh = &c->jheads[bud->jhead]; + struct ubifs_bud *next; + uint32_t data; + int err; + + if (list_is_last(&bud->list, &jh->buds_list)) + return 1; + + /* + * The following is a quirk to make sure we work correctly with UBIFS + * images used with older UBIFS. + * + * Normally, the last bud will be the last in the journal head's list + * of bud. However, there is one exception if the UBIFS image belongs + * to older UBIFS. This is fairly unlikely: one would need to use old + * UBIFS, then have a power cut exactly at the right point, and then + * try to mount this image with new UBIFS. + * + * The exception is: it is possible to have 2 buds A and B, A goes + * before B, and B is the last, bud B is contains no data, and bud A is + * corrupted at the end. The reason is that in older versions when the + * journal code switched the next bud (from A to B), it first added a + * log reference node for the new bud (B), and only after this it + * synchronized the write-buffer of current bud (A). But later this was + * changed and UBIFS started to always synchronize the write-buffer of + * the bud (A) before writing the log reference for the new bud (B). + * + * But because older UBIFS always synchronized A's write-buffer before + * writing to B, we can recognize this exceptional situation but + * checking the contents of bud B - if it is empty, then A can be + * treated as the last and we can recover it. + * + * TODO: remove this piece of code in a couple of years (today it is + * 16.05.2011). + */ + next = list_entry(bud->list.next, struct ubifs_bud, list); + if (!list_is_last(&next->list, &jh->buds_list)) + return 0; + + err = ubi_read(c->ubi, next->lnum, (char *)&data, + next->start, 4); + if (err) + return 0; + + return data == 0xFFFFFFFF; +} + +/** * replay_bud - replay a bud logical eraseblock. * @c: UBIFS file-system description object - * @lnum: bud logical eraseblock number to replay - * @offs: bud start offset - * @jhead: journal head to which this bud belongs - * @free: amount of free space in the bud is returned here - * @dirty: amount of dirty space from padding and deletion nodes is returned - * here + * @b: bud entry which describes the bud * - * This function returns zero in case of success and a negative error code in - * case of failure. + * This function replays bud @bud, recovers it if needed, and adds all nodes + * from this bud to the replay list. Returns zero in case of success and a + * negative error code in case of failure. */ -static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, - int *free, int *dirty) +static int replay_bud(struct ubifs_info *c, struct bud_entry *b) { - int err = 0, used = 0; + int is_last = is_last_bud(c, b->bud); + int err = 0, used = 0, lnum = b->bud->lnum, offs = b->bud->start; struct ubifs_scan_leb *sleb; struct ubifs_scan_node *snod; - struct ubifs_bud *bud; - dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); - if (c->need_recovery) - sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); + dbg_mnt("replay bud LEB %d, head %d, offs %d, is_last %d", + lnum, b->bud->jhead, offs, is_last); + + if (c->need_recovery && is_last) + /* + * Recover only last LEBs in the journal heads, because power + * cuts may cause corruptions only in these LEBs, because only + * these LEBs could possibly be written to at the power cut + * time. + */ + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, + b->bud->jhead != GCHD); else sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0); if (IS_ERR(sleb)) @@ -627,15 +674,13 @@ static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, goto out; } - bud = ubifs_search_bud(c, lnum); - if (!bud) - BUG(); - + ubifs_assert(ubifs_search_bud(c, lnum)); ubifs_assert(sleb->endpt - offs >= used); ubifs_assert(sleb->endpt % c->min_io_size == 0); - *dirty = sleb->endpt - offs - used; - *free = c->leb_size - sleb->endpt; + b->dirty = sleb->endpt - offs - used; + b->free = c->leb_size - sleb->endpt; + dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free); out: ubifs_scan_destroy(sleb); @@ -649,58 +694,6 @@ out_dump: } /** - * insert_ref_node - insert a reference node to the replay tree. - * @c: UBIFS file-system description object - * @lnum: node logical eraseblock number - * @offs: node offset - * @sqnum: sequence number - * @free: amount of free space in bud - * @dirty: amount of dirty space from padding and deletion nodes - * @jhead: journal head number for the bud - * - * This function inserts a reference node to the replay tree and returns zero - * in case of success or a negative error code in case of failure. - */ -static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, - unsigned long long sqnum, int free, int dirty, - int jhead) -{ - struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; - struct replay_entry *r; - - dbg_mnt("add ref LEB %d:%d", lnum, offs); - while (*p) { - parent = *p; - r = rb_entry(parent, struct replay_entry, rb); - if (sqnum < r->sqnum) { - p = &(*p)->rb_left; - continue; - } else if (sqnum > r->sqnum) { - p = &(*p)->rb_right; - continue; - } - ubifs_err("duplicate sqnum in replay tree"); - return -EINVAL; - } - - r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); - if (!r) - return -ENOMEM; - - r->lnum = lnum; - r->offs = offs; - r->sqnum = sqnum; - r->flags = REPLAY_REF; - r->free = free; - r->dirty = dirty; - r->jhead = jhead; - - rb_link_node(&r->rb, parent, p); - rb_insert_color(&r->rb, &c->replay_tree); - return 0; -} - -/** * replay_buds - replay all buds. * @c: UBIFS file-system description object * @@ -710,17 +703,16 @@ static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, static int replay_buds(struct ubifs_info *c) { struct bud_entry *b; - int err, uninitialized_var(free), uninitialized_var(dirty); + int err; + unsigned long long prev_sqnum = 0; list_for_each_entry(b, &c->replay_buds, list) { - err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, - &free, &dirty); - if (err) - return err; - err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, - free, dirty, b->bud->jhead); + err = replay_bud(c, b); if (err) return err; + + ubifs_assert(b->sqnum > prev_sqnum); + prev_sqnum = b->sqnum; } return 0; @@ -1060,25 +1052,29 @@ int ubifs_replay_journal(struct ubifs_info *c) if (err) goto out; - err = apply_replay_tree(c); + err = apply_replay_list(c); + if (err) + goto out; + + err = set_buds_lprops(c); if (err) goto out; /* - * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable - * to roughly estimate index growth. Things like @c->min_idx_lebs + * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable + * to roughly estimate index growth. Things like @c->bi.min_idx_lebs * depend on it. This means we have to initialize it to make sure * budgeting works properly. */ - c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); - c->budg_uncommitted_idx *= c->max_idx_node_sz; + c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->bi.uncommitted_idx *= c->max_idx_node_sz; ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, (unsigned long)c->highest_inum); out: - destroy_replay_tree(c); + destroy_replay_list(c); destroy_bud_list(c); c->replaying = 0; return err; diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index bf31b4729e5..c606f010e8d 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -475,7 +475,8 @@ failed: * @c: UBIFS file-system description object * * This function returns a pointer to the superblock node or a negative error - * code. + * code. Note, the user of this function is responsible of kfree()'ing the + * returned superblock buffer. */ struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) { @@ -616,6 +617,7 @@ int ubifs_read_superblock(struct ubifs_info *c) c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); memcpy(&c->uuid, &sup->uuid, 16); c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); /* Automatically increase file system size to the maximum size */ c->old_leb_cnt = c->leb_cnt; @@ -650,3 +652,152 @@ out: kfree(sup); return err; } + +/** + * fixup_leb - fixup/unmap an LEB containing free space. + * @c: UBIFS file-system description object + * @lnum: the LEB number to fix up + * @len: number of used bytes in LEB (starting at offset 0) + * + * This function reads the contents of the given LEB number @lnum, then fixes + * it up, so that empty min. I/O units in the end of LEB are actually erased on + * flash (rather than being just all-0xff real data). If the LEB is completely + * empty, it is simply unmapped. + */ +static int fixup_leb(struct ubifs_info *c, int lnum, int len) +{ + int err; + + ubifs_assert(len >= 0); + ubifs_assert(len % c->min_io_size == 0); + ubifs_assert(len < c->leb_size); + + if (len == 0) { + dbg_mnt("unmap empty LEB %d", lnum); + return ubi_leb_unmap(c->ubi, lnum); + } + + dbg_mnt("fixup LEB %d, data len %d", lnum, len); + err = ubi_read(c->ubi, lnum, c->sbuf, 0, len); + if (err) + return err; + + return ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); +} + +/** + * fixup_free_space - find & remap all LEBs containing free space. + * @c: UBIFS file-system description object + * + * This function walks through all LEBs in the filesystem and fiexes up those + * containing free/empty space. + */ +static int fixup_free_space(struct ubifs_info *c) +{ + int lnum, err = 0; + struct ubifs_lprops *lprops; + + ubifs_get_lprops(c); + + /* Fixup LEBs in the master area */ + for (lnum = UBIFS_MST_LNUM; lnum < UBIFS_LOG_LNUM; lnum++) { + err = fixup_leb(c, lnum, c->mst_offs + c->mst_node_alsz); + if (err) + goto out; + } + + /* Unmap unused log LEBs */ + lnum = ubifs_next_log_lnum(c, c->lhead_lnum); + while (lnum != c->ltail_lnum) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + lnum = ubifs_next_log_lnum(c, lnum); + } + + /* Fixup the current log head */ + err = fixup_leb(c, c->lhead_lnum, c->lhead_offs); + if (err) + goto out; + + /* Fixup LEBs in the LPT area */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + int free = c->ltab[lnum - c->lpt_first].free; + + if (free > 0) { + err = fixup_leb(c, lnum, c->leb_size - free); + if (err) + goto out; + } + } + + /* Unmap LEBs in the orphans area */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = fixup_leb(c, lnum, 0); + if (err) + goto out; + } + + /* Fixup LEBs in the main area */ + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + lprops = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (lprops->free > 0) { + err = fixup_leb(c, lnum, c->leb_size - lprops->free); + if (err) + goto out; + } + } + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fixup_free_space - find & fix all LEBs with free space. + * @c: UBIFS file-system description object + * + * This function fixes up LEBs containing free space on first mount, if the + * appropriate flag was set when the FS was created. Each LEB with one or more + * empty min. I/O unit (i.e. free-space-count > 0) is re-written, to make sure + * the free space is actually erased. E.g., this is necessary for some NAND + * chips, since the free space may have been programmed like real "0xff" data + * (generating a non-0xff ECC), causing future writes to the not-really-erased + * NAND pages to behave badly. After the space is fixed up, the superblock flag + * is cleared, so that this is skipped for all future mounts. + */ +int ubifs_fixup_free_space(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + ubifs_assert(c->space_fixup); + ubifs_assert(!c->ro_mount); + + ubifs_msg("start fixing up free space"); + + err = fixup_free_space(c); + if (err) + return err; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + /* Free-space fixup is no longer required */ + c->space_fixup = 0; + sup->flags &= cpu_to_le32(~UBIFS_FLG_SPACE_FIXUP); + + err = ubifs_write_sb_node(c, sup); + kfree(sup); + if (err) + return err; + + ubifs_msg("free space fixup complete"); + return err; +} diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 04ad07f4fcc..6db0bdaa9f7 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -375,7 +375,7 @@ out: ubifs_release_dirty_inode_budget(c, ui); else { /* We've deleted something - clean the "no space" flags */ - c->nospace = c->nospace_rp = 0; + c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } done: @@ -694,11 +694,11 @@ static int init_constants_sb(struct ubifs_info *c) * be compressed and direntries are of the maximum size. * * Note, data, which may be stored in inodes is budgeted separately, so - * it is not included into 'c->inode_budget'. + * it is not included into 'c->bi.inode_budget'. */ - c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; - c->inode_budget = UBIFS_INO_NODE_SZ; - c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; + c->bi.page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->bi.inode_budget = UBIFS_INO_NODE_SZ; + c->bi.dent_budget = UBIFS_MAX_DENT_NODE_SZ; /* * When the amount of flash space used by buds becomes @@ -742,7 +742,7 @@ static void init_constants_master(struct ubifs_info *c) { long long tmp64; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); c->report_rp_size = ubifs_reported_space(c, c->rp_size); /* @@ -1144,8 +1144,8 @@ static int check_free_space(struct ubifs_info *c) { ubifs_assert(c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { - ubifs_err("insufficient free space to mount in read/write mode"); - dbg_dump_budg(c); + ubifs_err("insufficient free space to mount in R/W mode"); + dbg_dump_budg(c, &c->bi); dbg_dump_lprops(c); return -ENOSPC; } @@ -1304,7 +1304,7 @@ static int mount_ubifs(struct ubifs_info *c) if (err) goto out_lpt; - err = dbg_check_idx_size(c, c->old_idx_sz); + err = dbg_check_idx_size(c, c->bi.old_idx_sz); if (err) goto out_lpt; @@ -1313,7 +1313,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_journal; /* Calculate 'min_idx_lebs' after journal replay */ - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); err = ubifs_mount_orphans(c, c->need_recovery, c->ro_mount); if (err) @@ -1396,6 +1396,12 @@ static int mount_ubifs(struct ubifs_info *c) } else ubifs_assert(c->lst.taken_empty_lebs > 0); + if (!c->ro_mount && c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out_infos; + } + err = dbg_check_filesystem(c); if (err) goto out_infos; @@ -1442,7 +1448,8 @@ static int mount_ubifs(struct ubifs_info *c) c->main_lebs, c->main_first, c->leb_cnt - 1); dbg_msg("index LEBs: %d", c->lst.idx_lebs); dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", - c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); + c->bi.old_idx_sz, c->bi.old_idx_sz >> 10, + c->bi.old_idx_sz >> 20); dbg_msg("key hash type: %d", c->key_hash_type); dbg_msg("tree fanout: %d", c->fanout); dbg_msg("reserved GC LEB: %d", c->gc_lnum); @@ -1456,7 +1463,7 @@ static int mount_ubifs(struct ubifs_info *c) dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d", - UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout)); dbg_msg("dead watermark: %d", c->dead_wm); dbg_msg("dark watermark: %d", c->dark_wm); @@ -1584,6 +1591,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) } sup->leb_cnt = cpu_to_le32(c->leb_cnt); err = ubifs_write_sb_node(c, sup); + kfree(sup); if (err) goto out; } @@ -1684,6 +1692,13 @@ static int ubifs_remount_rw(struct ubifs_info *c) */ err = dbg_check_space_info(c); } + + if (c->space_fixup) { + err = ubifs_fixup_free_space(c); + if (err) + goto out; + } + mutex_unlock(&c->umount_mutex); return err; @@ -1766,10 +1781,9 @@ static void ubifs_put_super(struct super_block *sb) * to write them back because of I/O errors. */ if (!c->ro_error) { - ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); - ubifs_assert(c->budg_idx_growth == 0); - ubifs_assert(c->budg_dd_growth == 0); - ubifs_assert(c->budg_data_growth == 0); + ubifs_assert(c->bi.idx_growth == 0); + ubifs_assert(c->bi.dd_growth == 0); + ubifs_assert(c->bi.data_growth == 0); } /* diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index de485979ca3..8119b1fd8d9 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -2557,11 +2557,11 @@ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, if (err) { /* Ensure the znode is dirtied */ if (znode->cnext || !ubifs_zn_dirty(znode)) { - znode = dirty_cow_bottom_up(c, znode); - if (IS_ERR(znode)) { - err = PTR_ERR(znode); - goto out_unlock; - } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } } err = tnc_delete(c, znode, n); } diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 53288e5d604..41920f357bb 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -377,15 +377,13 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) c->gap_lebs = NULL; return err; } - if (!dbg_force_in_the_gaps_enabled) { + if (dbg_force_in_the_gaps_enabled()) { /* * Do not print scary warnings if the debugging * option which forces in-the-gaps is enabled. */ - ubifs_err("out of space"); - spin_lock(&c->space_lock); - dbg_dump_budg(c); - spin_unlock(&c->space_lock); + ubifs_warn("out of space"); + dbg_dump_budg(c, &c->bi); dbg_dump_lprops(c); } /* Try to commit anyway */ @@ -796,16 +794,16 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) spin_lock(&c->space_lock); /* * Although we have not finished committing yet, update size of the - * committed index ('c->old_idx_sz') and zero out the index growth + * committed index ('c->bi.old_idx_sz') and zero out the index growth * budget. It is OK to do this now, because we've reserved all the * space which is needed to commit the index, and it is save for the * budgeting subsystem to assume the index is already committed, * even though it is not. */ - ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); - c->old_idx_sz = c->calc_idx_sz; - c->budg_uncommitted_idx = 0; - c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + ubifs_assert(c->bi.min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + c->bi.old_idx_sz = c->calc_idx_sz; + c->bi.uncommitted_idx = 0; + c->bi.min_idx_lebs = ubifs_calc_min_idx_lebs(c); spin_unlock(&c->space_lock); mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index 191ca7863fe..e24380cf46e 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -408,9 +408,11 @@ enum { * Superblock flags. * * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed */ enum { UBIFS_FLG_BIGLPT = 0x02, + UBIFS_FLG_SPACE_FIXUP = 0x04, }; /** @@ -434,7 +436,7 @@ struct ubifs_ch { __u8 node_type; __u8 group_type; __u8 padding[2]; -} __attribute__ ((packed)); +} __packed; /** * union ubifs_dev_desc - device node descriptor. @@ -448,7 +450,7 @@ struct ubifs_ch { union ubifs_dev_desc { __le32 new; __le64 huge; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_ino_node - inode node. @@ -509,7 +511,7 @@ struct ubifs_ino_node { __le16 compr_type; __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ __u8 data[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_dent_node - directory entry node. @@ -534,7 +536,7 @@ struct ubifs_dent_node { __le16 nlen; __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ __u8 name[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_data_node - data node. @@ -555,7 +557,7 @@ struct ubifs_data_node { __le16 compr_type; __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ __u8 data[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_trun_node - truncation node. @@ -575,7 +577,7 @@ struct ubifs_trun_node { __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ __le64 old_size; __le64 new_size; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_pad_node - padding node. @@ -586,7 +588,7 @@ struct ubifs_trun_node { struct ubifs_pad_node { struct ubifs_ch ch; __le32 pad_len; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_sb_node - superblock node. @@ -644,7 +646,7 @@ struct ubifs_sb_node { __u8 uuid[16]; __le32 ro_compat_version; __u8 padding2[3968]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_mst_node - master node. @@ -711,7 +713,7 @@ struct ubifs_mst_node { __le32 idx_lebs; __le32 leb_cnt; __u8 padding[344]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_ref_node - logical eraseblock reference node. @@ -727,7 +729,7 @@ struct ubifs_ref_node { __le32 offs; __le32 jhead; __u8 padding[28]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_branch - key/reference/length branch @@ -741,7 +743,7 @@ struct ubifs_branch { __le32 offs; __le32 len; __u8 key[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_idx_node - indexing node. @@ -755,7 +757,7 @@ struct ubifs_idx_node { __le16 child_cnt; __le16 level; __u8 branches[]; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_cs_node - commit start node. @@ -765,7 +767,7 @@ struct ubifs_idx_node { struct ubifs_cs_node { struct ubifs_ch ch; __le64 cmt_no; -} __attribute__ ((packed)); +} __packed; /** * struct ubifs_orph_node - orphan node. @@ -777,6 +779,6 @@ struct ubifs_orph_node { struct ubifs_ch ch; __le64 cmt_no; __le64 inos[]; -} __attribute__ ((packed)); +} __packed; #endif /* __UBIFS_MEDIA_H__ */ diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 8c40ad3c672..93d1412a06f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -389,9 +389,9 @@ struct ubifs_gced_idx_leb { * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot * make sure @inode->i_size is always changed under @ui_mutex, because it - * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would deadlock - * with 'ubifs_writepage()' (see file.c). All the other inode fields are - * changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * cannot call 'truncate_setsize()' with @ui_mutex locked, because it would + * deadlock with 'ubifs_writepage()' (see file.c). All the other inode fields + * are changed under @ui_mutex, so they do not need "shadow" fields. Note, one * could consider to rework locking and base it on "shadow" fields. */ struct ubifs_inode { @@ -937,6 +937,40 @@ struct ubifs_mount_opts { unsigned int compr_type:2; }; +/** + * struct ubifs_budg_info - UBIFS budgeting information. + * @idx_growth: amount of bytes budgeted for index growth + * @data_growth: amount of bytes budgeted for cached data + * @dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @uncommitted_idx: amount of bytes were budgeted for growth of the index, but + * which still have to be taken into account because the index + * has not been committed so far + * @old_idx_sz: size of index on flash + * @min_idx_lebs: minimum number of LEBs required for the index + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * @page_budget: budget for a page (constant, nenver changed after mount) + * @inode_budget: budget for an inode (constant, nenver changed after mount) + * @dent_budget: budget for a directory entry (constant, nenver changed after + * mount) + */ +struct ubifs_budg_info { + long long idx_growth; + long long data_growth; + long long dd_growth; + long long uncommitted_idx; + unsigned long long old_idx_sz; + int min_idx_lebs; + unsigned int nospace:1; + unsigned int nospace_rp:1; + int page_budget; + int inode_budget; + int dent_budget; +}; + struct ubifs_debug_info; /** @@ -980,6 +1014,7 @@ struct ubifs_debug_info; * @cmt_wq: wait queue to sleep on if the log is full and a commit is running * * @big_lpt: flag that LPT is too big to write whole during commit + * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up * @no_chk_data_crc: do not check CRCs when reading data nodes (except during * recovery) * @bulk_read: enable bulk-reads @@ -1057,32 +1092,14 @@ struct ubifs_debug_info; * @dirty_zn_cnt: number of dirty znodes * @clean_zn_cnt: number of clean znodes * - * @budg_idx_growth: amount of bytes budgeted for index growth - * @budg_data_growth: amount of bytes budgeted for cached data - * @budg_dd_growth: amount of bytes budgeted for cached data that will make - * other data dirty - * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index, - * but which still have to be taken into account because - * the index has not been committed so far - * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, - * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst, - * @nospace, and @nospace_rp; - * @min_idx_lebs: minimum number of LEBs required for the index - * @old_idx_sz: size of index on flash + * @space_lock: protects @bi and @lst + * @lst: lprops statistics + * @bi: budgeting information * @calc_idx_sz: temporary variable which is used to calculate new index size * (contains accurate new index size at end of TNC commit start) - * @lst: lprops statistics - * @nospace: non-zero if the file-system does not have flash space (used as - * optimization) - * @nospace_rp: the same as @nospace, but additionally means that even reserved - * pool is full - * - * @page_budget: budget for a page - * @inode_budget: budget for an inode - * @dent_budget: budget for a directory entry * * @ref_node_alsz: size of the LEB reference node aligned to the min. flash - * I/O unit + * I/O unit * @mst_node_alsz: master node aligned size * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary @@ -1189,7 +1206,6 @@ struct ubifs_debug_info; * @replaying: %1 during journal replay * @mounting: %1 while mounting * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode - * @replay_tree: temporary tree used during journal replay * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay * @cs_sqnum: sequence number of first node in the log (commit start node) @@ -1238,6 +1254,7 @@ struct ubifs_info { wait_queue_head_t cmt_wq; unsigned int big_lpt:1; + unsigned int space_fixup:1; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; @@ -1308,21 +1325,10 @@ struct ubifs_info { atomic_long_t dirty_zn_cnt; atomic_long_t clean_zn_cnt; - long long budg_idx_growth; - long long budg_data_growth; - long long budg_dd_growth; - long long budg_uncommitted_idx; spinlock_t space_lock; - int min_idx_lebs; - unsigned long long old_idx_sz; - unsigned long long calc_idx_sz; struct ubifs_lp_stats lst; - unsigned int nospace:1; - unsigned int nospace_rp:1; - - int page_budget; - int inode_budget; - int dent_budget; + struct ubifs_budg_info bi; + unsigned long long calc_idx_sz; int ref_node_alsz; int mst_node_alsz; @@ -1430,7 +1436,6 @@ struct ubifs_info { unsigned int replaying:1; unsigned int mounting:1; unsigned int remounting_rw:1; - struct rb_root replay_tree; struct list_head replay_list; struct list_head replay_buds; unsigned long long cs_sqnum; @@ -1628,6 +1633,7 @@ int ubifs_write_master(struct ubifs_info *c); int ubifs_read_superblock(struct ubifs_info *c); struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); +int ubifs_fixup_free_space(struct ubifs_info *c); /* replay.c */ int ubifs_validate_entry(struct ubifs_info *c, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 3299f469e71..16f19f55e63 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -80,8 +80,8 @@ enum { SECURITY_XATTR, }; -static const struct inode_operations none_inode_operations; -static const struct file_operations none_file_operations; +static const struct inode_operations empty_iops; +static const struct file_operations empty_fops; /** * create_xattr - create an extended attribute. @@ -131,8 +131,8 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, /* Re-define all operations to be "nothing" */ inode->i_mapping->a_ops = &empty_aops; - inode->i_op = &none_inode_operations; - inode->i_fop = &none_file_operations; + inode->i_op = &empty_iops; + inode->i_fop = &empty_fops; inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; ui = ubifs_inode(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e765743cf9f..b4d791a8320 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -409,7 +409,7 @@ out: } /** - * ufs_getfrag_bloc() - `get_block_t' function, interface between UFS and + * ufs_getfrag_block() - `get_block_t' function, interface between UFS and * readpage, writepage and so on */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9ef9ed2cfe2..5e68099db2a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -33,7 +33,6 @@ #include <linux/migrate.h> #include <linux/backing-dev.h> #include <linux/freezer.h> -#include <linux/list_sort.h> #include "xfs_sb.h" #include "xfs_inum.h" @@ -709,6 +708,27 @@ xfs_buf_get_empty( return bp; } +/* + * Return a buffer allocated as an empty buffer and associated to external + * memory via xfs_buf_associate_memory() back to it's empty state. + */ +void +xfs_buf_set_empty( + struct xfs_buf *bp, + size_t len) +{ + if (bp->b_pages) + _xfs_buf_free_pages(bp); + + bp->b_pages = NULL; + bp->b_page_count = 0; + bp->b_addr = NULL; + bp->b_file_offset = 0; + bp->b_buffer_length = bp->b_count_desired = len; + bp->b_bn = XFS_BUF_DADDR_NULL; + bp->b_flags &= ~XBF_MAPPED; +} + static inline struct page * mem_to_page( void *addr) @@ -1402,12 +1422,12 @@ restart: int xfs_buftarg_shrink( struct shrinker *shrink, - int nr_to_scan, - gfp_t mask) + struct shrink_control *sc) { struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg, bt_shrinker); struct xfs_buf *bp; + int nr_to_scan = sc->nr_to_scan; LIST_HEAD(dispose); if (!nr_to_scan) diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index a9a1c451264..50a7d5fb3b7 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -178,6 +178,7 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); +extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len); extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); extern void xfs_buf_hold(xfs_buf_t *); diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index b3486dfa552..54e623bfbb8 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -586,7 +586,8 @@ xfs_file_compat_ioctl( case XFS_IOC_RESVSP_32: case XFS_IOC_UNRESVSP_32: case XFS_IOC_RESVSP64_32: - case XFS_IOC_UNRESVSP64_32: { + case XFS_IOC_UNRESVSP64_32: + case XFS_IOC_ZERO_RANGE_32: { struct xfs_flock64 bf; if (xfs_compat_flock64_copyin(&bf, arg)) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h index 08b605792a9..80f4060e897 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -184,6 +184,7 @@ typedef struct compat_xfs_flock64 { #define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64) #define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64) #define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64) +#define XFS_IOC_ZERO_RANGE_32 _IOW('X', 57, struct compat_xfs_flock64) typedef struct compat_xfs_fsop_geom_v1 { __u32 blocksize; /* filesystem (data) block size */ diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 244be9cbfe7..8633521b3b2 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -70,6 +70,7 @@ #include <linux/ctype.h> #include <linux/writeback.h> #include <linux/capability.h> +#include <linux/list_sort.h> #include <asm/page.h> #include <asm/div64.h> diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c index 9f76cceb678..bd672def95a 100644 --- a/fs/xfs/linux-2.6/xfs_message.c +++ b/fs/xfs/linux-2.6/xfs_message.c @@ -41,23 +41,6 @@ __xfs_printk( printk("%sXFS: %pV\n", level, vaf); } -void xfs_printk( - const char *level, - const struct xfs_mount *mp, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - __xfs_printk(level, mp, &vaf); - va_end(args); -} - #define define_xfs_printk_level(func, kern_level) \ void func(const struct xfs_mount *mp, const char *fmt, ...) \ { \ @@ -95,8 +78,7 @@ xfs_alert_tag( int do_panic = 0; if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { - xfs_printk(KERN_ALERT, mp, - "XFS: Transforming an alert into a BUG."); + xfs_alert(mp, "Transforming an alert into a BUG."); do_panic = 1; } diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h index f1b3fc1b6c4..7fb7ea00767 100644 --- a/fs/xfs/linux-2.6/xfs_message.h +++ b/fs/xfs/linux-2.6/xfs_message.h @@ -3,9 +3,6 @@ struct xfs_mount; -extern void xfs_printk(const char *level, const struct xfs_mount *mp, - const char *fmt, ...) - __attribute__ ((format (printf, 3, 4))); extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) @@ -28,7 +25,9 @@ extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); #else -static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +static inline void +__attribute__ ((format (printf, 2, 3))) +xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) { } #endif diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index b38e58d0229..b0aa59e51fd 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1787,10 +1787,6 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; - error = xfs_init_workqueues(); - if (error) - goto out_sysctl_unregister; - vfs_initquota(); error = register_filesystem(&xfs_fs_type); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 3e898a48122..8ecad5ff9f9 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -267,6 +267,16 @@ xfs_sync_inode_attr( error = xfs_iflush(ip, flags); + /* + * We don't want to try again on non-blocking flushes that can't run + * again immediately. If an inode really must be written, then that's + * what the SYNC_WAIT flag is for. + */ + if (error == EAGAIN) { + ASSERT(!(flags & SYNC_WAIT)); + error = 0; + } + out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; @@ -1022,13 +1032,14 @@ xfs_reclaim_inodes( static int xfs_reclaim_inode_shrink( struct shrinker *shrink, - int nr_to_scan, - gfp_t gfp_mask) + struct shrink_control *sc) { struct xfs_mount *mp; struct xfs_perag *pag; xfs_agnumber_t ag; int reclaimable; + int nr_to_scan = sc->nr_to_scan; + gfp_t gfp_mask = sc->gfp_mask; mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 2d0bcb47907..d48b7a579ae 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -1151,44 +1151,7 @@ TRACE_EVENT(xfs_bunmap, ); -#define XFS_BUSY_SYNC \ - { 0, "async" }, \ - { 1, "sync" } - -TRACE_EVENT(xfs_alloc_busy, - TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, int sync), - TP_ARGS(trans, agno, agbno, len, sync), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(struct xfs_trans *, tp) - __field(int, tid) - __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, agbno) - __field(xfs_extlen_t, len) - __field(int, sync) - ), - TP_fast_assign( - __entry->dev = trans->t_mountp->m_super->s_dev; - __entry->tp = trans; - __entry->tid = trans->t_ticket->t_tid; - __entry->agno = agno; - __entry->agbno = agbno; - __entry->len = len; - __entry->sync = sync; - ), - TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->tp, - __entry->tid, - __entry->agno, - __entry->agbno, - __entry->len, - __print_symbolic(__entry->sync, XFS_BUSY_SYNC)) - -); - -TRACE_EVENT(xfs_alloc_unbusy, +DECLARE_EVENT_CLASS(xfs_busy_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t len), TP_ARGS(mp, agno, agbno, len), @@ -1210,35 +1173,45 @@ TRACE_EVENT(xfs_alloc_unbusy, __entry->agbno, __entry->len) ); +#define DEFINE_BUSY_EVENT(name) \ +DEFINE_EVENT(xfs_busy_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_BUSY_EVENT(xfs_alloc_busy); +DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem); +DEFINE_BUSY_EVENT(xfs_alloc_busy_force); +DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse); +DEFINE_BUSY_EVENT(xfs_alloc_busy_clear); -#define XFS_BUSY_STATES \ - { 0, "missing" }, \ - { 1, "found" } - -TRACE_EVENT(xfs_alloc_busysearch, +TRACE_EVENT(xfs_alloc_busy_trim, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, - xfs_agblock_t agbno, xfs_extlen_t len, int found), - TP_ARGS(mp, agno, agbno, len, found), + xfs_agblock_t agbno, xfs_extlen_t len, + xfs_agblock_t tbno, xfs_extlen_t tlen), + TP_ARGS(mp, agno, agbno, len, tbno, tlen), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, found) + __field(xfs_agblock_t, tbno) + __field(xfs_extlen_t, tlen) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->found = found; + __entry->tbno = tbno; + __entry->tlen = tlen; ), - TP_printk("dev %d:%d agno %u agbno %u len %u %s", + TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->tbno, + __entry->tlen) ); TRACE_EVENT(xfs_trans_commit_lsn, @@ -1418,7 +1391,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->wasfromfl, __entry->isfl, __entry->userdata, - __entry->firstblock) + (unsigned long long)__entry->firstblock) ) #define DEFINE_ALLOC_EVENT(name) \ @@ -1433,11 +1406,14 @@ DEFINE_ALLOC_EVENT(xfs_alloc_near_first); DEFINE_ALLOC_EVENT(xfs_alloc_near_greater); DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser); DEFINE_ALLOC_EVENT(xfs_alloc_near_error); +DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry); +DEFINE_ALLOC_EVENT(xfs_alloc_near_busy); DEFINE_ALLOC_EVENT(xfs_alloc_size_neither); DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry); DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft); DEFINE_ALLOC_EVENT(xfs_alloc_size_done); DEFINE_ALLOC_EVENT(xfs_alloc_size_error); +DEFINE_ALLOC_EVENT(xfs_alloc_size_busy); DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist); DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough); DEFINE_ALLOC_EVENT(xfs_alloc_small_done); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 69228aa8605..b94dace4e78 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -60,7 +60,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, struct shrink_control *); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -2009,10 +2009,10 @@ xfs_qm_shake_freelist( STATIC int xfs_qm_shake( struct shrinker *shrink, - int nr_to_scan, - gfp_t gfp_mask) + struct shrink_control *sc) { int ndqused, nfree, n; + gfp_t gfp_mask = sc->gfp_mask; if (!kmem_shake_allow(gfp_mask)) return 0; diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index 58632cc17f2..da0a561ffba 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -187,7 +187,6 @@ struct xfs_busy_extent { xfs_agnumber_t agno; xfs_agblock_t bno; xfs_extlen_t length; - xlog_tid_t tid; /* transaction that created this */ }; /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 27d64d752ea..acdced86413 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -41,19 +41,13 @@ #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 -/* - * Prototypes for per-ag allocation routines - */ - STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *, - xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); - -/* - * Internal functions. - */ + xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *); +STATIC void xfs_alloc_busy_trim(struct xfs_alloc_arg *, + xfs_agblock_t, xfs_extlen_t, xfs_agblock_t *, xfs_extlen_t *); /* * Lookup the record equal to [bno, len] in the btree given by cur. @@ -154,19 +148,21 @@ xfs_alloc_compute_aligned( xfs_extlen_t *reslen) /* result length */ { xfs_agblock_t bno; - xfs_extlen_t diff; xfs_extlen_t len; - if (args->alignment > 1 && foundlen >= args->minlen) { - bno = roundup(foundbno, args->alignment); - diff = bno - foundbno; - len = diff >= foundlen ? 0 : foundlen - diff; + /* Trim busy sections out of found extent */ + xfs_alloc_busy_trim(args, foundbno, foundlen, &bno, &len); + + if (args->alignment > 1 && len >= args->minlen) { + xfs_agblock_t aligned_bno = roundup(bno, args->alignment); + xfs_extlen_t diff = aligned_bno - bno; + + *resbno = aligned_bno; + *reslen = diff >= len ? 0 : len - diff; } else { - bno = foundbno; - len = foundlen; + *resbno = bno; + *reslen = len; } - *resbno = bno; - *reslen = len; } /* @@ -280,7 +276,6 @@ xfs_alloc_fix_minleft( return 1; agf = XFS_BUF_TO_AGF(args->agbp); diff = be32_to_cpu(agf->agf_freeblks) - + be32_to_cpu(agf->agf_flcount) - args->len - args->minleft; if (diff >= 0) return 1; @@ -541,16 +536,8 @@ xfs_alloc_ag_vextent( if (error) return error; - /* - * Search the busylist for these blocks and mark the - * transaction as synchronous if blocks are found. This - * avoids the need to block due to a synchronous log - * force to ensure correct ordering as the synchronous - * transaction will guarantee that for us. - */ - if (xfs_alloc_busy_search(args->mp, args->agno, - args->agbno, args->len)) - xfs_trans_set_sync(args->tp); + ASSERT(!xfs_alloc_busy_search(args->mp, args->agno, + args->agbno, args->len)); } if (!args->isfl) { @@ -577,14 +564,14 @@ xfs_alloc_ag_vextent_exact( { xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ - xfs_agblock_t end; /* end of allocated extent */ int error; xfs_agblock_t fbno; /* start block of found extent */ - xfs_agblock_t fend; /* end block of found extent */ xfs_extlen_t flen; /* length of found extent */ + xfs_agblock_t tbno; /* start block of trimmed extent */ + xfs_extlen_t tlen; /* length of trimmed extent */ + xfs_agblock_t tend; /* end block of trimmed extent */ + xfs_agblock_t end; /* end of allocated extent */ int i; /* success/failure of operation */ - xfs_agblock_t maxend; /* end of maximal extent */ - xfs_agblock_t minend; /* end of minimal extent */ xfs_extlen_t rlen; /* length of returned extent */ ASSERT(args->alignment == 1); @@ -614,14 +601,22 @@ xfs_alloc_ag_vextent_exact( goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); ASSERT(fbno <= args->agbno); - minend = args->agbno + args->minlen; - maxend = args->agbno + args->maxlen; - fend = fbno + flen; /* - * Give up if the freespace isn't long enough for the minimum request. + * Check for overlapping busy extents. + */ + xfs_alloc_busy_trim(args, fbno, flen, &tbno, &tlen); + + /* + * Give up if the start of the extent is busy, or the freespace isn't + * long enough for the minimum request. */ - if (fend < minend) + if (tbno > args->agbno) + goto not_found; + if (tlen < args->minlen) + goto not_found; + tend = tbno + tlen; + if (tend < args->agbno + args->minlen) goto not_found; /* @@ -630,14 +625,14 @@ xfs_alloc_ag_vextent_exact( * * Fix the length according to mod and prod if given. */ - end = XFS_AGBLOCK_MIN(fend, maxend); + end = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen); args->len = end - args->agbno; xfs_alloc_fix_len(args); if (!xfs_alloc_fix_minleft(args)) goto not_found; rlen = args->len; - ASSERT(args->agbno + rlen <= fend); + ASSERT(args->agbno + rlen <= tend); end = args->agbno + rlen; /* @@ -686,11 +681,11 @@ xfs_alloc_find_best_extent( struct xfs_btree_cur **scur, /* searching cursor */ xfs_agblock_t gdiff, /* difference for search comparison */ xfs_agblock_t *sbno, /* extent found by search */ - xfs_extlen_t *slen, - xfs_extlen_t *slena, /* aligned length */ + xfs_extlen_t *slen, /* extent length */ + xfs_agblock_t *sbnoa, /* aligned extent found by search */ + xfs_extlen_t *slena, /* aligned extent length */ int dir) /* 0 = search right, 1 = search left */ { - xfs_agblock_t bno; xfs_agblock_t new; xfs_agblock_t sdiff; int error; @@ -708,16 +703,16 @@ xfs_alloc_find_best_extent( if (error) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena); + xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); /* * The good extent is closer than this one. */ if (!dir) { - if (bno >= args->agbno + gdiff) + if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { - if (bno <= args->agbno - gdiff) + if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -729,8 +724,8 @@ xfs_alloc_find_best_extent( xfs_alloc_fix_len(args); sdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, *sbno, - *slen, &new); + args->alignment, *sbnoa, + *slena, &new); /* * Choose closer size and invalidate other cursor. @@ -780,7 +775,7 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t gtbnoa; /* aligned ... */ xfs_extlen_t gtdiff; /* difference to right side entry */ xfs_extlen_t gtlen; /* length of right side entry */ - xfs_extlen_t gtlena = 0; /* aligned ... */ + xfs_extlen_t gtlena; /* aligned ... */ xfs_agblock_t gtnew; /* useful start bno of right side */ int error; /* error code */ int i; /* result code, temporary */ @@ -789,9 +784,10 @@ xfs_alloc_ag_vextent_near( xfs_agblock_t ltbnoa; /* aligned ... */ xfs_extlen_t ltdiff; /* difference to left side entry */ xfs_extlen_t ltlen; /* length of left side entry */ - xfs_extlen_t ltlena = 0; /* aligned ... */ + xfs_extlen_t ltlena; /* aligned ... */ xfs_agblock_t ltnew; /* useful start bno of left side */ xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; #if defined(DEBUG) && defined(__KERNEL__) /* * Randomly don't execute the first algorithm. @@ -800,13 +796,20 @@ xfs_alloc_ag_vextent_near( dofirst = random32() & 1; #endif + +restart: + bno_cur_lt = NULL; + bno_cur_gt = NULL; + ltlen = 0; + gtlena = 0; + ltlena = 0; + /* * Get a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); - ltlen = 0; - bno_cur_lt = bno_cur_gt = NULL; + /* * See if there are any free extents as big as maxlen. */ @@ -822,11 +825,13 @@ xfs_alloc_ag_vextent_near( goto error0; if (i == 0 || ltlen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_near_noentry(args); return 0; } ASSERT(i == 1); } args->wasfromfl = 0; + /* * First algorithm. * If the requested extent is large wrt the freespaces available @@ -890,7 +895,7 @@ xfs_alloc_ag_vextent_near( if (args->len < blen) continue; ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbno, ltlen, <new); + args->alignment, ltbnoa, ltlena, <new); if (ltnew != NULLAGBLOCK && (args->len > blen || ltdiff < bdiff)) { bdiff = ltdiff; @@ -1042,11 +1047,12 @@ xfs_alloc_ag_vextent_near( args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ltdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, ltbno, ltlen, <new); + args->alignment, ltbnoa, ltlena, <new); error = xfs_alloc_find_best_extent(args, &bno_cur_lt, &bno_cur_gt, - ltdiff, >bno, >len, >lena, + ltdiff, >bno, >len, + >bnoa, >lena, 0 /* search right */); } else { ASSERT(gtlena >= args->minlen); @@ -1057,11 +1063,12 @@ xfs_alloc_ag_vextent_near( args->len = XFS_EXTLEN_MIN(gtlena, args->maxlen); xfs_alloc_fix_len(args); gtdiff = xfs_alloc_compute_diff(args->agbno, args->len, - args->alignment, gtbno, gtlen, >new); + args->alignment, gtbnoa, gtlena, >new); error = xfs_alloc_find_best_extent(args, &bno_cur_gt, &bno_cur_lt, - gtdiff, <bno, <len, <lena, + gtdiff, <bno, <len, + <bnoa, <lena, 1 /* search left */); } @@ -1073,6 +1080,12 @@ xfs_alloc_ag_vextent_near( * If we couldn't get anything, give up. */ if (bno_cur_lt == NULL && bno_cur_gt == NULL) { + if (!forced++) { + trace_xfs_alloc_near_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + trace_xfs_alloc_size_neither(args); args->agbno = NULLAGBLOCK; return 0; @@ -1107,12 +1120,13 @@ xfs_alloc_ag_vextent_near( return 0; } rlen = args->len; - (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, ltbno, - ltlen, <new); + (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, + ltbnoa, ltlena, <new); ASSERT(ltnew >= ltbno); - ASSERT(ltnew + rlen <= ltbno + ltlen); + ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->agbno = ltnew; + if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, ltnew, rlen, XFSA_FIXUP_BNO_OK))) goto error0; @@ -1155,26 +1169,35 @@ xfs_alloc_ag_vextent_size( int i; /* temp status variable */ xfs_agblock_t rbno; /* returned block number */ xfs_extlen_t rlen; /* length of returned extent */ + int forced = 0; +restart: /* * Allocate and initialize a cursor for the by-size btree. */ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, args->agno, XFS_BTNUM_CNT); bno_cur = NULL; + /* * Look for an entry >= maxlen+alignment-1 blocks. */ if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, args->maxlen + args->alignment - 1, &i))) goto error0; + /* - * If none, then pick up the last entry in the tree unless the - * tree is empty. + * If none or we have busy extents that we cannot allocate from, then + * we have to settle for a smaller extent. In the case that there are + * no large extents, this will return the last entry in the tree unless + * the tree is empty. In the case that there are only busy large + * extents, this will return the largest small extent unless there + * are no smaller extents available. */ - if (!i) { - if ((error = xfs_alloc_ag_vextent_small(args, cnt_cur, &fbno, - &flen, &i))) + if (!i || forced > 1) { + error = xfs_alloc_ag_vextent_small(args, cnt_cur, + &fbno, &flen, &i); + if (error) goto error0; if (i == 0 || flen == 0) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); @@ -1182,22 +1205,56 @@ xfs_alloc_ag_vextent_size( return 0; } ASSERT(i == 1); + xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); + } else { + /* + * Search for a non-busy extent that is large enough. + * If we are at low space, don't check, or if we fall of + * the end of the btree, turn off the busy check and + * restart. + */ + for (;;) { + error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); + if (error) + goto error0; + XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + + xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen); + + if (rlen >= args->maxlen) + break; + + error = xfs_btree_increment(cnt_cur, 0, &i); + if (error) + goto error0; + if (i == 0) { + /* + * Our only valid extents must have been busy. + * Make it unbusy by forcing the log out and + * retrying. If we've been here before, forcing + * the log isn't making the extents available, + * which means they have probably been freed in + * this transaction. In that case, we have to + * give up on them and we'll attempt a minlen + * allocation the next time around. + */ + xfs_btree_del_cursor(cnt_cur, + XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + if (!forced++) + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + } } - /* - * There's a freespace as big as maxlen+alignment-1, get it. - */ - else { - if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - } + /* * In the first case above, we got the last entry in the * by-size btree. Now we check to see if the space hits maxlen * once aligned; if not, we search left for something better. * This can't happen in the second case above. */ - xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); XFS_WANT_CORRUPTED_GOTO(rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); @@ -1251,13 +1308,19 @@ xfs_alloc_ag_vextent_size( * Fix up the length. */ args->len = rlen; - xfs_alloc_fix_len(args); - if (rlen < args->minlen || !xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_size_nominleft(args); - args->agbno = NULLAGBLOCK; - return 0; + if (rlen < args->minlen) { + if (!forced++) { + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_busy(args); + xfs_log_force(args->mp, XFS_LOG_SYNC); + goto restart; + } + goto out_nominleft; } + xfs_alloc_fix_len(args); + + if (!xfs_alloc_fix_minleft(args)) + goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); /* @@ -1287,6 +1350,12 @@ error0: if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); return error; + +out_nominleft: + xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); + trace_xfs_alloc_size_nominleft(args); + args->agbno = NULLAGBLOCK; + return 0; } /* @@ -1326,6 +1395,9 @@ xfs_alloc_ag_vextent_small( if (error) goto error0; if (fbno != NULLAGBLOCK) { + xfs_alloc_busy_reuse(args->mp, args->agno, fbno, 1, + args->userdata); + if (args->userdata) { xfs_buf_t *bp; @@ -1617,18 +1689,6 @@ xfs_free_ag_extent( trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); - /* - * Since blocks move to the free list without the coordination - * used in xfs_bmap_finish, we can't allow block to be available - * for reallocation and non-transaction writing (user data) - * until we know that the transaction that moved it to the free - * list is permanently on disk. We track the blocks by declaring - * these blocks as "busy"; the busy list is maintained on a per-ag - * basis and each transaction records which entries should be removed - * when the iclog commits to disk. If a busy block is allocated, - * the iclog is pushed up to the LSN that freed the block. - */ - xfs_alloc_busy_insert(tp, agno, bno, len); return 0; error0: @@ -1923,21 +1983,6 @@ xfs_alloc_get_freelist( xfs_alloc_log_agf(tp, agbp, logflags); *bnop = bno; - /* - * As blocks are freed, they are added to the per-ag busy list and - * remain there until the freeing transaction is committed to disk. - * Now that we have allocated blocks, this list must be searched to see - * if a block is being reused. If one is, then the freeing transaction - * must be pushed to disk before this transaction. - * - * We do this by setting the current transaction to a sync transaction - * which guarantees that the freeing transaction is on disk before this - * transaction. This is done instead of a synchronous log force here so - * that we don't sit and wait with the AGF locked in the transaction - * during the log force. - */ - if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1)) - xfs_trans_set_sync(tp); return 0; } @@ -2423,105 +2468,13 @@ xfs_free_extent( } error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); + if (!error) + xfs_alloc_busy_insert(tp, args.agno, args.agbno, len); error0: xfs_perag_put(args.pag); return error; } - -/* - * AG Busy list management - * The busy list contains block ranges that have been freed but whose - * transactions have not yet hit disk. If any block listed in a busy - * list is reused, the transaction that freed it must be forced to disk - * before continuing to use the block. - * - * xfs_alloc_busy_insert - add to the per-ag busy list - * xfs_alloc_busy_clear - remove an item from the per-ag busy list - * xfs_alloc_busy_search - search for a busy extent - */ - -/* - * Insert a new extent into the busy tree. - * - * The busy extent tree is indexed by the start block of the busy extent. - * there can be multiple overlapping ranges in the busy extent tree but only - * ever one entry at a given start block. The reason for this is that - * multi-block extents can be freed, then smaller chunks of that extent - * allocated and freed again before the first transaction commit is on disk. - * If the exact same start block is freed a second time, we have to wait for - * that busy extent to pass out of the tree before the new extent is inserted. - * There are two main cases we have to handle here. - * - * The first case is a transaction that triggers a "free - allocate - free" - * cycle. This can occur during btree manipulations as a btree block is freed - * to the freelist, then allocated from the free list, then freed again. In - * this case, the second extxpnet free is what triggers the duplicate and as - * such the transaction IDs should match. Because the extent was allocated in - * this transaction, the transaction must be marked as synchronous. This is - * true for all cases where the free/alloc/free occurs in the one transaction, - * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case. - * This serves to catch violations of the second case quite effectively. - * - * The second case is where the free/alloc/free occur in different - * transactions. In this case, the thread freeing the extent the second time - * can't mark the extent busy immediately because it is already tracked in a - * transaction that may be committing. When the log commit for the existing - * busy extent completes, the busy extent will be removed from the tree. If we - * allow the second busy insert to continue using that busy extent structure, - * it can be freed before this transaction is safely in the log. Hence our - * only option in this case is to force the log to remove the existing busy - * extent from the list before we insert the new one with the current - * transaction ID. - * - * The problem we are trying to avoid in the free-alloc-free in separate - * transactions is most easily described with a timeline: - * - * Thread 1 Thread 2 Thread 3 xfslogd - * xact alloc - * free X - * mark busy - * commit xact - * free xact - * xact alloc - * alloc X - * busy search - * mark xact sync - * commit xact - * free xact - * force log - * checkpoint starts - * .... - * xact alloc - * free X - * mark busy - * finds match - * *** KABOOM! *** - * .... - * log IO completes - * unbusy X - * checkpoint completes - * - * By issuing a log force in thread 3 @ "KABOOM", the thread will block until - * the checkpoint completes, and the busy extent it matched will have been - * removed from the tree when it is woken. Hence it can then continue safely. - * - * However, to ensure this matching process is robust, we need to use the - * transaction ID for identifying transaction, as delayed logging results in - * the busy extent and transaction lifecycles being different. i.e. the busy - * extent is active for a lot longer than the transaction. Hence the - * transaction structure can be freed and reallocated, then mark the same - * extent busy again in the new transaction. In this case the new transaction - * will have a different tid but can have the same address, and hence we need - * to check against the tid. - * - * Future: for delayed logging, we could avoid the log force if the extent was - * first freed in the current checkpoint sequence. This, however, requires the - * ability to pin the current checkpoint in memory until this transaction - * commits to ensure that both the original free and the current one combine - * logically into the one checkpoint. If the checkpoint sequences are - * different, however, we still need to wait on a log force. - */ void xfs_alloc_busy_insert( struct xfs_trans *tp, @@ -2533,9 +2486,7 @@ xfs_alloc_busy_insert( struct xfs_busy_extent *busyp; struct xfs_perag *pag; struct rb_node **rbp; - struct rb_node *parent; - int match; - + struct rb_node *parent = NULL; new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL); if (!new) { @@ -2544,7 +2495,7 @@ xfs_alloc_busy_insert( * block, make this a synchronous transaction to insure that * the block is not reused before this transaction commits. */ - trace_xfs_alloc_busy(tp, agno, bno, len, 1); + trace_xfs_alloc_busy_enomem(tp->t_mountp, agno, bno, len); xfs_trans_set_sync(tp); return; } @@ -2552,66 +2503,28 @@ xfs_alloc_busy_insert( new->agno = agno; new->bno = bno; new->length = len; - new->tid = xfs_log_get_trans_ident(tp); - INIT_LIST_HEAD(&new->list); /* trace before insert to be able to see failed inserts */ - trace_xfs_alloc_busy(tp, agno, bno, len, 0); + trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len); pag = xfs_perag_get(tp->t_mountp, new->agno); -restart: spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; - parent = NULL; - busyp = NULL; - match = 0; - while (*rbp && match >= 0) { + while (*rbp) { parent = *rbp; busyp = rb_entry(parent, struct xfs_busy_extent, rb_node); if (new->bno < busyp->bno) { - /* may overlap, but exact start block is lower */ rbp = &(*rbp)->rb_left; - if (new->bno + new->length > busyp->bno) - match = busyp->tid == new->tid ? 1 : -1; + ASSERT(new->bno + new->length <= busyp->bno); } else if (new->bno > busyp->bno) { - /* may overlap, but exact start block is higher */ rbp = &(*rbp)->rb_right; - if (bno < busyp->bno + busyp->length) - match = busyp->tid == new->tid ? 1 : -1; + ASSERT(bno >= busyp->bno + busyp->length); } else { - match = busyp->tid == new->tid ? 1 : -1; - break; + ASSERT(0); } } - if (match < 0) { - /* overlap marked busy in different transaction */ - spin_unlock(&pag->pagb_lock); - xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); - goto restart; - } - if (match > 0) { - /* - * overlap marked busy in same transaction. Update if exact - * start block match, otherwise combine the busy extents into - * a single range. - */ - if (busyp->bno == new->bno) { - busyp->length = max(busyp->length, new->length); - spin_unlock(&pag->pagb_lock); - ASSERT(tp->t_flags & XFS_TRANS_SYNC); - xfs_perag_put(pag); - kmem_free(new); - return; - } - rb_erase(&busyp->rb_node, &pag->pagb_tree); - new->length = max(busyp->bno + busyp->length, - new->bno + new->length) - - min(busyp->bno, new->bno); - new->bno = min(busyp->bno, new->bno); - } else - busyp = NULL; rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); @@ -2619,7 +2532,6 @@ restart: list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); - kmem_free(busyp); } /* @@ -2668,31 +2580,443 @@ xfs_alloc_busy_search( } } spin_unlock(&pag->pagb_lock); - trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match); xfs_perag_put(pag); return match; } +/* + * The found free extent [fbno, fend] overlaps part or all of the given busy + * extent. If the overlap covers the beginning, the end, or all of the busy + * extent, the overlapping portion can be made unbusy and used for the + * allocation. We can't split a busy extent because we can't modify a + * transaction/CIL context busy list, but we can update an entries block + * number or length. + * + * Returns true if the extent can safely be reused, or false if the search + * needs to be restarted. + */ +STATIC bool +xfs_alloc_busy_update_extent( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_busy_extent *busyp, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) +{ + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + /* + * If there is a busy extent overlapping a user allocation, we have + * no choice but to force the log and retry the search. + * + * Fortunately this does not happen during normal operation, but + * only if the filesystem is very low on space and has to dip into + * the AGFL for normal allocations. + */ + if (userdata) + goto out_force_log; + + if (bbno < fbno && bend > fend) { + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + */ + + /* + * We would have to split the busy extent to be able to track + * it correct, which we cannot do because we would have to + * modify the list of busy extents attached to the transaction + * or CIL context, which is immutable. + * + * Force out the log to clear the busy extent and retry the + * search. + */ + goto out_force_log; + } else if (bbno >= fbno && bend <= fend) { + /* + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + */ + + /* + * The busy extent is fully covered by the extent we are + * allocating, and can simply be removed from the rbtree. + * However we cannot remove it from the immutable list + * tracking busy extents in the transaction or CIL context, + * so set the length to zero to mark it invalid. + * + * We also need to restart the busy extent search from the + * tree root, because erasing the node can rearrange the + * tree topology. + */ + rb_erase(&busyp->rb_node, &pag->pagb_tree); + busyp->length = 0; + return false; + } else if (fend < bend) { + /* + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + */ + busyp->bno = fend; + } else if (bbno < fbno) { + /* + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + */ + busyp->length = fbno - busyp->bno; + } else { + ASSERT(0); + } + + trace_xfs_alloc_busy_reuse(mp, pag->pag_agno, fbno, flen); + return true; + +out_force_log: + spin_unlock(&pag->pagb_lock); + xfs_log_force(mp, XFS_LOG_SYNC); + trace_xfs_alloc_busy_force(mp, pag->pag_agno, fbno, flen); + spin_lock(&pag->pagb_lock); + return false; +} + + +/* + * For a given extent [fbno, flen], make sure we can reuse it safely. + */ void -xfs_alloc_busy_clear( +xfs_alloc_busy_reuse( struct xfs_mount *mp, - struct xfs_busy_extent *busyp) + xfs_agnumber_t agno, + xfs_agblock_t fbno, + xfs_extlen_t flen, + bool userdata) { struct xfs_perag *pag; + struct rb_node *rbp; - trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno, - busyp->length); + ASSERT(flen > 0); - ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno, - busyp->length) == 1); + pag = xfs_perag_get(mp, agno); + spin_lock(&pag->pagb_lock); +restart: + rbp = pag->pagb_tree.rb_node; + while (rbp) { + struct xfs_busy_extent *busyp = + rb_entry(rbp, struct xfs_busy_extent, rb_node); + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; - list_del_init(&busyp->list); + if (fbno + flen <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } - pag = xfs_perag_get(mp, busyp->agno); - spin_lock(&pag->pagb_lock); - rb_erase(&busyp->rb_node, &pag->pagb_tree); + if (!xfs_alloc_busy_update_extent(mp, pag, busyp, fbno, flen, + userdata)) + goto restart; + } spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); +} + +/* + * For a given extent [fbno, flen], search the busy extent list to find a + * subset of the extent that is not busy. If *rlen is smaller than + * args->minlen no suitable extent could be found, and the higher level + * code needs to force out the log and retry the allocation. + */ +STATIC void +xfs_alloc_busy_trim( + struct xfs_alloc_arg *args, + xfs_agblock_t bno, + xfs_extlen_t len, + xfs_agblock_t *rbno, + xfs_extlen_t *rlen) +{ + xfs_agblock_t fbno; + xfs_extlen_t flen; + struct rb_node *rbp; + + ASSERT(len > 0); + spin_lock(&args->pag->pagb_lock); +restart: + fbno = bno; + flen = len; + rbp = args->pag->pagb_tree.rb_node; + while (rbp && flen >= args->minlen) { + struct xfs_busy_extent *busyp = + rb_entry(rbp, struct xfs_busy_extent, rb_node); + xfs_agblock_t fend = fbno + flen; + xfs_agblock_t bbno = busyp->bno; + xfs_agblock_t bend = bbno + busyp->length; + + if (fend <= bbno) { + rbp = rbp->rb_left; + continue; + } else if (fbno >= bend) { + rbp = rbp->rb_right; + continue; + } + + /* + * If this is a metadata allocation, try to reuse the busy + * extent instead of trimming the allocation. + */ + if (!args->userdata) { + if (!xfs_alloc_busy_update_extent(args->mp, args->pag, + busyp, fbno, flen, + false)) + goto restart; + continue; + } + + if (bbno <= fbno) { + /* start overlap */ + + /* + * Case 1: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +---------+ + * fbno fend + * + * Case 2: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 3: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-------------+ + * fbno fend + * + * Case 4: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------+ + * fbno fend + * + * No unbusy region in extent, return failure. + */ + if (fend <= bend) + goto fail; + + /* + * Case 5: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +----------------------+ + * fbno fend + * + * Case 6: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fbno = bend; + } else if (bend >= fend) { + /* end overlap */ + + /* + * Case 7: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +------------------+ + * fbno fend + * + * Case 8: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +--------------------------+ + * fbno fend + * + * Needs to be trimmed to: + * +-------+ + * fbno fend + */ + fend = bbno; + } else { + /* middle overlap */ + + /* + * Case 9: + * bbno bend + * +BBBBBBBBBBBBBBBBB+ + * +-----------------------------------+ + * fbno fend + * + * Can be trimmed to: + * +-------+ OR +-------+ + * fbno fend fbno fend + * + * Backward allocation leads to significant + * fragmentation of directories, which degrades + * directory performance, therefore we always want to + * choose the option that produces forward allocation + * patterns. + * Preferring the lower bno extent will make the next + * request use "fend" as the start of the next + * allocation; if the segment is no longer busy at + * that point, we'll get a contiguous allocation, but + * even if it is still busy, we will get a forward + * allocation. + * We try to avoid choosing the segment at "bend", + * because that can lead to the next allocation + * taking the segment at "fbno", which would be a + * backward allocation. We only use the segment at + * "fbno" if it is much larger than the current + * requested size, because in that case there's a + * good chance subsequent allocations will be + * contiguous. + */ + if (bbno - fbno >= args->maxlen) { + /* left candidate fits perfect */ + fend = bbno; + } else if (fend - bend >= args->maxlen * 4) { + /* right candidate has enough free space */ + fbno = bend; + } else if (bbno - fbno >= args->minlen) { + /* left candidate fits minimum requirement */ + fend = bbno; + } else { + goto fail; + } + } + + flen = fend - fbno; + } + spin_unlock(&args->pag->pagb_lock); + + if (fbno != bno || flen != len) { + trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, + fbno, flen); + } + *rbno = fbno; + *rlen = flen; + return; +fail: + /* + * Return a zero extent length as failure indications. All callers + * re-check if the trimmed extent satisfies the minlen requirement. + */ + spin_unlock(&args->pag->pagb_lock); + trace_xfs_alloc_busy_trim(args->mp, args->agno, bno, len, fbno, 0); + *rbno = fbno; + *rlen = 0; +} + +static void +xfs_alloc_busy_clear_one( + struct xfs_mount *mp, + struct xfs_perag *pag, + struct xfs_busy_extent *busyp) +{ + if (busyp->length) { + trace_xfs_alloc_busy_clear(mp, busyp->agno, busyp->bno, + busyp->length); + rb_erase(&busyp->rb_node, &pag->pagb_tree); + } + + list_del_init(&busyp->list); kmem_free(busyp); } + +void +xfs_alloc_busy_clear( + struct xfs_mount *mp, + struct list_head *list) +{ + struct xfs_busy_extent *busyp, *n; + struct xfs_perag *pag = NULL; + xfs_agnumber_t agno = NULLAGNUMBER; + + list_for_each_entry_safe(busyp, n, list, list) { + if (busyp->agno != agno) { + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } + pag = xfs_perag_get(mp, busyp->agno); + spin_lock(&pag->pagb_lock); + agno = busyp->agno; + } + + xfs_alloc_busy_clear_one(mp, pag, busyp); + } + + if (pag) { + spin_unlock(&pag->pagb_lock); + xfs_perag_put(pag); + } +} + +/* + * Callback for list_sort to sort busy extents by the AG they reside in. + */ +int +xfs_busy_extent_ag_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + return container_of(a, struct xfs_busy_extent, list)->agno - + container_of(b, struct xfs_busy_extent, list)->agno; +} diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index d0b3bc72005..240ad288f2f 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -140,11 +140,24 @@ xfs_alloc_busy_insert(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); void -xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp); +xfs_alloc_busy_clear(struct xfs_mount *mp, struct list_head *list); int xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len); + +void +xfs_alloc_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata); + +int +xfs_busy_extent_ag_cmp(void *priv, struct list_head *a, struct list_head *b); + +static inline void xfs_alloc_busy_sort(struct list_head *list) +{ + list_sort(NULL, list, xfs_busy_extent_ag_cmp); +} + #endif /* __KERNEL__ */ /* diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c index 3916925e258..8b469d53599 100644 --- a/fs/xfs/xfs_alloc_btree.c +++ b/fs/xfs/xfs_alloc_btree.c @@ -95,6 +95,8 @@ xfs_allocbt_alloc_block( return 0; } + xfs_alloc_busy_reuse(cur->bc_mp, cur->bc_private.a.agno, bno, 1, false); + xfs_trans_agbtree_delta(cur->bc_tp, 1); new->s = cpu_to_be32(bno); @@ -118,17 +120,6 @@ xfs_allocbt_free_block( if (error) return error; - /* - * Since blocks move to the free list without the coordination used in - * xfs_bmap_finish, we can't allow block to be available for - * reallocation and non-transaction writing (user data) until we know - * that the transaction that moved it to the free list is permanently - * on disk. We track the blocks by declaring these blocks as "busy"; - * the busy list is maintained on a per-ag basis and each transaction - * records which entries should be removed when the iclog commits to - * disk. If a busy block is allocated, the iclog is pushed up to the - * LSN that freed the block. - */ xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); xfs_trans_agbtree_delta(cur->bc_tp, -1); return 0; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index be628677c28..9a84a85c03b 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -202,7 +202,7 @@ xfs_swap_extents( xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { - xfs_mount_t *mp; + xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; xfs_bstat_t *sbp = &sxp->sx_stat; xfs_ifork_t *tempifp, *ifp, *tifp; @@ -212,16 +212,12 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; - mp = ip->i_mount; - tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); goto out; } - sbp = &sxp->sx_stat; - /* * we have to do two separate lock calls here to keep lockdep * happy. If we try to get all the locks in one call, lock will diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a37480a6e02..c8e3349c287 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1354,7 +1354,7 @@ xfs_itruncate_start( return 0; } last_byte = xfs_file_last_byte(ip); - trace_xfs_itruncate_start(ip, flags, new_size, toss_start, last_byte); + trace_xfs_itruncate_start(ip, new_size, flags, toss_start, last_byte); if (last_byte > toss_start) { if (flags & XFS_ITRUNC_DEFINITE) { xfs_tosspages(ip, toss_start, @@ -1470,7 +1470,7 @@ xfs_itruncate_finish( * file but the log buffers containing the free and reallocation * don't, then we'd end up with garbage in the blocks being freed. * As long as we make the new_size permanent before actually - * freeing any blocks it doesn't matter if they get writtten to. + * freeing any blocks it doesn't matter if they get written to. * * The callers must signal into us whether or not the size * setting here must be synchronous. There are a few cases diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 576fdfe81d6..09983a3344a 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -970,7 +970,6 @@ xfs_iflush_abort( { xfs_inode_log_item_t *iip = ip->i_itemp; - iip = ip->i_itemp; if (iip) { struct xfs_ail *ailp = iip->ili_item.li_ailp; if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b612ce4520a..211930246f2 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1449,6 +1449,13 @@ xlog_dealloc_log(xlog_t *log) xlog_cil_destroy(log); + /* + * always need to ensure that the extra buffer does not point to memory + * owned by another log buffer before we free it. + */ + xfs_buf_set_empty(log->l_xbuf, log->l_iclog_size); + xfs_buf_free(log->l_xbuf); + iclog = log->l_iclog; for (i=0; i<log->l_iclog_bufs; i++) { xfs_buf_free(iclog->ic_bp); @@ -1458,7 +1465,6 @@ xlog_dealloc_log(xlog_t *log) } spinlock_destroy(&log->l_icloglock); - xfs_buf_free(log->l_xbuf); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ @@ -3248,13 +3254,6 @@ xfs_log_ticket_get( return ticket; } -xlog_tid_t -xfs_log_get_trans_ident( - struct xfs_trans *tp) -{ - return tp->t_ticket->t_tid; -} - /* * Allocate and initialise a new log ticket. */ diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 3bd3291ef8d..78c9039994a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -189,8 +189,6 @@ void xlog_iodone(struct xfs_buf *); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); -xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp); - void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_log_vec *log_vector, xfs_lsn_t *commit_lsn, int flags); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9ca59be0897..7d56e88a3f0 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -361,13 +361,12 @@ xlog_cil_committed( int abort) { struct xfs_cil_ctx *ctx = args; - struct xfs_busy_extent *busyp, *n; xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain, ctx->start_lsn, abort); - list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list) - xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp); + xfs_alloc_busy_sort(&ctx->busy_extents); + xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, &ctx->busy_extents); spin_lock(&ctx->cil->xc_cil_lock); list_del(&ctx->committing); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 5864850e9e3..2d3b6a498d6 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -146,6 +146,8 @@ static inline uint xlog_get_client_id(__be32 i) shutdown */ #define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ +typedef __uint32_t xlog_tid_t; + #ifdef __KERNEL__ /* * Below are states for covering allocation transactions. diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 5cc464a17c9..04142caedb2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -205,6 +205,35 @@ xlog_bread( } /* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + xlog_t *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + xfs_buf_t *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = XFS_BUF_PTR(bp); + int orig_len = bp->b_buffer_length; + int error, error2; + + error = XFS_BUF_SET_PTR(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = XFS_BUF_SET_PTR(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. * This can only be used for synchronous log writes. @@ -1229,20 +1258,12 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp); - balign = BBTOB(ealign - start_block); - error = XFS_BUF_SET_PTR(bp, offset + balign, - BBTOB(sectbb)); + offset = XFS_BUF_PTR(bp) + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); if (error) break; - error = xlog_bread_noalign(log, ealign, sectbb, bp); - if (error) - break; - - error = XFS_BUF_SET_PTR(bp, offset, bufblks); - if (error) - break; } offset = xlog_align(log, start_block, endcount, bp); @@ -3448,19 +3469,9 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - error = XFS_BUF_SET_PTR(hbp, - offset + BBTOB(split_hblks), - BBTOB(hblks - split_hblks)); - if (error) - goto bread_err2; - - error = xlog_bread_noalign(log, 0, - wrapped_hblks, hbp); - if (error) - goto bread_err2; - - error = XFS_BUF_SET_PTR(hbp, offset, - BBTOB(hblks)); + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); if (error) goto bread_err2; } @@ -3511,19 +3522,9 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - error = XFS_BUF_SET_PTR(dbp, - offset + BBTOB(split_bblks), - BBTOB(bblks - split_bblks)); - if (error) - goto bread_err2; - - error = xlog_bread_noalign(log, wrapped_hblks, - bblks - split_bblks, - dbp); - if (error) - goto bread_err2; - - error = XFS_BUF_SET_PTR(dbp, offset, h_size); + error = xlog_bread_offset(log, 0, + bblks - split_bblks, hbp, + offset + BBTOB(split_bblks)); if (error) goto bread_err2; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bb3f9a7b24e..b49b82363d2 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1900,7 +1900,7 @@ xfs_mod_incore_sb_batch( uint nmsb, int rsvd) { - xfs_mod_sb_t *msbp = &msb[0]; + xfs_mod_sb_t *msbp; int error = 0; /* @@ -1910,7 +1910,7 @@ xfs_mod_incore_sb_batch( * changes will be atomic. */ spin_lock(&mp->m_sb_lock); - for (msbp = &msbp[0]; msbp < (msb + nmsb); msbp++) { + for (msbp = msb; msbp < (msb + nmsb); msbp++) { ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || msbp->msb_field > XFS_SBS_FDBLOCKS); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 76922793f64..d1f24858ccc 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -608,10 +608,8 @@ STATIC void xfs_trans_free( struct xfs_trans *tp) { - struct xfs_busy_extent *busyp, *n; - - list_for_each_entry_safe(busyp, n, &tp->t_busy, list) - xfs_alloc_busy_clear(tp->t_mountp, busyp); + xfs_alloc_busy_sort(&tp->t_busy); + xfs_alloc_busy_clear(tp->t_mountp, &tp->t_busy); atomic_dec(&tp->t_mountp->m_active_trans); xfs_trans_free_dqinfo(tp); diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h index 26d1867d815..65584b55607 100644 --- a/fs/xfs/xfs_types.h +++ b/fs/xfs/xfs_types.h @@ -73,8 +73,6 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */ typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ -typedef __uint32_t xlog_tid_t; /* transaction ID type */ - /* * These types are 64 bits on disk but are either 32 or 64 bits in memory. * Disk based types: |