diff options
Diffstat (limited to 'fs/super.c')
| -rw-r--r-- | fs/super.c | 893 |
1 files changed, 555 insertions, 338 deletions
diff --git a/fs/super.c b/fs/super.c index ca696155cd9..d20d5b11ded 100644 --- a/fs/super.c +++ b/fs/super.c @@ -20,7 +20,7 @@ * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ -#include <linux/module.h> +#include <linux/export.h> #include <linux/slab.h> #include <linux/acct.h> #include <linux/blkdev.h> @@ -30,89 +30,107 @@ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> +#include <linux/rculist_bl.h> +#include <linux/cleancache.h> +#include <linux/fsnotify.h> +#include <linux/lockdep.h> #include "internal.h" LIST_HEAD(super_blocks); DEFINE_SPINLOCK(sb_lock); -/** - * alloc_super - create new superblock - * @type: filesystem type superblock should belong to - * - * Allocates and initializes a new &struct super_block. alloc_super() - * returns a pointer new superblock or %NULL if allocation had failed. +static char *sb_writers_name[SB_FREEZE_LEVELS] = { + "sb_writers", + "sb_pagefaults", + "sb_internal", +}; + +/* + * One thing we have to be careful of with a per-sb shrinker is that we don't + * drop the last active reference to the superblock from within the shrinker. + * If that happens we could trigger unregistering the shrinker from within the + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * take a passive reference to the superblock to avoid this from occurring. */ -static struct super_block *alloc_super(struct file_system_type *type) +static unsigned long super_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) { - struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); - static const struct super_operations default_op; + struct super_block *sb; + long fs_objects = 0; + long total_objects; + long freed = 0; + long dentries; + long inodes; - if (s) { - if (security_sb_alloc(s)) { - kfree(s); - s = NULL; - goto out; - } -#ifdef CONFIG_SMP - s->s_files = alloc_percpu(struct list_head); - if (!s->s_files) { - security_sb_free(s); - kfree(s); - s = NULL; - goto out; - } else { - int i; + sb = container_of(shrink, struct super_block, s_shrink); - for_each_possible_cpu(i) - INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i)); - } -#else - INIT_LIST_HEAD(&s->s_files); -#endif - INIT_LIST_HEAD(&s->s_instances); - INIT_HLIST_HEAD(&s->s_anon); - INIT_LIST_HEAD(&s->s_inodes); - INIT_LIST_HEAD(&s->s_dentry_lru); - init_rwsem(&s->s_umount); - mutex_init(&s->s_lock); - lockdep_set_class(&s->s_umount, &type->s_umount_key); - /* - * The locking rules for s_lock are up to the - * filesystem. For example ext3fs has different - * lock ordering than usbfs: - */ - lockdep_set_class(&s->s_lock, &type->s_lock_key); - /* - * sget() can have s_umount recursion. - * - * When it cannot find a suitable sb, it allocates a new - * one (this one), and tries again to find a suitable old - * one. - * - * In case that succeeds, it will acquire the s_umount - * lock of the old one. Since these are clearly distrinct - * locks, and this object isn't exposed yet, there's no - * risk of deadlocks. - * - * Annotate this by putting this lock in a different - * subclass. - */ - down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); - s->s_count = 1; - atomic_set(&s->s_active, 1); - mutex_init(&s->s_vfs_rename_mutex); - lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); - mutex_init(&s->s_dquot.dqio_mutex); - mutex_init(&s->s_dquot.dqonoff_mutex); - init_rwsem(&s->s_dquot.dqptr_sem); - init_waitqueue_head(&s->s_wait_unfrozen); - s->s_maxbytes = MAX_NON_LFS; - s->s_op = &default_op; - s->s_time_gran = 1000000000; + /* + * Deadlock avoidance. We may hold various FS locks, and we don't want + * to recurse into the FS that called us in clear_inode() and friends.. + */ + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + if (!grab_super_passive(sb)) + return SHRINK_STOP; + + if (sb->s_op->nr_cached_objects) + fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); + + inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); + dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); + total_objects = dentries + inodes + fs_objects + 1; + + /* proportion the scan between the caches */ + dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); + inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); + + /* + * prune the dcache first as the icache is pinned by it, then + * prune the icache, followed by the filesystem specific caches + */ + freed = prune_dcache_sb(sb, dentries, sc->nid); + freed += prune_icache_sb(sb, inodes, sc->nid); + + if (fs_objects) { + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, + total_objects); + freed += sb->s_op->free_cached_objects(sb, fs_objects, + sc->nid); } -out: - return s; + + drop_super(sb); + return freed; +} + +static unsigned long super_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct super_block *sb; + long total_objects = 0; + + sb = container_of(shrink, struct super_block, s_shrink); + + /* + * Don't call grab_super_passive as it is a potential + * scalability bottleneck. The counts could get updated + * between super_cache_count and super_cache_scan anyway. + * Call to super_cache_count with shrinker_rwsem held + * ensures the safety of call to list_lru_count_node() and + * s_op->nr_cached_objects(). + */ + if (sb->s_op && sb->s_op->nr_cached_objects) + total_objects = sb->s_op->nr_cached_objects(sb, + sc->nid); + + total_objects += list_lru_count_node(&sb->s_dentry_lru, + sc->nid); + total_objects += list_lru_count_node(&sb->s_inode_lru, + sc->nid); + + total_objects = vfs_pressure_ratio(total_objects); + return total_objects; } /** @@ -121,15 +139,101 @@ out: * * Frees a superblock. */ -static inline void destroy_super(struct super_block *s) +static void destroy_super(struct super_block *s) { -#ifdef CONFIG_SMP - free_percpu(s->s_files); -#endif + int i; + list_lru_destroy(&s->s_dentry_lru); + list_lru_destroy(&s->s_inode_lru); + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); + WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree(s); + kfree_rcu(s, rcu); +} + +/** + * alloc_super - create new superblock + * @type: filesystem type superblock should belong to + * @flags: the mount flags + * + * Allocates and initializes a new &struct super_block. alloc_super() + * returns a pointer new superblock or %NULL if allocation had failed. + */ +static struct super_block *alloc_super(struct file_system_type *type, int flags) +{ + struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); + static const struct super_operations default_op; + int i; + + if (!s) + return NULL; + + INIT_LIST_HEAD(&s->s_mounts); + + if (security_sb_alloc(s)) + goto fail; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) { + if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + goto fail; + lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], + &type->s_writers_key[i], 0); + } + init_waitqueue_head(&s->s_writers.wait); + init_waitqueue_head(&s->s_writers.wait_unfrozen); + s->s_flags = flags; + s->s_bdi = &default_backing_dev_info; + INIT_HLIST_NODE(&s->s_instances); + INIT_HLIST_BL_HEAD(&s->s_anon); + INIT_LIST_HEAD(&s->s_inodes); + + if (list_lru_init(&s->s_dentry_lru)) + goto fail; + if (list_lru_init(&s->s_inode_lru)) + goto fail; + + init_rwsem(&s->s_umount); + lockdep_set_class(&s->s_umount, &type->s_umount_key); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); + s->s_count = 1; + atomic_set(&s->s_active, 1); + mutex_init(&s->s_vfs_rename_mutex); + lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); + mutex_init(&s->s_dquot.dqio_mutex); + mutex_init(&s->s_dquot.dqonoff_mutex); + init_rwsem(&s->s_dquot.dqptr_sem); + s->s_maxbytes = MAX_NON_LFS; + s->s_op = &default_op; + s->s_time_gran = 1000000000; + s->cleancache_poolid = -1; + + s->s_shrink.seeks = DEFAULT_SEEKS; + s->s_shrink.scan_objects = super_cache_scan; + s->s_shrink.count_objects = super_cache_count; + s->s_shrink.batch = 1024; + s->s_shrink.flags = SHRINKER_NUMA_AWARE; + return s; + +fail: + destroy_super(s); + return NULL; } /* Superblock refcounting */ @@ -137,7 +241,7 @@ static inline void destroy_super(struct super_block *s) /* * Drop a superblock's refcount. The caller must hold sb_lock. */ -void __put_super(struct super_block *sb) +static void __put_super(struct super_block *sb) { if (!--sb->s_count) { list_del_init(&sb->s_list); @@ -152,7 +256,7 @@ void __put_super(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -void put_super(struct super_block *sb) +static void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); @@ -175,7 +279,10 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { + cleancache_invalidate_fs(s); + unregister_shrinker(&s->s_shrink); fs->kill_sb(s); + put_filesystem(fs); put_super(s); } else { @@ -212,41 +319,56 @@ EXPORT_SYMBOL(deactivate_super); * and want to turn it into a full-blown active reference. grab_super() * is called with sb_lock held and drops it. Returns 1 in case of * success, 0 if we had failed (superblock contents was already dead or - * dying when grab_super() had been called). + * dying when grab_super() had been called). Note that this is only + * called for superblocks not in rundown mode (== ones still on ->fs_supers + * of their type), so increment of ->s_count is OK here. */ static int grab_super(struct super_block *s) __releases(sb_lock) { - if (atomic_inc_not_zero(&s->s_active)) { - spin_unlock(&sb_lock); - return 1; - } - /* it's going away */ s->s_count++; spin_unlock(&sb_lock); - /* wait for it to die */ down_write(&s->s_umount); + if ((s->s_flags & MS_BORN) && atomic_inc_not_zero(&s->s_active)) { + put_super(s); + return 1; + } up_write(&s->s_umount); put_super(s); return 0; } /* - * Superblock locking. We really ought to get rid of these two. + * grab_super_passive - acquire a passive reference + * @sb: reference we are trying to grab + * + * Tries to acquire a passive reference. This is used in places where we + * cannot take an active reference but we need to ensure that the + * superblock does not go away while we are working on it. It returns + * false if a reference was not gained, and returns true with the s_umount + * lock held in read mode if a reference is gained. On successful return, + * the caller must drop the s_umount lock and the passive reference when + * done. */ -void lock_super(struct super_block * sb) +bool grab_super_passive(struct super_block *sb) { - get_fs_excl(); - mutex_lock(&sb->s_lock); -} + spin_lock(&sb_lock); + if (hlist_unhashed(&sb->s_instances)) { + spin_unlock(&sb_lock); + return false; + } -void unlock_super(struct super_block * sb) -{ - put_fs_excl(); - mutex_unlock(&sb->s_lock); -} + sb->s_count++; + spin_unlock(&sb_lock); + + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root && (sb->s_flags & MS_BORN)) + return true; + up_read(&sb->s_umount); + } -EXPORT_SYMBOL(lock_super); -EXPORT_SYMBOL(unlock_super); + put_super(sb); + return false; +} /** * generic_shutdown_super - common helper for ->kill_sb() @@ -266,17 +388,20 @@ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; - if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); - get_fs_excl(); sb->s_flags &= ~MS_ACTIVE; fsnotify_unmount_inodes(&sb->s_inodes); evict_inodes(sb); + if (sb->s_dio_done_wq) { + destroy_workqueue(sb->s_dio_done_wq); + sb->s_dio_done_wq = NULL; + } + if (sop->put_super) sop->put_super(sb); @@ -285,11 +410,10 @@ void generic_shutdown_super(struct super_block *sb) "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } - put_fs_excl(); } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ - list_del_init(&sb->s_instances); + hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); } @@ -301,11 +425,13 @@ EXPORT_SYMBOL(generic_shutdown_super); * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback + * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), + int flags, void *data) { struct super_block *s = NULL; @@ -315,7 +441,7 @@ struct super_block *sget(struct file_system_type *type, retry: spin_lock(&sb_lock); if (test) { - list_for_each_entry(old, &type->fs_supers, s_instances) { + hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) @@ -325,17 +451,12 @@ retry: destroy_super(s); s = NULL; } - down_write(&old->s_umount); - if (unlikely(!(old->s_flags & MS_BORN))) { - deactivate_locked_super(old); - goto retry; - } return old; } } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type); + s = alloc_super(type, flags); if (!s) return ERR_PTR(-ENOMEM); goto retry; @@ -351,9 +472,10 @@ retry: s->s_type = type; strlcpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); - list_add(&s->s_instances, &type->fs_supers); + hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); + register_shrinker(&s->s_shrink); return s; } @@ -368,39 +490,33 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); /** - * sync_supers - helper for periodic superblock writeback - * - * Call the write_super method if present on all dirty superblocks in - * the system. This is for the periodic writeback used by most older - * filesystems. For data integrity superblock writeback use - * sync_filesystems() instead. + * iterate_supers - call function for all active superblocks + * @f: function to call + * @arg: argument to pass to it * - * Note: check the dirty flag before waiting, so we don't - * hold up the sync while mounting a device. (The newly - * mounted device won't need syncing.) + * Scans the superblock list and calls given function, passing it + * locked superblock and given argument. */ -void sync_supers(void) +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; - if (sb->s_op->write_super && sb->s_dirt) { - sb->s_count++; - spin_unlock(&sb_lock); + sb->s_count++; + spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && sb->s_dirt) - sb->s_op->write_super(sb); - up_read(&sb->s_umount); + down_read(&sb->s_umount); + if (sb->s_root && (sb->s_flags & MS_BORN)) + f(sb, arg); + up_read(&sb->s_umount); - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } + spin_lock(&sb_lock); + if (p) + __put_super(p); + p = sb; } if (p) __put_super(p); @@ -408,26 +524,26 @@ void sync_supers(void) } /** - * iterate_supers - call function for all active superblocks + * iterate_supers_type - call function for superblocks of given type + * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ -void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +void iterate_supers_type(struct file_system_type *type, + void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) - continue; + hlist_for_each_entry(sb, &type->fs_supers, s_instances) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) f(sb, arg); up_read(&sb->s_umount); @@ -441,6 +557,8 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_unlock(&sb_lock); } +EXPORT_SYMBOL(iterate_supers_type); + /** * get_super - get the superblock of a device * @bdev: device to get the superblock for @@ -459,14 +577,14 @@ struct super_block *get_super(struct block_device *bdev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -482,6 +600,29 @@ rescan: EXPORT_SYMBOL(get_super); /** + * get_super_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen). %NULL is returned if no match + * is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ + while (1) { + struct super_block *s = get_super(bdev); + if (!s || s->s_writers.frozen == SB_UNFROZEN) + return s; + up_read(&s->s_umount); + wait_event(s->s_writers.wait_unfrozen, + s->s_writers.frozen == SB_UNFROZEN); + put_super(s); + } +} +EXPORT_SYMBOL(get_super_thawed); + +/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * @@ -499,13 +640,13 @@ struct super_block *get_active_super(struct block_device *bdev) restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { - if (grab_super(sb)) /* drops sb_lock */ - return sb; - else + if (!grab_super(sb)) goto restart; + up_write(&sb->s_umount); + return sb; } } spin_unlock(&sb_lock); @@ -519,14 +660,14 @@ struct super_block *user_get_super(dev_t dev) spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root) + if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ @@ -553,7 +694,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) int retval; int remount_ro; - if (sb->s_frozen != SB_UNFROZEN) + if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; #ifdef CONFIG_BLOCK @@ -564,25 +705,36 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (flags & MS_RDONLY) acct_auto_close(sb); shrink_dcache_sb(sb); - sync_filesystem(sb); remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); /* If we are remounting RDONLY and current sb is read/write, make sure there are no rw files opened */ if (remount_ro) { - if (force) - mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) - return -EBUSY; + if (force) { + sb->s_readonly_remount = 1; + smp_wmb(); + } else { + retval = sb_prepare_remount_readonly(sb); + if (retval) + return retval; + } } if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) - return retval; + if (retval) { + if (!force) + goto cancel_readonly; + /* If forced remount, go ahead despite any errors */ + WARN(1, "forced remount of a %s fs returned %i\n", + sb->s_type->name, retval); + } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + /* Needs to be ordered wrt mnt_is_readonly() */ + smp_wmb(); + sb->s_readonly_remount = 0; /* * Some filesystems modify their metadata via some other path than the @@ -595,6 +747,10 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; + +cancel_readonly: + sb->s_readonly_remount = 0; + return retval; } static void do_emergency_remount(struct work_struct *work) @@ -603,12 +759,13 @@ static void do_emergency_remount(struct work_struct *work) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (list_empty(&sb->s_instances)) + if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { + if (sb->s_root && sb->s_bdev && (sb->s_flags & MS_BORN) && + !(sb->s_flags & MS_RDONLY)) { /* * What lock protects sb->s_flags?? */ @@ -645,9 +802,12 @@ void emergency_remount(void) static DEFINE_IDA(unnamed_dev_ida); static DEFINE_SPINLOCK(unnamed_dev_lock);/* protects the above */ -static int unnamed_dev_start = 0; /* don't bother trying below it */ +/* Many userspace utilities consider an FSID of 0 invalid. + * Always return at least 1 from get_anon_bdev. + */ +static int unnamed_dev_start = 1; -int set_anon_super(struct super_block *s, void *data) +int get_anon_bdev(dev_t *p) { int dev; int error; @@ -666,7 +826,7 @@ int set_anon_super(struct super_block *s, void *data) else if (error) return -EAGAIN; - if ((dev & MAX_ID_MASK) == (1 << MINORBITS)) { + if (dev == (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) @@ -674,24 +834,38 @@ int set_anon_super(struct super_block *s, void *data) spin_unlock(&unnamed_dev_lock); return -EMFILE; } - s->s_dev = MKDEV(0, dev & MINORMASK); - s->s_bdi = &noop_backing_dev_info; + *p = MKDEV(0, dev & MINORMASK); return 0; } +EXPORT_SYMBOL(get_anon_bdev); -EXPORT_SYMBOL(set_anon_super); - -void kill_anon_super(struct super_block *sb) +void free_anon_bdev(dev_t dev) { - int slot = MINOR(sb->s_dev); - - generic_shutdown_super(sb); + int slot = MINOR(dev); spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, slot); if (slot < unnamed_dev_start) unnamed_dev_start = slot; spin_unlock(&unnamed_dev_lock); } +EXPORT_SYMBOL(free_anon_bdev); + +int set_anon_super(struct super_block *s, void *data) +{ + int error = get_anon_bdev(&s->s_dev); + if (!error) + s->s_bdi = &noop_backing_dev_info; + return error; +} + +EXPORT_SYMBOL(set_anon_super); + +void kill_anon_super(struct super_block *sb) +{ + dev_t dev = sb->s_dev; + generic_shutdown_super(sb); + free_anon_bdev(dev); +} EXPORT_SYMBOL(kill_anon_super); @@ -720,13 +894,12 @@ struct dentry *mount_ns(struct file_system_type *fs_type, int flags, { struct super_block *sb; - sb = sget(fs_type, ns_test_super, ns_set_super, data); + sb = sget(fs_type, ns_test_super, ns_set_super, flags, data); if (IS_ERR(sb)) return ERR_CAST(sb); if (!sb->s_root) { int err; - sb->s_flags = flags; err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0); if (err) { deactivate_locked_super(sb); @@ -766,13 +939,13 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, { struct block_device *bdev; struct super_block *s; - fmode_t mode = FMODE_READ; + fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; - bdev = open_bdev_exclusive(dev_name, mode, fs_type); + bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); @@ -787,7 +960,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error = -EBUSY; goto error_bdev; } - s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); + s = sget(fs_type, test_bdev_super, set_bdev_super, flags | MS_NOSEC, + bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; @@ -801,18 +975,17 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, /* * s_umount nests inside bd_mutex during - * __invalidate_device(). close_bdev_exclusive() - * acquires bd_mutex and can't be called under - * s_umount. Drop s_umount temporarily. This is safe - * as we're holding an active reference. + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. */ up_write(&s->s_umount); - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); down_write(&s->s_umount); } else { char b[BDEVNAME_SIZE]; - s->s_flags = flags; s->s_mode = mode; strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); sb_set_blocksize(s, block_size(bdev)); @@ -831,29 +1004,12 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, error_s: error = PTR_ERR(s); error_bdev: - close_bdev_exclusive(bdev, mode); + blkdev_put(bdev, mode); error: return ERR_PTR(error); } EXPORT_SYMBOL(mount_bdev); -int get_sb_bdev(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) -{ - struct dentry *root; - - root = mount_bdev(fs_type, flags, dev_name, data, fill_super); - if (IS_ERR(root)) - return PTR_ERR(root); - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - return 0; -} - -EXPORT_SYMBOL(get_sb_bdev); - void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; @@ -862,7 +1018,8 @@ void kill_block_super(struct super_block *sb) bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); - close_bdev_exclusive(bdev, mode); + WARN_ON_ONCE(!(mode & FMODE_EXCL)); + blkdev_put(bdev, mode | FMODE_EXCL); } EXPORT_SYMBOL(kill_block_super); @@ -873,13 +1030,11 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, int (*fill_super)(struct super_block *, void *, int)) { int error; - struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); + struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); - s->s_flags = flags; - error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); @@ -890,22 +1045,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -int get_sb_nodev(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) -{ - struct dentry *root; - - root = mount_nodev(fs_type, flags, data, fill_super); - if (IS_ERR(root)) - return PTR_ERR(root); - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - return 0; -} -EXPORT_SYMBOL(get_sb_nodev); - static int compare_single(struct super_block *s, void *p) { return 1; @@ -918,11 +1057,10 @@ struct dentry *mount_single(struct file_system_type *fs_type, struct super_block *s; int error; - s = sget(fs_type, compare_single, set_anon_super, NULL); + s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { - s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); @@ -936,69 +1074,36 @@ struct dentry *mount_single(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_single); -int get_sb_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int), - struct vfsmount *mnt) +struct dentry * +mount_fs(struct file_system_type *type, int flags, const char *name, void *data) { struct dentry *root; - root = mount_single(fs_type, flags, data, fill_super); - if (IS_ERR(root)) - return PTR_ERR(root); - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - return 0; -} - -EXPORT_SYMBOL(get_sb_single); - -struct vfsmount * -vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) -{ - struct vfsmount *mnt; - struct dentry *root; + struct super_block *sb; char *secdata = NULL; - int error; - - if (!type) - return ERR_PTR(-ENODEV); - - error = -ENOMEM; - mnt = alloc_vfsmnt(name); - if (!mnt) - goto out; - - if (flags & MS_KERNMOUNT) - mnt->mnt_flags = MNT_INTERNAL; + int error = -ENOMEM; if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { secdata = alloc_secdata(); if (!secdata) - goto out_mnt; + goto out; error = security_sb_copy_data(data, secdata); if (error) goto out_free_secdata; } - if (type->mount) { - root = type->mount(type, flags, name, data); - if (IS_ERR(root)) { - error = PTR_ERR(root); - goto out_free_secdata; - } - mnt->mnt_root = root; - mnt->mnt_sb = root->d_sb; - } else { - error = type->get_sb(type, flags, name, data, mnt); - if (error < 0) - goto out_free_secdata; + root = type->mount(type, flags, name, data); + if (IS_ERR(root)) { + error = PTR_ERR(root); + goto out_free_secdata; } - BUG_ON(!mnt->mnt_sb); - WARN_ON(!mnt->mnt_sb->s_bdi); - mnt->mnt_sb->s_flags |= MS_BORN; + sb = root->d_sb; + BUG_ON(!sb); + WARN_ON(!sb->s_bdi); + WARN_ON(sb->s_bdi == &default_backing_dev_info); + sb->s_flags |= MS_BORN; - error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); + error = security_sb_kern_mount(sb, flags, secdata); if (error) goto out_sb; @@ -1006,29 +1111,136 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that - * violate this rule. This warning should be either removed or - * converted to a BUG() in 2.6.34. + * violate this rule. */ - WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " - "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); + WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " + "negative value (%lld)\n", type->name, sb->s_maxbytes); - mnt->mnt_mountpoint = mnt->mnt_root; - mnt->mnt_parent = mnt; - up_write(&mnt->mnt_sb->s_umount); + up_write(&sb->s_umount); free_secdata(secdata); - return mnt; + return root; out_sb: - dput(mnt->mnt_root); - deactivate_locked_super(mnt->mnt_sb); + dput(root); + deactivate_locked_super(sb); out_free_secdata: free_secdata(secdata); -out_mnt: - free_vfsmnt(mnt); out: return ERR_PTR(error); } -EXPORT_SYMBOL_GPL(vfs_kern_mount); +/* + * This is an internal function, please use sb_end_{write,pagefault,intwrite} + * instead. + */ +void __sb_end_write(struct super_block *sb, int level) +{ + percpu_counter_dec(&sb->s_writers.counter[level-1]); + /* + * Make sure s_writers are updated before we wake up waiters in + * freeze_super(). + */ + smp_mb(); + if (waitqueue_active(&sb->s_writers.wait)) + wake_up(&sb->s_writers.wait); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); +} +EXPORT_SYMBOL(__sb_end_write); + +#ifdef CONFIG_LOCKDEP +/* + * We want lockdep to tell us about possible deadlocks with freezing but + * it's it bit tricky to properly instrument it. Getting a freeze protection + * works as getting a read lock but there are subtle problems. XFS for example + * gets freeze protection on internal level twice in some cases, which is OK + * only because we already hold a freeze protection also on higher level. Due + * to these cases we have to tell lockdep we are doing trylock when we + * already hold a freeze protection for a higher freeze level. + */ +static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, + unsigned long ip) +{ + int i; + + if (!trylock) { + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + trylock = true; + break; + } + } + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); +} +#endif + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ +retry: + if (unlikely(sb->s_writers.frozen >= level)) { + if (!wait) + return 0; + wait_event(sb->s_writers.wait_unfrozen, + sb->s_writers.frozen < level); + } + +#ifdef CONFIG_LOCKDEP + acquire_freeze_lock(sb, level, !wait, _RET_IP_); +#endif + percpu_counter_inc(&sb->s_writers.counter[level-1]); + /* + * Make sure counter is updated before we check for frozen. + * freeze_super() first sets frozen and then checks the counter. + */ + smp_mb(); + if (unlikely(sb->s_writers.frozen >= level)) { + __sb_end_write(sb, level); + goto retry; + } + return 1; +} +EXPORT_SYMBOL(__sb_start_write); + +/** + * sb_wait_write - wait until all writers to given file system finish + * @sb: the super for which we wait + * @level: type of writers we wait for (normal vs page fault) + * + * This function waits until there are no writers of given type to given file + * system. Caller of this function should make sure there can be no new writers + * of type @level before calling this function. Otherwise this function can + * livelock. + */ +static void sb_wait_write(struct super_block *sb, int level) +{ + s64 writers; + + /* + * We just cycle-through lockdep here so that it does not complain + * about returning with lock to userspace + */ + rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + + do { + DEFINE_WAIT(wait); + + /* + * We use a barrier in prepare_to_wait() to separate setting + * of frozen and checking of the counter + */ + prepare_to_wait(&sb->s_writers.wait, &wait, + TASK_UNINTERRUPTIBLE); + + writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); + if (writers) + schedule(); + + finish_wait(&sb->s_writers.wait, &wait); + } while (writers); +} /** * freeze_super - lock the filesystem and force it into a consistent state @@ -1037,6 +1249,31 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount); * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return * -EBUSY. + * + * During this function, sb->s_writers.frozen goes through these values: + * + * SB_UNFROZEN: File system is normal, all writes progress as usual. + * + * SB_FREEZE_WRITE: The file system is in the process of being frozen. New + * writes should be blocked, though page faults are still allowed. We wait for + * all writes to complete and then proceed to the next stage. + * + * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked + * but internal fs threads can still modify the filesystem (although they + * should not dirty new pages or inodes), writeback can run etc. After waiting + * for all running page faults we sync the filesystem which will clean all + * dirty pages and inodes (no new dirty pages or inodes can be created when + * sync is running). + * + * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs + * modification are blocked (e.g. XFS preallocation truncation on inode + * reclaim). This is usually implemented by blocking new transactions for + * filesystems that have them and need this additional guard. After all + * internal writers are finished we call ->freeze_fs() to finish filesystem + * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is + * mostly auxiliary for filesystems to verify they do not modify frozen fs. + * + * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb) { @@ -1044,37 +1281,64 @@ int freeze_super(struct super_block *sb) atomic_inc(&sb->s_active); down_write(&sb->s_umount); - if (sb->s_frozen) { + if (sb->s_writers.frozen != SB_UNFROZEN) { deactivate_locked_super(sb); return -EBUSY; } + if (!(sb->s_flags & MS_BORN)) { + up_write(&sb->s_umount); + return 0; /* sic - it's "nothing to do" */ + } + if (sb->s_flags & MS_RDONLY) { - sb->s_frozen = SB_FREEZE_TRANS; - smp_wmb(); + /* Nothing to do really... */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } - sb->s_frozen = SB_FREEZE_WRITE; + /* From now on, no new normal writers can start */ + sb->s_writers.frozen = SB_FREEZE_WRITE; smp_wmb(); + /* Release s_umount to preserve sb_start_write -> s_umount ordering */ + up_write(&sb->s_umount); + + sb_wait_write(sb, SB_FREEZE_WRITE); + + /* Now we go and block page faults... */ + down_write(&sb->s_umount); + sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; + smp_wmb(); + + sb_wait_write(sb, SB_FREEZE_PAGEFAULT); + + /* All writers are done so after syncing there won't be dirty data */ sync_filesystem(sb); - sb->s_frozen = SB_FREEZE_TRANS; + /* Now wait for internal filesystem counter */ + sb->s_writers.frozen = SB_FREEZE_FS; smp_wmb(); + sb_wait_write(sb, SB_FREEZE_FS); - sync_blockdev(sb->s_bdev); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; + smp_wmb(); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } } + /* + * This is just for debugging purposes so that fs can warn if it + * sees write activity when frozen is set to SB_FREEZE_COMPLETE. + */ + sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } @@ -1091,7 +1355,7 @@ int thaw_super(struct super_block *sb) int error; down_write(&sb->s_umount); - if (sb->s_frozen == SB_UNFROZEN) { + if (sb->s_writers.frozen == SB_UNFROZEN) { up_write(&sb->s_umount); return -EINVAL; } @@ -1104,64 +1368,17 @@ int thaw_super(struct super_block *sb) if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); - sb->s_frozen = SB_FREEZE_TRANS; up_write(&sb->s_umount); return error; } } out: - sb->s_frozen = SB_UNFROZEN; + sb->s_writers.frozen = SB_UNFROZEN; smp_wmb(); - wake_up(&sb->s_wait_unfrozen); + wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return 0; } EXPORT_SYMBOL(thaw_super); - -static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) -{ - int err; - const char *subtype = strchr(fstype, '.'); - if (subtype) { - subtype++; - err = -EINVAL; - if (!subtype[0]) - goto err; - } else - subtype = ""; - - mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL); - err = -ENOMEM; - if (!mnt->mnt_sb->s_subtype) - goto err; - return mnt; - - err: - mntput(mnt); - return ERR_PTR(err); -} - -struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) -{ - struct file_system_type *type = get_fs_type(fstype); - struct vfsmount *mnt; - if (!type) - return ERR_PTR(-ENODEV); - mnt = vfs_kern_mount(type, flags, name, data); - if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) && - !mnt->mnt_sb->s_subtype) - mnt = fs_set_subtype(mnt, fstype); - put_filesystem(type); - return mnt; -} -EXPORT_SYMBOL_GPL(do_kern_mount); - -struct vfsmount *kern_mount_data(struct file_system_type *type, void *data) -{ - return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data); -} - -EXPORT_SYMBOL_GPL(kern_mount_data); |
