diff options
Diffstat (limited to 'fs/hugetlbfs/inode.c')
| -rw-r--r-- | fs/hugetlbfs/inode.c | 316 |
1 files changed, 196 insertions, 120 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0aa424a0234..1e2872b2534 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1,11 +1,13 @@ /* * hugetlbpage-backed filesystem. Based on ramfs. * - * William Irwin, 2002 + * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> @@ -41,6 +43,25 @@ const struct file_operations hugetlbfs_file_operations; static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; +struct hugetlbfs_config { + kuid_t uid; + kgid_t gid; + umode_t mode; + long nr_blocks; + long nr_inodes; + struct hstate *hstate; +}; + +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + static struct backing_dev_info hugetlbfs_backing_dev_info = { .name = "hugetlbfs", .ra_pages = 0, /* No readahead */ @@ -78,7 +99,7 @@ static void huge_pagevec_release(struct pagevec *pvec) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { - struct inode *inode = file->f_path.dentry->d_inode; + struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -91,7 +112,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap_pgoff unwinds (may be important on powerpc * and ia64). */ - vma->vm_flags |= VM_HUGETLB | VM_RESERVED; + vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) @@ -132,8 +153,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; struct hstate *h = hstate_file(file); + struct vm_unmapped_area_info info; if (len & ~huge_page_mask(h)) return -EINVAL; @@ -154,32 +175,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return addr; } - start_addr = mm->free_area_cache; - - if (len <= mm->cached_hole_size) - start_addr = TASK_UNMAPPED_BASE; - -full_search: - addr = ALIGN(start_addr, huge_page_size(h)); - - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = TASK_UNMAPPED_BASE; - goto full_search; - } - return -ENOMEM; - } - - if (!vma || addr + len <= vma->vm_start) - return addr; - addr = ALIGN(vma->vm_end, huge_page_size(h)); - } + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = PAGE_MASK & ~huge_page_mask(h); + info.align_offset = 0; + return vm_unmapped_area(&info); } #endif @@ -366,22 +368,27 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart) static void hugetlbfs_evict_inode(struct inode *inode) { + struct resv_map *resv_map; + truncate_hugepages(inode, 0); - end_writeback(inode); + resv_map = (struct resv_map *)inode->i_mapping->private_data; + /* root inode doesn't have the resv_map, so we should check it */ + if (resv_map) + resv_map_release(&resv_map->refs); + clear_inode(inode); } static inline void -hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) +hugetlb_vmtruncate_list(struct rb_root *root, pgoff_t pgoff) { struct vm_area_struct *vma; - struct prio_tree_iter iter; - vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) { + vma_interval_tree_foreach(vma, root, pgoff, ULONG_MAX) { unsigned long v_offset; /* * Can the expression below overflow on 32-bit arches? - * No, because the prio_tree returns us only those vmas + * No, because the interval tree returns us only those vmas * which overlap the truncated area starting at pgoff, * and no vma on a 32-bit arch can span beyond the 4GB. */ @@ -390,8 +397,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) else v_offset = 0; - __unmap_hugepage_range(vma, - vma->vm_start + v_offset, vma->vm_end, NULL); + unmap_hugepage_range(vma, vma->vm_start + v_offset, + vma->vm_end, NULL); } } @@ -406,7 +413,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); mutex_lock(&mapping->i_mmap_mutex); - if (!prio_tree_empty(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); mutex_unlock(&mapping->i_mmap_mutex); truncate_hugepages(inode, offset); @@ -440,22 +447,60 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return 0; } -static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, - gid_t gid, int mode, dev_t dev) +static struct inode *hugetlbfs_get_root(struct super_block *sb, + struct hugetlbfs_config *config) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + struct hugetlbfs_inode_info *info; + inode->i_ino = get_next_ino(); + inode->i_mode = S_IFDIR | config->mode; + inode->i_uid = config->uid; + inode->i_gid = config->gid; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + info = HUGETLBFS_I(inode); + mpol_shared_policy_init(&info->policy, NULL); + inode->i_op = &hugetlbfs_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + lockdep_annotate_inode_mutex_key(inode); + } + return inode; +} + +/* + * Hugetlbfs is not reclaimable; therefore its i_mmap_mutex will never + * be taken from reclaim -- unlike regular filesystems. This needs an + * annotation because huge_pmd_share() does an allocation under + * i_mmap_mutex. + */ +static struct lock_class_key hugetlbfs_i_mmap_mutex_key; + +static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct inode *dir, + umode_t mode, dev_t dev) { struct inode *inode; + struct resv_map *resv_map; + + resv_map = resv_map_alloc(); + if (!resv_map) + return NULL; inode = new_inode(sb); if (inode) { struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = uid; - inode->i_gid = gid; + inode_init_owner(inode, dir, mode); + lockdep_set_class(&inode->i_mapping->i_mmap_mutex, + &hugetlbfs_i_mmap_mutex_key); inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - INIT_LIST_HEAD(&inode->i_mapping->private_list); + inode->i_mapping->private_data = resv_map; info = HUGETLBFS_I(inode); /* * The policy is initialized here even if we are creating a @@ -485,7 +530,9 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, break; } lockdep_annotate_inode_mutex_key(inode); - } + } else + kref_put(&resv_map->refs, resv_map_release); + return inode; } @@ -493,20 +540,12 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, * File creation. Allocate an inode, and we're done.. */ static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, int mode, dev_t dev) + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; - gid_t gid; - - if (dir->i_mode & S_ISGID) { - gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else { - gid = current_fsgid(); - } - inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), gid, mode, dev); + + inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (inode) { dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -516,7 +555,7 @@ static int hugetlbfs_mknod(struct inode *dir, return error; } -static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); if (!retval) @@ -524,7 +563,7 @@ static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) return retval; } -static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) +static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); } @@ -534,15 +573,8 @@ static int hugetlbfs_symlink(struct inode *dir, { struct inode *inode; int error = -ENOSPC; - gid_t gid; - if (dir->i_mode & S_ISGID) - gid = dir->i_gid; - else - gid = current_fsgid(); - - inode = hugetlbfs_get_inode(dir->i_sb, current_fsuid(), - gid, S_IFLNK|S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); @@ -575,11 +607,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, int rc; rc = migrate_huge_page_move_mapping(mapping, newpage, page); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -670,7 +702,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) static void hugetlbfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } @@ -752,13 +783,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) case Opt_uid: if (match_int(&args[0], &option)) goto bad_val; - pconfig->uid = option; + pconfig->uid = make_kuid(current_user_ns(), option); + if (!uid_valid(pconfig->uid)) + goto bad_val; break; case Opt_gid: if (match_int(&args[0], &option)) goto bad_val; - pconfig->gid = option; + pconfig->gid = make_kgid(current_user_ns(), option); + if (!gid_valid(pconfig->gid)) + goto bad_val; break; case Opt_mode: @@ -790,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) ps = memparse(args[0].from, &rest); pconfig->hstate = size_to_hstate(ps); if (!pconfig->hstate) { - printk(KERN_ERR - "hugetlbfs: Unsupported page size %lu MB\n", + pr_err("Unsupported page size %lu MB\n", ps >> 20); return -EINVAL; } @@ -799,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } default: - printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", - p); + pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; break; } @@ -820,16 +853,13 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) return 0; bad_val: - printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", - args[0].from, p); + pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); return -EINVAL; } static int hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) { - struct inode * inode; - struct dentry * root; int ret; struct hugetlbfs_config config; struct hugetlbfs_sb_info *sbinfo; @@ -866,21 +896,12 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - inode = hugetlbfs_get_inode(sb, config.uid, config.gid, - S_IFDIR | config.mode, 0); - if (!inode) - goto out_free; - - root = d_alloc_root(inode); - if (!root) { - iput(inode); + sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + if (!sb->s_root) goto out_free; - } - sb->s_root = root; return 0; out_free: - if (sbinfo->spool) - kfree(sbinfo->spool); + kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } @@ -896,55 +917,82 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; +MODULE_ALIAS_FS("hugetlbfs"); -static struct vfsmount *hugetlbfs_vfsmount; +static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; static int can_do_hugetlb_shm(void) { - return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); + kgid_t shm_group; + shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); + return capable(CAP_IPC_LOCK) || in_group_p(shm_group); +} + +static int get_hstate_idx(int page_size_log) +{ + struct hstate *h = hstate_sizelog(page_size_log); + + if (!h) + return -1; + return h - hstates; } +static const struct dentry_operations anon_ops = { + .d_dname = simple_dname +}; + +/* + * Note that size should be aligned to proper hugepage size in caller side, + * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. + */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, - struct user_struct **user, int creat_flags) + vm_flags_t acctflag, struct user_struct **user, + int creat_flags, int page_size_log) { - int error = -ENOMEM; - struct file *file; + struct file *file = ERR_PTR(-ENOMEM); struct inode *inode; struct path path; - struct dentry *root; + struct super_block *sb; struct qstr quick_string; + int hstate_idx; + + hstate_idx = get_hstate_idx(page_size_log); + if (hstate_idx < 0) + return ERR_PTR(-ENODEV); *user = NULL; - if (!hugetlbfs_vfsmount) + if (!hugetlbfs_vfsmount[hstate_idx]) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { - printk_once(KERN_WARNING "Using mlock ulimits for SHM_HUGETLB is deprecated\n"); + task_lock(current); + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + current->comm, current->pid); + task_unlock(current); } else { *user = NULL; return ERR_PTR(-EPERM); } } - root = hugetlbfs_vfsmount->mnt_root; + sb = hugetlbfs_vfsmount[hstate_idx]->mnt_sb; quick_string.name = name; quick_string.len = strlen(quick_string.name); quick_string.hash = 0; - path.dentry = d_alloc(root, &quick_string); + path.dentry = d_alloc_pseudo(sb, &quick_string); if (!path.dentry) goto out_shm_unlock; - path.mnt = mntget(hugetlbfs_vfsmount); - error = -ENOSPC; - inode = hugetlbfs_get_inode(root->d_sb, current_fsuid(), - current_fsgid(), S_IFREG | S_IRWXUGO, 0); + d_set_d_op(path.dentry, &anon_ops); + path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]); + file = ERR_PTR(-ENOSPC); + inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; - error = -ENOMEM; + file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) @@ -954,10 +1002,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - error = -ENFILE; file = alloc_file(&path, FMODE_WRITE | FMODE_READ, &hugetlbfs_file_operations); - if (!file) + if (IS_ERR(file)) goto out_dentry; /* inode is already attached */ return file; @@ -971,18 +1018,25 @@ out_shm_unlock: user_shm_unlock(size, *user); *user = NULL; } - return ERR_PTR(error); + return file; } static int __init init_hugetlbfs_fs(void) { + struct hstate *h; int error; - struct vfsmount *vfsmount; + int i; + + if (!hugepages_supported()) { + pr_info("disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; + error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), 0, 0, init_once); @@ -993,18 +1047,29 @@ static int __init init_hugetlbfs_fs(void) if (error) goto out; - vfsmount = kern_mount(&hugetlbfs_fs_type); + i = 0; + for_each_hstate(h) { + char buf[50]; + unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10); - if (!IS_ERR(vfsmount)) { - hugetlbfs_vfsmount = vfsmount; - return 0; - } + snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb); + hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type, + buf); - error = PTR_ERR(vfsmount); + if (IS_ERR(hugetlbfs_vfsmount[i])) { + pr_err("Cannot mount internal hugetlbfs for " + "page size %uK", ps_kb); + error = PTR_ERR(hugetlbfs_vfsmount[i]); + hugetlbfs_vfsmount[i] = NULL; + } + i++; + } + /* Non default hstates are optional */ + if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx])) + return 0; out: - if (error) - kmem_cache_destroy(hugetlbfs_inode_cachep); + kmem_cache_destroy(hugetlbfs_inode_cachep); out2: bdi_destroy(&hugetlbfs_backing_dev_info); return error; @@ -1012,8 +1077,19 @@ static int __init init_hugetlbfs_fs(void) static void __exit exit_hugetlbfs_fs(void) { + struct hstate *h; + int i; + + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(hugetlbfs_inode_cachep); - kern_unmount(hugetlbfs_vfsmount); + i = 0; + for_each_hstate(h) + kern_unmount(hugetlbfs_vfsmount[i++]); unregister_filesystem(&hugetlbfs_fs_type); bdi_destroy(&hugetlbfs_backing_dev_info); } |
