diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/block_dev.c | 3 | ||||
-rw-r--r-- | fs/btrfs/disk-io.c | 5 | ||||
-rw-r--r-- | fs/direct-io.c | 57 | ||||
-rw-r--r-- | fs/eventpoll.c | 234 | ||||
-rw-r--r-- | fs/hugetlbfs/inode.c | 3 | ||||
-rw-r--r-- | fs/nfs/internal.h | 2 | ||||
-rw-r--r-- | fs/nfs/write.c | 4 | ||||
-rw-r--r-- | fs/pipe.c | 2 | ||||
-rw-r--r-- | fs/proc/array.c | 7 | ||||
-rw-r--r-- | fs/proc/base.c | 2 |
10 files changed, 269 insertions, 50 deletions
diff --git a/fs/block_dev.c b/fs/block_dev.c index afe74dda632..0e575d1304b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1139,6 +1139,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; @@ -1159,6 +1160,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; + bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); put_disk(disk); @@ -1232,6 +1234,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; + bdev->bd_queue = NULL; bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info); if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f99a099a774..d8525662ca7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -872,7 +872,8 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, #ifdef CONFIG_MIGRATION static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { /* * we can't safely write a btree page from here, @@ -887,7 +888,7 @@ static int btree_migratepage(struct address_space *mapping, if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/direct-io.c b/fs/direct-io.c index d740ab67ff6..4a588dbd11b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -36,6 +36,7 @@ #include <linux/rwsem.h> #include <linux/uio.h> #include <linux/atomic.h> +#include <linux/prefetch.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -580,9 +581,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, { int ret; sector_t fs_startblk; /* Into file, in filesystem-sized blocks */ + sector_t fs_endblk; /* Into file, in filesystem-sized blocks */ unsigned long fs_count; /* Number of filesystem-sized blocks */ - unsigned long dio_count;/* Number of dio_block-sized blocks */ - unsigned long blkmask; int create; /* @@ -593,11 +593,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, if (ret == 0) { BUG_ON(sdio->block_in_file >= sdio->final_block_in_request); fs_startblk = sdio->block_in_file >> sdio->blkfactor; - dio_count = sdio->final_block_in_request - sdio->block_in_file; - fs_count = dio_count >> sdio->blkfactor; - blkmask = (1 << sdio->blkfactor) - 1; - if (dio_count & blkmask) - fs_count++; + fs_endblk = (sdio->final_block_in_request - 1) >> + sdio->blkfactor; + fs_count = fs_endblk - fs_startblk + 1; map_bh->b_state = 0; map_bh->b_size = fs_count << dio->inode->i_blkbits; @@ -1090,8 +1088,8 @@ static inline int drop_refcount(struct dio *dio) * individual fields and will generate much worse code. This is important * for the whole file. */ -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, +static inline ssize_t +do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) @@ -1100,7 +1098,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, size_t size; unsigned long addr; unsigned blkbits = inode->i_blkbits; - unsigned bdev_blkbits = 0; unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; loff_t end = offset; @@ -1113,12 +1110,14 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (rw & WRITE) rw = WRITE_ODIRECT; - if (bdev) - bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); + /* + * Avoid references to bdev if not absolutely needed to give + * the early prefetch in the caller enough time. + */ if (offset & blocksize_mask) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits(bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; if (offset & blocksize_mask) goto out; @@ -1129,11 +1128,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, addr = (unsigned long)iov[seg].iov_base; size = iov[seg].iov_len; end += size; - if ((addr & blocksize_mask) || (size & blocksize_mask)) { + if (unlikely((addr & blocksize_mask) || + (size & blocksize_mask))) { if (bdev) - blkbits = bdev_blkbits; + blkbits = blksize_bits( + bdev_logical_block_size(bdev)); blocksize_mask = (1 << blkbits) - 1; - if ((addr & blocksize_mask) || (size & blocksize_mask)) + if ((addr & blocksize_mask) || (size & blocksize_mask)) goto out; } } @@ -1316,6 +1317,30 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, out: return retval; } + +ssize_t +__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, const struct iovec *iov, loff_t offset, + unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) +{ + /* + * The block device state is needed in the end to finally + * submit everything. Since it's likely to be cache cold + * prefetch it here as first thing to hide some of the + * latency. + * + * Attempt to prefetch the pieces we likely need later. + */ + prefetch(&bdev->bd_disk->part_tbl); + prefetch(bdev->bd_queue); + prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); + + return do_blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_block, end_io, + submit_io, flags); +} + EXPORT_SYMBOL(__blockdev_direct_IO); static __init int dio_init(void) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23..aabdfc38cf2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -197,6 +197,12 @@ struct eventpoll { /* The user that created the eventpoll descriptor */ struct user_struct *user; + + struct file *file; + + /* used to optimize loop detection check */ + int visited; + struct list_head visited_list_link; }; /* Wait structure used by the poll hooks */ @@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ +static LIST_HEAD(visited_list); + +/* + * List of files with newly added links, where we may need to limit the number + * of emanating paths. Protected by the epmutex. + */ +static LIST_HEAD(tfile_check_list); + #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> @@ -276,6 +291,12 @@ ctl_table epoll_table[] = { }; #endif /* CONFIG_SYSCTL */ +static const struct file_operations eventpoll_fops; + +static inline int is_file_epoll(struct file *f) +{ + return f->f_op == &eventpoll_fops; +} /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { .llseek = noop_llseek, }; -/* Fast test to see if the file is an eventpoll file */ -static inline int is_file_epoll(struct file *f) -{ - return f->f_op == &eventpoll_fops; -} - /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are @@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) rb_insert_color(&epi->rbn, &ep->rbr); } + + +#define PATH_ARR_SIZE 5 +/* + * These are the number paths of length 1 to 5, that we are allowing to emanate + * from a single file of interest. For example, we allow 1000 paths of length + * 1, to emanate from each file of interest. This essentially represents the + * potential wakeup paths, which need to be limited in order to avoid massive + * uncontrolled wakeup storms. The common use case should be a single ep which + * is connected to n file sources. In this case each file source has 1 path + * of length 1. Thus, the numbers below should be more than sufficient. These + * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify + * and delete can't add additional paths. Protected by the epmutex. + */ +static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; +static int path_count[PATH_ARR_SIZE]; + +static int path_count_inc(int nests) +{ + if (++path_count[nests] > path_limits[nests]) + return -1; + return 0; +} + +static void path_count_init(void) +{ + int i; + + for (i = 0; i < PATH_ARR_SIZE; i++) + path_count[i] = 0; +} + +static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) +{ + int error = 0; + struct file *file = priv; + struct file *child_file; + struct epitem *epi; + + list_for_each_entry(epi, &file->f_ep_links, fllink) { + child_file = epi->ep->file; + if (is_file_epoll(child_file)) { + if (list_empty(&child_file->f_ep_links)) { + if (path_count_inc(call_nests)) { + error = -1; + break; + } + } else { + error = ep_call_nested(&poll_loop_ncalls, + EP_MAX_NESTS, + reverse_path_check_proc, + child_file, child_file, + current); + } + if (error != 0) + break; + } else { + printk(KERN_ERR "reverse_path_check_proc: " + "file is not an ep!\n"); + } + } + return error; +} + +/** + * reverse_path_check - The tfile_check_list is list of file *, which have + * links that are proposed to be newly added. We need to + * make sure that those added links don't add too many + * paths such that we will spend all our time waking up + * eventpoll objects. + * + * Returns: Returns zero if the proposed links don't create too many paths, + * -1 otherwise. + */ +static int reverse_path_check(void) +{ + int length = 0; + int error = 0; + struct file *current_file; + + /* let's call this for all tfiles */ + list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { + length++; + path_count_init(); + error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + reverse_path_check_proc, current_file, + current_file, current); + if (error) + break; + } + return error; +} + /* * Must be called with "mtx" held. */ @@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, */ ep_rbtree_insert(ep, epi); + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (reverse_path_check()) + goto error_remove_epi; + /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); @@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, return 0; +error_remove_epi: + spin_lock(&tfile->f_lock); + if (ep_is_linked(&epi->fllink)) + list_del_init(&epi->fllink); + spin_unlock(&tfile->f_lock); + + rb_erase(&epi->rbn, &ep->rbr); + error_unregister: ep_unregister_pollwait(ep, epi); @@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) int error = 0; struct file *file = priv; struct eventpoll *ep = file->private_data; + struct eventpoll *ep_tovisit; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, call_nests + 1); + ep->visited = 1; + list_add(&ep->visited_list_link, &visited_list); for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { + ep_tovisit = epi->ffd.file->private_data; + if (ep_tovisit->visited) + continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, - ep_loop_check_proc, epi->ffd.file, - epi->ffd.file->private_data, current); + ep_loop_check_proc, epi->ffd.file, + ep_tovisit, current); if (error != 0) break; + } else { + /* + * If we've reached a file that is not associated with + * an ep, then we need to check if the newly added + * links are going to add too many wakeup paths. We do + * this by adding it to the tfile_check_list, if it's + * not already there, and calling reverse_path_check() + * during ep_insert(). + */ + if (list_empty(&epi->ffd.file->f_tfile_llink)) + list_add(&epi->ffd.file->f_tfile_llink, + &tfile_check_list); } } mutex_unlock(&ep->mtx); @@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + int ret; + struct eventpoll *ep_cur, *ep_next; + + ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); + /* clear visited list */ + list_for_each_entry_safe(ep_cur, ep_next, &visited_list, + visited_list_link) { + ep_cur->visited = 0; + list_del(&ep_cur->visited_list_link); + } + return ret; +} + +static void clear_tfile_check_list(void) +{ + struct file *file; + + /* first clear the tfile_check_list */ + while (!list_empty(&tfile_check_list)) { + file = list_first_entry(&tfile_check_list, struct file, + f_tfile_llink); + list_del_init(&file->f_tfile_llink); + } + INIT_LIST_HEAD(&tfile_check_list); } /* @@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) */ SYSCALL_DEFINE1(epoll_create1, int, flags) { - int error; + int error, fd; struct eventpoll *ep = NULL; + struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); @@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ - error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, + fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); + if (fd < 0) { + error = fd; + goto out_free_ep; + } + file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); - if (error < 0) - ep_free(ep); - + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto out_free_fd; + } + fd_install(fd, file); + ep->file = file; + return fd; + +out_free_fd: + put_unused_fd(fd); +out_free_ep: + ep_free(ep); return error; } @@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, /* * When we insert an epoll file descriptor, inside another epoll file * descriptor, there is the change of creating closed loops, which are - * better be handled here, than in more critical paths. + * better be handled here, than in more critical paths. While we are + * checking for loops we also determine the list of files reachable + * and hang them on the tfile_check_list, so we can check that we + * haven't created too many possible wakeup paths. * - * We hold epmutex across the loop check and the insert in this case, in - * order to prevent two separate inserts from racing and each doing the - * insert "at the same time" such that ep_loop_check passes on both - * before either one does the insert, thereby creating a cycle. + * We need to hold the epmutex across both ep_insert and ep_remove + * b/c we want to make sure we are looking at a coherent view of + * epoll network. */ - if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { + if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { mutex_lock(&epmutex); did_lock_epmutex = 1; - error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) - goto error_tgt_fput; } - + if (op == EPOLL_CTL_ADD) { + if (is_file_epoll(tfile)) { + error = -ELOOP; + if (ep_loop_check(ep, tfile) != 0) + goto error_tgt_fput; + } else + list_add(&tfile->f_tfile_llink, &tfile_check_list); + } mutex_lock_nested(&ep->mtx, 0); @@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; + clear_tfile_check_list(); break; case EPOLL_CTL_DEL: if (epi) @@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_unlock(&ep->mtx); error_tgt_fput: - if (unlikely(did_lock_epmutex)) + if (did_lock_epmutex) mutex_unlock(&epmutex); fput(tfile); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e425ad9d049..1e85a7ac021 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -583,7 +583,8 @@ static int hugetlbfs_set_page_dirty(struct page *page) } static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page) + struct page *newpage, struct page *page, + enum migrate_mode mode) { int rc; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ee92538b06..8102db9b926 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -332,7 +332,7 @@ void nfs_commit_release_pages(struct nfs_write_data *data); #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *); + struct page *, struct page *, enum migrate_mode); #else #define nfs_migrate_page NULL #endif diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0c3885255f9..834f0fe96f8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1688,7 +1688,7 @@ out_error: #ifdef CONFIG_MIGRATION int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page) + struct page *page, enum migrate_mode mode) { /* * If PagePrivate is set, then the page is currently associated with @@ -1703,7 +1703,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage, nfs_fscache_release_page(page, GFP_KERNEL); - return migrate_page(mapping, newpage, page); + return migrate_page(mapping, newpage, page, mode); } #endif diff --git a/fs/pipe.c b/fs/pipe.c index f0e485d54e6..a932ced92a1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1137,7 +1137,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) if (nr_pages < pipe->nrbufs) return -EBUSY; - bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); + bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; diff --git a/fs/proc/array.c b/fs/proc/array.c index 8c344f037bd..9252ee3b71e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ -%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n", +%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n", pid_nr_ns(pid, ns), tcomm, state, @@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), - cputime_to_clock_t(cgtime)); + cputime_to_clock_t(cgtime), + (mm && permitted) ? mm->start_data : 0, + (mm && permitted) ? mm->end_data : 0, + (mm && permitted) ? mm->start_brk : 0); if (mm) mmput(mm); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 8173dfd89cb..5485a5388ec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -654,6 +654,8 @@ static int proc_pid_permission(struct inode *inode, int mask) bool has_perms; task = get_proc_task(inode); + if (!task) + return -ESRCH; has_perms = has_pid_permissions(pid, task, 1); put_task_struct(task); |