aboutsummaryrefslogtreecommitdiff
path: root/fs/splice.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/splice.c')
-rw-r--r--fs/splice.c1439
1 files changed, 864 insertions, 575 deletions
diff --git a/fs/splice.c b/fs/splice.c
index a861bb318ac..f5cb9ba8451 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -21,14 +21,19 @@
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
+#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/swap.h>
#include <linux/writeback.h>
-#include <linux/buffer_head.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/uio.h>
#include <linux/security.h>
+#include <linux/gfp.h>
+#include <linux/socket.h>
+#include <linux/compat.h>
+#include <linux/aio.h>
+#include "internal.h"
/*
* Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -58,8 +63,9 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
*/
wait_on_page_writeback(page);
- if (PagePrivate(page))
- try_to_release_page(page, GFP_KERNEL);
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL))
+ goto out_unlock;
/*
* If we succeeded in removing the mapping, set LRU flag
@@ -75,6 +81,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
* Raced with truncate or failed to remove page from current
* address space, unlock and return failure.
*/
+out_unlock:
unlock_page(page);
return 1;
}
@@ -128,10 +135,8 @@ error:
return err;
}
-static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
+const struct pipe_buf_operations page_cache_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = page_cache_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = page_cache_pipe_buf_steal,
@@ -150,14 +155,20 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
static const struct pipe_buf_operations user_page_pipe_buf_ops = {
.can_merge = 0,
- .map = generic_pipe_buf_map,
- .unmap = generic_pipe_buf_unmap,
.confirm = generic_pipe_buf_confirm,
.release = page_cache_pipe_buf_release,
.steal = user_page_pipe_buf_steal,
.get = generic_pipe_buf_get,
};
+static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
+{
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+}
+
/**
* splice_to_pipe - fill passed data into a pipe
* @pipe: pipe to fill
@@ -179,8 +190,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
do_wakeup = 0;
page_nr = 0;
- if (pipe->inode)
- mutex_lock(&pipe->inode->i_mutex);
+ pipe_lock(pipe);
for (;;) {
if (!pipe->readers) {
@@ -190,8 +200,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
break;
}
- if (pipe->nrbufs < PIPE_BUFFERS) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+ if (pipe->nrbufs < pipe->buffers) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
buf->page = spd->pages[page_nr];
@@ -206,12 +216,12 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
page_nr++;
ret += buf->len;
- if (pipe->inode)
+ if (pipe->files)
do_wakeup = 1;
if (!--spd->nr_pages)
break;
- if (pipe->nrbufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < pipe->buffers)
continue;
break;
@@ -242,16 +252,10 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
pipe->waiting_writers--;
}
- if (pipe->inode) {
- mutex_unlock(&pipe->inode->i_mutex);
+ pipe_unlock(pipe);
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
- }
- }
+ if (do_wakeup)
+ wakeup_pipe_readers(pipe);
while (page_nr < spd_pages)
spd->spd_release(spd, page_nr++);
@@ -259,11 +263,43 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
return ret;
}
-static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
+void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
{
page_cache_release(spd->pages[i]);
}
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+ unsigned int buffers = ACCESS_ONCE(pipe->buffers);
+
+ spd->nr_pages_max = buffers;
+ if (buffers <= PIPE_DEF_BUFFERS)
+ return 0;
+
+ spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
+ spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+ if (spd->pages && spd->partial)
+ return 0;
+
+ kfree(spd->pages);
+ kfree(spd->partial);
+ return -ENOMEM;
+}
+
+void splice_shrink_spd(struct splice_pipe_desc *spd)
+{
+ if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
+ return;
+
+ kfree(spd->pages);
+ kfree(spd->partial);
+}
+
static int
__generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
@@ -271,8 +307,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
{
struct address_space *mapping = in->f_mapping;
unsigned int loff, nr_pages, req_pages;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct page *page;
pgoff_t index, end_index;
loff_t isize;
@@ -280,20 +316,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &page_cache_pipe_buf_ops,
.spd_release = spd_release_page,
};
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+ nr_pages = min(req_pages, spd.nr_pages_max);
/*
* Lookup the (hopefully) full range of pages we need.
*/
- spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
index += spd.nr_pages;
/*
@@ -320,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_mask(mapping));
+ GFP_KERNEL);
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
@@ -334,7 +374,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
unlock_page(page);
}
- pages[spd.nr_pages++] = page;
+ spd.pages[spd.nr_pages++] = page;
index++;
}
@@ -355,7 +395,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* this_len is the max we'll use from this page
*/
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
- page = pages[page_nr];
+ page = spd.pages[page_nr];
if (PageReadahead(page))
page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -365,24 +405,25 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
* If the page isn't uptodate, we may need to start io on it
*/
if (!PageUptodate(page)) {
- /*
- * If in nonblock mode then dont block on waiting
- * for an in-flight io page
- */
- if (flags & SPLICE_F_NONBLOCK) {
- if (TestSetPageLocked(page))
- break;
- } else
- lock_page(page);
+ lock_page(page);
/*
- * page was truncated, stop here. if this isn't the
- * first page, we'll just complete what we already
- * added
+ * Page was truncated, or invalidated by the
+ * filesystem. Redo the find/create, but this time the
+ * page is kept locked, so there's no chance of another
+ * race with truncate/invalidate.
*/
if (!page->mapping) {
unlock_page(page);
- break;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+
+ if (!page) {
+ error = -ENOMEM;
+ break;
+ }
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
@@ -439,8 +480,8 @@ fill_it:
len = this_len;
}
- partial[page_nr].offset = loff;
- partial[page_nr].len = this_len;
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
len -= this_len;
loff = 0;
spd.nr_pages++;
@@ -452,12 +493,13 @@ fill_it:
* we got, 'nr_pages' is how many pages are in the map.
*/
while (page_nr < nr_pages)
- page_cache_release(pages[page_nr++]);
+ page_cache_release(spd.pages[page_nr++]);
in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
if (spd.nr_pages)
- return splice_to_pipe(pipe, &spd);
+ error = splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(&spd);
return error;
}
@@ -479,9 +521,8 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- ssize_t spliced;
- int ret;
loff_t isize, left;
+ int ret;
isize = i_size_read(in->f_mapping->host);
if (unlikely(*ppos >= isize))
@@ -491,224 +532,362 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
if (unlikely(left < len))
len = left;
- ret = 0;
- spliced = 0;
- while (len && !spliced) {
- ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-
- if (ret < 0)
- break;
- else if (!ret) {
- if (spliced)
- break;
- if (flags & SPLICE_F_NONBLOCK) {
- ret = -EAGAIN;
- break;
- }
- }
-
+ ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
+ if (ret > 0) {
*ppos += ret;
- len -= ret;
- spliced += ret;
+ file_accessed(in);
}
- if (spliced)
- return spliced;
-
return ret;
}
-
EXPORT_SYMBOL(generic_file_splice_read);
-/*
- * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
- * using sendpage(). Return the number of bytes sent.
- */
-static int pipe_to_sendpage(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf, struct splice_desc *sd)
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = generic_pipe_buf_release,
+ .steal = generic_pipe_buf_steal,
+ .get = generic_pipe_buf_get,
+};
+
+static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf)
{
- struct file *file = sd->u.file;
- loff_t pos = sd->pos;
- int ret, more;
+ return 1;
+}
+
+/* Pipe buffer operations for a socket and similar. */
+const struct pipe_buf_operations nosteal_pipe_buf_ops = {
+ .can_merge = 0,
+ .confirm = generic_pipe_buf_confirm,
+ .release = generic_pipe_buf_release,
+ .steal = generic_pipe_buf_nosteal,
+ .get = generic_pipe_buf_get,
+};
+EXPORT_SYMBOL(nosteal_pipe_buf_ops);
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+ unsigned long vlen, loff_t offset)
+{
+ mm_segment_t old_fs;
+ loff_t pos = offset;
+ ssize_t res;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+ set_fs(old_fs);
+
+ return res;
+}
+
+ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+ loff_t pos)
+{
+ mm_segment_t old_fs;
+ ssize_t res;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = vfs_write(file, (__force const char __user *)buf, count, &pos);
+ set_fs(old_fs);
+
+ return res;
+}
+EXPORT_SYMBOL(kernel_write);
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ unsigned int nr_pages;
+ unsigned int nr_freed;
+ size_t offset;
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
+ ssize_t res;
+ size_t this_len;
+ int error;
+ int i;
+ struct splice_pipe_desc spd = {
+ .pages = pages,
+ .partial = partial,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
+ .flags = flags,
+ .ops = &default_pipe_buf_ops,
+ .spd_release = spd_release_page,
+ };
- ret = buf->ops->confirm(pipe, buf);
- if (!ret) {
- more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
- ret = file->f_op->sendpage(file, buf->page, buf->offset,
- sd->len, &pos, more);
+ res = -ENOMEM;
+ vec = __vec;
+ if (spd.nr_pages_max > PIPE_DEF_BUFFERS) {
+ vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL);
+ if (!vec)
+ goto shrink_ret;
}
- return ret;
+ offset = *ppos & ~PAGE_CACHE_MASK;
+ nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) {
+ struct page *page;
+
+ page = alloc_page(GFP_USER);
+ error = -ENOMEM;
+ if (!page)
+ goto err;
+
+ this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+ vec[i].iov_base = (void __user *) page_address(page);
+ vec[i].iov_len = this_len;
+ spd.pages[i] = page;
+ spd.nr_pages++;
+ len -= this_len;
+ offset = 0;
+ }
+
+ res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+ if (res < 0) {
+ error = res;
+ goto err;
+ }
+
+ error = 0;
+ if (!res)
+ goto err;
+
+ nr_freed = 0;
+ for (i = 0; i < spd.nr_pages; i++) {
+ this_len = min_t(size_t, vec[i].iov_len, res);
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = this_len;
+ if (!this_len) {
+ __free_page(spd.pages[i]);
+ spd.pages[i] = NULL;
+ nr_freed++;
+ }
+ res -= this_len;
+ }
+ spd.nr_pages -= nr_freed;
+
+ res = splice_to_pipe(pipe, &spd);
+ if (res > 0)
+ *ppos += res;
+
+shrink_ret:
+ if (vec != __vec)
+ kfree(vec);
+ splice_shrink_spd(&spd);
+ return res;
+
+err:
+ for (i = 0; i < spd.nr_pages; i++)
+ __free_page(spd.pages[i]);
+
+ res = error;
+ goto shrink_ret;
}
+EXPORT_SYMBOL(default_file_splice_read);
/*
- * This is a little more tricky than the file -> pipe splicing. There are
- * basically three cases:
- *
- * - Destination page already exists in the address space and there
- * are users of it. For that case we have no other option that
- * copying the data. Tough luck.
- * - Destination page already exists in the address space, but there
- * are no users of it. Make sure it's uptodate, then drop it. Fall
- * through to last case.
- * - Destination page does not exist, we can add the pipe page to
- * the page cache and avoid the copy.
- *
- * If asked to move pages to the output file (SPLICE_F_MOVE is set in
- * sd->flags), we attempt to migrate pages from the pipe to the output
- * file address space page cache. This is possible if no one else has
- * the pipe page referenced outside of the pipe and page cache. If
- * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
- * a new page in the output file page cache and fill/dirty that.
+ * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
+ * using sendpage(). Return the number of bytes sent.
*/
-static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
- struct splice_desc *sd)
+static int pipe_to_sendpage(struct pipe_inode_info *pipe,
+ struct pipe_buffer *buf, struct splice_desc *sd)
{
struct file *file = sd->u.file;
- struct address_space *mapping = file->f_mapping;
- unsigned int offset, this_len;
- struct page *page;
- void *fsdata;
- int ret;
-
- /*
- * make sure the data in this buffer is uptodate
- */
- ret = buf->ops->confirm(pipe, buf);
- if (unlikely(ret))
- return ret;
+ loff_t pos = sd->pos;
+ int more;
- offset = sd->pos & ~PAGE_CACHE_MASK;
+ if (!likely(file->f_op->sendpage))
+ return -EINVAL;
- this_len = sd->len;
- if (this_len + offset > PAGE_CACHE_SIZE)
- this_len = PAGE_CACHE_SIZE - offset;
+ more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
- ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
- if (unlikely(ret))
- goto out;
+ if (sd->len < sd->total_len && pipe->nrbufs > 1)
+ more |= MSG_SENDPAGE_NOTLAST;
- if (buf->page != page) {
- /*
- * Careful, ->map() uses KM_USER0!
- */
- char *src = buf->ops->map(pipe, buf, 1);
- char *dst = kmap_atomic(page, KM_USER1);
+ return file->f_op->sendpage(file, buf->page, buf->offset,
+ sd->len, &pos, more);
+}
- memcpy(dst + offset, src + buf->offset, this_len);
- flush_dcache_page(page);
- kunmap_atomic(dst, KM_USER1);
- buf->ops->unmap(pipe, buf, src);
- }
- ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
- page, fsdata);
-out:
- return ret;
+static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
+{
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
}
/**
- * __splice_from_pipe - splice data from a pipe to given actor
+ * splice_from_pipe_feed - feed available data from a pipe to a file
* @pipe: pipe to splice from
* @sd: information to @actor
* @actor: handler that splices the data
*
* Description:
- * This function does little more than loop over the pipe and call
- * @actor to do the actual moving of a single struct pipe_buffer to
- * the desired destination. See pipe_to_file, pipe_to_sendpage, or
- * pipe_to_user.
+ * This function loops over the pipe and calls @actor to do the
+ * actual moving of a single struct pipe_buffer to the desired
+ * destination. It returns when there's no more buffers left in
+ * the pipe or if the requested number of bytes (@sd->total_len)
+ * have been copied. It returns a positive number (one) if the
+ * pipe needs to be filled with more data, zero if the required
+ * number of bytes have been copied and -errno on error.
*
+ * This, together with splice_from_pipe_{begin,end,next}, may be
+ * used to implement the functionality of __splice_from_pipe() when
+ * locking is required around copying the pipe buffers to the
+ * destination.
*/
-ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
- splice_actor *actor)
+static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
+ splice_actor *actor)
{
- int ret, do_wakeup, err;
+ int ret;
- ret = 0;
- do_wakeup = 0;
+ while (pipe->nrbufs) {
+ struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ const struct pipe_buf_operations *ops = buf->ops;
- for (;;) {
- if (pipe->nrbufs) {
- struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
- const struct pipe_buf_operations *ops = buf->ops;
+ sd->len = buf->len;
+ if (sd->len > sd->total_len)
+ sd->len = sd->total_len;
- sd->len = buf->len;
- if (sd->len > sd->total_len)
- sd->len = sd->total_len;
+ ret = buf->ops->confirm(pipe, buf);
+ if (unlikely(ret)) {
+ if (ret == -ENODATA)
+ ret = 0;
+ return ret;
+ }
- err = actor(pipe, buf, sd);
- if (err <= 0) {
- if (!ret && err != -ENODATA)
- ret = err;
+ ret = actor(pipe, buf, sd);
+ if (ret <= 0)
+ return ret;
- break;
- }
+ buf->offset += ret;
+ buf->len -= ret;
- ret += err;
- buf->offset += err;
- buf->len -= err;
+ sd->num_spliced += ret;
+ sd->len -= ret;
+ sd->pos += ret;
+ sd->total_len -= ret;
- sd->len -= err;
- sd->pos += err;
- sd->total_len -= err;
- if (sd->len)
- continue;
+ if (!buf->len) {
+ buf->ops = NULL;
+ ops->release(pipe, buf);
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+ pipe->nrbufs--;
+ if (pipe->files)
+ sd->need_wakeup = true;
+ }
- if (!buf->len) {
- buf->ops = NULL;
- ops->release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
- pipe->nrbufs--;
- if (pipe->inode)
- do_wakeup = 1;
- }
+ if (!sd->total_len)
+ return 0;
+ }
- if (!sd->total_len)
- break;
- }
+ return 1;
+}
- if (pipe->nrbufs)
- continue;
+/**
+ * splice_from_pipe_next - wait for some data to splice from
+ * @pipe: pipe to splice from
+ * @sd: information about the splice operation
+ *
+ * Description:
+ * This function will wait for some data and return a positive
+ * value (one) if pipe buffers are available. It will return zero
+ * or -errno if no more data needs to be spliced.
+ */
+static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+ while (!pipe->nrbufs) {
if (!pipe->writers)
- break;
- if (!pipe->waiting_writers) {
- if (ret)
- break;
- }
+ return 0;
- if (sd->flags & SPLICE_F_NONBLOCK) {
- if (!ret)
- ret = -EAGAIN;
- break;
- }
+ if (!pipe->waiting_writers && sd->num_spliced)
+ return 0;
- if (signal_pending(current)) {
- if (!ret)
- ret = -ERESTARTSYS;
- break;
- }
+ if (sd->flags & SPLICE_F_NONBLOCK)
+ return -EAGAIN;
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible_sync(&pipe->wait);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- do_wakeup = 0;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+
+ if (sd->need_wakeup) {
+ wakeup_pipe_writers(pipe);
+ sd->need_wakeup = false;
}
pipe_wait(pipe);
}
- if (do_wakeup) {
- smp_mb();
- if (waitqueue_active(&pipe->wait))
- wake_up_interruptible(&pipe->wait);
- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
- }
+ return 1;
+}
- return ret;
+/**
+ * splice_from_pipe_begin - start splicing from pipe
+ * @sd: information about the splice operation
+ *
+ * Description:
+ * This function should be called before a loop containing
+ * splice_from_pipe_next() and splice_from_pipe_feed() to
+ * initialize the necessary fields of @sd.
+ */
+static void splice_from_pipe_begin(struct splice_desc *sd)
+{
+ sd->num_spliced = 0;
+ sd->need_wakeup = false;
+}
+
+/**
+ * splice_from_pipe_end - finish splicing from pipe
+ * @pipe: pipe to splice from
+ * @sd: information about the splice operation
+ *
+ * Description:
+ * This function will wake up pipe writers if necessary. It should
+ * be called after a loop containing splice_from_pipe_next() and
+ * splice_from_pipe_feed().
+ */
+static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+ if (sd->need_wakeup)
+ wakeup_pipe_writers(pipe);
+}
+
+/**
+ * __splice_from_pipe - splice data from a pipe to given actor
+ * @pipe: pipe to splice from
+ * @sd: information to @actor
+ * @actor: handler that splices the data
+ *
+ * Description:
+ * This function does little more than loop over the pipe and call
+ * @actor to do the actual moving of a single struct pipe_buffer to
+ * the desired destination. See pipe_to_file, pipe_to_sendpage, or
+ * pipe_to_user.
+ *
+ */
+ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
+ splice_actor *actor)
+{
+ int ret;
+
+ splice_from_pipe_begin(sd);
+ do {
+ ret = splice_from_pipe_next(pipe, sd);
+ if (ret > 0)
+ ret = splice_from_pipe_feed(pipe, sd, actor);
+ } while (ret > 0);
+ splice_from_pipe_end(pipe, sd);
+
+ return sd->num_spliced ? sd->num_spliced : ret;
}
EXPORT_SYMBOL(__splice_from_pipe);
@@ -722,7 +901,7 @@ EXPORT_SYMBOL(__splice_from_pipe);
* @actor: handler that splices the data
*
* Description:
- * See __splice_from_pipe. This function locks the input and output inodes,
+ * See __splice_from_pipe. This function locks the pipe inode,
* otherwise it's identical to __splice_from_pipe().
*
*/
@@ -731,7 +910,6 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
splice_actor *actor)
{
ssize_t ret;
- struct inode *inode = out->f_mapping->host;
struct splice_desc sd = {
.total_len = len,
.flags = flags,
@@ -739,21 +917,15 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
.u.file = out,
};
- /*
- * The actor worker might be calling ->prepare_write and
- * ->commit_write. Most of the time, these expect i_mutex to
- * be held. Since this may result in an ABBA deadlock with
- * pipe->inode, we have to order lock acquiry here.
- */
- inode_double_lock(inode, pipe->inode);
+ pipe_lock(pipe);
ret = __splice_from_pipe(pipe, &sd, actor);
- inode_double_unlock(inode, pipe->inode);
+ pipe_unlock(pipe);
return ret;
}
/**
- * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
+ * iter_file_splice_write - splice data from a pipe to a file
* @pipe: pipe info
* @out: file to write to
* @ppos: position in @out
@@ -762,119 +934,162 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
*
* Description:
* Will either move or copy pages (determined by @flags options) from
- * the given pipe inode to the given file. The caller is responsible
- * for acquiring i_mutex on both inodes.
+ * the given pipe inode to the given file.
+ * This one is ->write_iter-based.
*
*/
ssize_t
-generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
+iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
struct splice_desc sd = {
.total_len = len,
.flags = flags,
.pos = *ppos,
.u.file = out,
};
+ int nbufs = pipe->buffers;
+ struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
+ GFP_KERNEL);
ssize_t ret;
- int err;
- err = remove_suid(out->f_path.dentry);
- if (unlikely(err))
- return err;
+ if (unlikely(!array))
+ return -ENOMEM;
- ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
- if (ret > 0) {
- unsigned long nr_pages;
+ pipe_lock(pipe);
- *ppos += ret;
- nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ splice_from_pipe_begin(&sd);
+ while (sd.total_len) {
+ struct iov_iter from;
+ struct kiocb kiocb;
+ size_t left;
+ int n, idx;
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
+ ret = splice_from_pipe_next(pipe, &sd);
+ if (ret <= 0)
+ break;
- if (err)
- ret = err;
+ if (unlikely(nbufs < pipe->buffers)) {
+ kfree(array);
+ nbufs = pipe->buffers;
+ array = kcalloc(nbufs, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!array) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ /* build the vector */
+ left = sd.total_len;
+ for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
+ struct pipe_buffer *buf = pipe->bufs + idx;
+ size_t this_len = buf->len;
+
+ if (this_len > left)
+ this_len = left;
+
+ if (idx == pipe->buffers - 1)
+ idx = -1;
+
+ ret = buf->ops->confirm(pipe, buf);
+ if (unlikely(ret)) {
+ if (ret == -ENODATA)
+ ret = 0;
+ goto done;
+ }
+
+ array[n].bv_page = buf->page;
+ array[n].bv_len = this_len;
+ array[n].bv_offset = buf->offset;
+ left -= this_len;
+ }
+
+ /* ... iov_iter */
+ from.type = ITER_BVEC | WRITE;
+ from.bvec = array;
+ from.nr_segs = n;
+ from.count = sd.total_len - left;
+ from.iov_offset = 0;
+
+ /* ... and iocb */
+ init_sync_kiocb(&kiocb, out);
+ kiocb.ki_pos = sd.pos;
+ kiocb.ki_nbytes = sd.total_len - left;
+
+ /* now, send it */
+ ret = out->f_op->write_iter(&kiocb, &from);
+ if (-EIOCBQUEUED == ret)
+ ret = wait_on_sync_kiocb(&kiocb);
+
+ if (ret <= 0)
+ break;
+
+ sd.num_spliced += ret;
+ sd.total_len -= ret;
+ *ppos = sd.pos = kiocb.ki_pos;
+
+ /* dismiss the fully eaten buffers, adjust the partial one */
+ while (ret) {
+ struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+ if (ret >= buf->len) {
+ const struct pipe_buf_operations *ops = buf->ops;
+ ret -= buf->len;
+ buf->len = 0;
+ buf->ops = NULL;
+ ops->release(pipe, buf);
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+ pipe->nrbufs--;
+ if (pipe->files)
+ sd.need_wakeup = true;
+ } else {
+ buf->offset += ret;
+ buf->len -= ret;
+ ret = 0;
+ }
}
- balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
}
+done:
+ kfree(array);
+ splice_from_pipe_end(pipe, &sd);
+
+ pipe_unlock(pipe);
+
+ if (sd.num_spliced)
+ ret = sd.num_spliced;
return ret;
}
-EXPORT_SYMBOL(generic_file_splice_write_nolock);
+EXPORT_SYMBOL(iter_file_splice_write);
-/**
- * generic_file_splice_write - splice data from a pipe to a file
- * @pipe: pipe info
- * @out: file to write to
- * @ppos: position in @out
- * @len: number of bytes to splice
- * @flags: splice modifier flags
- *
- * Description:
- * Will either move or copy pages (determined by @flags options) from
- * the given pipe inode to the given file.
- *
- */
-ssize_t
-generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+ struct splice_desc *sd)
{
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- int killsuid, killpriv;
- ssize_t ret;
- int err = 0;
-
- killpriv = security_inode_need_killpriv(out->f_path.dentry);
- killsuid = should_remove_suid(out->f_path.dentry);
- if (unlikely(killsuid || killpriv)) {
- mutex_lock(&inode->i_mutex);
- if (killpriv)
- err = security_inode_killpriv(out->f_path.dentry);
- if (!err && killsuid)
- err = __remove_suid(out->f_path.dentry, killsuid);
- mutex_unlock(&inode->i_mutex);
- if (err)
- return err;
- }
+ int ret;
+ void *data;
+ loff_t tmp = sd->pos;
- ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
- if (ret > 0) {
- unsigned long nr_pages;
+ data = kmap(buf->page);
+ ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
+ kunmap(buf->page);
- *ppos += ret;
- nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ return ret;
+}
- /*
- * If file or inode is SYNC and we actually wrote some data,
- * sync it.
- */
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- mutex_lock(&inode->i_mutex);
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- mutex_unlock(&inode->i_mutex);
-
- if (err)
- ret = err;
- }
- balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
- }
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+ if (ret > 0)
+ *ppos += ret;
return ret;
}
-EXPORT_SYMBOL(generic_file_splice_write);
-
/**
* generic_splice_sendpage - splice data from a pipe to a socket
* @pipe: pipe to splice from
@@ -902,19 +1117,15 @@ EXPORT_SYMBOL(generic_splice_sendpage);
static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
- int ret;
-
- if (unlikely(!out->f_op || !out->f_op->splice_write))
- return -EINVAL;
-
- if (unlikely(!(out->f_mode & FMODE_WRITE)))
- return -EBADF;
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+ loff_t *, size_t, unsigned int);
- ret = rw_verify_area(WRITE, out, ppos, len);
- if (unlikely(ret < 0))
- return ret;
+ if (out->f_op->splice_write)
+ splice_write = out->f_op->splice_write;
+ else
+ splice_write = default_file_splice_write;
- return out->f_op->splice_write(pipe, out, ppos, len, flags);
+ return splice_write(pipe, out, ppos, len, flags);
}
/*
@@ -924,11 +1135,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
+ ssize_t (*splice_read)(struct file *, loff_t *,
+ struct pipe_inode_info *, size_t, unsigned int);
int ret;
- if (unlikely(!in->f_op || !in->f_op->splice_read))
- return -EINVAL;
-
if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
@@ -936,7 +1146,12 @@ static long do_splice_to(struct file *in, loff_t *ppos,
if (unlikely(ret < 0))
return ret;
- return in->f_op->splice_read(in, ppos, pipe, len, flags);
+ if (in->f_op->splice_read)
+ splice_read = in->f_op->splice_read;
+ else
+ splice_read = default_file_splice_read;
+
+ return splice_read(in, ppos, pipe, len, flags);
}
/**
@@ -966,7 +1181,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* randomly drop data for eg socket -> socket splicing. Use the
* piped splicing for that!
*/
- i_mode = in->f_path.dentry->d_inode->i_mode;
+ i_mode = file_inode(in)->i_mode;
if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
return -EINVAL;
@@ -976,7 +1191,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
*/
pipe = current->splice_pipe;
if (unlikely(!pipe)) {
- pipe = alloc_pipe_info(NULL);
+ pipe = alloc_pipe_info();
if (!pipe)
return -ENOMEM;
@@ -1005,7 +1220,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
while (len) {
size_t read_len;
- loff_t pos = sd->pos;
+ loff_t pos = sd->pos, prev_pos = pos;
ret = do_splice_to(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
@@ -1020,15 +1235,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
* could get stuck data in the internal pipe:
*/
ret = actor(pipe, sd);
- if (unlikely(ret <= 0))
+ if (unlikely(ret <= 0)) {
+ sd->pos = prev_pos;
goto out_release;
+ }
bytes += ret;
len -= ret;
sd->pos = pos;
- if (ret < read_len)
+ if (ret < read_len) {
+ sd->pos = prev_pos + ret;
goto out_release;
+ }
}
done:
@@ -1041,7 +1260,7 @@ out_release:
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < PIPE_BUFFERS; i++) {
+ for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops) {
@@ -1062,7 +1281,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
{
struct file *file = sd->u.file;
- return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
+ return do_splice_from(pipe, file, sd->opos, sd->total_len,
+ sd->flags);
}
/**
@@ -1070,6 +1290,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
* @in: file to splice from
* @ppos: input file offset
* @out: file to splice to
+ * @opos: output file offset
* @len: number of bytes to splice
* @flags: splice modifier flags
*
@@ -1081,7 +1302,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
*
*/
long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- size_t len, unsigned int flags)
+ loff_t *opos, size_t len, unsigned int flags)
{
struct splice_desc sd = {
.len = len,
@@ -1089,28 +1310,30 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.flags = flags,
.pos = *ppos,
.u.file = out,
+ .opos = opos,
};
long ret;
+ if (unlikely(!(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
+
+ if (unlikely(out->f_flags & O_APPEND))
+ return -EINVAL;
+
+ ret = rw_verify_area(WRITE, out, opos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
if (ret > 0)
- *ppos += ret;
+ *ppos = sd.pos;
return ret;
}
-/*
- * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
- * location, so checking ->i_pipe is not enough to verify that this is a
- * pipe.
- */
-static inline struct pipe_inode_info *pipe_info(struct inode *inode)
-{
- if (S_ISFIFO(inode->i_mode))
- return inode->i_pipe;
-
- return NULL;
-}
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags);
/*
* Determine where to splice to/from.
@@ -1119,47 +1342,82 @@ static long do_splice(struct file *in, loff_t __user *off_in,
struct file *out, loff_t __user *off_out,
size_t len, unsigned int flags)
{
- struct pipe_inode_info *pipe;
- loff_t offset, *off;
+ struct pipe_inode_info *ipipe;
+ struct pipe_inode_info *opipe;
+ loff_t offset;
long ret;
- pipe = pipe_info(in->f_path.dentry->d_inode);
- if (pipe) {
+ ipipe = get_pipe_info(in);
+ opipe = get_pipe_info(out);
+
+ if (ipipe && opipe) {
+ if (off_in || off_out)
+ return -ESPIPE;
+
+ if (!(in->f_mode & FMODE_READ))
+ return -EBADF;
+
+ if (!(out->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ /* Splicing to self would be fun, but... */
+ if (ipipe == opipe)
+ return -EINVAL;
+
+ return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+ }
+
+ if (ipipe) {
if (off_in)
return -ESPIPE;
if (off_out) {
- if (out->f_op->llseek == no_llseek)
+ if (!(out->f_mode & FMODE_PWRITE))
return -EINVAL;
if (copy_from_user(&offset, off_out, sizeof(loff_t)))
return -EFAULT;
- off = &offset;
- } else
- off = &out->f_pos;
+ } else {
+ offset = out->f_pos;
+ }
+
+ if (unlikely(!(out->f_mode & FMODE_WRITE)))
+ return -EBADF;
- ret = do_splice_from(pipe, out, off, len, flags);
+ if (unlikely(out->f_flags & O_APPEND))
+ return -EINVAL;
- if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+ ret = rw_verify_area(WRITE, out, &offset, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ file_start_write(out);
+ ret = do_splice_from(ipipe, out, &offset, len, flags);
+ file_end_write(out);
+
+ if (!off_out)
+ out->f_pos = offset;
+ else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
}
- pipe = pipe_info(out->f_path.dentry->d_inode);
- if (pipe) {
+ if (opipe) {
if (off_out)
return -ESPIPE;
if (off_in) {
- if (in->f_op->llseek == no_llseek)
+ if (!(in->f_mode & FMODE_PREAD))
return -EINVAL;
if (copy_from_user(&offset, off_in, sizeof(loff_t)))
return -EFAULT;
- off = &offset;
- } else
- off = &in->f_pos;
+ } else {
+ offset = in->f_pos;
+ }
- ret = do_splice_to(in, off, pipe, len, flags);
+ ret = do_splice_to(in, &offset, opipe, len, flags);
- if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+ if (!off_in)
+ in->f_pos = offset;
+ else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
ret = -EFAULT;
return ret;
@@ -1169,36 +1427,6 @@ static long do_splice(struct file *in, loff_t __user *off_in,
}
/*
- * Do a copy-from-user while holding the mmap_semaphore for reading, in a
- * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem
- * for writing) and page faulting on the user memory pointed to by src.
- * This assumes that we will very rarely hit the partial != 0 path, or this
- * will not be a win.
- */
-static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
-{
- int partial;
-
- if (!access_ok(VERIFY_READ, src, n))
- return -EFAULT;
-
- pagefault_disable();
- partial = __copy_from_user_inatomic(dst, src, n);
- pagefault_enable();
-
- /*
- * Didn't copy everything, drop the mmap_sem and do a faulting copy
- */
- if (unlikely(partial)) {
- up_read(&current->mm->mmap_sem);
- partial = copy_from_user(dst, src, n);
- down_read(&current->mm->mmap_sem);
- }
-
- return partial;
-}
-
-/*
* Map an iov into an array of pages and offset/length tupples. With the
* partial_page structure, we can map several non-contiguous ranges into
* our ones pages[] map instead of splitting that operation into pieces.
@@ -1207,12 +1435,11 @@ static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n)
*/
static int get_iovec_page_array(const struct iovec __user *iov,
unsigned int nr_vecs, struct page **pages,
- struct partial_page *partial, int aligned)
+ struct partial_page *partial, bool aligned,
+ unsigned int pipe_buffers)
{
int buffers = 0, error = 0;
- down_read(&current->mm->mmap_sem);
-
while (nr_vecs) {
unsigned long off, npages;
struct iovec entry;
@@ -1221,7 +1448,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
int i;
error = -EFAULT;
- if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry)))
+ if (copy_from_user(&entry, iov, sizeof(entry)))
break;
base = entry.iov_base;
@@ -1252,12 +1479,11 @@ static int get_iovec_page_array(const struct iovec __user *iov,
break;
npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (npages > PIPE_BUFFERS - buffers)
- npages = PIPE_BUFFERS - buffers;
+ if (npages > pipe_buffers - buffers)
+ npages = pipe_buffers - buffers;
- error = get_user_pages(current, current->mm,
- (unsigned long) base, npages, 0, 0,
- &pages[buffers], NULL);
+ error = get_user_pages_fast((unsigned long)base, npages,
+ 0, &pages[buffers]);
if (unlikely(error <= 0))
break;
@@ -1289,15 +1515,13 @@ static int get_iovec_page_array(const struct iovec __user *iov,
* or if we mapped the max number of pages that we have
* room for.
*/
- if (error < npages || buffers == PIPE_BUFFERS)
+ if (error < npages || buffers == pipe_buffers)
break;
nr_vecs--;
iov++;
}
- up_read(&current->mm->mmap_sem);
-
if (buffers)
return buffers;
@@ -1307,122 +1531,50 @@ static int get_iovec_page_array(const struct iovec __user *iov,
static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
struct splice_desc *sd)
{
- char *src;
- int ret;
-
- ret = buf->ops->confirm(pipe, buf);
- if (unlikely(ret))
- return ret;
-
- /*
- * See if we can use the atomic maps, by prefaulting in the
- * pages and doing an atomic copy
- */
- if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
- src = buf->ops->map(pipe, buf, 1);
- ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
- sd->len);
- buf->ops->unmap(pipe, buf, src);
- if (!ret) {
- ret = sd->len;
- goto out;
- }
- }
-
- /*
- * No dice, use slow non-atomic map and copy
- */
- src = buf->ops->map(pipe, buf, 0);
-
- ret = sd->len;
- if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
- ret = -EFAULT;
-
- buf->ops->unmap(pipe, buf, src);
-out:
- if (ret > 0)
- sd->u.userptr += ret;
- return ret;
+ int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
+ return n == sd->len ? n : -EFAULT;
}
/*
* For lack of a better implementation, implement vmsplice() to userspace
* as a simple copy of the pipes pages to the user iov.
*/
-static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
+static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
struct splice_desc sd;
- ssize_t size;
- int error;
long ret;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ ssize_t count;
- pipe = pipe_info(file->f_path.dentry->d_inode);
+ pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- if (pipe->inode)
- mutex_lock(&pipe->inode->i_mutex);
-
- error = ret = 0;
- while (nr_segs) {
- void __user *base;
- size_t len;
-
- /*
- * Get user address base and length for this iovec.
- */
- error = get_user(base, &iov->iov_base);
- if (unlikely(error))
- break;
- error = get_user(len, &iov->iov_len);
- if (unlikely(error))
- break;
-
- /*
- * Sanity check this iovec. 0 read succeeds.
- */
- if (unlikely(!len))
- break;
- if (unlikely(!base)) {
- error = -EFAULT;
- break;
- }
-
- if (unlikely(!access_ok(VERIFY_WRITE, base, len))) {
- error = -EFAULT;
- break;
- }
-
- sd.len = 0;
- sd.total_len = len;
- sd.flags = flags;
- sd.u.userptr = base;
- sd.pos = 0;
-
- size = __splice_from_pipe(pipe, &sd, pipe_to_user);
- if (size < 0) {
- if (!ret)
- ret = size;
-
- break;
- }
-
- ret += size;
+ ret = rw_copy_check_uvector(READ, uiov, nr_segs,
+ ARRAY_SIZE(iovstack), iovstack, &iov);
+ if (ret <= 0)
+ goto out;
- if (size < len)
- break;
+ count = ret;
+ iov_iter_init(&iter, READ, iov, nr_segs, count);
- nr_segs--;
- iov++;
- }
+ sd.len = 0;
+ sd.total_len = count;
+ sd.flags = flags;
+ sd.u.data = &iter;
+ sd.pos = 0;
- if (pipe->inode)
- mutex_unlock(&pipe->inode->i_mutex);
+ pipe_lock(pipe);
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
+ pipe_unlock(pipe);
- if (!ret)
- ret = error;
+out:
+ if (iov != iovstack)
+ kfree(iov);
return ret;
}
@@ -1436,26 +1588,35 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
+ .nr_pages_max = PIPE_DEF_BUFFERS,
.flags = flags,
.ops = &user_page_pipe_buf_ops,
.spd_release = spd_release_page,
};
+ long ret;
- pipe = pipe_info(file->f_path.dentry->d_inode);
+ pipe = get_pipe_info(file);
if (!pipe)
return -EBADF;
- spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
- flags & SPLICE_F_GIFT);
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+ spd.partial, false,
+ spd.nr_pages_max);
if (spd.nr_pages <= 0)
- return spd.nr_pages;
+ ret = spd.nr_pages;
+ else
+ ret = splice_to_pipe(pipe, &spd);
- return splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(&spd);
+ return ret;
}
/*
@@ -1474,12 +1635,11 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
* Currently we punt and implement it as a normal copy, see pipe_to_user().
*
*/
-asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
- unsigned long nr_segs, unsigned int flags)
+SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
+ unsigned long, nr_segs, unsigned int, flags)
{
- struct file *file;
+ struct fd f;
long error;
- int fput;
if (unlikely(nr_segs > UIO_MAXIOV))
return -EINVAL;
@@ -1487,47 +1647,65 @@ asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
return 0;
error = -EBADF;
- file = fget_light(fd, &fput);
- if (file) {
- if (file->f_mode & FMODE_WRITE)
- error = vmsplice_to_pipe(file, iov, nr_segs, flags);
- else if (file->f_mode & FMODE_READ)
- error = vmsplice_to_user(file, iov, nr_segs, flags);
-
- fput_light(file, fput);
+ f = fdget(fd);
+ if (f.file) {
+ if (f.file->f_mode & FMODE_WRITE)
+ error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
+ else if (f.file->f_mode & FMODE_READ)
+ error = vmsplice_to_user(f.file, iov, nr_segs, flags);
+
+ fdput(f);
}
return error;
}
-asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
- int fd_out, loff_t __user *off_out,
- size_t len, unsigned int flags)
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
+ unsigned int, nr_segs, unsigned int, flags)
+{
+ unsigned i;
+ struct iovec __user *iov;
+ if (nr_segs > UIO_MAXIOV)
+ return -EINVAL;
+ iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
+ for (i = 0; i < nr_segs; i++) {
+ struct compat_iovec v;
+ if (get_user(v.iov_base, &iov32[i].iov_base) ||
+ get_user(v.iov_len, &iov32[i].iov_len) ||
+ put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
+ put_user(v.iov_len, &iov[i].iov_len))
+ return -EFAULT;
+ }
+ return sys_vmsplice(fd, iov, nr_segs, flags);
+}
+#endif
+
+SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
+ int, fd_out, loff_t __user *, off_out,
+ size_t, len, unsigned int, flags)
{
+ struct fd in, out;
long error;
- struct file *in, *out;
- int fput_in, fput_out;
if (unlikely(!len))
return 0;
error = -EBADF;
- in = fget_light(fd_in, &fput_in);
- if (in) {
- if (in->f_mode & FMODE_READ) {
- out = fget_light(fd_out, &fput_out);
- if (out) {
- if (out->f_mode & FMODE_WRITE)
- error = do_splice(in, off_in,
- out, off_out,
+ in = fdget(fd_in);
+ if (in.file) {
+ if (in.file->f_mode & FMODE_READ) {
+ out = fdget(fd_out);
+ if (out.file) {
+ if (out.file->f_mode & FMODE_WRITE)
+ error = do_splice(in.file, off_in,
+ out.file, off_out,
len, flags);
- fput_light(out, fput_out);
+ fdput(out);
}
}
-
- fput_light(in, fput_in);
+ fdput(in);
}
-
return error;
}
@@ -1535,7 +1713,7 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
* Make sure there's data to read. Wait for input if we can, otherwise
* return an appropriate error.
*/
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
int ret;
@@ -1547,7 +1725,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
return 0;
ret = 0;
- mutex_lock(&pipe->inode->i_mutex);
+ pipe_lock(pipe);
while (!pipe->nrbufs) {
if (signal_pending(current)) {
@@ -1565,7 +1743,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
pipe_wait(pipe);
}
- mutex_unlock(&pipe->inode->i_mutex);
+ pipe_unlock(pipe);
return ret;
}
@@ -1573,7 +1751,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Make sure there's writeable room. Wait for room if we can, otherwise
* return an appropriate error.
*/
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
{
int ret;
@@ -1581,13 +1759,13 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
* Check ->nrbufs without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < pipe->buffers)
return 0;
ret = 0;
- mutex_lock(&pipe->inode->i_mutex);
+ pipe_lock(pipe);
- while (pipe->nrbufs >= PIPE_BUFFERS) {
+ while (pipe->nrbufs >= pipe->buffers) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
@@ -1606,7 +1784,122 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
pipe->waiting_writers--;
}
- mutex_unlock(&pipe->inode->i_mutex);
+ pipe_unlock(pipe);
+ return ret;
+}
+
+/*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
+{
+ struct pipe_buffer *ibuf, *obuf;
+ int ret = 0, nbuf;
+ bool input_wakeup = false;
+
+
+retry:
+ ret = ipipe_prep(ipipe, flags);
+ if (ret)
+ return ret;
+
+ ret = opipe_prep(opipe, flags);
+ if (ret)
+ return ret;
+
+ /*
+ * Potential ABBA deadlock, work around it by ordering lock
+ * grabbing by pipe info address. Otherwise two different processes
+ * could deadlock (one doing tee from A -> B, the other from B -> A).
+ */
+ pipe_double_lock(ipipe, opipe);
+
+ do {
+ if (!opipe->readers) {
+ send_sig(SIGPIPE, current, 0);
+ if (!ret)
+ ret = -EPIPE;
+ break;
+ }
+
+ if (!ipipe->nrbufs && !ipipe->writers)
+ break;
+
+ /*
+ * Cannot make any progress, because either the input
+ * pipe is empty or the output pipe is full.
+ */
+ if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
+ /* Already processed some buffers, break */
+ if (ret)
+ break;
+
+ if (flags & SPLICE_F_NONBLOCK) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ /*
+ * We raced with another reader/writer and haven't
+ * managed to process any buffers. A zero return
+ * value means EOF, so retry instead.
+ */
+ pipe_unlock(ipipe);
+ pipe_unlock(opipe);
+ goto retry;
+ }
+
+ ibuf = ipipe->bufs + ipipe->curbuf;
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
+ obuf = opipe->bufs + nbuf;
+
+ if (len >= ibuf->len) {
+ /*
+ * Simply move the whole buffer from ipipe to opipe
+ */
+ *obuf = *ibuf;
+ ibuf->ops = NULL;
+ opipe->nrbufs++;
+ ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
+ ipipe->nrbufs--;
+ input_wakeup = true;
+ } else {
+ /*
+ * Get a reference to this pipe buffer,
+ * so we can copy the contents over.
+ */
+ ibuf->ops->get(ipipe, ibuf);
+ *obuf = *ibuf;
+
+ /*
+ * Don't inherit the gift flag, we need to
+ * prevent multiple steals of this page.
+ */
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+ obuf->len = len;
+ opipe->nrbufs++;
+ ibuf->offset += obuf->len;
+ ibuf->len -= obuf->len;
+ }
+ ret += obuf->len;
+ len -= obuf->len;
+ } while (len);
+
+ pipe_unlock(ipipe);
+ pipe_unlock(opipe);
+
+ /*
+ * If we put data in the output pipe, wakeup any potential readers.
+ */
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
+
+ if (input_wakeup)
+ wakeup_pipe_writers(ipipe);
+
return ret;
}
@@ -1622,10 +1915,10 @@ static int link_pipe(struct pipe_inode_info *ipipe,
/*
* Potential ABBA deadlock, work around it by ordering lock
- * grabbing by inode address. Otherwise two different processes
+ * grabbing by pipe info address. Otherwise two different processes
* could deadlock (one doing tee from A -> B, the other from B -> A).
*/
- inode_double_lock(ipipe->inode, opipe->inode);
+ pipe_double_lock(ipipe, opipe);
do {
if (!opipe->readers) {
@@ -1639,11 +1932,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* If we have iterated all input buffers or ran out of
* output room, break.
*/
- if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+ if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
break;
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
- nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+ ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
/*
* Get a reference to this pipe buffer,
@@ -1676,17 +1969,14 @@ static int link_pipe(struct pipe_inode_info *ipipe,
if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
ret = -EAGAIN;
- inode_double_unlock(ipipe->inode, opipe->inode);
+ pipe_unlock(ipipe);
+ pipe_unlock(opipe);
/*
* If we put data in the output pipe, wakeup any potential readers.
*/
- if (ret > 0) {
- smp_mb();
- if (waitqueue_active(&opipe->wait))
- wake_up_interruptible(&opipe->wait);
- kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
- }
+ if (ret > 0)
+ wakeup_pipe_readers(opipe);
return ret;
}
@@ -1700,8 +1990,8 @@ static int link_pipe(struct pipe_inode_info *ipipe,
static long do_tee(struct file *in, struct file *out, size_t len,
unsigned int flags)
{
- struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
- struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
+ struct pipe_inode_info *ipipe = get_pipe_info(in);
+ struct pipe_inode_info *opipe = get_pipe_info(out);
int ret = -EINVAL;
/*
@@ -1713,9 +2003,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
* Keep going, unless we encounter an error. The ipipe/opipe
* ordering doesn't really matter.
*/
- ret = link_ipipe_prep(ipipe, flags);
+ ret = ipipe_prep(ipipe, flags);
if (!ret) {
- ret = link_opipe_prep(opipe, flags);
+ ret = opipe_prep(opipe, flags);
if (!ret)
ret = link_pipe(ipipe, opipe, len, flags);
}
@@ -1724,28 +2014,27 @@ static long do_tee(struct file *in, struct file *out, size_t len,
return ret;
}
-asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
- struct file *in;
- int error, fput_in;
+ struct fd in;
+ int error;
if (unlikely(!len))
return 0;
error = -EBADF;
- in = fget_light(fdin, &fput_in);
- if (in) {
- if (in->f_mode & FMODE_READ) {
- int fput_out;
- struct file *out = fget_light(fdout, &fput_out);
-
- if (out) {
- if (out->f_mode & FMODE_WRITE)
- error = do_tee(in, out, len, flags);
- fput_light(out, fput_out);
+ in = fdget(fdin);
+ if (in.file) {
+ if (in.file->f_mode & FMODE_READ) {
+ struct fd out = fdget(fdout);
+ if (out.file) {
+ if (out.file->f_mode & FMODE_WRITE)
+ error = do_tee(in.file, out.file,
+ len, flags);
+ fdput(out);
}
}
- fput_light(in, fput_in);
+ fdput(in);
}
return error;