From 72f465033702ebfe20db8f50edaad59f0f38b0f5 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Aug 2010 13:35:09 +0200 Subject: bio-integrity.c: remove dependency on __GFP_NOFAIL The kmalloc() in bio_integrity_prep() is failable, so remove __GFP_NOFAIL from its mask. Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 612a5c38d3c..a8f4cc67998 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -413,7 +413,7 @@ int bio_integrity_prep(struct bio *bio) /* Allocate kernel buffer for protection data */ len = sectors * blk_integrity_tuple_size(bi); - buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp); + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); return -EIO; -- cgit v1.2.3-18-g5258 From 220eb7fd984bfc7e6b4005fdf32efe9cd8af7cf2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 23 Aug 2010 13:36:20 +0200 Subject: fs/bio-integrity.c: return -ENOMEM on kmalloc failure Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index a8f4cc67998..4d0ff5ee27b 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -416,7 +416,7 @@ int bio_integrity_prep(struct bio *bio) buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); - return -EIO; + return -ENOMEM; } end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; -- cgit v1.2.3-18-g5258 From b76b4014f9d988d2412b873e4d4c13c7f9afc4e4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 28 Aug 2010 08:52:10 +0200 Subject: writeback: Fix lost wake-up shutting down writeback thread Setting the task state here may cause us to miss the wake up from kthread_stop(), so we need to recheck kthread_should_stop() or risk sleeping forever in the following schedule(). Symptom was an indefinite hang on an NFSv4 mount. (NFSv4 may create multiple mounts in a temporary namespace while traversing the mount path, and since the temporary namespace is immediately destroyed, it may end up destroying a mount very soon after it was created, possibly making this race more likely.) INFO: task mount.nfs4:4314 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mount.nfs4 D 0000000000000000 2880 4314 4313 0x00000000 ffff88001ed6da28 0000000000000046 ffff88001ed6dfd8 ffff88001ed6dfd8 ffff88001ed6c000 ffff88001ed6c000 ffff88001ed6c000 ffff88001e5003a0 ffff88001ed6dfd8 ffff88001e5003a8 ffff88001ed6c000 ffff88001ed6dfd8 Call Trace: [] schedule_timeout+0x1cd/0x2e0 [] ? mark_held_locks+0x6c/0xa0 [] ? _raw_spin_unlock_irq+0x30/0x60 [] ? trace_hardirqs_on_caller+0x14d/0x190 [] ? sub_preempt_count+0xe/0xd0 [] wait_for_common+0x120/0x190 [] ? default_wake_function+0x0/0x20 [] wait_for_completion+0x1d/0x20 [] kthread_stop+0x4a/0x150 [] ? thaw_process+0x70/0x80 [] bdi_unregister+0x10a/0x1a0 [] nfs_put_super+0x19/0x20 [] generic_shutdown_super+0x54/0xe0 [] kill_anon_super+0x16/0x60 [] nfs4_kill_super+0x39/0x90 [] deactivate_locked_super+0x45/0x60 [] deactivate_super+0x49/0x70 [] mntput_no_expire+0x84/0xe0 [] release_mounts+0x9f/0xc0 [] put_mnt_ns+0x65/0x80 [] nfs_follow_remote_path+0x1e6/0x420 [] nfs4_try_mount+0x6f/0xd0 [] nfs4_get_sb+0xa2/0x360 [] vfs_kern_mount+0x88/0x1f0 [] do_kern_mount+0x52/0x130 [] ? _lock_kernel+0x6a/0x170 [] do_mount+0x26e/0x7f0 [] ? copy_mount_options+0xea/0x190 [] sys_mount+0x98/0xf0 [] system_call_fastpath+0x16/0x1b 1 lock held by mount.nfs4/4314: #0: (&type->s_umount_key#24){+.+...}, at: [] deactivate_super+0x41/0x70 Signed-off-by: J. Bruce Fields Signed-off-by: Jens Axboe Acked-by: Artem Bityutskiy --- fs/fs-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7d9d06ba184..81e086d8aa5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -808,7 +808,7 @@ int bdi_writeback_thread(void *data) wb->last_active = jiffies; set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list)) { + if (!list_empty(&bdi->work_list) || kthread_should_stop()) { __set_current_state(TASK_RUNNING); continue; } -- cgit v1.2.3-18-g5258 From 4afc31345e5f543e5d89a47aeadaaad1d91a5bc8 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 29 Aug 2010 01:55:38 +0900 Subject: nilfs2: fix leak of shadow dat inode in error path of load_nilfs If load_nilfs() gets an error while doing recovery, it will fail to free the shadow inode of dat (nilfs->ns_gc_dat). This fixes the leak issue. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/the_nilfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 4317f177ea7..ba7c10c917f 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) nilfs_mdt_destroy(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_sufile); nilfs_mdt_destroy(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_gc_dat); failed: nilfs_clear_recovery_info(&ri); -- cgit v1.2.3-18-g5258 From 8f587df479c3cea14ba1a9b9d58f34fd2fd6d58b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Aug 2010 16:27:45 +0000 Subject: 9p: potential ERR_PTR() dereference p9_client_walk() can return error values if we run out of space or there is a problem with the network. Signed-off-by: Dan Carpenter Signed-off-by: Eric Van Hensbergen --- fs/9p/fid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 35856368906..6406f896bf9 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) } kfree(wnames); fid_out: - v9fs_fid_add(dentry, fid); + if (!IS_ERR(fid)) + v9fs_fid_add(dentry, fid); err_out: up_read(&v9ses->rename_sem); return fid; -- cgit v1.2.3-18-g5258 From 9bc08a45fb117c696e4940cfa1208cb1cc7a2f25 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Sep 2010 15:14:38 +1000 Subject: xfs: improve buffer cache hash scalability When doing large parallel file creates on a 16p machines, large amounts of time is being spent in _xfs_buf_find(). A system wide profile with perf top shows this: 1134740.00 19.3% _xfs_buf_find 733142.00 12.5% __ticket_spin_lock The problem is that the hash contains 45,000 buffers, and the hash table width is only 256 buffers. That means we've got around 200 buffers per chain, and searching it is quite expensive. The hash table size needs to increase. Secondly, every time we do a lookup, we promote the buffer we find to the head of the hash chain. This is causing cachelines to be dirtied and causes invalidation of cachelines across all CPUs that may have walked the hash chain recently. hence every walk of the hash chain is effectively a cold cache walk. Remove the promotion to avoid this invalidation. The results are: 1045043.00 21.2% __ticket_spin_lock 326184.00 6.6% _xfs_buf_find A 70% drop in the CPU usage when looking up buffers. Unfortunately that does not result in an increase in performance underthis workload as contention on the inode_lock soaks up most of the reduction in CPU usage. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 8 +------- fs/xfs/linux-2.6/xfs_buf.h | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ea79072f521..d72cf2bb054 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -440,12 +440,7 @@ _xfs_buf_find( ASSERT(btp == bp->b_target); if (bp->b_file_offset == range_base && bp->b_buffer_length == range_length) { - /* - * If we look at something, bring it to the - * front of the list for next time. - */ atomic_inc(&bp->b_hold); - list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } @@ -1443,8 +1438,7 @@ xfs_alloc_bufhash( { unsigned int i; - btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ - btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index d072e5ff923..2a05614f0b9 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -137,7 +137,6 @@ typedef struct xfs_buftarg { size_t bt_smask; /* per device buffer hash table */ - uint bt_hashmask; uint bt_hashshift; xfs_bufhash_t *bt_hash; -- cgit v1.2.3-18-g5258 From 23963e54ce187ca6e907c83176c15508b0f6e60d Mon Sep 17 00:00:00 2001 From: Arkadiusz Mi?kiewicz Date: Thu, 26 Aug 2010 10:19:43 +0000 Subject: xfs: Disallow 32bit project quota id Currently on-disk structure is able to keep only 16bit project quota id, so disallow 32bit ones. This fixes a problem where parts of kernel structures holding project quota id are 32bit while parts (on-disk) are 16bit variables which causes project quota member files to be inaccessible for some operations (like mv/rm). Signed-off-by: Arkadiusz Mi?kiewicz Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 237f5ffb2ee..4fec427b83e 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -906,6 +906,13 @@ xfs_ioctl_setattr( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + /* + * Disallow 32bit project ids because on-disk structure + * is 16bit only. + */ + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) + return XFS_ERROR(EINVAL); + /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later -- cgit v1.2.3-18-g5258 From 8f34a430ac16d5fbd9d6b383184d35e152f5a963 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 2 Sep 2010 15:23:16 -0400 Subject: nfsd4: mask out non-access bits in nfs4_access_to_omode This fixes an unnecessary BUG(). Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3dfef062396..cf0d2ffb3c8 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -440,7 +440,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { static int nfs4_access_to_omode(u32 access) { - switch (access) { + switch (access & NFS4_SHARE_ACCESS_BOTH) { case NFS4_SHARE_ACCESS_READ: return O_RDONLY; case NFS4_SHARE_ACCESS_WRITE: -- cgit v1.2.3-18-g5258 From 72656c46f50b8dfe50e15793692982e636e3df20 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 3 Sep 2010 12:19:33 +1000 Subject: xfs: prevent 32bit overflow in space reservation If we attempt to preallocate more than 2^32 blocks of space in a single syscall, the transaction block reservation will overflow leading to a hangs in the superblock block accounting code. This is trivially reproduced with xfs_io. Fix the problem by capping the allocation reservation to the maximum number of blocks a single xfs_bmapi() call can allocate (2^21 blocks). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 66d585c6917..4c7c7bfb2b2 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2299,15 +2299,22 @@ xfs_alloc_file_space( e = allocatesize_fsb; } + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = (uint)(e - s); + resrtextents = qblocks = resblks; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { resrtextents = 0; - resblks = qblocks = \ - XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); quota_flag = XFS_QMOPT_RES_REGBLKS; } -- cgit v1.2.3-18-g5258 From 9af25465081480a75824fd7a16a37a5cfebeede9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 30 Aug 2010 02:44:03 +0000 Subject: xfs: Make fiemap work with sparse files In xfs_vn_fiemap, we set bvm_count to fi_extent_max + 1 and want to return fi_extent_max extents, but actually it won't work for a sparse file. The reason is that in xfs_getbmap we will calculate holes and set it in 'out', while out is malloced by bmv_count(fi_extent_max+1) which didn't consider holes. So in the worst case, if 'out' vector looks like [hole, extent, hole, extent, hole, ... hole, extent, hole], we will only return half of fi_extent_max extents. This patch add a new parameter BMV_IF_NO_HOLES for bvm_iflags. So with this flags, we don't use our 'out' in xfs_getbmap for a hole. The solution is a bit ugly by just don't increasing index of 'out' vector. I felt that it is not easy to skip it at the very beginning since we have the complicated check and some function like xfs_getbmapx_fix_eof_hole to adjust 'out'. Cc: Dave Chinner Signed-off-by: Tao Ma Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 2 +- fs/xfs/xfs_bmap.c | 14 +++++++++++++- fs/xfs/xfs_fs.h | 4 +++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 68be25dcd30..b1fc2a6bfe8 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -664,7 +664,7 @@ xfs_vn_fiemap( fieinfo->fi_extents_max + 1; bm.bmv_count = min_t(__s32, bm.bmv_count, (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC; + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 23f14e595c1..f90dadd5a96 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5533,12 +5533,24 @@ xfs_getbmap( map[i].br_startblock)) goto out_free_map; - nexleft--; bmv->bmv_offset = out[cur_ext].bmv_offset + out[cur_ext].bmv_length; bmv->bmv_length = max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; bmv->bmv_entries++; cur_ext++; } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 7cf7220e7d5..87c2e9d0228 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -114,8 +114,10 @@ struct getbmapx { #define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ #define BMV_IF_VALID \ - (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ -- cgit v1.2.3-18-g5258 From 57f9bdac2510cd7fda58e4a111d250861eb1ebeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:12:29 +0200 Subject: sysfs: checking for NULL instead of ERR_PTR d_path() returns an ERR_PTR and it doesn't return NULL. Signed-off-by: Dan Carpenter Cc: stable Reviewed-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b27b5688f6..da3fefe91a8 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) char *p; p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (p) + if (!IS_ERR(p)) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ -- cgit v1.2.3-18-g5258 From 595afaf9e6ee1b48e13ec4b8bcc8c7dee888161a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 7 Sep 2010 13:42:41 +0200 Subject: fuse: flush background queue on connection close David Bartly reported that fuse can hang in fuse_get_req_nofail() when the connection to the filesystem server is no longer active. If bg_queue is not empty then flush_bg_queue() called from request_end() can put more requests on to the pending queue. If this happens while ending requests on the processing queue then those background requests will be queued to the pending list and never ended. Another problem is that fuse_dev_release() didn't wake up processes sleeping on blocked_waitq. Solve this by: a) flushing the background queue before calling end_requests() on the pending and processing queues b) setting blocked = 0 and waking up processes waiting on blocked_waitq() Thanks to David for an excellent bug report. Reported-by: David Bartley Signed-off-by: Miklos Szeredi CC: stable@kernel.org --- fs/fuse/dev.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 69ad053ffd7..b4fc47ff4bc 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1769,6 +1769,14 @@ __acquires(&fc->lock) } } +static void end_queued_requests(struct fuse_conn *fc) +{ + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + end_requests(fc, &fc->pending); + end_requests(fc, &fc->processing); +} + /* * Abort all requests. * @@ -1795,8 +1803,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; end_io_requests(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + end_queued_requests(fc); wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -1811,8 +1818,9 @@ int fuse_dev_release(struct inode *inode, struct file *file) if (fc) { spin_lock(&fc->lock); fc->connected = 0; - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); + fc->blocked = 0; + end_queued_requests(fc); + wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); fuse_conn_put(fc); } -- cgit v1.2.3-18-g5258 From b9ca67b2ddf021491a3741d9555e8ff59b2175ba Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 7 Sep 2010 13:42:41 +0200 Subject: fuse: fix lock annotations Sparse doesn't understand lock annotations of the form __releases(&foo->lock). Change them to __releases(foo->lock). Same for __acquires(). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 26 ++++++++++++++------------ fs/fuse/file.c | 8 ++++---- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b4fc47ff4bc..d367af1514e 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -276,7 +276,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -306,8 +306,8 @@ __releases(&fc->lock) static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (signal_pending(current)) return; @@ -325,8 +325,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -905,8 +905,8 @@ static int request_pending(struct fuse_conn *fc) /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { DECLARE_WAITQUEUE(wait, current); @@ -934,7 +934,7 @@ __acquires(&fc->lock) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(&fc->lock) +__releases(fc->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1720,8 +1720,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; @@ -1744,8 +1744,8 @@ __acquires(&fc->lock) * locked). */ static void end_io_requests(struct fuse_conn *fc) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = @@ -1770,6 +1770,8 @@ __acquires(&fc->lock) } static void end_queued_requests(struct fuse_conn *fc) +__releases(fc->lock) +__acquires(fc->lock) { fc->max_background = UINT_MAX; flush_bg_queue(fc); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 147c1f71bdb..c8224587123 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1144,8 +1144,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) /* Called under fc->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); loff_t size = i_size_read(req->inode); @@ -1183,8 +1183,8 @@ __acquires(&fc->lock) * Called with fc->lock */ void fuse_flush_writepages(struct inode *inode) -__releases(&fc->lock) -__acquires(&fc->lock) +__releases(fc->lock) +__acquires(fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); -- cgit v1.2.3-18-g5258 From 7a2e8a8faab76386d8eaae9ded739ee5615be174 Mon Sep 17 00:00:00 2001 From: Valerie Aurora Date: Thu, 26 Aug 2010 11:07:22 -0700 Subject: VFS: Sanity check mount flags passed to change_mnt_propagation() Sanity check the flags passed to change_mnt_propagation(). Exactly one flag should be set. Return EINVAL otherwise. Userspace can pass in arbitrary combinations of MS_* flags to mount(). do_change_type() is called if any of MS_SHARED, MS_PRIVATE, MS_SLAVE, or MS_UNBINDABLE is set. do_change_type() clears MS_REC and then calls change_mnt_propagation() with the rest of the user-supplied flags. change_mnt_propagation() clearly assumes only one flag is set but do_change_type() does not check that this is true. For example, mount() with flags MS_SHARED | MS_RDONLY does not actually make the mount shared or read-only but does clear MNT_UNBINDABLE. Signed-off-by: Valerie Aurora Signed-off-by: Linus Torvalds --- fs/namespace.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index de402eb6eaf..a72eaabfe8f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1483,6 +1483,23 @@ out_unlock: return err; } +/* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~MS_REC; + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + /* * recursively change the type of the mountpoint. */ @@ -1490,7 +1507,7 @@ static int do_change_type(struct path *path, int flag) { struct vfsmount *m, *mnt = path->mnt; int recurse = flag & MS_REC; - int type = flag & ~MS_REC; + int type; int err = 0; if (!capable(CAP_SYS_ADMIN)) @@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag) if (path->dentry != path->mnt->mnt_root) return -EINVAL; + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + down_write(&namespace_sem); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); -- cgit v1.2.3-18-g5258 From dc696aced9f09f05b1f927b93f5a7918017a3e49 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 12 Aug 2010 16:24:25 -0700 Subject: ocfs2: Fix metaecc error messages Like tools, the checksum validate function now prints the values in hex. Signed-off-by: Sunil Mushran Singed-off-by: Tao Ma --- fs/ocfs2/blockcheck.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index ec6d1233959..c7ee03c2222 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -439,7 +439,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, ocfs2_blockcheck_inc_failure(stats); mlog(ML_ERROR, - "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); /* Ok, try ECC fixups */ @@ -453,7 +453,7 @@ int ocfs2_block_check_validate(void *data, size_t blocksize, goto out; } - mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n", (unsigned int)check.bc_crc32e, (unsigned int)crc); rc = -EIO; -- cgit v1.2.3-18-g5258 From f5ce5a08a40f2086435858ddc80cb40394b082eb Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 12 Aug 2010 16:24:26 -0700 Subject: ocfs2: Fix incorrect checksum validation error For local mounts, ocfs2_read_locked_inode() calls ocfs2_read_blocks_sync() to read the inode off the disk. The latter first checks to see if that block is cached in the journal, and, if so, returns that block. That is ok. But ocfs2_read_locked_inode() goes wrong when it tries to validate the checksum of such blocks. Blocks that are cached in the journal may not have had their checksum computed as yet. We should not validate the checksums of such blocks. Fixes ossbz#1282 http://oss.oracle.com/bugzilla/show_bug.cgi?id=1282 Signed-off-by: Sunil Mushran Cc: stable@kernel.org Singed-off-by: Tao Ma --- fs/ocfs2/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 0492464916b..eece3e05d9d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -488,7 +488,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, OCFS2_BH_IGNORE_CACHE); } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); - if (!status) + /* + * If buffer is in jbd, then its checksum may not have been + * computed as yet. + */ + if (!status && !buffer_jbd(bh)) status = ocfs2_validate_inode_block(osb->sb, bh); } if (status < 0) { -- cgit v1.2.3-18-g5258 From f63afdb2c32db850fa1bfccf84643a8885cbeb61 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Sat, 17 Jul 2010 21:45:49 +0800 Subject: ocfs2: make __ocfs2_page_mkwrite handle file end properly. __ocfs2_page_mkwrite now is broken in handling file end. 1. the last page should be the page contains i_size - 1. 2. the len in the last page is also calculated wrong. So change them accordingly. Acked-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/mmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index af2b8fe1f13..4c18f4ad93b 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -74,9 +74,11 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, /* * Another node might have truncated while we were waiting on * cluster locks. + * We don't check size == 0 before the shift. This is borrowed + * from do_generic_file_read. */ - last_index = size >> PAGE_CACHE_SHIFT; - if (page->index > last_index) { + last_index = (size - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!size || page->index > last_index)) { ret = -EINVAL; goto out; } @@ -107,7 +109,7 @@ static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, * because the "write" would invalidate their data. */ if (page->index == last_index) - len = size & ~PAGE_CACHE_MASK; + len = ((size - 1) & ~PAGE_CACHE_MASK) + 1; ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, &fsdata, di_bh, page); -- cgit v1.2.3-18-g5258 From 04eda1a18019bb387dc7e97ee99979dd88dc608a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 5 Aug 2010 20:32:45 +0200 Subject: ocfs2: Flush drive's caches on fdatasync When 'barrier' mount option is specified, we have to issue a cache flush during fdatasync(2). We have to do this even if inode doesn't have I_DIRTY_DATASYNC set because we still have to get written *data* to disk so that they are not lost in case of crash. Acked-by: Tao Ma Signed-off-by: Jan Kara Singed-off-by: Tao Ma --- fs/ocfs2/file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 81296b4e364..6b2be0f2eac 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -36,6 +36,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_INODE #include @@ -190,8 +191,16 @@ static int ocfs2_sync_file(struct file *file, int datasync) if (err) goto bail; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { + /* + * We still have to flush drive's caches to get data to the + * platter + */ + if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) + blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, + NULL, BLKDEV_IFL_WAIT); goto bail; + } journal = osb->journal->j_journal; err = jbd2_journal_force_commit(journal); -- cgit v1.2.3-18-g5258 From 889f004a8c83d515f275078687f859bc0d5ede9d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 2 Sep 2010 13:10:10 +0800 Subject: ocfs2: Use the right group in nfs sync check. We have added discontig block group now, and now an inode can be allocated in an discontig block group. So get it in ocfs2_get_suballoc_slot_bit. The old ocfs2_test_suballoc_bit gets group block no from the allocation inode which is wrong. Fix it by passing the right group. Acked-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index a8e6a95a353..8a009ee1f7f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2567,7 +2567,8 @@ out: * suballoc_bit. */ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, - u16 *suballoc_slot, u16 *suballoc_bit) + u16 *suballoc_slot, u64 *group_blkno, + u16 *suballoc_bit) { int status; struct buffer_head *inode_bh = NULL; @@ -2604,6 +2605,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno, *suballoc_slot = le16_to_cpu(inode_fe->i_suballoc_slot); if (suballoc_bit) *suballoc_bit = le16_to_cpu(inode_fe->i_suballoc_bit); + if (group_blkno) + *group_blkno = le64_to_cpu(inode_fe->i_suballoc_loc); bail: brelse(inode_bh); @@ -2621,7 +2624,8 @@ bail: */ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, struct inode *suballoc, - struct buffer_head *alloc_bh, u64 blkno, + struct buffer_head *alloc_bh, + u64 group_blkno, u64 blkno, u16 bit, int *res) { struct ocfs2_dinode *alloc_di; @@ -2642,10 +2646,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb, goto bail; } - if (alloc_di->i_suballoc_loc) - bg_blkno = le64_to_cpu(alloc_di->i_suballoc_loc); - else - bg_blkno = ocfs2_which_suballoc_group(blkno, bit); + bg_blkno = group_blkno ? group_blkno : + ocfs2_which_suballoc_group(blkno, bit); status = ocfs2_read_group_descriptor(suballoc, alloc_di, bg_blkno, &group_bh); if (status < 0) { @@ -2680,6 +2682,7 @@ bail: int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) { int status; + u64 group_blkno = 0; u16 suballoc_bit = 0, suballoc_slot = 0; struct inode *inode_alloc_inode; struct buffer_head *alloc_bh = NULL; @@ -2687,7 +2690,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog_entry("blkno: %llu", (unsigned long long)blkno); status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, - &suballoc_bit); + &group_blkno, &suballoc_bit); if (status < 0) { mlog(ML_ERROR, "get alloc slot and bit failed %d\n", status); goto bail; @@ -2715,7 +2718,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) } status = ocfs2_test_suballoc_bit(osb, inode_alloc_inode, alloc_bh, - blkno, suballoc_bit, res); + group_blkno, blkno, suballoc_bit, res); if (status < 0) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); -- cgit v1.2.3-18-g5258 From b2b6ebf5f740e015b2155343958f067e594323ea Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 26 Aug 2010 13:06:50 -0700 Subject: ocfs2: properly set and use inode group alloc hint We were setting ac->ac_last_group in ocfs2_claim_suballoc_bits from res->sr_bg_blkno. Unfortunately, res->sr_bg_blkno is going to be zero under normal (non-fragmented) circumstances. The discontig block group patches effectively turned off that feature. Fix this by correctly calculating what the next group hint should be. Acked-by: Tao Ma Signed-off-by: Mark Fasheh Tested-by: Goldwyn Rodrigues Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8a009ee1f7f..b93d7e72175 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -62,6 +62,17 @@ struct ocfs2_suballoc_result { unsigned int sr_bits; /* How many bits we claimed */ }; +static u64 ocfs2_group_from_res(struct ocfs2_suballoc_result *res) +{ + if (res->sr_blkno == 0) + return 0; + + if (res->sr_bg_blkno) + return res->sr_bg_blkno; + + return ocfs2_which_suballoc_group(res->sr_blkno, res->sr_bit_offset); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); @@ -1845,6 +1856,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, int status; u16 victim, i; u16 bits_left = 0; + u64 hint = ac->ac_last_group; struct ocfs2_chain_list *cl; struct ocfs2_dinode *fe; @@ -1872,7 +1884,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, goto bail; } - res->sr_bg_blkno = ac->ac_last_group; + res->sr_bg_blkno = hint; if (res->sr_bg_blkno) { /* Attempt to short-circuit the usual search mechanism * by jumping straight to the most recently used @@ -1896,8 +1908,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); goto set_hint; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1920,8 +1934,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, ac->ac_chain = i; status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); - if (!status) + if (!status) { + hint = ocfs2_group_from_res(res); break; + } if (status < 0 && status != -ENOSPC) { mlog_errno(status); goto bail; @@ -1936,7 +1952,7 @@ set_hint: if (bits_left < min_bits) ac->ac_last_group = 0; else - ac->ac_last_group = res->sr_bg_blkno; + ac->ac_last_group = hint; } bail: -- cgit v1.2.3-18-g5258 From 9b4c0ff32ccd87ab52d4c5bd0a0536febce11370 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 24 Aug 2010 14:28:03 +0200 Subject: ocfs2: Fix deadlock when allocating page We cannot call grab_cache_page() when holding filesystem locks or with a transaction started as grab_cache_page() calls page allocation with GFP_KERNEL flag and thus page reclaim can recurse back into the filesystem causing deadlocks or various assertion failures. We have to use find_or_create_page() instead and pass it GFP_NOFS as we do with other allocations. Acked-by: Mark Fasheh Signed-off-by: Jan Kara Signed-off-by: Tao Ma --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/file.c | 2 +- fs/ocfs2/refcounttree.c | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 215e12ce1d8..592fae5007d 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6672,7 +6672,7 @@ int ocfs2_grab_pages(struct inode *inode, loff_t start, loff_t end, last_page_bytes = PAGE_ALIGN(end); index = start >> PAGE_CACHE_SHIFT; do { - pages[numpages] = grab_cache_page(mapping, index); + pages[numpages] = find_or_create_page(mapping, index, GFP_NOFS); if (!pages[numpages]) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6b2be0f2eac..2caa3a7a1a3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -783,7 +783,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 73a11ccfd4c..0afeda83120 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2960,7 +2960,7 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, if (map_end & (PAGE_CACHE_SIZE - 1)) to = map_end & (PAGE_CACHE_SIZE - 1); - page = grab_cache_page(mapping, page_index); + page = find_or_create_page(mapping, page_index, GFP_NOFS); /* * In case PAGE_CACHE_SIZE <= CLUSTER_SIZE, This page @@ -3179,7 +3179,8 @@ static int ocfs2_cow_sync_writeback(struct super_block *sb, if (map_end > end) map_end = end; - page = grab_cache_page(context->inode->i_mapping, page_index); + page = find_or_create_page(context->inode->i_mapping, + page_index, GFP_NOFS); BUG_ON(!page); wait_on_page_writeback(page); -- cgit v1.2.3-18-g5258 From 81c8c82b5a39f9127e8b239e9b406a6c3a41b228 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Thu, 19 Aug 2010 15:15:00 +0800 Subject: Ocfs2: Fix a regression bug from mainline commit(6b933c8e6f1a2f3118082c455eef25f9b1ac7b45). The patch is to fix the regression bug brought from commit 6b933c8...( 'ocfs2: Avoid direct write if we fall back to buffered I/O'): http://oss.oracle.com/bugzilla/show_bug.cgi?id=1285 The commit 6b933c8e6f1a2f3118082c455eef25f9b1ac7b45 changed __generic_file_aio_write to generic_file_buffered_write, which didn't call filemap_{write,wait}_range to flush the pagecaches when we were falling O_DIRECT writes back to buffered ones. it did hurt the O_DIRECT semantics somehow in extented odirect writes. This patch tries to guarantee O_DIRECT writes of 'fall back to buffered' to be correctly flushed. Signed-off-by: Tristan Ye Signed-off-by: Tao Ma --- fs/ocfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2caa3a7a1a3..9a03c151b5c 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2338,7 +2338,7 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && has_refcount)) { + ((file->f_flags & O_DIRECT) && !direct_io)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) -- cgit v1.2.3-18-g5258 From 021960cab320ae3cc4e9aba9cca42f9f5ce785f3 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:15 -0700 Subject: ocfs2: split out inode alloc code from ocfs2_mknod_locked Do this by splitting the bulk of the function away from the inode allocation code at the very tom of ocfs2_mknod_locked(). Existing callers don't need to change and won't see any difference. The new function created, __ocfs2_mknod_locked() will be used shortly. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/namei.c | 55 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f171b51a74f..2aa66b695fa 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -472,32 +472,23 @@ leave: return status; } -static int ocfs2_mknod_locked(struct ocfs2_super *osb, - struct inode *dir, - struct inode *inode, - dev_t dev, - struct buffer_head **new_fe_bh, - struct buffer_head *parent_fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *inode_ac) +static int __ocfs2_mknod_locked(struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac, + u64 fe_blkno, u64 suballoc_loc, u16 suballoc_bit) { int status = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *fe = NULL; struct ocfs2_extent_list *fel; - u64 suballoc_loc, fe_blkno = 0; - u16 suballoc_bit; u16 feat; *new_fe_bh = NULL; - status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, - inode_ac, &suballoc_loc, - &suballoc_bit, &fe_blkno); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ @@ -591,6 +582,34 @@ leave: return status; } +static int ocfs2_mknod_locked(struct ocfs2_super *osb, + struct inode *dir, + struct inode *inode, + dev_t dev, + struct buffer_head **new_fe_bh, + struct buffer_head *parent_fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *inode_ac) +{ + int status = 0; + u64 suballoc_loc, fe_blkno = 0; + u16 suballoc_bit; + + *new_fe_bh = NULL; + + status = ocfs2_claim_new_inode(handle, dir, parent_fe_bh, + inode_ac, &suballoc_loc, + &suballoc_bit, &fe_blkno); + if (status < 0) { + mlog_errno(status); + return status; + } + + return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, + parent_fe_bh, handle, inode_ac, + fe_blkno, suballoc_loc, suballoc_bit); +} + static int ocfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) -- cgit v1.2.3-18-g5258 From d51349829c378c06ba4aa7d4b16ca23739858608 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:16 -0700 Subject: ocfs2: use ocfs2_alloc_dinode_update_counts() instead of open coding ocfs2_search_chain() makes the same updates as ocfs2_alloc_dinode_update_counts to the alloc inode. Instead of open coding the bitmap update, use our helper function. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index b93d7e72175..e7edda8c6a1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1719,7 +1719,6 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; - u32 tmp_used; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1807,22 +1806,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } - /* Ok, claim our bits now: set the info on dinode, chainlist - * and then the group */ - status = ocfs2_journal_access_di(handle, - INODE_CACHE(alloc_inode), - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (status) { mlog_errno(status); goto bail; } - tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used); - fe->id1.bitmap1.i_used = cpu_to_le32(res->sr_bits + tmp_used); - le32_add_cpu(&cl->cl_recs[chain].c_free, -res->sr_bits); - ocfs2_journal_dirty(handle, ac->ac_bh); - status = ocfs2_block_group_set_bits(handle, alloc_inode, bg, -- cgit v1.2.3-18-g5258 From e49e27674d1dd2717ad90b21ece8f83102153315 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:17 -0700 Subject: ocfs2: allow return of new inode block location before allocation of the inode This allows code which needs to know the eventual block number of an inode but can't allocate it yet due to transaction or lock ordering. For example, ocfs2_create_inode_in_orphan() currently gives a junk blkno for preparation of the orphan dir because it can't yet know where the actual inode is placed - that code is actually in ocfs2_mknod_locked. This is a problem when the orphan dirs are indexed as the junk inode number will create an index entry which goes unused (and fails the later removal from the orphan dir). Now with these interfaces, ocfs2_create_inode_in_orphan() can run the block group search (and get back the inode block number) *before* any actual allocation occurs. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/suballoc.c | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/suballoc.h | 21 +++++++ 2 files changed, 180 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index e7edda8c6a1..8a286f54dca 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -57,6 +57,12 @@ struct ocfs2_suballoc_result { u64 sr_bg_blkno; /* The bg we allocated from. Set to 0 when a block group is contiguous. */ + u64 sr_bg_stable_blkno; /* + * Doesn't change, always + * set to target block + * group descriptor + * block. + */ u64 sr_blkno; /* The first allocated block */ unsigned int sr_bit_offset; /* The bit in the bg */ unsigned int sr_bits; /* How many bits we claimed */ @@ -149,6 +155,10 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) brelse(ac->ac_bh); ac->ac_bh = NULL; ac->ac_resv = NULL; + if (ac->ac_find_loc_priv) { + kfree(ac->ac_find_loc_priv); + ac->ac_find_loc_priv = NULL; + } } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -1689,6 +1699,15 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (!ret) ocfs2_bg_discontig_fix_result(ac, gd, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; + + if (ac->ac_find_loc_only) + goto out_loc_only; + ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, le16_to_cpu(gd->bg_chain)); @@ -1702,6 +1721,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, if (ret < 0) mlog_errno(ret); +out_loc_only: *bits_left = le16_to_cpu(gd->bg_free_bits_count); out: @@ -1780,6 +1800,11 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, if (!status) ocfs2_bg_discontig_fix_result(ac, bg, res); + /* + * sr_bg_blkno might have been changed by + * ocfs2_bg_discontig_fix_result + */ + res->sr_bg_stable_blkno = group_bh->b_blocknr; /* * Keep track of previous block descriptor read. When @@ -1806,6 +1831,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, } } + if (ac->ac_find_loc_only) + goto out_loc_only; + status = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh, res->sr_bits, chain); @@ -1828,6 +1856,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, (unsigned long long)le64_to_cpu(fe->i_blkno)); +out_loc_only: *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: brelse(group_bh); @@ -2023,6 +2052,136 @@ static inline void ocfs2_save_inode_ac_group(struct inode *dir, OCFS2_I(dir)->ip_last_used_slot = ac->ac_alloc_slot; } +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno) +{ + int ret; + handle_t *handle = NULL; + struct ocfs2_suballoc_result *res; + + BUG_ON(!ac); + BUG_ON(ac->ac_bits_given != 0); + BUG_ON(ac->ac_bits_wanted != 1); + BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE); + + res = kzalloc(sizeof(*res), GFP_NOFS); + if (res == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ocfs2_init_inode_ac_group(dir, parent_fe_bh, ac); + + /* + * The handle started here is for chain relink. Alternatively, + * we could just disable relink for these calls. + */ + handle = ocfs2_start_trans(OCFS2_SB(dir->i_sb), OCFS2_SUBALLOC_ALLOC); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + /* + * This will instruct ocfs2_claim_suballoc_bits and + * ocfs2_search_one_group to search but save actual allocation + * for later. + */ + ac->ac_find_loc_only = 1; + + ret = ocfs2_claim_suballoc_bits(ac, handle, 1, 1, res); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ac->ac_find_loc_priv = res; + *fe_blkno = res->sr_blkno; + +out: + if (handle) + ocfs2_commit_trans(OCFS2_SB(dir->i_sb), handle); + + if (ret) + kfree(res); + + return ret; +} + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno) +{ + int ret; + u16 chain; + struct ocfs2_suballoc_result *res = ac->ac_find_loc_priv; + struct buffer_head *bg_bh = NULL; + struct ocfs2_group_desc *bg; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) ac->ac_bh->b_data; + + /* + * Since di_blkno is being passed back in, we check for any + * inconsistencies which may have happened between + * calls. These are code bugs as di_blkno is not expected to + * change once returned from ocfs2_find_new_inode_loc() + */ + BUG_ON(res->sr_blkno != di_blkno); + + ret = ocfs2_read_group_descriptor(ac->ac_inode, di, + res->sr_bg_stable_blkno, &bg_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + bg = (struct ocfs2_group_desc *) bg_bh->b_data; + chain = le16_to_cpu(bg->bg_chain); + + ret = ocfs2_alloc_dinode_update_counts(ac->ac_inode, handle, + ac->ac_bh, res->sr_bits, + chain); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_block_group_set_bits(handle, + ac->ac_inode, + bg, + bg_bh, + res->sr_bit_offset, + res->sr_bits); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, + (unsigned long long)di_blkno); + + atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); + + BUG_ON(res->sr_bits != 1); + + *suballoc_loc = res->sr_bg_blkno; + *suballoc_bit = res->sr_bit_offset; + ac->ac_bits_given++; + ocfs2_save_inode_ac_group(dir, ac); + +out: + brelse(bg_bh); + + return ret; +} + int ocfs2_claim_new_inode(handle_t *handle, struct inode *dir, struct buffer_head *parent_fe_bh, diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index a017dd3ee7d..b8afabfeede 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -56,6 +56,9 @@ struct ocfs2_alloc_context { u64 ac_max_block; /* Highest block number to allocate. 0 is is the same as ~0 - unlimited */ + int ac_find_loc_only; /* hack for reflink operation ordering */ + struct ocfs2_suballoc_result *ac_find_loc_priv; /* */ + struct ocfs2_alloc_reservation *ac_resv; }; @@ -197,4 +200,22 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_alloc_context **meta_ac); int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res); + + + +/* + * The following two interfaces are for ocfs2_create_inode_in_orphan(). + */ +int ocfs2_find_new_inode_loc(struct inode *dir, + struct buffer_head *parent_fe_bh, + struct ocfs2_alloc_context *ac, + u64 *fe_blkno); + +int ocfs2_claim_new_inode_at_loc(handle_t *handle, + struct inode *dir, + struct ocfs2_alloc_context *ac, + u64 *suballoc_loc, + u16 *suballoc_bit, + u64 di_blkno); + #endif /* _CHAINALLOC_H_ */ -- cgit v1.2.3-18-g5258 From dd43bcde23c527f64897eef41aa1fed2c9905ea9 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:18 -0700 Subject: ocfs2: split out ocfs2_prepare_orphan_dir() into locking and prep functions We do this because ocfs2_create_inode_in_orphan() wants to order locking of the orphan dir with respect to locking of the inode allocator *before* making any changes to the directory. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/namei.c | 120 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 88 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2aa66b695fa..54c62985535 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1871,61 +1871,117 @@ bail: return status; } -static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, - struct inode **ret_orphan_dir, - u64 blkno, - char *name, - struct ocfs2_dir_lookup_result *lookup) +static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + struct buffer_head **ret_orphan_dir_bh) { struct inode *orphan_dir_inode; struct buffer_head *orphan_dir_bh = NULL; - int status = 0; - - status = ocfs2_blkno_stringify(blkno, name); - if (status < 0) { - mlog_errno(status); - return status; - } + int ret = 0; orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -ENOENT; - mlog_errno(status); - return status; + ret = -ENOENT; + mlog_errno(ret); + return ret; } mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mlog_errno(status); - goto leave; + ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (ret < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); + iput(orphan_dir_inode); + + mlog_errno(ret); + return ret; } - status = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, - orphan_dir_bh, name, - OCFS2_ORPHAN_NAMELEN, lookup); - if (status < 0) { - ocfs2_inode_unlock(orphan_dir_inode, 1); + *ret_orphan_dir = orphan_dir_inode; + *ret_orphan_dir_bh = orphan_dir_bh; - mlog_errno(status); - goto leave; + return 0; +} + +static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode, + struct buffer_head *orphan_dir_bh, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb); + + ret = ocfs2_blkno_stringify(blkno, name); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode, + orphan_dir_bh, name, + OCFS2_ORPHAN_NAMELEN, lookup); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + return 0; +} + +/** + * ocfs2_prepare_orphan_dir() - Prepare an orphan directory for + * insertion of an orphan. + * @osb: ocfs2 file system + * @ret_orphan_dir: Orphan dir inode - returned locked! + * @blkno: Actual block number of the inode to be inserted into orphan dir. + * @lookup: dir lookup result, to be passed back into functions like + * ocfs2_orphan_add + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, + struct inode **ret_orphan_dir, + u64 blkno, + char *name, + struct ocfs2_dir_lookup_result *lookup) +{ + struct inode *orphan_dir_inode = NULL; + struct buffer_head *orphan_dir_bh = NULL; + int ret = 0; + + ret = ocfs2_lookup_lock_orphan_dir(osb, &orphan_dir_inode, + &orphan_dir_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh, + blkno, name, lookup); + if (ret < 0) { + mlog_errno(ret); + goto out; } *ret_orphan_dir = orphan_dir_inode; -leave: - if (status) { +out: + brelse(orphan_dir_bh); + + if (ret) { + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); } - brelse(orphan_dir_bh); - - mlog_exit(status); - return status; + mlog_exit(ret); + return ret; } static int ocfs2_orphan_add(struct ocfs2_super *osb, -- cgit v1.2.3-18-g5258 From 97b8f4a9dfd932997677136e11980eb2fafea91d Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 13 Aug 2010 15:15:19 -0700 Subject: ocfs2: Fix orphan add in ocfs2_create_inode_in_orphan ocfs2_create_inode_in_orphan() is used by reflink to create the newly reflinked inode simultaneously in the orphan dir. This allows us to easily handle partially-reflinked files during recovery cleanup. We have a problem though - the orphan dir stringifies inode # to determine a unique name under which the orphan entry dirent can be created. Since ocfs2_create_inode_in_orphan() needs the space allocated in the orphan dir before it can allocate the inode, we currently call into the orphan code: /* * We give the orphan dir the root blkno to fake an orphan name, * and allocate enough space for our insertion. */ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir, osb->root_blkno, orphan_name, &orphan_insert); Using osb->root_blkno might work fine on unindexed directories, but the orphan dir can have an index. When it has that index, the above code fails to allocate the proper index entry. Later, when we try to remove the file from the orphan dir (using the actual inode #), the reflink operation will fail. To fix this, I created a function ocfs2_alloc_orphaned_file() which uses the newly split out orphan and inode alloc code to figure out what the inode block number will be (once allocated) and then prepare the orphan dir from that data. Signed-off-by: Mark Fasheh Signed-off-by: Tao Ma --- fs/ocfs2/namei.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 107 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 54c62985535..a00dda2e4f1 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2128,6 +2128,99 @@ leave: return status; } +/** + * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly + * allocated file. This is different from the typical 'add to orphan dir' + * operation in that the inode does not yet exist. This is a problem because + * the orphan dir stringifies the inode block number to come up with it's + * dirent. Obviously if the inode does not yet exist we have a chicken and egg + * problem. This function works around it by calling deeper into the orphan + * and suballoc code than other callers. Use this only by necessity. + * @dir: The directory which this inode will ultimately wind up under - not the + * orphan dir! + * @dir_bh: buffer_head the @dir inode block + * @orphan_name: string of length (CFS2_ORPHAN_NAMELEN + 1). Will be filled + * with the string to be used for orphan dirent. Pass back to the orphan dir + * code. + * @ret_orphan_dir: orphan dir inode returned to be passed back into orphan + * dir code. + * @ret_di_blkno: block number where the new inode will be allocated. + * @orphan_insert: Dir insert context to be passed back into orphan dir code. + * @ret_inode_ac: Inode alloc context to be passed back to the allocator. + * + * Returns zero on success and the ret_orphan_dir, name and lookup + * fields will be populated. + * + * Returns non-zero on failure. + */ +static int ocfs2_prep_new_orphaned_file(struct inode *dir, + struct buffer_head *dir_bh, + char *orphan_name, + struct inode **ret_orphan_dir, + u64 *ret_di_blkno, + struct ocfs2_dir_lookup_result *orphan_insert, + struct ocfs2_alloc_context **ret_inode_ac) +{ + int ret; + u64 di_blkno; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + struct i