diff options
Diffstat (limited to 'fs/exofs')
| -rw-r--r-- | fs/exofs/Kbuild | 6 | ||||
| -rw-r--r-- | fs/exofs/Kconfig.ore | 14 | ||||
| -rw-r--r-- | fs/exofs/common.h | 22 | ||||
| -rw-r--r-- | fs/exofs/dir.c | 77 | ||||
| -rw-r--r-- | fs/exofs/exofs.h | 189 | ||||
| -rw-r--r-- | fs/exofs/file.c | 34 | ||||
| -rw-r--r-- | fs/exofs/inode.c | 475 | ||||
| -rw-r--r-- | fs/exofs/ios.c | 803 | ||||
| -rw-r--r-- | fs/exofs/namei.c | 30 | ||||
| -rw-r--r-- | fs/exofs/ore.c | 1164 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.c | 721 | ||||
| -rw-r--r-- | fs/exofs/ore_raid.h | 62 | ||||
| -rw-r--r-- | fs/exofs/pnfs.h | 45 | ||||
| -rw-r--r-- | fs/exofs/super.c | 538 | ||||
| -rw-r--r-- | fs/exofs/sys.c | 205 |
15 files changed, 3008 insertions, 1377 deletions
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 2d0f757fda3..389ba8312d5 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -12,5 +12,9 @@ # Kbuild - Gets included from the Kernels Makefile and build system # -exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o +# ore module library +libore-y := ore.o ore_raid.o +obj-$(CONFIG_ORE) += libore.o + +exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/Kconfig.ore b/fs/exofs/Kconfig.ore new file mode 100644 index 00000000000..2daf2329c28 --- /dev/null +++ b/fs/exofs/Kconfig.ore @@ -0,0 +1,14 @@ +# ORE - Objects Raid Engine (libore.ko) +# +# Note ORE needs to "select ASYNC_XOR". So Not to force multiple selects +# for every ORE user we do it like this. Any user should add itself here +# at the "depends on EXOFS_FS || ..." with an ||. The dependencies are +# selected here, and we default to "ON". So in effect it is like been +# selected by any of the users. +config ORE + tristate + depends on EXOFS_FS || PNFS_OBJLAYOUT + select ASYNC_XOR + select RAID6_PQ + select ASYNC_PQ + default SCSI_OSD_ULD diff --git a/fs/exofs/common.h b/fs/exofs/common.h index f0d520312d8..3bbd46956d7 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -53,10 +53,14 @@ #define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ /* exofs Application specific page/attribute */ +/* Inode attrs */ # define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) # define EXOFS_ATTR_INODE_DATA 1 # define EXOFS_ATTR_INODE_FILE_LAYOUT 2 # define EXOFS_ATTR_INODE_DIR_LAYOUT 3 +/* Partition attrs */ +# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3) +# define EXOFS_ATTR_SB_STATS 1 /* * The maximum number of files we can have is limited by the size of the @@ -86,8 +90,8 @@ enum { */ enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; struct exofs_fscb { - __le64 s_nextid; /* Highest object ID used */ - __le64 s_numfiles; /* Number of files on fs */ + __le64 s_nextid; /* Only used after mkfs */ + __le64 s_numfiles; /* Only used after mkfs */ __le32 s_version; /* == EXOFS_FSCB_VER */ __le16 s_magic; /* Magic signature */ __le16 s_newfs; /* Non-zero if this is a new fs */ @@ -98,10 +102,20 @@ struct exofs_fscb { } __packed; /* + * This struct is set on the FS partition's attributes. + * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together + * with the create command, to atomically persist the sb writeable information. + */ +struct exofs_sb_stats { + __le64 s_nextid; /* Highest object ID used */ + __le64 s_numfiles; /* Number of files on fs */ +} __packed; + +/* * Describes the raid used in the FS. It is part of the device table. * This here is taken from the pNFS-objects definition. In exofs we * use one raid policy through-out the filesystem. (NOTE: the funny - * alignment at begining. We take care of it at exofs_device_table. + * alignment at beginning. We take care of it at exofs_device_table. */ struct exofs_dt_data_map { __le32 cb_num_comps; @@ -122,7 +136,7 @@ struct exofs_dt_device_info { u8 systemid[OSD_SYSTEMID_LEN]; __le64 long_name_offset; /* If !0 then offset-in-file */ __le32 osdname_len; /* */ - u8 osdname[44]; /* Embbeded, Ususally an asci uuid */ + u8 osdname[44]; /* Embbeded, Usually an asci uuid */ } __packed; /* diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index dcc941d82d6..49f51ab4caa 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -124,7 +124,7 @@ out: Ebadsize: EXOFS_ERR("ERROR [exofs_check_page]: " - "size of directory #%lu is not a multiple of chunk size", + "size of directory(0x%lx) is not a multiple of chunk size\n", dir->i_ino ); goto fail; @@ -142,8 +142,8 @@ Espan: goto bad_entry; bad_entry: EXOFS_ERR( - "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - " - "offset=%lu, inode=%llu, rec_len=%d, name_len=%d", + "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - " + "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n", dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, _LLU(le64_to_cpu(p->inode_no)), rec_len, p->name_len); @@ -151,8 +151,8 @@ bad_entry: Eend: p = (struct exofs_dir_entry *)(kaddr + offs); EXOFS_ERR("ERROR [exofs_check_page]: " - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%llu", + "entry in directory(0x%lx) spans the page boundary" + "offset=%lu, inode=0x%llx\n", dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, _LLU(le64_to_cpu(p->inode_no))); fail: @@ -234,37 +234,33 @@ static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = { static inline void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode) { - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } static int -exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) +exofs_readdir(struct file *file, struct dir_context *ctx) { - loff_t pos = filp->f_pos; - struct inode *inode = filp->f_path.dentry->d_inode; + loff_t pos = ctx->pos; + struct inode *inode = file_inode(file); unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(exofs_chunk_size(inode)-1); - unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); + int need_revalidate = (file->f_version != inode->i_version); if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1)) return 0; - types = exofs_filetype_table; - for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct exofs_dir_entry *de; struct page *page = exofs_get_page(inode, n); if (IS_ERR(page)) { - EXOFS_ERR("ERROR: " - "bad page in #%lu", - inode->i_ino); - filp->f_pos += PAGE_CACHE_SIZE - offset; + EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n", + inode->i_ino); + ctx->pos += PAGE_CACHE_SIZE - offset; return PTR_ERR(page); } kaddr = page_address(page); @@ -272,9 +268,9 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (offset) { offset = exofs_validate_entry(kaddr, offset, chunk_mask); - filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + ctx->pos = (n<<PAGE_CACHE_SHIFT) + offset; } - filp->f_version = inode->i_version; + file->f_version = inode->i_version; need_revalidate = 0; } de = (struct exofs_dir_entry *)(kaddr + offset); @@ -283,32 +279,30 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir) for (; (char *)de <= limit; de = exofs_next_entry(de)) { if (de->rec_len == 0) { EXOFS_ERR("ERROR: " - "zero-length directory entry"); + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); exofs_put_page(page); return -EIO; } if (de->inode_no) { - int over; - unsigned char d_type = DT_UNKNOWN; + unsigned char t; - if (types && de->file_type < EXOFS_FT_MAX) - d_type = types[de->file_type]; + if (de->file_type < EXOFS_FT_MAX) + t = exofs_filetype_table[de->file_type]; + else + t = DT_UNKNOWN; - offset = (char *)de - kaddr; - over = filldir(dirent, de->name, de->name_len, - (n<<PAGE_CACHE_SHIFT) | offset, + if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode_no), - d_type); - if (over) { + t)) { exofs_put_page(page); return 0; } } - filp->f_pos += le16_to_cpu(de->rec_len); + ctx->pos += le16_to_cpu(de->rec_len); } exofs_put_page(page); } - return 0; } @@ -342,9 +336,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir, kaddr += exofs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { - EXOFS_ERR( - "ERROR: exofs_find_entry: " - "zero-length directory entry"); + EXOFS_ERR("ERROR: zero-length entry in " + "directory(0x%lx)\n", + dir->i_ino); exofs_put_page(page); goto out; } @@ -472,7 +466,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode) } if (de->rec_len == 0) { EXOFS_ERR("ERROR: exofs_add_link: " - "zero-length directory entry"); + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); err = -EIO; goto out_unlock; } @@ -491,7 +486,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode) exofs_put_page(page); } - EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode); + EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n", + dentry, inode->i_ino); return -EINVAL; got_it: @@ -542,7 +538,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page) while (de < dir) { if (de->rec_len == 0) { EXOFS_ERR("ERROR: exofs_delete_entry:" - "zero-length directory entry"); + "zero-length entry in directory(0x%lx)\n", + inode->i_ino); err = -EIO; goto out; } @@ -594,7 +591,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) goto fail; } - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); de = (struct exofs_dir_entry *)kaddr; de->name_len = 1; de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1)); @@ -608,7 +605,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent) de->inode_no = cpu_to_le64(parent->i_ino); memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); exofs_set_de_type(de, inode); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); err = exofs_commit_chunk(page, 0, chunk_size); fail: page_cache_release(page); @@ -666,5 +663,5 @@ not_empty: const struct file_operations exofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, - .readdir = exofs_readdir, + .iterate = exofs_readdir, }; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 2dc925fa101..fffe86fd7a4 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -36,12 +36,9 @@ #include <linux/fs.h> #include <linux/time.h> #include <linux/backing-dev.h> -#include "common.h" +#include <scsi/osd_ore.h> -/* FIXME: Remove once pnfs hits mainline - * #include <linux/exportfs/pnfs_osd_xdr.h> - */ -#include "pnfs.h" +#include "common.h" #define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) @@ -56,45 +53,30 @@ /* u64 has problems with printk this will cast it to unsigned long long */ #define _LLU(x) (unsigned long long)(x) -struct exofs_layout { - osd_id s_pid; /* partition ID of file system*/ - - /* Our way of looking at the data_map */ - unsigned stripe_unit; - unsigned mirrors_p1; - - unsigned group_width; - u64 group_depth; - unsigned group_count; - - enum exofs_inode_layout_gen_functions lay_func; - - unsigned s_numdevs; /* Num of devices in array */ - struct osd_dev *s_ods[0]; /* Variable length */ +struct exofs_dev { + struct ore_dev ored; + unsigned did; + unsigned urilen; + uint8_t *uri; + struct kobject ed_kobj; }; - /* * our extension to the in-memory superblock */ struct exofs_sb_info { - struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ + struct backing_dev_info bdi; /* register our bdi with VFS */ + struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ uint32_t s_numfiles; /* number of files on fs */ spinlock_t s_next_gen_lock; /* spinlock for gen # update */ u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ - uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ - struct backing_dev_info bdi; /* register our bdi with VFS */ - - struct pnfs_osd_data_map data_map; /* Default raid to use - * FIXME: Needed ? - */ -/* struct exofs_layout dir_layout;*/ /* Default dir layout */ - struct exofs_layout layout; /* Default files layout, - * contains the variable osd_dev - * array. Keep last */ - struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ + + struct ore_layout layout; /* Default files layout */ + struct ore_comp one_comp; /* id & cred of partition id=0*/ + struct ore_components oc; /* comps for the partition */ + struct kobject s_kobj; /* holds per-sbi kobject */ }; /* @@ -107,7 +89,8 @@ struct exofs_i_info { uint32_t i_data[EXOFS_IDATA];/*short symlink names and device #s*/ uint32_t i_dir_start_lookup; /* which page to start lookup */ uint64_t i_commit_size; /* the object's written length */ - uint8_t i_cred[OSD_CAP_LEN];/* all-powerful credential */ + struct ore_comp one_comp; /* same component for all devices */ + struct ore_components oc; /* inode view of the device table */ }; static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) @@ -115,52 +98,6 @@ static inline osd_id exofs_oi_objno(struct exofs_i_info *oi) return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF; } -struct exofs_io_state; -typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private); - -struct exofs_io_state { - struct kref kref; - - void *private; - exofs_io_done_fn done; - - struct exofs_layout *layout; - struct osd_obj_id obj; - u8 *cred; - - /* Global read/write IO*/ - loff_t offset; - unsigned long length; - void *kern_buff; - - struct page **pages; - unsigned nr_pages; - unsigned pgbase; - unsigned pages_consumed; - - /* Attributes */ - unsigned in_attr_len; - struct osd_attr *in_attr; - unsigned out_attr_len; - struct osd_attr *out_attr; - - /* Variable array of size numdevs */ - unsigned numdevs; - struct exofs_per_dev_state { - struct osd_request *or; - struct bio *bio; - loff_t offset; - unsigned length; - unsigned dev; - } per_dev[]; -}; - -static inline unsigned exofs_io_state_size(unsigned numdevs) -{ - return sizeof(struct exofs_io_state) + - sizeof(struct exofs_per_dev_state) * numdevs; -} - /* * our inode flags */ @@ -205,12 +142,6 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode) } /* - * Given a layout, object_number and stripe_index return the associated global - * dev_index - */ -unsigned exofs_layout_od_id(struct exofs_layout *layout, - osd_id obj_no, unsigned layout_index); -/* * Maximum count of links to a file */ #define EXOFS_LINK_MAX 32000 @@ -219,49 +150,15 @@ unsigned exofs_layout_od_id(struct exofs_layout *layout, * function declarations * *************************/ -/* ios.c */ -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], - const struct osd_obj_id *obj); -int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length); - -int exofs_get_io_state(struct exofs_layout *layout, - struct exofs_io_state **ios); -void exofs_put_io_state(struct exofs_io_state *ios); - -int exofs_check_io(struct exofs_io_state *ios, u64 *resid); - -int exofs_sbi_create(struct exofs_io_state *ios); -int exofs_sbi_remove(struct exofs_io_state *ios); -int exofs_sbi_write(struct exofs_io_state *ios); -int exofs_sbi_read(struct exofs_io_state *ios); - -int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr); - -int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len); -static inline int exofs_oi_write(struct exofs_i_info *oi, - struct exofs_io_state *ios) -{ - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - return exofs_sbi_write(ios); -} - -static inline int exofs_oi_read(struct exofs_i_info *oi, - struct exofs_io_state *ios) -{ - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - return exofs_sbi_read(ios); -} - /* inode.c */ +unsigned exofs_max_io_pages(struct ore_layout *layout, + unsigned expected_pages); int exofs_setattr(struct dentry *, struct iattr *); int exofs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); extern struct inode *exofs_iget(struct super_block *, unsigned long); -struct inode *exofs_new_inode(struct inode *, int); +struct inode *exofs_new_inode(struct inode *, umode_t); extern int exofs_write_inode(struct inode *, struct writeback_control *wbc); extern void exofs_evict_inode(struct inode *); @@ -279,7 +176,19 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *, struct inode *); /* super.c */ -int exofs_sync_fs(struct super_block *sb, int wait); +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], + const struct osd_obj_id *obj); +int exofs_sbi_write_stats(struct exofs_sb_info *sbi); + +/* sys.c */ +int exofs_sysfs_init(void); +void exofs_sysfs_uninit(void); +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev); +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi); +int exofs_sysfs_odev_add(struct exofs_dev *edev, + struct exofs_sb_info *sbi); +void exofs_sysfs_dbg_print(void); /********************* * operation vectors * @@ -293,7 +202,6 @@ extern const struct file_operations exofs_file_operations; /* inode.c */ extern const struct address_space_operations exofs_aops; -extern const struct osd_attr g_attr_logical_length; /* namei.c */ extern const struct inode_operations exofs_dir_inode_operations; @@ -303,4 +211,35 @@ extern const struct inode_operations exofs_special_inode_operations; extern const struct inode_operations exofs_symlink_inode_operations; extern const struct inode_operations exofs_fast_symlink_inode_operations; +/* exofs_init_comps will initialize an ore_components device array + * pointing to a single ore_comp struct, and a round-robin view + * of the device table. + * The first device of each inode is the [inode->ino % num_devices] + * and the rest of the devices sequentially following where the + * first device is after the last device. + * It is assumed that the global device array at @sbi is twice + * bigger and that the device table repeats twice. + * See: exofs_read_lookup_dev_table() + */ +static inline void exofs_init_comps(struct ore_components *oc, + struct ore_comp *one_comp, + struct exofs_sb_info *sbi, osd_id oid) +{ + unsigned dev_mod = (unsigned)oid, first_dev; + + one_comp->obj.partition = sbi->one_comp.obj.partition; + one_comp->obj.id = oid; + exofs_make_credential(one_comp->cred, &one_comp->obj); + + oc->first_dev = 0; + oc->numdevs = sbi->layout.group_width * sbi->layout.mirrors_p1 * + sbi->layout.group_count; + oc->single_comp = EC_SINGLE_COMP; + oc->comps = one_comp; + + /* Round robin device view of the table */ + first_dev = (dev_mod * sbi->layout.mirrors_p1) % sbi->oc.numdevs; + oc->ods = &sbi->oc.ods[first_dev]; +} + #endif diff --git a/fs/exofs/file.c b/fs/exofs/file.c index b905c79b4f0..71bf8e4fb5d 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -42,25 +42,19 @@ static int exofs_release_file(struct inode *inode, struct file *filp) * Note, in exofs all metadata is written as part of inode, regardless. * The writeout is synchronous */ -static int exofs_file_fsync(struct file *filp, int datasync) +static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { - int ret; struct inode *inode = filp->f_mapping->host; - struct super_block *sb; - - if (!(inode->i_state & I_DIRTY)) - return 0; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) - return 0; - - ret = sync_inode_metadata(inode, 1); + int ret; - /* This is a good place to write the sb */ - /* TODO: Sechedule an sb-sync on create */ - sb = inode->i_sb; - if (sb->s_dirt) - exofs_sync_fs(sb, 1); + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret) + return ret; + mutex_lock(&inode->i_mutex); + ret = sync_inode_metadata(filp->f_mapping->host, 1); + mutex_unlock(&inode->i_mutex); return ret; } @@ -73,17 +67,17 @@ static int exofs_flush(struct file *file, fl_owner_t id) const struct file_operations exofs_file_operations = { .llseek = generic_file_llseek, - .read = do_sync_read, - .write = do_sync_write, - .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, .release = exofs_release_file, .fsync = exofs_file_fsync, .flush = exofs_flush, .splice_read = generic_file_splice_read, - .splice_write = generic_file_splice_write, + .splice_write = iter_file_splice_write, }; const struct inode_operations exofs_file_inode_operations = { diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a7555238c41..3f9cafd7393 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -37,17 +37,20 @@ #define EXOFS_DBGMSG2(M...) do {} while (0) -enum { BIO_MAX_PAGES_KMALLOC = - (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec), - MAX_PAGES_KMALLOC = - PAGE_SIZE / sizeof(struct page *), -}; +unsigned exofs_max_io_pages(struct ore_layout *layout, + unsigned expected_pages) +{ + unsigned pages = min_t(unsigned, expected_pages, + layout->max_io_length / PAGE_SIZE); + + return pages; +} struct page_collect { struct exofs_sb_info *sbi; struct inode *inode; unsigned expected_pages; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct page **pages; unsigned alloc_pages; @@ -57,6 +60,7 @@ struct page_collect { bool read_4_write; /* This means two things: that the read is sync * And the pages should not be unlocked. */ + struct page *that_locked_page; }; static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, @@ -75,6 +79,7 @@ static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, pcol->length = 0; pcol->pg_first = -1; pcol->read_4_write = false; + pcol->that_locked_page = NULL; } static void _pcol_reset(struct page_collect *pcol) @@ -87,29 +92,22 @@ static void _pcol_reset(struct page_collect *pcol) pcol->length = 0; pcol->pg_first = -1; pcol->ios = NULL; + pcol->that_locked_page = NULL; /* this is probably the end of the loop but in writes * it might not end here. don't be left with nothing */ if (!pcol->expected_pages) - pcol->expected_pages = MAX_PAGES_KMALLOC; + pcol->expected_pages = + exofs_max_io_pages(&pcol->sbi->layout, ~0); } static int pcol_try_alloc(struct page_collect *pcol) { - unsigned pages = min_t(unsigned, pcol->expected_pages, - MAX_PAGES_KMALLOC); - - if (!pcol->ios) { /* First time allocate io_state */ - int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); - - if (ret) - return ret; - } + unsigned pages; /* TODO: easily support bio chaining */ - pages = min_t(unsigned, pages, - pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC); + pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages); for (; pages; pages >>= 1) { pcol->pages = kmalloc(pages * sizeof(struct page *), @@ -131,7 +129,7 @@ static void pcol_free(struct page_collect *pcol) pcol->pages = NULL; if (pcol->ios) { - exofs_put_io_state(pcol->ios); + ore_put_io_state(pcol->ios); pcol->ios = NULL; } } @@ -147,14 +145,17 @@ static int pcol_add_page(struct page_collect *pcol, struct page *page, return 0; } +enum {PAGE_WAS_NOT_IN_IO = 17}; static int update_read_page(struct page *page, int ret) { - if (ret == 0) { + switch (ret) { + case 0: /* Everything is OK */ SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - } else if (ret == -EFAULT) { + break; + case -EFAULT: /* In this case we were trying to read something that wasn't on * disk yet - return a page full of zeroes. This should be OK, * because the object should be empty (if there was a write @@ -165,16 +166,22 @@ static int update_read_page(struct page *page, int ret) SetPageUptodate(page); if (PageError(page)) ClearPageError(page); - ret = 0; /* recovered error */ EXOFS_DBGMSG("recovered read error\n"); - } else /* Error */ + /* fall through */ + case PAGE_WAS_NOT_IN_IO: + ret = 0; /* recovered error */ + break; + default: SetPageError(page); - + } return ret; } static void update_write_page(struct page *page, int ret) { + if (unlikely(ret == PAGE_WAS_NOT_IN_IO)) + return; /* don't pass start don't collect $200 */ + if (ret) { mapping_set_error(page->mapping, ret); SetPageError(page); @@ -188,15 +195,16 @@ static void update_write_page(struct page *page, int ret) static int __readpages_done(struct page_collect *pcol) { int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_io(pcol->ios, &resid); + int ret = ore_check_io(pcol->ios, NULL); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("readpages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -232,7 +240,7 @@ static int __readpages_done(struct page_collect *pcol) } /* callback of async reads */ -static void readpages_done(struct exofs_io_state *ios, void *p) +static void readpages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; @@ -257,23 +265,70 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) } } +static int _maybe_not_all_in_one_io(struct ore_io_state *ios, + struct page_collect *pcol_src, struct page_collect *pcol) +{ + /* length was wrong or offset was not page aligned */ + BUG_ON(pcol_src->nr_pages < ios->nr_pages); + + if (pcol_src->nr_pages > ios->nr_pages) { + struct page **src_page; + unsigned pages_less = pcol_src->nr_pages - ios->nr_pages; + unsigned long len_less = pcol_src->length - ios->length; + unsigned i; + int ret; + + /* This IO was trimmed */ + pcol_src->nr_pages = ios->nr_pages; + pcol_src->length = ios->length; + + /* Left over pages are passed to the next io */ + pcol->expected_pages += pages_less; + pcol->nr_pages = pages_less; + pcol->length = len_less; + src_page = pcol_src->pages + pcol_src->nr_pages; + pcol->pg_first = (*src_page)->index; + + ret = pcol_try_alloc(pcol); + if (unlikely(ret)) + return ret; + + for (i = 0; i < pages_less; ++i) + pcol->pages[i] = *src_page++; + + EXOFS_DBGMSG("Length was adjusted nr_pages=0x%x " + "pages_less=0x%x expected_pages=0x%x " + "next_offset=0x%llx next_len=0x%lx\n", + pcol_src->nr_pages, pages_less, pcol->expected_pages, + pcol->pg_first * PAGE_SIZE, pcol->length); + } + return 0; +} + static int read_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct exofs_io_state *ios = pcol->ios; + struct ore_io_state *ios; struct page_collect *pcol_copy = NULL; int ret; if (!pcol->pages) return 0; + if (!pcol->ios) { + int ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, true, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + + if (ret) + return ret; + } + + ios = pcol->ios; ios->pages = pcol->pages; - ios->nr_pages = pcol->nr_pages; - ios->length = pcol->length; - ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT; if (pcol->read_4_write) { - exofs_oi_read(oi, pcol->ios); + ore_read(pcol->ios); return __readpages_done(pcol); } @@ -286,26 +341,32 @@ static int read_exec(struct page_collect *pcol) *pcol_copy = *pcol; ios->done = readpages_done; ios->private = pcol_copy; - ret = exofs_oi_read(oi, ios); + + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); if (unlikely(ret)) goto err; - atomic_inc(&pcol->sbi->s_curr_pending); + EXOFS_DBGMSG2("read_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + + ret = ore_read(ios); + if (unlikely(ret)) + goto err; - EXOFS_DBGMSG2("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", - ios->obj.id, _LLU(ios->offset), pcol->length); + atomic_inc(&pcol->sbi->s_curr_pending); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: - if (!pcol->read_4_write) - _unlock_pcol_pages(pcol, ret, READ); - - pcol_free(pcol); - + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, READ); + pcol_free(pcol_copy); kfree(pcol_copy); + return ret; } @@ -326,11 +387,15 @@ static int readpage_strip(void *data, struct page *page) size_t len; int ret; + BUG_ON(!PageLocked(page)); + /* FIXME: Just for debugging, will be removed */ if (PageUptodate(page)) EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino, page->index); + pcol->that_locked_page = page; + if (page->index < end_index) len = PAGE_CACHE_SIZE; else if (page->index == end_index) @@ -350,8 +415,10 @@ static int readpage_strip(void *data, struct page *page) if (!pcol->read_4_write) unlock_page(page); - EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," - " splitting\n", inode->i_ino, page->index); + EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx " + "read_4_write=%d index=0x%lx end_index=0x%lx " + "splitting\n", inode->i_ino, len, + pcol->read_4_write, page->index, end_index); return read_exec(pcol); } @@ -417,6 +484,10 @@ static int exofs_readpages(struct file *file, struct address_space *mapping, return ret; } + ret = read_exec(&pcol); + if (unlikely(ret)) + return ret; + return read_exec(&pcol); } @@ -446,21 +517,22 @@ static int exofs_readpage(struct file *file, struct page *page) } /* Callback for osd_write. All writes are asynchronous */ -static void writepages_done(struct exofs_io_state *ios, void *p) +static void writepages_done(struct ore_io_state *ios, void *p) { struct page_collect *pcol = p; int i; - u64 resid; u64 good_bytes; u64 length = 0; - int ret = exofs_check_io(ios, &resid); + int ret = ore_check_io(ios, NULL); atomic_dec(&pcol->sbi->s_curr_pending); - if (likely(!ret)) + if (likely(!ret)) { good_bytes = pcol->length; - else - good_bytes = pcol->length - resid; + ret = PAGE_WAS_NOT_IN_IO; + } else { + good_bytes = 0; + } EXOFS_DBGMSG2("writepages_done(0x%lx) good_bytes=0x%llx" " length=0x%lx nr_pages=%u\n", @@ -493,16 +565,82 @@ static void writepages_done(struct exofs_io_state *ios, void *p) EXOFS_DBGMSG2("writepages_done END\n"); } +static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) +{ + struct page_collect *pcol = priv; + pgoff_t index = offset / PAGE_SIZE; + + if (!pcol->that_locked_page || + (pcol->that_locked_page->index != index)) { + struct page *page; + loff_t i_size = i_size_read(pcol->inode); + + if (offset >= i_size) { + *uptodate = true; + EXOFS_DBGMSG2("offset >= i_size index=0x%lx\n", index); + return ZERO_PAGE(0); + } + + page = find_get_page(pcol->inode->i_mapping, index); + if (!page) { + page = find_or_create_page(pcol->inode->i_mapping, + index, GFP_NOFS); + if (unlikely(!page)) { + EXOFS_DBGMSG("grab_cache_page Failed " + "index=0x%llx\n", _LLU(index)); + return NULL; + } + unlock_page(page); + } + if (PageDirty(page) || PageWriteback(page)) + *uptodate = true; + else + *uptodate = PageUptodate(page); + EXOFS_DBGMSG2("index=0x%lx uptodate=%d\n", index, *uptodate); + return page; + } else { + EXOFS_DBGMSG2("YES that_locked_page index=0x%lx\n", + pcol->that_locked_page->index); + *uptodate = true; + return pcol->that_locked_page; + } +} + +static void __r4w_put_page(void *priv, struct page *page) +{ + struct page_collect *pcol = priv; + + if ((pcol->that_locked_page != page) && (ZERO_PAGE(0) != page)) { + EXOFS_DBGMSG2("index=0x%lx\n", page->index); + page_cache_release(page); + return; + } + EXOFS_DBGMSG2("that_locked_page index=0x%lx\n", + ZERO_PAGE(0) == page ? -1 : page->index); +} + +static const struct _ore_r4w_op _r4w_op = { + .get_page = &__r4w_get_page, + .put_page = &__r4w_put_page, +}; + static int write_exec(struct page_collect *pcol) { struct exofs_i_info *oi = exofs_i(pcol->inode); - struct exofs_io_state *ios = pcol->ios; + struct ore_io_state *ios; struct page_collect *pcol_copy = NULL; int ret; if (!pcol->pages) return 0; + BUG_ON(pcol->ios); + ret = ore_get_rw_state(&pcol->sbi->layout, &oi->oc, false, + pcol->pg_first << PAGE_CACHE_SHIFT, + pcol->length, &pcol->ios); + if (unlikely(ret)) + goto err; + pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); if (!pcol_copy) { EXOFS_ERR("write_exec: Failed to kmalloc(pcol)\n"); @@ -512,30 +650,36 @@ static int write_exec(struct page_collect *pcol) *pcol_copy = *pcol; + ios = pcol->ios; ios->pages = pcol_copy->pages; - ios->nr_pages = pcol_copy->nr_pages; - ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT; - ios->length = pcol_copy->length; ios->done = writepages_done; + ios->r4w = &_r4w_op; ios->private = pcol_copy; - ret = exofs_oi_write(oi, ios); + /* pages ownership was passed to pcol_copy */ + _pcol_reset(pcol); + + ret = _maybe_not_all_in_one_io(ios, pcol_copy, pcol); + if (unlikely(ret)) + goto err; + + EXOFS_DBGMSG2("write_exec(0x%lx) offset=0x%llx length=0x%llx\n", + pcol->inode->i_ino, _LLU(ios->offset), _LLU(ios->length)); + + ret = ore_write(ios); if (unlikely(ret)) { - EXOFS_ERR("write_exec: exofs_oi_write() Failed\n"); + EXOFS_ERR("write_exec: ore_write() Failed\n"); goto err; } atomic_inc(&pcol->sbi->s_curr_pending); - EXOFS_DBGMSG2("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", - pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset), - pcol->length); - /* pages ownership was passed to pcol_copy */ - _pcol_reset(pcol); return 0; err: - _unlock_pcol_pages(pcol, ret, WRITE); - pcol_free(pcol); + if (!pcol_copy) /* Failed before ownership transfer */ + pcol_copy = pcol; + _unlock_pcol_pages(pcol_copy, ret, WRITE); + pcol_free(pcol_copy); kfree(pcol_copy); return ret; @@ -670,14 +814,33 @@ static int exofs_writepages(struct address_space *mapping, _pcol_init(&pcol, expected_pages, mapping->host); ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol); - if (ret) { + if (unlikely(ret)) { EXOFS_ERR("write_cache_pages => %d\n", ret); return ret; } - return write_exec(&pcol); + ret = write_exec(&pcol); + if (unlikely(ret)) + return ret; + + if (wbc->sync_mode == WB_SYNC_ALL) { + return write_exec(&pcol); /* pump the last reminder */ + } else if (pcol.nr_pages) { + /* not SYNC let the reminder join the next writeout */ + unsigned i; + + for (i = 0; i < pcol.nr_pages; i++) { + struct page *page = pcol.pages[i]; + + end_page_writeback(page); + set_page_dirty(page); + unlock_page(page); + } + } + return 0; } +/* static int exofs_writepage(struct page *page, struct writeback_control *wbc) { struct page_collect pcol; @@ -693,12 +856,12 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc) return write_exec(&pcol); } - +*/ /* i_mutex held using inode->i_size directly */ static void _write_failed(struct inode *inode, loff_t to) { if (to > inode->i_size) - truncate_pagecache(inode, to, inode->i_size); + truncate_pagecache(inode, inode->i_size); } int exofs_write_begin(struct file *file, struct address_space *mapping, @@ -722,11 +885,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, /* read modify write */ if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { + loff_t i_size = i_size_read(mapping->host); + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + size_t rlen; + + if (page->index < end_index) + rlen = PAGE_CACHE_SIZE; + else if (page->index == end_index) + rlen = i_size & ~PAGE_CACHE_MASK; + else + rlen = 0; + + if (!rlen) { + clear_highpage(page); + SetPageUptodate(page); + goto out; + } + ret = _readpage(page, true); if (ret) { /*SetPageError was done by _readpage. Is it ok?*/ unlock_page(page); - EXOFS_DBGMSG("__readpage_filler failed\n"); + EXOFS_DBGMSG("__readpage failed\n"); } } out: @@ -773,16 +953,26 @@ static int exofs_releasepage(struct page *page, gfp_t gfp) return 0; } -static void exofs_invalidatepage(struct page *page, unsigned long offset) +static void exofs_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) { - EXOFS_DBGMSG("page 0x%lx offset 0x%lx\n", page->index, offset); + EXOFS_DBGMSG("page 0x%lx offset 0x%x length 0x%x\n", + page->index, offset, length); WARN_ON(1); } + + /* TODO: Should be easy enough to do proprly */ +static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, + struct iov_iter *iter, loff_t offset) +{ + return 0; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, - .writepage = exofs_writepage, + .writepage = NULL, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, .write_end = exofs_write_end, @@ -792,10 +982,9 @@ const struct address_space_operations exofs_aops = { /* Not implemented Yet */ .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */ - .direct_IO = NULL, /* TODO: Should be trivial to do */ + .direct_IO = exofs_direct_IO, /* With these NULL has special meaning or default is not exported */ - .sync_page = NULL, .get_xip_mem = NULL, .migratepage = NULL, .launder_page = NULL, @@ -817,21 +1006,19 @@ static inline int exofs_inode_is_fast_symlink(struct inode *inode) return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0); } -const struct osd_attr g_attr_logical_length = ATTR_DEF( - OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); - static int _do_truncate(struct inode *inode, loff_t newsize) { struct exofs_i_info *oi = exofs_i(inode); + struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - ret = exofs_oi_truncate(oi, (u64)newsize); + ret = ore_truncate(&sbi->layout, &oi->oc, (u64)newsize); if (likely(!ret)) truncate_setsize(inode, newsize); - EXOFS_DBGMSG("(0x%lx) size=0x%llx ret=>%d\n", + EXOFS_DBGMSG2("(0x%lx) size=0x%llx ret=>%d\n", inode->i_ino, newsize, ret); return ret; } @@ -890,43 +1077,38 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, [1] = g_attr_inode_file_layout, [2] = g_attr_inode_dir_layout, }; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct exofs_on_disk_inode_layout *layout; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); return ret; } - ios->obj.id = exofs_oi_objno(oi); - exofs_make_credential(oi->i_cred, &ios->obj); - ios->cred = oi->i_cred; - - attrs[1].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); - attrs[2].len = exofs_on_disk_inode_layout_size(sbi->layout.s_numdevs); + attrs[1].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); + attrs[2].len = exofs_on_disk_inode_layout_size(sbi->oc.numdevs); ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) { EXOFS_ERR("object(0x%llx) corrupted, return empty file=>%d\n", - _LLU(ios->obj.id), ret); + _LLU(oi->one_comp.obj.id), ret); memset(inode, 0, sizeof(*inode)); inode->i_mode = 0040000 | (0777 & ~022); /* If object is lost on target we might as well enable it's * delete. */ - if ((ret == -ENOENT) || (ret == -EINVAL)) - ret = 0; + ret = 0; goto out; } ret = extract_attr_from_ios(ios, &attrs[0]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 0 of inode failed\n", __func__); goto out; } WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE); @@ -934,7 +1116,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[1]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 1 of inode failed\n", __func__); goto out; } if (attrs[1].len) { @@ -949,7 +1131,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, ret = extract_attr_from_ios(ios, &attrs[2]); if (ret) { - EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__); + EXOFS_ERR("%s: extract_attr 2 of inode failed\n", __func__); goto out; } if (attrs[2].len) { @@ -963,7 +1145,7 @@ static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, } out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } @@ -989,6 +1171,8 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) return inode; oi = exofs_i(inode); __oi_init(oi); + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); /* read the inode from the osd */ ret = exofs_get_inode(sb, oi, &fcb); @@ -999,9 +1183,9 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) /* copy stuff from on-disk struct to in-memory struct */ inode->i_mode = le16_to_cpu(fcb.i_mode); - inode->i_uid = le32_to_cpu(fcb.i_uid); - inode->i_gid = le32_to_cpu(fcb.i_gid); - inode->i_nlink = le16_to_cpu(fcb.i_links_count); + i_uid_write(inode, le32_to_cpu(fcb.i_uid)); + i_gid_write(inode, le32_to_cpu(fcb.i_gid)); + set_nlink(inode, le16_to_cpu(fcb.i_links_count)); inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime); inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime); inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime); @@ -1030,6 +1214,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data)); } + inode->i_mapping->backing_dev_info = sb->s_bdi; if (S_ISREG(inode->i_mode)) { inode->i_op = &exofs_file_inode_operations; inode->i_fop = &exofs_file_operations; @@ -1073,26 +1258,28 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi) } return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; } + /* * Callback function from exofs_new_inode(). The important thing is that we * set the obj_created flag so that other methods know that the object exists on * the OSD. */ -static void create_done(struct exofs_io_state *ios, void *p) +static void create_done(struct ore_io_state *ios, void *p) { struct inode *inode = p; struct exofs_i_info *oi = exofs_i(inode); struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; int ret; - ret = exofs_check_io(ios, NULL); - exofs_put_io_state(ios); + ret = ore_check_io(ios, NULL); + ore_put_io_state(ios); atomic_dec(&sbi->s_curr_pending); if (unlikely(ret)) { EXOFS_ERR("object=0x%llx creation failed in pid=0x%llx", - _LLU(exofs_oi_objno(oi)), _LLU(sbi->layout.s_pid)); + _LLU(exofs_oi_objno(oi)), + _LLU(oi->one_comp.obj.partition)); /*TODO: When FS is corrupted creation can fail, object already * exist. Get rid of this asynchronous creation, if exist * increment the obj counter and try the next object. Until we @@ -1109,16 +1296,15 @@ static void create_done(struct exofs_io_state *ios, void *p) /* * Set up a new inode and create an object for it on the OSD */ -struct inode *exofs_new_inode(struct inode *dir, int mode) +struct inode *exofs_new_inode(struct inode *dir, umode_t mode) { - struct super_block *sb; + struct super_block *sb = dir->i_sb; + struct exofs_sb_info *sbi = sb->s_fs_info; struct inode *inode; struct exofs_i_info *oi; - struct exofs_sb_info *sbi; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - sb = dir->i_sb; inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -1128,9 +1314,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) set_obj_2bcreated(oi); - sbi = sb->s_fs_info; - - sb->s_dirt = 1; + inode->i_mapping->backing_dev_info = sb->s_bdi; inode_init_owner(inode, dir, mode); inode->i_ino = sbi->s_nextid++; inode->i_blkbits = EXOFS_BLKSHIFT; @@ -1141,23 +1325,24 @@ struct inode *exofs_new_inode(struct inode *dir, int mode) spin_unlock(&sbi->s_next_gen_lock); insert_inode_hash(inode); + exofs_init_comps(&oi->oc, &oi->one_comp, sb->s_fs_info, + exofs_oi_objno(oi)); + exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */ + mark_inode_dirty(inode); - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n"); + EXOFS_ERR("exofs_new_inode: ore_get_io_state failed\n"); return ERR_PTR(ret); } - ios->obj.id = exofs_oi_objno(oi); - exofs_make_credential(oi->i_cred, &ios->obj); - ios->done = create_done; ios->private = inode; - ios->cred = oi->i_cred; - ret = exofs_sbi_create(ios); + + ret = ore_create(ios); if (ret) { - exofs_put_io_state(ios); + ore_put_io_state(ios); return ERR_PTR(ret); } atomic_inc(&sbi->s_curr_pending); @@ -1176,11 +1361,11 @@ struct updatei_args { /* * Callback function from exofs_update_inode(). */ -static void updatei_done(struct exofs_io_state *ios, void *p) +static void updatei_done(struct ore_io_state *ios, void *p) { struct updatei_args *args = p; - exofs_put_io_state(ios); + ore_put_io_state(ios); atomic_dec(&args->sbi->s_curr_pending); @@ -1196,7 +1381,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct osd_attr attr; struct exofs_fcb *fcb; struct updatei_args *args; @@ -1211,8 +1396,8 @@ static int exofs_update_inode(struct inode *inode, int do_sync) fcb = &args->fcb; fcb->i_mode = cpu_to_le16(inode->i_mode); - fcb->i_uid = cpu_to_le32(inode->i_uid); - fcb->i_gid = cpu_to_le32(inode->i_gid); + fcb->i_uid = cpu_to_le32(i_uid_read(inode)); + fcb->i_gid = cpu_to_le32(i_gid_read(inode)); fcb->i_links_count = cpu_to_le16(inode->i_nlink); fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec); @@ -1235,9 +1420,9 @@ static int exofs_update_inode(struct inode *inode, int do_sync) } else memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); goto free_args; } @@ -1254,13 +1439,13 @@ static int exofs_update_inode(struct inode *inode, int do_sync) ios->private = args; } - ret = exofs_oi_write(oi, ios); + ret = ore_write(ios); if (!do_sync && !ret) { atomic_inc(&sbi->s_curr_pending); goto out; /* deallocation in updatei_done */ } - exofs_put_io_state(ios); + ore_put_io_state(ios); free_args: kfree(args); out: @@ -1271,18 +1456,19 @@ out: int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) { - return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); + /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */ + return exofs_update_inode(inode, 1); } /* * Callback function from exofs_delete_inode() - don't have much cleaning up to * do. */ -static void delete_done(struct exofs_io_state *ios, void *p) +static void delete_done(struct ore_io_state *ios, void *p) { struct exofs_sb_info *sbi = p; - exofs_put_io_state(ios); + ore_put_io_state(ios); atomic_dec(&sbi->s_curr_pending); } @@ -1297,17 +1483,17 @@ void exofs_evict_inode(struct inode *inode) struct exofs_i_info *oi = exofs_i(inode); struct super_block *sb = inode->i_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; int ret; - truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages_final(&inode->i_data); /* TODO: should do better here */ if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; inode->i_size = 0; - end_writeback(inode); + clear_inode(inode); /* if we are deleting an obj that hasn't been created yet, wait. * This also makes sure that create_done cannot be called with an @@ -1317,20 +1503,19 @@ void exofs_evict_inode(struct inode *inode) /* ignore the error, attempt a remove anyway */ /* Now Remove the OSD objects */ - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &oi->oc, &ios); if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__); + EXOFS_ERR("%s: ore_get_io_state failed\n", __func__); return; } - ios->obj.id = exofs_oi_objno(oi); ios->done = delete_done; ios->private = sbi; - ios->cred = oi->i_cred; - ret = exofs_sbi_remove(ios); + + ret = ore_remove(ios); if (ret) { - EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__); - exofs_put_io_state(ios); + EXOFS_ERR("%s: ore_remove failed\n", __func__); + ore_put_io_state(ios); return; } atomic_inc(&sbi->s_curr_pending); @@ -1338,5 +1523,5 @@ void exofs_evict_inode(struct inode *inode) return; no_delete: - end_writeback(inode); + clear_inode(inode); } diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c deleted file mode 100644 index f74a2ec027a..00000000000 --- a/fs/exofs/ios.c +++ /dev/null @@ -1,803 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/slab.h> -#include <scsi/scsi_device.h> -#include <asm/div64.h> - -#include "exofs.h" - -#define EXOFS_DBGMSG2(M...) do {} while (0) -/* #define EXOFS_DBGMSG2 EXOFS_DBGMSG */ - -void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) -{ - osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); -} - -int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, - u64 offset, void *p, unsigned length) -{ - struct osd_request *or = osd_start_request(od, GFP_KERNEL); -/* struct osd_sense_info osi = {.key = 0};*/ - int ret; - - if (unlikely(!or)) { - EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); - return -ENOMEM; - } - ret = osd_req_read_kern(or, obj, offset, p, length); - if (unlikely(ret)) { - EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); - goto out; - } - - ret = osd_finalize_request(or, 0, cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); - goto out; - } - - ret = osd_execute_request(or); - if (unlikely(ret)) - EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); - /* osd_req_decode_sense(or, ret); */ - -out: - osd_end_request(or); - return ret; -} - -int exofs_get_io_state(struct exofs_layout *layout, - struct exofs_io_state **pios) -{ - struct exofs_io_state *ios; - - /*TODO: Maybe use kmem_cach per sbi of size - * exofs_io_state_size(layout->s_numdevs) - */ - ios = kzalloc(exofs_io_state_size(layout->s_numdevs), GFP_KERNEL); - if (unlikely(!ios)) { - EXOFS_DBGMSG("Failed kzalloc bytes=%d\n", - exofs_io_state_size(layout->s_numdevs)); - *pios = NULL; - return -ENOMEM; - } - - ios->layout = layout; - ios->obj.partition = layout->s_pid; - *pios = ios; - return 0; -} - -void exofs_put_io_state(struct exofs_io_state *ios) -{ - if (ios) { - unsigned i; - - for (i = 0; i < ios->numdevs; i++) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[i]; - - if (per_dev->or) - osd_end_request(per_dev->or); - if (per_dev->bio) - bio_put(per_dev->bio); - } - - kfree(ios); - } -} - -unsigned exofs_layout_od_id(struct exofs_layout *layout, - osd_id obj_no, unsigned layout_index) -{ -/* switch (layout->lay_func) { - case LAYOUT_MOVING_WINDOW: - {*/ - unsigned dev_mod = obj_no; - - return (layout_index + dev_mod * layout->mirrors_p1) % - layout->s_numdevs; -/* } - case LAYOUT_FUNC_IMPLICT: - return layout->devs[layout_index]; - }*/ -} - -static inline struct osd_dev *exofs_ios_od(struct exofs_io_state *ios, - unsigned layout_index) -{ - return ios->layout->s_ods[ - exofs_layout_od_id(ios->layout, ios->obj.id, layout_index)]; -} - -static void _sync_done(struct exofs_io_state *ios, void *p) -{ - struct completion *waiting = p; - - complete(waiting); -} - -static void _last_io(struct kref *kref) -{ - struct exofs_io_state *ios = container_of( - kref, struct exofs_io_state, kref); - - ios->done(ios, ios->private); -} - -static void _done_io(struct osd_request *or, void *p) -{ - struct exofs_io_state *ios = p; - - kref_put(&ios->kref, _last_io); -} - -static int exofs_io_execute(struct exofs_io_state *ios) -{ - DECLARE_COMPLETION_ONSTACK(wait); - bool sync = (ios->done == NULL); - int i, ret; - - if (sync) { - ios->done = _sync_done; - ios->private = &wait; - } - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - ret = osd_finalize_request(or, 0, ios->cred, NULL); - if (unlikely(ret)) { - EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", - ret); - return ret; - } - } - - kref_init(&ios->kref); - - for (i = 0; i < ios->numdevs; i++) { - struct osd_request *or = ios->per_dev[i].or; - if (unlikely(!or)) - continue; - - kref_get(&ios->kref); - osd_execute_request_async(or, _done_io, ios); - } - - kref_put(&ios->kref, _last_io); - ret = 0; - - if (sync) { - wait_for_completion(&wait); - ret = exofs_check_io(ios, NULL); - } - return ret; -} - -static void _clear_bio(struct bio *bio) -{ - struct bio_vec *bv; - unsigned i; - - __bio_for_each_segment(bv, bio, i, 0) { - unsigned this_count = bv->bv_len; - - if (likely(PAGE_SIZE == this_count)) - clear_highpage(bv->bv_page); - else - zero_user(bv->bv_page, bv->bv_offset, this_count); - } -} - -int exofs_check_io(struct exofs_io_state *ios, u64 *resid) -{ - enum osd_err_priority acumulated_osd_err = 0; - int acumulated_lin_err = 0; - int i; - - for (i = 0; i < ios->numdevs; i++) { - struct osd_sense_info osi; - struct osd_request *or = ios->per_dev[i].or; - int ret; - - if (unlikely(!or)) - continue; - - ret = osd_req_decode_sense(or, &osi); - if (likely(!ret)) - continue; - - if (OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) { - /* start read offset passed endof file */ - _clear_bio(ios->per_dev[i].bio); - EXOFS_DBGMSG("start read offset passed end of file " - "offset=0x%llx, length=0x%llx\n", - _LLU(ios->per_dev[i].offset), - _LLU(ios->per_dev[i].length)); - - continue; /* we recovered */ - } - - if (osi.osd_err_pri >= acumulated_osd_err) { - acumulated_osd_err = osi.osd_err_pri; - acumulated_lin_err = ret; - } - } - - /* TODO: raid specific residual calculations */ - if (resid) { - if (likely(!acumulated_lin_err)) - *resid = 0; - else - *resid = ios->length; - } - - return acumulated_lin_err; -} - -/* - * L - logical offset into the file - * - * U - The number of bytes in a stripe within a group - * - * U = stripe_unit * group_width - * - * T - The number of bytes striped within a group of component objects - * (before advancing to the next group) - * - * T = stripe_unit * group_width * group_depth - * - * S - The number of bytes striped across all component objects - * before the pattern repeats - * - * S = stripe_unit * group_width * group_depth * group_count - * - * M - The "major" (i.e., across all components) stripe number - * - * M = L / S - * - * G - Counts the groups from the beginning of the major stripe - * - * G = (L - (M * S)) / T [or (L % S) / T] - * - * H - The byte offset within the group - * - * H = (L - (M * S)) % T [or (L % S) % T] - * - * N - The "minor" (i.e., across the group) stripe number - * - * N = H / U - * - * C - The component index coresponding to L - * - * C = (H - (N * U)) / stripe_unit + G * group_width - * [or (L % U) / stripe_unit + G * group_width] - * - * O - The component offset coresponding to L - * - * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit - */ -struct _striping_info { - u64 obj_offset; - u64 group_length; - unsigned dev; - unsigned unit_off; -}; - -static void _calc_stripe_info(struct exofs_io_state *ios, u64 file_offset, - struct _striping_info *si) -{ - u32 stripe_unit = ios->layout->stripe_unit; - u32 group_width = ios->layout->group_width; - u64 group_depth = ios->layout->group_depth; - - u32 U = stripe_unit * group_width; - u64 T = U * group_depth; - u64 S = T * ios->layout->group_count; - u64 M = div64_u64(file_offset, S); - - /* - G = (L - (M * S)) / T - H = (L - (M * S)) % T - */ - u64 LmodS = file_offset - M * S; - u32 G = div64_u64(LmodS, T); - u64 H = LmodS - G * T; - - u32 N = div_u64(H, U); - - /* "H - (N * U)" is just "H % U" so it's bound to u32 */ - si->dev = (u32)(H - (N * U)) / stripe_unit + G * group_width; - si->dev *= ios->layout->mirrors_p1; - - div_u64_rem(file_offset, stripe_unit, &si->unit_off); - - si->obj_offset = si->unit_off + (N * stripe_unit) + - (M * group_depth * stripe_unit); - - si->group_length = T - H; -} - -static int _add_stripe_unit(struct exofs_io_state *ios, unsigned *cur_pg, - unsigned pgbase, struct exofs_per_dev_state *per_dev, - int cur_len) -{ - unsigned pg = *cur_pg; - struct request_queue *q = - osd_request_queue(exofs_ios_od(ios, per_dev->dev)); - - per_dev->length += cur_len; - - if (per_dev->bio == NULL) { - unsigned pages_in_stripe = ios->layout->group_width * - (ios->layout->stripe_unit / PAGE_SIZE); - unsigned bio_size = (ios->nr_pages + pages_in_stripe) / - ios->layout->group_width; - - per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); - if (unlikely(!per_dev->bio)) { - EXOFS_DBGMSG("Failed to allocate BIO size=%u\n", - bio_size); - return -ENOMEM; - } - } - - while (cur_len > 0) { - unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); - unsigned added_len; - - BUG_ON(ios->nr_pages <= pg); - cur_len -= pglen; - - added_len = bio_add_pc_page(q, per_dev->bio, ios->pages[pg], - pglen, pgbase); - if (unlikely(pglen != added_len)) - return -ENOMEM; - pgbase = 0; - ++pg; - } - BUG_ON(cur_len); - - *cur_pg = pg; - return 0; -} - -static int _prepare_one_group(struct exofs_io_state *ios, u64 length, - struct _striping_info *si) -{ - unsigned stripe_unit = ios->layout->stripe_unit; - unsigned mirrors_p1 = ios->layout->mirrors_p1; - unsigned devs_in_group = ios->layout->group_width * mirrors_p1; - unsigned dev = si->dev; - unsigned first_dev = dev - (dev % devs_in_group); - unsigned max_comp = ios->numdevs ? ios->numdevs - mirrors_p1 : 0; - unsigned cur_pg = ios->pages_consumed; - int ret = 0; - - while (length) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[dev]; - unsigned cur_len, page_off = 0; - - if (!per_dev->length) { - per_dev->dev = dev; - if (dev < si->dev) { - per_dev->offset = si->obj_offset + stripe_unit - - si->unit_off; - cur_len = stripe_unit; - } else if (dev == si->dev) { - per_dev->offset = si->obj_offset; - cur_len = stripe_unit - si->unit_off; - page_off = si->unit_off & ~PAGE_MASK; - BUG_ON(page_off && (page_off != ios->pgbase)); - } else { /* dev > si->dev */ - per_dev->offset = si->obj_offset - si->unit_off; - cur_len = stripe_unit; - } - - if (max_comp < dev) - max_comp = dev; - } else { - cur_len = stripe_unit; - } - if (cur_len >= length) - cur_len = length; - - ret = _add_stripe_unit(ios, &cur_pg, page_off , per_dev, - cur_len); - if (unlikely(ret)) - goto out; - - dev += mirrors_p1; - dev = (dev % devs_in_group) + first_dev; - - length -= cur_len; - } -out: - ios->numdevs = max_comp + mirrors_p1; - ios->pages_consumed = cur_pg; - return ret; -} - -static int _prepare_for_striping(struct exofs_io_state *ios) -{ - u64 length = ios->length; - u64 offset = ios->offset; - struct _striping_info si; - int ret = 0; - - if (!ios->pages) { - if (ios->kern_buff) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[0]; - - _calc_stripe_info(ios, ios->offset, &si); - per_dev->offset = si.obj_offset; - per_dev->dev = si.dev; - - /* no cross device without page array */ - BUG_ON((ios->layout->group_width > 1) && - (si.unit_off + ios->length > - ios->layout->stripe_unit)); - } - ios->numdevs = ios->layout->mirrors_p1; - return 0; - } - - while (length) { - _calc_stripe_info(ios, offset, &si); - - if (length < si.group_length) - si.group_length = length; - - ret = _prepare_one_group(ios, si.group_length, &si); - if (unlikely(ret)) - goto out; - - offset += si.group_length; - length -= si.group_length; - } - -out: - return ret; -} - -int exofs_sbi_create(struct exofs_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->layout->s_numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_create_object(or, &ios->obj); - } - ret = exofs_io_execute(ios); - -out: - return ret; -} - -int exofs_sbi_remove(struct exofs_io_state *ios) -{ - int i, ret; - - for (i = 0; i < ios->layout->s_numdevs; i++) { - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, i), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - ios->per_dev[i].or = or; - ios->numdevs++; - - osd_req_remove_object(or, &ios->obj); - } - ret = exofs_io_execute(ios); - -out: - return ret; -} - -static int _sbi_write_mirror(struct exofs_io_state *ios, int cur_comp) -{ - struct exofs_per_dev_state *master_dev = &ios->per_dev[cur_comp]; - unsigned dev = ios->per_dev[cur_comp].dev; - unsigned last_comp = cur_comp + ios->layout->mirrors_p1; - int ret = 0; - - if (ios->pages && !master_dev->length) - return 0; /* Just an empty slot */ - - for (; cur_comp < last_comp; ++cur_comp, ++dev) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, dev), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - ret = -ENOMEM; - goto out; - } - per_dev->or = or; - per_dev->offset = master_dev->offset; - - if (ios->pages) { - struct bio *bio; - - if (per_dev != master_dev) { - bio = bio_kmalloc(GFP_KERNEL, - master_dev->bio->bi_max_vecs); - if (unlikely(!bio)) { - EXOFS_DBGMSG( - "Failed to allocate BIO size=%u\n", - master_dev->bio->bi_max_vecs); - ret = -ENOMEM; - goto out; - } - - __bio_clone(bio, master_dev->bio); - bio->bi_bdev = NULL; - bio->bi_next = NULL; - per_dev->length = master_dev->length; - per_dev->bio = bio; - per_dev->dev = dev; - } else { - bio = master_dev->bio; - /* FIXME: bio_set_dir() */ - bio->bi_rw |= REQ_WRITE; - } - - osd_req_write(or, &ios->obj, per_dev->offset, bio, - per_dev->length); - EXOFS_DBGMSG("write(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), - _LLU(per_dev->length), dev); - } else if (ios->kern_buff) { - ret = osd_req_write_kern(or, &ios->obj, per_dev->offset, - ios->kern_buff, ios->length); - if (unlikely(ret)) - goto out; - EXOFS_DBGMSG2("write_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), - _LLU(ios->length), dev); - } else { - osd_req_set_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->out_attr_len, dev); - } - - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, - ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, - ios->in_attr_len); - } - -out: - return ret; -} - -int exofs_sbi_write(struct exofs_io_state *ios) -{ - int i; - int ret; - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _sbi_write_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = exofs_io_execute(ios); - return ret; -} - -static int _sbi_read_mirror(struct exofs_io_state *ios, unsigned cur_comp) -{ - struct osd_request *or; - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - unsigned first_dev = (unsigned)ios->obj.id; - - if (ios->pages && !per_dev->length) - return 0; /* Just an empty slot */ - - first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; - or = osd_start_request(exofs_ios_od(ios, first_dev), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - if (ios->pages) { - osd_req_read(or, &ios->obj, per_dev->offset, - per_dev->bio, per_dev->length); - EXOFS_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" - " dev=%d\n", _LLU(ios->obj.id), - _LLU(per_dev->offset), _LLU(per_dev->length), - first_dev); - } else if (ios->kern_buff) { - int ret = osd_req_read_kern(or, &ios->obj, per_dev->offset, - ios->kern_buff, ios->length); - EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " - "length=0x%llx dev=%d ret=>%d\n", - _LLU(ios->obj.id), _LLU(per_dev->offset), - _LLU(ios->length), first_dev, ret); - if (unlikely(ret)) - return ret; - } else { - osd_req_get_attributes(or, &ios->obj); - EXOFS_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", - _LLU(ios->obj.id), ios->in_attr_len, first_dev); - } - if (ios->out_attr) - osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); - - if (ios->in_attr) - osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); - - return 0; -} - -int exofs_sbi_read(struct exofs_io_state *ios) -{ - int i; - int ret; - - ret = _prepare_for_striping(ios); - if (unlikely(ret)) - return ret; - - for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { - ret = _sbi_read_mirror(ios, i); - if (unlikely(ret)) - return ret; - } - - ret = exofs_io_execute(ios); - return ret; -} - -int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr) -{ - struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ - void *iter = NULL; - int nelem; - - do { - nelem = 1; - osd_req_decode_get_attr_list(ios->per_dev[0].or, - &cur_attr, &nelem, &iter); - if ((cur_attr.attr_page == attr->attr_page) && - (cur_attr.attr_id == attr->attr_id)) { - attr->len = cur_attr.len; - attr->val_ptr = cur_attr.val_ptr; - return 0; - } - } while (iter); - - return -EIO; -} - -static int _truncate_mirrors(struct exofs_io_state *ios, unsigned cur_comp, - struct osd_attr *attr) -{ - int last_comp = cur_comp + ios->layout->mirrors_p1; - - for (; cur_comp < last_comp; ++cur_comp) { - struct exofs_per_dev_state *per_dev = &ios->per_dev[cur_comp]; - struct osd_request *or; - - or = osd_start_request(exofs_ios_od(ios, cur_comp), GFP_KERNEL); - if (unlikely(!or)) { - EXOFS_ERR("%s: osd_start_request failed\n", __func__); - return -ENOMEM; - } - per_dev->or = or; - - osd_req_set_attributes(or, &ios->obj); - osd_req_add_set_attr_list(or, attr, 1); - } - - return 0; -} - -int exofs_oi_truncate(struct exofs_i_info *oi, u64 size) -{ - struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info; - struct exofs_io_state *ios; - struct exofs_trunc_attr { - struct osd_attr attr; - __be64 newsize; - } *size_attrs; - struct _striping_info si; - int i, ret; - - ret = exofs_get_io_state(&sbi->layout, &ios); - if (unlikely(ret)) - return ret; - - size_attrs = kcalloc(ios->layout->group_width, sizeof(*size_attrs), - GFP_KERNEL); - if (unlikely(!size_attrs)) { - ret = -ENOMEM; - goto out; - } - - ios->obj.id = exofs_oi_objno(oi); - ios->cred = oi->i_cred; - - ios->numdevs = ios->layout->s_numdevs; - _calc_stripe_info(ios, size, &si); - - for (i = 0; i < ios->layout->group_width; ++i) { - struct exofs_trunc_attr *size_attr = &size_attrs[i]; - u64 obj_size; - - if (i < si.dev) - obj_size = si.obj_offset + - ios->layout->stripe_unit - si.unit_off; - else if (i == si.dev) - obj_size = si.obj_offset; - else /* i > si.dev */ - obj_size = si.obj_offset - si.unit_off; - - size_attr->newsize = cpu_to_be64(obj_size); - size_attr->attr = g_attr_logical_length; - size_attr->attr.val_ptr = &size_attr->newsize; - - ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, - &size_attr->attr); - if (unlikely(ret)) - goto out; - } - ret = exofs_io_execute(ios); - -out: - kfree(size_attrs); - exofs_put_io_state(ios); - return ret; -} diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4d70db110cf..4731fd991ef 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -46,7 +46,7 @@ static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode) } static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) + unsigned int flags) { struct inode *inode; ino_t ino; @@ -55,17 +55,12 @@ static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENAMETOOLONG); ino = exofs_inode_by_name(dir, dentry); - inode = NULL; - if (ino) { - inode = exofs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } + inode = ino ? exofs_iget(dir->i_sb, ino) : NULL; return d_splice_alias(inode, dentry); } -static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) +static int exofs_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) { struct inode *inode = exofs_new_inode(dir, mode); int err = PTR_ERR(inode); @@ -79,7 +74,7 @@ static int exofs_create(struct inode *dir, struct dentry *dentry, int mode, return err; } -static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode, +static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -148,9 +143,6 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, { struct inode *inode = old_dentry->d_inode; - if (inode->i_nlink >= EXOFS_LINK_MAX) - return -EMLINK; - inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); ihold(inode); @@ -158,13 +150,10 @@ static int exofs_link(struct dentry *old_dentry, struct inode *dir, return exofs_add_nondir(dentry, inode); } -static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +static int exofs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - int err = -EMLINK; - - if (dir->i_nlink >= EXOFS_LINK_MAX) - goto out; + int err; inode_inc_link_count(dir); @@ -280,11 +269,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto out_dir; } else { - if (dir_de) { - err = -EMLINK; - if (new_dir->i_nlink >= EXOFS_LINK_MAX) - goto out_dir; - } err = exofs_add_link(new_dentry, old_inode); if (err) goto out_dir; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c new file mode 100644 index 00000000000..cfc0205d62c --- /dev/null +++ b/fs/exofs/ore.c @@ -0,0 +1,1164 @@ +/* + * Copyright (C) 2005, 2006 + * Avishay Traeger (avishay@gmail.com) + * Copyright (C) 2008, 2009 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation. Since it is based on ext2, and the only + * valid version of GPL for the Linux kernel is version 2, the only valid + * version of GPL for exofs is version 2. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <linux/slab.h> +#include <linux/module.h> +#include <asm/div64.h> +#include <linux/lcm.h> + +#include "ore_raid.h" + +MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); +MODULE_LICENSE("GPL"); + +/* ore_verify_layout does a couple of things: + * 1. Given a minimum number of needed parameters fixes up the rest of the + * members to be operatonals for the ore. The needed parameters are those + * that are defined by the pnfs-objects layout STD. + * 2. Check to see if the current ore code actually supports these parameters + * for example stripe_unit must be a multple of the system PAGE_SIZE, + * and etc... + * 3. Cache some havily used calculations that will be needed by users. + */ + +enum { BIO_MAX_PAGES_KMALLOC = + (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),}; + +int ore_verify_layout(unsigned total_comps, struct ore_layout *layout) +{ + u64 stripe_length; + + switch (layout->raid_algorithm) { + case PNFS_OSD_RAID_0: + layout->parity = 0; + break; + case PNFS_OSD_RAID_5: + layout->parity = 1; + break; + case PNFS_OSD_RAID_PQ: + layout->parity = 2; + break; + case PNFS_OSD_RAID_4: + default: + ORE_ERR("Only RAID_0/5/6 for now received-enum=%d\n", + layout->raid_algorithm); + return -EINVAL; + } + if (0 != (layout->stripe_unit & ~PAGE_MASK)) { + ORE_ERR("Stripe Unit(0x%llx)" + " must be Multples of PAGE_SIZE(0x%lx)\n", + _LLU(layout->stripe_unit), PAGE_SIZE); + return -EINVAL; + } + if (layout->group_width) { + if (!layout->group_depth) { + ORE_ERR("group_depth == 0 && group_width != 0\n"); + return -EINVAL; + } + if (total_comps < (layout->group_width * layout->mirrors_p1)) { + ORE_ERR("Data Map wrong, " + "numdevs=%d < group_width=%d * mirrors=%d\n", + total_comps, layout->group_width, + layout->mirrors_p1); + return -EINVAL; + } + layout->group_count = total_comps / layout->mirrors_p1 / + layout->group_width; + } else { + if (layout->group_depth) { + printk(KERN_NOTICE "Warning: group_depth ignored " + "group_width == 0 && group_depth == %lld\n", + _LLU(layout->group_depth)); + } + layout->group_width = total_comps / layout->mirrors_p1; + layout->group_depth = -1; + layout->group_count = 1; + } + + stripe_length = (u64)layout->group_width * layout->stripe_unit; + if (stripe_length >= (1ULL << 32)) { + ORE_ERR("Stripe_length(0x%llx) >= 32bit is not supported\n", + _LLU(stripe_length)); + return -EINVAL; + } + + layout->max_io_length = + (BIO_MAX_PAGES_KMALLOC * PAGE_SIZE - layout->stripe_unit) * + (layout->group_width - layout->parity); + if (layout->parity) { + unsigned stripe_length = + (layout->group_width - layout->parity) * + layout->stripe_unit; + + layout->max_io_length /= stripe_length; + layout->max_io_length *= stripe_length; + } + ORE_DBGMSG("max_io_length=0x%lx\n", layout->max_io_length); + + return 0; +} +EXPORT_SYMBOL(ore_verify_layout); + +static u8 *_ios_cred(struct ore_io_state *ios, unsigned index) +{ + return ios->oc->comps[index & ios->oc->single_comp].cred; +} + +static struct osd_obj_id *_ios_obj(struct ore_io_state *ios, unsigned index) +{ + return &ios->oc->comps[index & ios->oc->single_comp].obj; +} + +static struct osd_dev *_ios_od(struct ore_io_state *ios, unsigned index) +{ + ORE_DBGMSG2("oc->first_dev=%d oc->numdevs=%d i=%d oc->ods=%p\n", + ios->oc->first_dev, ios->oc->numdevs, index, + ios->oc->ods); + + return ore_comp_dev(ios->oc, index); +} + +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + struct page **pages; + struct osd_sg_entry *sgilist; + struct __alloc_all_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + union { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + }; + } *_aios; + + if (likely(sizeof(*_aios) <= PAGE_SIZE)) { + _aios = kzalloc(sizeof(*_aios), GFP_KERNEL); + if (unlikely(!_aios)) { + ORE_DBGMSG("Failed kzalloc bytes=%zd\n", + sizeof(*_aios)); + *pios = NULL; + return -ENOMEM; + } + pages = num_par_pages ? _aios->pages : NULL; + sgilist = sgs_per_dev ? _aios->sglist : NULL; + ios = &_aios->ios; + } else { + struct __alloc_small_io_state { + struct ore_io_state ios; + struct ore_per_dev_state per_dev[numdevs]; + } *_aio_small; + union __extra_part { + struct osd_sg_entry sglist[sgs_per_dev * numdevs]; + struct page *pages[num_par_pages]; + } *extra_part; + + _aio_small = kzalloc(sizeof(*_aio_small), GFP_KERNEL); + if (unlikely(!_aio_small)) { + ORE_DBGMSG("Failed alloc first part bytes=%zd\n", + sizeof(*_aio_small)); + *pios = NULL; + return -ENOMEM; + } + extra_part = kzalloc(sizeof(*extra_part), GFP_KERNEL); + if (unlikely(!extra_part)) { + ORE_DBGMSG("Failed alloc second part bytes=%zd\n", + sizeof(*extra_part)); + kfree(_aio_small); + *pios = NULL; + return -ENOMEM; + } + + pages = num_par_pages ? extra_part->pages : NULL; + sgilist = sgs_per_dev ? extra_part->sglist : NULL; + /* In this case the per_dev[0].sgilist holds the pointer to + * be freed + */ + ios = &_aio_small->ios; + ios->extra_part_alloc = true; + } + + if (pages) { + ios->parity_pages = pages; + ios->max_par_pages = num_par_pages; + } + if (sgilist) { + unsigned d; + + for (d = 0; d < numdevs; ++d) { + ios->per_dev[d].sglist = sgilist; + sgilist += sgs_per_dev; + } + ios->sgs_per_dev = sgs_per_dev; + } + + ios->layout = layout; + ios->oc = oc; + *pios = ios; + return 0; +} + +/* Allocate an io_state for only a single group of devices + * + * If a user needs to call ore_read/write() this version must be used becase it + * allocates extra stuff for striping and raid. + * The ore might decide to only IO less then @length bytes do to alignmets + * and constrains as follows: + * - The IO cannot cross group boundary. + * - In raid5/6 The end of the IO must align at end of a stripe eg. + * (@offset + @length) % strip_size == 0. Or the complete range is within a + * single stripe. + * - Memory condition only permitted a shorter IO. (A user can use @length=~0 + * And check the returned ios->length for max_io_size.) + * + * The caller must check returned ios->length (and/or ios->nr_pages) and + * re-issue these pages that fall outside of ios->length + */ +int ore_get_rw_state(struct ore_layout *layout, struct ore_components *oc, + bool is_reading, u64 offset, u64 length, + struct ore_io_state **pios) +{ + struct ore_io_state *ios; + unsigned numdevs = layout->group_width * layout->mirrors_p1; + unsigned sgs_per_dev = 0, max_par_pages = 0; + int ret; + + if (layout->parity && length) { + unsigned data_devs = layout->group_width - layout->parity; + unsigned stripe_size = layout->stripe_unit * data_devs; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + u32 remainder; + u64 num_stripes; + u64 num_raid_units; + + num_stripes = div_u64_rem(length, stripe_size, &remainder); + if (remainder) + ++num_stripes; + + num_raid_units = num_stripes * layout->parity; + + if (is_reading) { + /* For reads add per_dev sglist array */ + /* TODO: Raid 6 we need twice more. Actually: + * num_stripes / LCMdP(W,P); + * if (W%P != 0) num_stripes *= parity; + */ + + /* first/last seg is split */ + num_raid_units += layout->group_width; + sgs_per_dev = div_u64(num_raid_units, data_devs) + 2; + } else { + /* For Writes add parity pages array. */ + max_par_pages = num_raid_units * pages_in_unit * + sizeof(struct page *); + } + } + + ret = _ore_get_io_state(layout, oc, numdevs, sgs_per_dev, max_par_pages, + pios); + if (unlikely(ret)) + return ret; + + ios = *pios; + ios->reading = is_reading; + ios->offset = offset; + + if (length) { + ore_calc_stripe_info(layout, offset, length, &ios->si); + ios->length = ios->si.length; + ios->nr_pages = ((ios->offset & (PAGE_SIZE - 1)) + + ios->length + PAGE_SIZE - 1) / PAGE_SIZE; + if (layout->parity) + _ore_post_alloc_raid_stuff(ios); + } + + return 0; +} +EXPORT_SYMBOL(ore_get_rw_state); + +/* Allocate an io_state for all the devices in the comps array + * + * This version of io_state allocation is used mostly by create/remove + * and trunc where we currently need all the devices. The only wastful + * bit is the read/write_attributes with no IO. Those sites should + * be converted to use ore_get_rw_state() with length=0 + */ +int ore_get_io_state(struct ore_layout *layout, struct ore_components *oc, + struct ore_io_state **pios) +{ + return _ore_get_io_state(layout, oc, oc->numdevs, 0, 0, pios); +} +EXPORT_SYMBOL(ore_get_io_state); + +void ore_put_io_state(struct ore_io_state *ios) +{ + if (ios) { + unsigned i; + + for (i = 0; i < ios->numdevs; i++) { + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + + if (per_dev->or) + osd_end_request(per_dev->or); + if (per_dev->bio) + bio_put(per_dev->bio); + } + + _ore_free_raid_stuff(ios); + kfree(ios); + } +} +EXPORT_SYMBOL(ore_put_io_state); + +static void _sync_done(struct ore_io_state *ios, void *p) +{ + struct completion *waiting = p; + + complete(waiting); +} + +static void _last_io(struct kref *kref) +{ + struct ore_io_state *ios = container_of( + kref, struct ore_io_state, kref); + + ios->done(ios, ios->private); +} + +static void _done_io(struct osd_request *or, void *p) +{ + struct ore_io_state *ios = p; + + kref_put(&ios->kref, _last_io); +} + +int ore_io_execute(struct ore_io_state *ios) +{ + DECLARE_COMPLETION_ONSTACK(wait); + bool sync = (ios->done == NULL); + int i, ret; + + if (sync) { + ios->done = _sync_done; + ios->private = &wait; + } + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + ret = osd_finalize_request(or, 0, _ios_cred(ios, i), NULL); + if (unlikely(ret)) { + ORE_DBGMSG("Failed to osd_finalize_request() => %d\n", + ret); + return ret; + } + } + + kref_init(&ios->kref); + + for (i = 0; i < ios->numdevs; i++) { + struct osd_request *or = ios->per_dev[i].or; + if (unlikely(!or)) + continue; + + kref_get(&ios->kref); + osd_execute_request_async(or, _done_io, ios); + } + + kref_put(&ios->kref, _last_io); + ret = 0; + + if (sync) { + wait_for_completion(&wait); + ret = ore_check_io(ios, NULL); + } + return ret; +} + +static void _clear_bio(struct bio *bio) +{ + struct bio_vec *bv; + unsigned i; + + bio_for_each_segment_all(bv, bio, i) { + unsigned this_count = bv->bv_len; + + if (likely(PAGE_SIZE == this_count)) + clear_highpage(bv->bv_page); + else + zero_user(bv->bv_page, bv->bv_offset, this_count); + } +} + +int ore_check_io(struct ore_io_state *ios, ore_on_dev_error on_dev_error) +{ + enum osd_err_priority acumulated_osd_err = 0; + int acumulated_lin_err = 0; + int i; + + for (i = 0; i < ios->numdevs; i++) { + struct osd_sense_info osi; + struct ore_per_dev_state *per_dev = &ios->per_dev[i]; + struct osd_request *or = per_dev->or; + int ret; + + if (unlikely(!or)) + continue; + + ret = osd_req_decode_sense(or, &osi); + if (likely(!ret)) + continue; + + if ((OSD_ERR_PRI_CLEAR_PAGES == osi.osd_err_pri) && + per_dev->bio) { + /* start read offset passed endof file. + * Note: if we do not have bio it means read-attributes + * In this case we should return error to caller. + */ + _clear_bio(per_dev->bio); + ORE_DBGMSG("start read offset passed end of file " + "offset=0x%llx, length=0x%llx\n", + _LLU(per_dev->offset), + _LLU(per_dev->length)); + + continue; /* we recovered */ + } + + if (on_dev_error) { + u64 residual = ios->reading ? + or->in.residual : or->out.residual; + u64 offset = (ios->offset + ios->length) - residual; + unsigned dev = per_dev->dev - ios->oc->first_dev; + struct ore_dev *od = ios->oc->ods[dev]; + + on_dev_error(ios, od, dev, osi.osd_err_pri, + offset, residual); + } + if (osi.osd_err_pri >= acumulated_osd_err) { + acumulated_osd_err = osi.osd_err_pri; + acumulated_lin_err = ret; + } + } + + return acumulated_lin_err; +} +EXPORT_SYMBOL(ore_check_io); + +/* + * L - logical offset into the file + * + * D - number of Data devices + * D = group_width - parity + * + * U - The number of bytes in a stripe within a group + * U = stripe_unit * D + * + * T - The number of bytes striped within a group of component objects + * (before advancing to the next group) + * T = U * group_depth + * + * S - The number of bytes striped across all component objects + * before the pattern repeats + * S = T * group_count + * + * M - The "major" (i.e., across all components) cycle number + * M = L / S + * + * G - Counts the groups from the beginning of the major cycle + * G = (L - (M * S)) / T [or (L % S) / T] + * + * H - The byte offset within the group + * H = (L - (M * S)) % T [or (L % S) % T] + * + * N - The "minor" (i.e., across the group) stripe number + * N = H / U + * + * C - The component index coresponding to L + * + * C = (H - (N * U)) / stripe_unit + G * D + * [or (L % U) / stripe_unit + G * D] + * + * O - The component offset coresponding to L + * O = L % stripe_unit + N * stripe_unit + M * group_depth * stripe_unit + * + * LCMdP – Parity cycle: Lowest Common Multiple of group_width, parity + * divide by parity + * LCMdP = lcm(group_width, parity) / parity + * + * R - The parity Rotation stripe + * (Note parity cycle always starts at a group's boundary) + * R = N % LCMdP + * + * I = the first parity device index + * I = (group_width + group_width - R*parity - parity) % group_width + * + * Craid - The component index Rotated + * Craid = (group_width + C - R*parity) % group_width + * (We add the group_width to avoid negative numbers modulo math) + */ +void ore_calc_stripe_info(struct ore_layout *layout, u64 file_offset, + u64 length, struct ore_striping_info *si) +{ + u32 stripe_unit = layout->stripe_unit; + u32 group_width = layout->group_width; + u64 group_depth = layout->group_depth; + u32 parity = layout->parity; + + u32 D = group_width - parity; + u32 U = D * stripe_unit; + u64 T = U * group_depth; + u64 S = T * layout->group_count; + u64 M = div64_u64(file_offset, S); + + /* + G = (L - (M * S)) / T + H = (L - (M * S)) % T + */ + u64 LmodS = file_offset - M * S; + u32 G = div64_u64(LmodS, T); + u64 H = LmodS - G * T; + + u32 N = div_u64(H, U); + u32 Nlast; + + /* "H - (N * U)" is just "H % U" so it's bound to u32 */ + u32 C = (u32)(H - (N * U)) / stripe_unit + G * group_width; + u32 first_dev = C - C % group_width; + + div_u64_rem(file_offset, stripe_unit, &si->unit_off); + + si->obj_offset = si->unit_off + (N * stripe_unit) + + (M * group_depth * stripe_unit); + si->cur_comp = C - first_dev; + si->cur_pg = si->unit_off / PAGE_SIZE; + + if (parity) { + u32 LCMdP = lcm(group_width, parity) / parity; + /* R = N % LCMdP; */ + u32 RxP = (N % LCMdP) * parity; + + si->par_dev = (group_width + group_width - parity - RxP) % + group_width + first_dev; + si->dev = (group_width + group_width + C - RxP) % + group_width + first_dev; + si->bytes_in_stripe = U; + si->first_stripe_start = M * S + G * T + N * U; + } else { + /* Make the math correct see _prepare_one_group */ + si->par_dev = group_width; + si->dev = C; + } + + si->dev *= layout->mirrors_p1; + si->par_dev *= layout->mirrors_p1; + si->offset = file_offset; + si->length = T - H; + if (si->length > length) + si->length = length; + + Nlast = div_u64(H + si->length + U - 1, U); + si->maxdevUnits = Nlast - N; + + si->M = M; +} +EXPORT_SYMBOL(ore_calc_stripe_info); + +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len) +{ + unsigned pg = *cur_pg; + struct request_queue *q = + osd_request_queue(_ios_od(ios, per_dev->dev)); + unsigned len = cur_len; + int ret; + + if (per_dev->bio == NULL) { + unsigned bio_size; + + if (!ios->reading) { + bio_size = ios->si.maxdevUnits; + } else { + bio_size = (ios->si.maxdevUnits + 1) * + (ios->layout->group_width - ios->layout->parity) / + ios->layout->group_width; + } + bio_size *= (ios->layout->stripe_unit / PAGE_SIZE); + + per_dev->bio = bio_kmalloc(GFP_KERNEL, bio_size); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + bio_size); + ret = -ENOMEM; + goto out; + } + } + + while (cur_len > 0) { + unsigned pglen = min_t(unsigned, PAGE_SIZE - pgbase, cur_len); + unsigned added_len; + + cur_len -= pglen; + + added_len = bio_add_pc_page(q, per_dev->bio, pages[pg], + pglen, pgbase); + if (unlikely(pglen != added_len)) { + /* If bi_vcnt == bi_max then this is a SW BUG */ + ORE_DBGMSG("Failed bio_add_pc_page bi_vcnt=0x%x " + "bi_max=0x%x BIO_MAX=0x%x cur_len=0x%x\n", + per_dev->bio->bi_vcnt, + per_dev->bio->bi_max_vecs, + BIO_MAX_PAGES_KMALLOC, cur_len); + ret = -ENOMEM; + goto out; + } + _add_stripe_page(ios->sp2d, &ios->si, pages[pg]); + + pgbase = 0; + ++pg; + } + BUG_ON(cur_len); + + per_dev->length += len; + *cur_pg = pg; + ret = 0; +out: /* we fail the complete unit on an error eg don't advance + * per_dev->length and cur_pg. This means that we might have a bigger + * bio than the CDB requested length (per_dev->length). That's fine + * only the oposite is fatal. + */ + return ret; +} + +static int _add_parity_units(struct ore_io_state *ios, + struct ore_striping_info *si, + unsigned dev, unsigned first_dev, + unsigned mirrors_p1, unsigned devs_in_group, + unsigned cur_len) +{ + unsigned do_parity; + int ret = 0; + + for (do_parity = ios->layout->parity; do_parity; --do_parity) { + struct ore_per_dev_state *per_dev; + + per_dev = &ios->per_dev[dev - first_dev]; + if (!per_dev->length && !per_dev->offset) { + /* Only/always the parity unit of the first + * stripe will be empty. So this is a chance to + * initialize the per_dev info. + */ + per_dev->dev = dev; + per_dev->offset = si->obj_offset - si->unit_off; + } + + ret = _ore_add_parity_unit(ios, si, per_dev, cur_len, + do_parity == 1); + if (unlikely(ret)) + break; + + if (do_parity != 1) { + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % + ios->layout->group_width; + } + } + + return ret; +} + +static int _prepare_for_striping(struct ore_io_state *ios) +{ + struct ore_striping_info *si = &ios->si; + unsigned stripe_unit = ios->layout->stripe_unit; + unsigned mirrors_p1 = ios->layout->mirrors_p1; + unsigned group_width = ios->layout->group_width; + unsigned devs_in_group = group_width * mirrors_p1; + unsigned dev = si->dev; + unsigned first_dev = dev - (dev % devs_in_group); + unsigned cur_pg = ios->pages_consumed; + u64 length = ios->length; + int ret = 0; + + if (!ios->pages) { + ios->numdevs = ios->layout->mirrors_p1; + return 0; + } + + BUG_ON(length > si->length); + + while (length) { + struct ore_per_dev_state *per_dev = + &ios->per_dev[dev - first_dev]; + unsigned cur_len, page_off = 0; + + if (!per_dev->length && !per_dev->offset) { + /* First time initialize the per_dev info. */ + per_dev->dev = dev; + if (dev == si->dev) { + WARN_ON(dev == si->par_dev); + per_dev->offset = si->obj_offset; + cur_len = stripe_unit - si->unit_off; + page_off = si->unit_off & ~PAGE_MASK; + BUG_ON(page_off && (page_off != ios->pgbase)); + } else { + per_dev->offset = si->obj_offset - si->unit_off; + cur_len = stripe_unit; + } + } else { + cur_len = stripe_unit; + } + if (cur_len >= length) + cur_len = length; + + ret = _ore_add_stripe_unit(ios, &cur_pg, page_off, ios->pages, + per_dev, cur_len); + if (unlikely(ret)) + goto out; + + length -= cur_len; + + dev = ((dev + mirrors_p1) % devs_in_group) + first_dev; + si->cur_comp = (si->cur_comp + 1) % group_width; + if (unlikely((dev == si->par_dev) || (!length && ios->sp2d))) { + if (!length && ios->sp2d) { + /* If we are writing and this is the very last + * stripe. then operate on parity dev. + */ + dev = si->par_dev; + /* If last stripe operate on parity comp */ + si->cur_comp = group_width - ios->layout->parity; + } + + /* In writes cur_len just means if it's the + * last one. See _ore_add_parity_unit. + */ + ret = _add_parity_units(ios, si, dev, first_dev, + mirrors_p1, devs_in_group, + ios->sp2d ? length : cur_len); + if (unlikely(ret)) + goto out; + + /* Rotate next par_dev backwards with wraping */ + si->par_dev = (devs_in_group + si->par_dev - + ios->layout->parity * mirrors_p1) % + devs_in_group + first_dev; + /* Next stripe, start fresh */ + si->cur_comp = 0; + si->cur_pg = 0; + si->obj_offset += cur_len; + si->unit_off = 0; + } + } +out: + ios->numdevs = devs_in_group; + ios->pages_consumed = cur_pg; + return ret; +} + +int ore_create(struct ore_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->oc->numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_create_object(or, _ios_obj(ios, i)); + } + ret = ore_io_execute(ios); + +out: + return ret; +} +EXPORT_SYMBOL(ore_create); + +int ore_remove(struct ore_io_state *ios) +{ + int i, ret; + + for (i = 0; i < ios->oc->numdevs; i++) { + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, i), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + ios->per_dev[i].or = or; + ios->numdevs++; + + osd_req_remove_object(or, _ios_obj(ios, i)); + } + ret = ore_io_execute(ios); + +out: + return ret; +} +EXPORT_SYMBOL(ore_remove); + +static int _write_mirror(struct ore_io_state *ios, int cur_comp) +{ + struct ore_per_dev_state *master_dev = &ios->per_dev[cur_comp]; + unsigned dev = ios->per_dev[cur_comp].dev; + unsigned last_comp = cur_comp + ios->layout->mirrors_p1; + int ret = 0; + + if (ios->pages && !master_dev->length) + return 0; /* Just an empty slot */ + + for (; cur_comp < last_comp; ++cur_comp, ++dev) { + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, dev), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + ret = -ENOMEM; + goto out; + } + per_dev->or = or; + + if (ios->pages) { + struct bio *bio; + + if (per_dev != master_dev) { + bio = bio_clone_kmalloc(master_dev->bio, + GFP_KERNEL); + if (unlikely(!bio)) { + ORE_DBGMSG( + "Failed to allocate BIO size=%u\n", + master_dev->bio->bi_max_vecs); + ret = -ENOMEM; + goto out; + } + + bio->bi_bdev = NULL; + bio->bi_next = NULL; + per_dev->offset = master_dev->offset; + per_dev->length = master_dev->length; + per_dev->bio = bio; + per_dev->dev = dev; + } else { + bio = master_dev->bio; + /* FIXME: bio_set_dir() */ + bio->bi_rw |= REQ_WRITE; + } + + osd_req_write(or, _ios_obj(ios, cur_comp), + per_dev->offset, bio, per_dev->length); + ORE_DBGMSG("write(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(_ios_obj(ios, cur_comp)->id), + _LLU(per_dev->offset), + _LLU(per_dev->length), dev); + } else if (ios->kern_buff) { + per_dev->offset = ios->si.obj_offset; + per_dev->dev = ios->si.dev + dev; + + /* no cross device without page array */ + BUG_ON((ios->layout->group_width > 1) && + (ios->si.unit_off + ios->length > + ios->layout->stripe_unit)); + + ret = osd_req_write_kern(or, _ios_obj(ios, cur_comp), + per_dev->offset, + ios->kern_buff, ios->length); + if (unlikely(ret)) + goto out; + ORE_DBGMSG2("write_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%d\n", + _LLU(_ios_obj(ios, cur_comp)->id), + _LLU(per_dev->offset), + _LLU(ios->length), per_dev->dev); + } else { + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); + ORE_DBGMSG2("obj(0x%llx) set_attributes=%d dev=%d\n", + _LLU(_ios_obj(ios, cur_comp)->id), + ios->out_attr_len, dev); + } + + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, + ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, + ios->in_attr_len); + } + +out: + return ret; +} + +int ore_write(struct ore_io_state *ios) +{ + int i; + int ret; + + if (unlikely(ios->sp2d && !ios->r4w)) { + /* A library is attempting a RAID-write without providing + * a pages lock interface. + */ + WARN_ON_ONCE(1); + return -ENOTSUPP; + } + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _write_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios); + return ret; +} +EXPORT_SYMBOL(ore_write); + +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp) +{ + struct osd_request *or; + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_obj_id *obj = _ios_obj(ios, cur_comp); + unsigned first_dev = (unsigned)obj->id; + + if (ios->pages && !per_dev->length) + return 0; /* Just an empty slot */ + + first_dev = per_dev->dev + first_dev % ios->layout->mirrors_p1; + or = osd_start_request(_ios_od(ios, first_dev), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + if (ios->pages) { + if (per_dev->cur_sg) { + /* finalize the last sg_entry */ + _ore_add_sg_seg(per_dev, 0, false); + if (unlikely(!per_dev->cur_sg)) + return 0; /* Skip parity only device */ + + osd_req_read_sg(or, obj, per_dev->bio, + per_dev->sglist, per_dev->cur_sg); + } else { + /* The no raid case */ + osd_req_read(or, obj, per_dev->offset, + per_dev->bio, per_dev->length); + } + + ORE_DBGMSG("read(0x%llx) offset=0x%llx length=0x%llx" + " dev=%d sg_len=%d\n", _LLU(obj->id), + _LLU(per_dev->offset), _LLU(per_dev->length), + first_dev, per_dev->cur_sg); + } else { + BUG_ON(ios->kern_buff); + + osd_req_get_attributes(or, obj); + ORE_DBGMSG2("obj(0x%llx) get_attributes=%d dev=%d\n", + _LLU(obj->id), + ios->in_attr_len, first_dev); + } + if (ios->out_attr) + osd_req_add_set_attr_list(or, ios->out_attr, ios->out_attr_len); + + if (ios->in_attr) + osd_req_add_get_attr_list(or, ios->in_attr, ios->in_attr_len); + + return 0; +} + +int ore_read(struct ore_io_state *ios) +{ + int i; + int ret; + + ret = _prepare_for_striping(ios); + if (unlikely(ret)) + return ret; + + for (i = 0; i < ios->numdevs; i += ios->layout->mirrors_p1) { + ret = _ore_read_mirror(ios, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios); + return ret; +} +EXPORT_SYMBOL(ore_read); + +int extract_attr_from_ios(struct ore_io_state *ios, struct osd_attr *attr) +{ + struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */ + void *iter = NULL; + int nelem; + + do { + nelem = 1; + osd_req_decode_get_attr_list(ios->per_dev[0].or, + &cur_attr, &nelem, &iter); + if ((cur_attr.attr_page == attr->attr_page) && + (cur_attr.attr_id == attr->attr_id)) { + attr->len = cur_attr.len; + attr->val_ptr = cur_attr.val_ptr; + return 0; + } + } while (iter); + + return -EIO; +} +EXPORT_SYMBOL(extract_attr_from_ios); + +static int _truncate_mirrors(struct ore_io_state *ios, unsigned cur_comp, + struct osd_attr *attr) +{ + int last_comp = cur_comp + ios->layout->mirrors_p1; + + for (; cur_comp < last_comp; ++cur_comp) { + struct ore_per_dev_state *per_dev = &ios->per_dev[cur_comp]; + struct osd_request *or; + + or = osd_start_request(_ios_od(ios, cur_comp), GFP_KERNEL); + if (unlikely(!or)) { + ORE_ERR("%s: osd_start_request failed\n", __func__); + return -ENOMEM; + } + per_dev->or = or; + + osd_req_set_attributes(or, _ios_obj(ios, cur_comp)); + osd_req_add_set_attr_list(or, attr, 1); + } + + return 0; +} + +struct _trunc_info { + struct ore_striping_info si; + u64 prev_group_obj_off; + u64 next_group_obj_off; + + unsigned first_group_dev; + unsigned nex_group_dev; +}; + +static void _calc_trunk_info(struct ore_layout *layout, u64 file_offset, + struct _trunc_info *ti) +{ + unsigned stripe_unit = layout->stripe_unit; + + ore_calc_stripe_info(layout, file_offset, 0, &ti->si); + + ti->prev_group_obj_off = ti->si.M * stripe_unit; + ti->next_group_obj_off = ti->si.M ? (ti->si.M - 1) * stripe_unit : 0; + + ti->first_group_dev = ti->si.dev - (ti->si.dev % layout->group_width); + ti->nex_group_dev = ti->first_group_dev + layout->group_width; +} + +int ore_truncate(struct ore_layout *layout, struct ore_components *oc, + u64 size) +{ + struct ore_io_state *ios; + struct exofs_trunc_attr { + struct osd_attr attr; + __be64 newsize; + } *size_attrs; + struct _trunc_info ti; + int i, ret; + + ret = ore_get_io_state(layout, oc, &ios); + if (unlikely(ret)) + return ret; + + _calc_trunk_info(ios->layout, size, &ti); + + size_attrs = kcalloc(ios->oc->numdevs, sizeof(*size_attrs), + GFP_KERNEL); + if (unlikely(!size_attrs)) { + ret = -ENOMEM; + goto out; + } + + ios->numdevs = ios->oc->numdevs; + + for (i = 0; i < ios->numdevs; ++i) { + struct exofs_trunc_attr *size_attr = &size_attrs[i]; + u64 obj_size; + + if (i < ti.first_group_dev) + obj_size = ti.prev_group_obj_off; + else if (i >= ti.nex_group_dev) + obj_size = ti.next_group_obj_off; + else if (i < ti.si.dev) /* dev within this group */ + obj_size = ti.si.obj_offset + + ios->layout->stripe_unit - ti.si.unit_off; + else if (i == ti.si.dev) + obj_size = ti.si.obj_offset; + else /* i > ti.dev */ + obj_size = ti.si.obj_offset - ti.si.unit_off; + + size_attr->newsize = cpu_to_be64(obj_size); + size_attr->attr = g_attr_logical_length; + size_attr->attr.val_ptr = &size_attr->newsize; + + ORE_DBGMSG2("trunc(0x%llx) obj_offset=0x%llx dev=%d\n", + _LLU(oc->comps->obj.id), _LLU(obj_size), i); + ret = _truncate_mirrors(ios, i * ios->layout->mirrors_p1, + &size_attr->attr); + if (unlikely(ret)) + goto out; + } + ret = ore_io_execute(ios); + +out: + kfree(size_attrs); + ore_put_io_state(ios); + return ret; +} +EXPORT_SYMBOL(ore_truncate); + +const struct osd_attr g_attr_logical_length = ATTR_DEF( + OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); +EXPORT_SYMBOL(g_attr_logical_length); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c new file mode 100644 index 00000000000..7f20f25c232 --- /dev/null +++ b/fs/exofs/ore_raid.c @@ -0,0 +1,721 @@ +/* + * Copyright (C) 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <linux/gfp.h> +#include <linux/async_tx.h> + +#include "ore_raid.h" + +#undef ORE_DBGMSG2 +#define ORE_DBGMSG2 ORE_DBGMSG + +static struct page *_raid_page_alloc(void) +{ + return alloc_page(GFP_KERNEL); +} + +static void _raid_page_free(struct page *p) +{ + __free_page(p); +} + +/* This struct is forward declare in ore_io_state, but is private to here. + * It is put on ios->sp2d for RAID5/6 writes only. See _gen_xor_unit. + * + * __stripe_pages_2d is a 2d array of pages, and it is also a corner turn. + * Ascending page index access is sp2d(p-minor, c-major). But storage is + * sp2d[p-minor][c-major], so it can be properlly presented to the async-xor + * API. + */ +struct __stripe_pages_2d { + /* Cache some hot path repeated calculations */ + unsigned parity; + unsigned data_devs; + unsigned pages_in_unit; + + bool needed ; + + /* Array size is pages_in_unit (layout->stripe_unit / PAGE_SIZE) */ + struct __1_page_stripe { + bool alloc; + unsigned write_count; + struct async_submit_ctl submit; + struct dma_async_tx_descriptor *tx; + + /* The size of this array is data_devs + parity */ + struct page **pages; + struct page **scribble; + /* bool array, size of this array is data_devs */ + char *page_is_read; + } _1p_stripes[]; +}; + +/* This can get bigger then a page. So support multiple page allocations + * _sp2d_free should be called even if _sp2d_alloc fails (by returning + * none-zero). + */ +static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, + unsigned parity, struct __stripe_pages_2d **psp2d) +{ + struct __stripe_pages_2d *sp2d; + unsigned data_devs = group_width - parity; + struct _alloc_all_bytes { + struct __alloc_stripe_pages_2d { + struct __stripe_pages_2d sp2d; + struct __1_page_stripe _1p_stripes[pages_in_unit]; + } __asp2d; + struct __alloc_1p_arrays { + struct page *pages[group_width]; + struct page *scribble[group_width]; + char page_is_read[data_devs]; + } __a1pa[pages_in_unit]; + } *_aab; + struct __alloc_1p_arrays *__a1pa; + struct __alloc_1p_arrays *__a1pa_end; + const unsigned sizeof__a1pa = sizeof(_aab->__a1pa[0]); + unsigned num_a1pa, alloc_size, i; + + /* FIXME: check these numbers in ore_verify_layout */ + BUG_ON(sizeof(_aab->__asp2d) > PAGE_SIZE); + BUG_ON(sizeof__a1pa > PAGE_SIZE); + + if (sizeof(*_aab) > PAGE_SIZE) { + num_a1pa = (PAGE_SIZE - sizeof(_aab->__asp2d)) / sizeof__a1pa; + alloc_size = sizeof(_aab->__asp2d) + sizeof__a1pa * num_a1pa; + } else { + num_a1pa = pages_in_unit; + alloc_size = sizeof(*_aab); + } + + _aab = kzalloc(alloc_size, GFP_KERNEL); + if (unlikely(!_aab)) { + ORE_DBGMSG("!! Failed to alloc sp2d size=%d\n", alloc_size); + return -ENOMEM; + } + + sp2d = &_aab->__asp2d.sp2d; + *psp2d = sp2d; /* From here Just call _sp2d_free */ + + __a1pa = _aab->__a1pa; + __a1pa_end = __a1pa + num_a1pa; + + for (i = 0; i < pages_in_unit; ++i) { + if (unlikely(__a1pa >= __a1pa_end)) { + num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, + pages_in_unit - i); + + __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + if (unlikely(!__a1pa)) { + ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", + num_a1pa); + return -ENOMEM; + } + __a1pa_end = __a1pa + num_a1pa; + /* First *pages is marked for kfree of the buffer */ + sp2d->_1p_stripes[i].alloc = true; + } + + sp2d->_1p_stripes[i].pages = __a1pa->pages; + sp2d->_1p_stripes[i].scribble = __a1pa->scribble ; + sp2d->_1p_stripes[i].page_is_read = __a1pa->page_is_read; + ++__a1pa; + } + + sp2d->parity = parity; + sp2d->data_devs = data_devs; + sp2d->pages_in_unit = pages_in_unit; + return 0; +} + +static void _sp2d_reset(struct __stripe_pages_2d *sp2d, + const struct _ore_r4w_op *r4w, void *priv) +{ + unsigned data_devs = sp2d->data_devs; + unsigned group_width = data_devs + sp2d->parity; + int p, c; + + if (!sp2d->needed) + return; + + for (c = data_devs - 1; c >= 0; --c) + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->page_is_read[c]) { + struct page *page = _1ps->pages[c]; + + r4w->put_page(priv, page); + _1ps->page_is_read[c] = false; + } + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + memset(_1ps->pages, 0, group_width * sizeof(*_1ps->pages)); + _1ps->write_count = 0; + _1ps->tx = NULL; + } + + sp2d->needed = false; +} + +static void _sp2d_free(struct __stripe_pages_2d *sp2d) +{ + unsigned i; + + if (!sp2d) + return; + + for (i = 0; i < sp2d->pages_in_unit; ++i) { + if (sp2d->_1p_stripes[i].alloc) + kfree(sp2d->_1p_stripes[i].pages); + } + + kfree(sp2d); +} + +static unsigned _sp2d_min_pg(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static unsigned _sp2d_max_pg(struct __stripe_pages_2d *sp2d) +{ + int p; + + for (p = sp2d->pages_in_unit - 1; p >= 0; --p) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (_1ps->write_count) + return p; + } + + return ~0; +} + +static void _gen_xor_unit(struct __stripe_pages_2d *sp2d) +{ + unsigned p; + unsigned tx_flags = ASYNC_TX_ACK; + + if (sp2d->parity == 1) + tx_flags |= ASYNC_TX_XOR_ZERO_DST; + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if (!_1ps->write_count) + continue; + + init_async_submit(&_1ps->submit, tx_flags, + NULL, NULL, NULL, (addr_conv_t *)_1ps->scribble); + + if (sp2d->parity == 1) + _1ps->tx = async_xor(_1ps->pages[sp2d->data_devs], + _1ps->pages, 0, sp2d->data_devs, + PAGE_SIZE, &_1ps->submit); + else /* parity == 2 */ + _1ps->tx = async_gen_syndrome(_1ps->pages, 0, + sp2d->data_devs + sp2d->parity, + PAGE_SIZE, &_1ps->submit); + } + + for (p = 0; p < sp2d->pages_in_unit; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + /* NOTE: We wait for HW synchronously (I don't have such HW + * to test with.) Is parallelism needed with today's multi + * cores? + */ + async_tx_issue_pending(_1ps->tx); + } +} + +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + struct __1_page_stripe *_1ps; + + sp2d->needed = true; + + _1ps = &sp2d->_1p_stripes[si->cur_pg]; + _1ps->pages[si->cur_comp] = page; + ++_1ps->write_count; + + si->cur_pg = (si->cur_pg + 1) % sp2d->pages_in_unit; + /* si->cur_comp is advanced outside at main loop */ +} + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last) +{ + struct osd_sg_entry *sge; + + ORE_DBGMSG("dev=%d cur_len=0x%x not_last=%d cur_sg=%d " + "offset=0x%llx length=0x%x last_sgs_total=0x%x\n", + per_dev->dev, cur_len, not_last, per_dev->cur_sg, + _LLU(per_dev->offset), per_dev->length, + per_dev->last_sgs_total); + + if (!per_dev->cur_sg) { + sge = per_dev->sglist; + + /* First time we prepare two entries */ + if (per_dev->length) { + ++per_dev->cur_sg; + sge->offset = per_dev->offset; + sge->len = per_dev->length; + } else { + /* Here the parity is the first unit of this object. + * This happens every time we reach a parity device on + * the same stripe as the per_dev->offset. We need to + * just skip this unit. + */ + per_dev->offset += cur_len; + return; + } + } else { + /* finalize the last one */ + sge = &per_dev->sglist[per_dev->cur_sg - 1]; + sge->len = per_dev->length - per_dev->last_sgs_total; + } + + if (not_last) { + /* Partly prepare the next one */ + struct osd_sg_entry *next_sge = sge + 1; + + ++per_dev->cur_sg; + next_sge->offset = sge->offset + sge->len + cur_len; + /* Save cur len so we know how mutch was added next time */ + per_dev->last_sgs_total = per_dev->length; + next_sge->len = 0; + } else if (!sge->len) { + /* Optimize for when the last unit is a parity */ + --per_dev->cur_sg; + } +} + +static int _alloc_read_4_write(struct ore_io_state *ios) +{ + struct ore_layout *layout = ios->layout; + int ret; + /* We want to only read those pages not in cache so worst case + * is a stripe populated with every other page + */ + unsigned sgs_per_dev = ios->sp2d->pages_in_unit + 2; + + ret = _ore_get_io_state(layout, ios->oc, + layout->group_width * layout->mirrors_p1, + sgs_per_dev, 0, &ios->ios_read_4_write); + return ret; +} + +/* @si contains info of the to-be-inserted page. Update of @si should be + * maintained by caller. Specificaly si->dev, si->obj_offset, ... + */ +static int _add_to_r4w(struct ore_io_state *ios, struct ore_striping_info *si, + struct page *page, unsigned pg_len) +{ + struct request_queue *q; + struct ore_per_dev_state *per_dev; + struct ore_io_state *read_ios; + unsigned first_dev = si->dev - (si->dev % + (ios->layout->group_width * ios->layout->mirrors_p1)); + unsigned comp = si->dev - first_dev; + unsigned added_len; + + if (!ios->ios_read_4_write) { + int ret = _alloc_read_4_write(ios); + + if (unlikely(ret)) + return ret; + } + + read_ios = ios->ios_read_4_write; + read_ios->numdevs = ios->layout->group_width * ios->layout->mirrors_p1; + + per_dev = &read_ios->per_dev[comp]; + if (!per_dev->length) { + per_dev->bio = bio_kmalloc(GFP_KERNEL, + ios->sp2d->pages_in_unit); + if (unlikely(!per_dev->bio)) { + ORE_DBGMSG("Failed to allocate BIO size=%u\n", + ios->sp2d->pages_in_unit); + return -ENOMEM; + } + per_dev->offset = si->obj_offset; + per_dev->dev = si->dev; + } else if (si->obj_offset != (per_dev->offset + per_dev->length)) { + u64 gap = si->obj_offset - (per_dev->offset + per_dev->length); + + _ore_add_sg_seg(per_dev, gap, true); + } + q = osd_request_queue(ore_comp_dev(read_ios->oc, per_dev->dev)); + added_len = bio_add_pc_page(q, per_dev->bio, page, pg_len, + si->obj_offset % PAGE_SIZE); + if (unlikely(added_len != pg_len)) { + ORE_DBGMSG("Failed to bio_add_pc_page bi_vcnt=%d\n", + per_dev->bio->bi_vcnt); + return -ENOMEM; + } + + per_dev->length += pg_len; + return 0; +} + +/* read the beginning of an unaligned first page */ +static int _add_to_r4w_first_page(struct ore_io_state *ios, struct page *page) +{ + struct ore_striping_info si; + unsigned pg_len; + + ore_calc_stripe_info(ios->layout, ios->offset, 0, &si); + + pg_len = si.obj_offset % PAGE_SIZE; + si.obj_offset -= pg_len; + + ORE_DBGMSG("offset=0x%llx len=0x%x index=0x%lx dev=%x\n", + _LLU(si.obj_offset), pg_len, page->index, si.dev); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +/* read the end of an incomplete last page */ +static int _add_to_r4w_last_page(struct ore_io_state *ios, u64 *offset) +{ + struct ore_striping_info si; + struct page *page; + unsigned pg_len, p, c; + + ore_calc_stripe_info(ios->layout, *offset, 0, &si); + + p = si.cur_pg; + c = si.cur_comp; + page = ios->sp2d->_1p_stripes[p].pages[c]; + + pg_len = PAGE_SIZE - (si.unit_off % PAGE_SIZE); + *offset += pg_len; + + ORE_DBGMSG("p=%d, c=%d next-offset=0x%llx len=0x%x dev=%x par_dev=%d\n", + p, c, _LLU(*offset), pg_len, si.dev, si.par_dev); + + BUG_ON(!page); + + return _add_to_r4w(ios, &si, page, pg_len); +} + +static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret) +{ + struct bio_vec *bv; + unsigned i, d; + + /* loop on all devices all pages */ + for (d = 0; d < ios->numdevs; d++) { + struct bio *bio = ios->per_dev[d].bio; + + if (!bio) + continue; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + SetPageUptodate(page); + if (PageError(page)) + ClearPageError(page); + } + } +} + +/* read_4_write is hacked to read the start of the first stripe and/or + * the end of the last stripe. If needed, with an sg-gap at each device/page. + * It is assumed to be called after the to_be_written pages of the first stripe + * are populating ios->sp2d[][] + * + * NOTE: We call ios->r4w->lock_fn for all pages needed for parity calculations + * These pages are held at sp2d[p].pages[c] but with + * sp2d[p].page_is_read[c] = true. At _sp2d_reset these pages are + * ios->r4w->lock_fn(). The ios->r4w->lock_fn might signal that the page is + * @uptodate=true, so we don't need to read it, only unlock, after IO. + * + * TODO: The read_4_write should calc a need_to_read_pages_count, if bigger then + * to-be-written count, we should consider the xor-in-place mode. + * need_to_read_pages_count is the actual number of pages not present in cache. + * maybe "devs_in_group - ios->sp2d[p].write_count" is a good enough + * approximation? In this mode the read pages are put in the empty places of + * ios->sp2d[p][*], xor is calculated the same way. These pages are + * allocated/freed and don't go through cache + */ +static int _read_4_write_first_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset = ios->si.first_stripe_start; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + + if (offset == ios->offset) /* Go to start collect $200 */ + goto read_last_stripe; + + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + + ORE_DBGMSG("stripe_start=0x%llx ios->offset=0x%llx min_p=%d max_p=%d\n", + offset, ios->offset, min_p, max_p); + + for (c = 0; ; c++) { + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + read_si.obj_offset += min_p * PAGE_SIZE; + offset += min_p * PAGE_SIZE; + for (p = min_p; p <= max_p; p++) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + struct page **pp = &_1ps->pages[c]; + bool uptodate; + + if (*pp) { + if (ios->offset % PAGE_SIZE) + /* Read the remainder of the page */ + _add_to_r4w_first_page(ios, *pp); + /* to-be-written pages start here */ + goto read_last_stripe; + } + + *pp = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!*pp)) + return -ENOMEM; + + if (!uptodate) + _add_to_r4w(ios, &read_si, *pp, PAGE_SIZE); + + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + read_si.obj_offset += PAGE_SIZE; + offset += PAGE_SIZE; + } + offset += (sp2d->pages_in_unit - p) * PAGE_SIZE; + } + +read_last_stripe: + return 0; +} + +static int _read_4_write_last_stripe(struct ore_io_state *ios) +{ + struct ore_striping_info read_si; + struct __stripe_pages_2d *sp2d = ios->sp2d; + u64 offset; + u64 last_stripe_end; + unsigned bytes_in_stripe = ios->si.bytes_in_stripe; + unsigned c, p, min_p = sp2d->pages_in_unit, max_p = -1; + + offset = ios->offset + ios->length; + if (offset % PAGE_SIZE) + _add_to_r4w_last_page(ios, &offset); + /* offset will be aligned to next page */ + + last_stripe_end = div_u64(offset + bytes_in_stripe - 1, bytes_in_stripe) + * bytes_in_stripe; + if (offset == last_stripe_end) /* Optimize for the aligned case */ + goto read_it; + + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + p = read_si.cur_pg; + c = read_si.cur_comp; + + if (min_p == sp2d->pages_in_unit) { + /* Didn't do it yet */ + min_p = _sp2d_min_pg(sp2d); + max_p = _sp2d_max_pg(sp2d); + } + + ORE_DBGMSG("offset=0x%llx stripe_end=0x%llx min_p=%d max_p=%d\n", + offset, last_stripe_end, min_p, max_p); + + while (offset < last_stripe_end) { + struct __1_page_stripe *_1ps = &sp2d->_1p_stripes[p]; + + if ((min_p <= p) && (p <= max_p)) { + struct page *page; + bool uptodate; + + BUG_ON(_1ps->pages[c]); + page = ios->r4w->get_page(ios->private, offset, + &uptodate); + if (unlikely(!page)) + return -ENOMEM; + + _1ps->pages[c] = page; + /* Mark read-pages to be cache_released */ + _1ps->page_is_read[c] = true; + if (!uptodate) + _add_to_r4w(ios, &read_si, page, PAGE_SIZE); + } + + offset += PAGE_SIZE; + if (p == (sp2d->pages_in_unit - 1)) { + ++c; + p = 0; + ore_calc_stripe_info(ios->layout, offset, 0, &read_si); + } else { + read_si.obj_offset += PAGE_SIZE; + ++p; + } + } + +read_it: + return 0; +} + +static int _read_4_write_execute(struct ore_io_state *ios) +{ + struct ore_io_state *ios_read; + unsigned i; + int ret; + + ios_read = ios->ios_read_4_write; + if (!ios_read) + return 0; + + /* FIXME: Ugly to signal _sbi_read_mirror that we have bio(s). Change + * to check for per_dev->bio + */ + ios_read->pages = ios->pages; + + /* Now read these devices */ + for (i = 0; i < ios_read->numdevs; i += ios_read->layout->mirrors_p1) { + ret = _ore_read_mirror(ios_read, i); + if (unlikely(ret)) + return ret; + } + + ret = ore_io_execute(ios_read); /* Synchronus execution */ + if (unlikely(ret)) { + ORE_DBGMSG("!! ore_io_execute => %d\n", ret); + return ret; + } + + _mark_read4write_pages_uptodate(ios_read, ret); + ore_put_io_state(ios_read); + ios->ios_read_4_write = NULL; /* Might need a reuse at last stripe */ + return 0; +} + +/* In writes @cur_len means length left. .i.e cur_len==0 is the last parity U */ +int _ore_add_parity_unit(struct ore_io_state *ios, + struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, + unsigned cur_len, bool do_xor) +{ + if (ios->reading) { + if (per_dev->cur_sg >= ios->sgs_per_dev) { + ORE_DBGMSG("cur_sg(%d) >= sgs_per_dev(%d)\n" , + per_dev->cur_sg, ios->sgs_per_dev); + return -ENOMEM; + } + _ore_add_sg_seg(per_dev, cur_len, true); + } else { + struct __stripe_pages_2d *sp2d = ios->sp2d; + struct page **pages = ios->parity_pages + ios->cur_par_page; + unsigned num_pages; + unsigned array_start = 0; + unsigned i; + int ret; + + si->cur_pg = _sp2d_min_pg(sp2d); + num_pages = _sp2d_max_pg(sp2d) + 1 - si->cur_pg; + + if (!per_dev->length) { + per_dev->offset += si->cur_pg * PAGE_SIZE; + /* If first stripe, Read in all read4write pages + * (if needed) before we calculate the first parity. + */ + if (do_xor) + _read_4_write_first_stripe(ios); + } + if (!cur_len && do_xor) + /* If last stripe r4w pages of last stripe */ + _read_4_write_last_stripe(ios); + _read_4_write_execute(ios); + + for (i = 0; i < num_pages; i++) { + pages[i] = _raid_page_alloc(); + if (unlikely(!pages[i])) + return -ENOMEM; + + ++(ios->cur_par_page); + } + + BUG_ON(si->cur_comp < sp2d->data_devs); + BUG_ON(si->cur_pg + num_pages > sp2d->pages_in_unit); + + ret = _ore_add_stripe_unit(ios, &array_start, 0, pages, + per_dev, num_pages * PAGE_SIZE); + if (unlikely(ret)) + return ret; + + if (do_xor) { + _gen_xor_unit(sp2d); + _sp2d_reset(sp2d, ios->r4w, ios->private); + } + } + return 0; +} + +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios) +{ + if (ios->parity_pages) { + struct ore_layout *layout = ios->layout; + unsigned pages_in_unit = layout->stripe_unit / PAGE_SIZE; + + if (_sp2d_alloc(pages_in_unit, layout->group_width, + layout->parity, &ios->sp2d)) { + return -ENOMEM; + } + } + return 0; +} + +void _ore_free_raid_stuff(struct ore_io_state *ios) +{ + if (ios->sp2d) { /* writing and raid */ + unsigned i; + + for (i = 0; i < ios->cur_par_page; i++) { + struct page *page = ios->parity_pages[i]; + + if (page) + _raid_page_free(page); + } + if (ios->extra_part_alloc) + kfree(ios->parity_pages); + /* If IO returned an error pages might need unlocking */ + _sp2d_reset(ios->sp2d, ios->r4w, ios->private); + _sp2d_free(ios->sp2d); + } else { + /* Will only be set if raid reading && sglist is big */ + if (ios->extra_part_alloc) + kfree(ios->per_dev[0].sglist); + } + if (ios->ios_read_4_write) + ore_put_io_state(ios->ios_read_4_write); +} diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h new file mode 100644 index 00000000000..cf6375d8212 --- /dev/null +++ b/fs/exofs/ore_raid.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) from 2011 + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of the objects raid engine (ore). + * + * It is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with "ore". If not, write to the Free Software Foundation, Inc: + * "Free Software Foundation <info@fsf.org>" + */ + +#include <scsi/osd_ore.h> + +#define ORE_ERR(fmt, a...) printk(KERN_ERR "ore: " fmt, ##a) + +#ifdef CONFIG_EXOFS_DEBUG +#define ORE_DBGMSG(fmt, a...) \ + printk(KERN_NOTICE "ore @%s:%d: " fmt, __func__, __LINE__, ##a) +#else +#define ORE_DBGMSG(fmt, a...) \ + do { if (0) printk(fmt, ##a); } while (0) +#endif + +/* u64 has problems with printk this will cast it to unsigned long long */ +#define _LLU(x) (unsigned long long)(x) + +#define ORE_DBGMSG2(M...) do {} while (0) +/* #define ORE_DBGMSG2 ORE_DBGMSG */ + +/* ios_raid.c stuff needed by ios.c */ +int _ore_post_alloc_raid_stuff(struct ore_io_state *ios); +void _ore_free_raid_stuff(struct ore_io_state *ios); + +void _ore_add_sg_seg(struct ore_per_dev_state *per_dev, unsigned cur_len, + bool not_last); +int _ore_add_parity_unit(struct ore_io_state *ios, struct ore_striping_info *si, + struct ore_per_dev_state *per_dev, unsigned cur_len, + bool do_xor); +void _ore_add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page); +static inline void _add_stripe_page(struct __stripe_pages_2d *sp2d, + struct ore_striping_info *si, struct page *page) +{ + if (!sp2d) /* Inline the fast path */ + return; /* Hay no raid stuff */ + _ore_add_stripe_page(sp2d, si, page); +} + +/* ios.c stuff needed by ios_raid.c */ +int _ore_get_io_state(struct ore_layout *layout, + struct ore_components *oc, unsigned numdevs, + unsigned sgs_per_dev, unsigned num_par_pages, + struct ore_io_state **pios); +int _ore_add_stripe_unit(struct ore_io_state *ios, unsigned *cur_pg, + unsigned pgbase, struct page **pages, + struct ore_per_dev_state *per_dev, int cur_len); +int _ore_read_mirror(struct ore_io_state *ios, unsigned cur_comp); +int ore_io_execute(struct ore_io_state *ios); diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h deleted file mode 100644 index c52e9888b8a..00000000000 --- a/fs/exofs/pnfs.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify it under the - * terms of the GNU General Public License version 2 as published by the Free - * Software Foundation. - * - */ - -/* FIXME: Remove this file once pnfs hits mainline */ - -#ifndef __EXOFS_PNFS_H__ -#define __EXOFS_PNFS_H__ - -#if ! defined(__PNFS_OSD_XDR_H__) - -enum pnfs_iomode { - IOMODE_READ = 1, - IOMODE_RW = 2, - IOMODE_ANY = 3, -}; - -/* Layout Structure */ -enum pnfs_osd_raid_algorithm4 { - PNFS_OSD_RAID_0 = 1, - PNFS_OSD_RAID_4 = 2, - PNFS_OSD_RAID_5 = 3, - PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */ -}; - -struct pnfs_osd_data_map { - u32 odm_num_comps; - u64 odm_stripe_unit; - u32 odm_group_width; - u32 odm_group_depth; - u32 odm_mirror_cnt; - u32 odm_raid_algorithm; -}; - -#endif /* ! defined(__PNFS_OSD_XDR_H__) */ - -#endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 8c6c4669b38..ed73ed8ebbe 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -35,11 +35,14 @@ #include <linux/parser.h> #include <linux/vfs.h> #include <linux/random.h> +#include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> #include "exofs.h" +#define EXOFS_DBGMSG2(M...) do {} while (0) + /****************************************************************************** * MOUNT OPTIONS *****************************************************************************/ @@ -48,6 +51,7 @@ * struct to hold what we get from mount options */ struct exofs_mountopt { + bool is_osdname; const char *dev_name; uint64_t pid; int timeout; @@ -56,7 +60,7 @@ struct exofs_mountopt { /* * exofs-specific mount-time options. */ -enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err }; +enum { Opt_name, Opt_pid, Opt_to, Opt_err }; /* * Our mount-time options. These should ideally be 64-bit unsigned, but the @@ -64,6 +68,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err }; * sufficient for most applications now. */ static match_table_t tokens = { + {Opt_name, "osdname=%s"}, {Opt_pid, "pid=%u"}, {Opt_to, "to=%u"}, {Opt_err, NULL} @@ -94,6 +99,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts) token = match_token(p, tokens, args); switch (token) { + case Opt_name: + opts->dev_name = match_strdup(&args[0]); + if (unlikely(!opts->dev_name)) { + EXOFS_ERR("Error allocating dev_name"); + return -ENOMEM; + } + opts->is_osdname = true; + break; case Opt_pid: if (0 == match_strlcpy(str, &args[0], sizeof(str))) return -EINVAL; @@ -153,7 +166,6 @@ static struct inode *exofs_alloc_inode(struct super_block *sb) static void exofs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - INIT_LIST_HEAD(&inode->i_dentry); kmem_cache_free(exofs_inode_cachep, exofs_i(inode)); } @@ -194,10 +206,152 @@ static int init_inodecache(void) */ static void destroy_inodecache(void) { + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); kmem_cache_destroy(exofs_inode_cachep); } /****************************************************************************** + * Some osd helpers + *****************************************************************************/ +void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj) +{ + osd_sec_init_nosec_doall_caps(cred_a, obj, false, true); +} + +static int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj, + u64 offset, void *p, unsigned length) +{ + struct osd_request *or = osd_start_request(od, GFP_KERNEL); +/* struct osd_sense_info osi = {.key = 0};*/ + int ret; + + if (unlikely(!or)) { + EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__); + return -ENOMEM; + } + ret = osd_req_read_kern(or, obj, offset, p, length); + if (unlikely(ret)) { + EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__); + goto out; + } + + ret = osd_finalize_request(or, 0, cred, NULL); + if (unlikely(ret)) { + EXOFS_DBGMSG("Failed to osd_finalize_request() => %d\n", ret); + goto out; + } + + ret = osd_execute_request(or); + if (unlikely(ret)) + EXOFS_DBGMSG("osd_execute_request() => %d\n", ret); + /* osd_req_decode_sense(or, ret); */ + +out: + osd_end_request(or); + EXOFS_DBGMSG2("read_kern(0x%llx) offset=0x%llx " + "length=0x%llx dev=%p ret=>%d\n", + _LLU(obj->id), _LLU(offset), _LLU(length), od, ret); + return ret; +} + +static const struct osd_attr g_attr_sb_stats = ATTR_DEF( + EXOFS_APAGE_SB_DATA, + EXOFS_ATTR_SB_STATS, + sizeof(struct exofs_sb_stats)); + +static int __sbi_read_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct ore_io_state *ios; + int ret; + + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); + return ret; + } + + ios->in_attr = attrs; + ios->in_attr_len = ARRAY_SIZE(attrs); + + ret = ore_read(ios); + if (unlikely(ret)) { + EXOFS_ERR("Error reading super_block stats => %d\n", ret); + goto out; + } + + ret = extract_attr_from_ios(ios, &attrs[0]); + if (ret) { + EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__); + goto out; + } + if (attrs[0].len) { + struct exofs_sb_stats *ess; + + if (unlikely(attrs[0].len != sizeof(*ess))) { + EXOFS_ERR("%s: Wrong version of exofs_sb_stats " + "size(%d) != expected(%zd)\n", + __func__, attrs[0].len, sizeof(*ess)); + goto out; + } + + ess = attrs[0].val_ptr; + sbi->s_nextid = le64_to_cpu(ess->s_nextid); + sbi->s_numfiles = le32_to_cpu(ess->s_numfiles); + } + +out: + ore_put_io_state(ios); + return ret; +} + +static void stats_done(struct ore_io_state *ios, void *p) +{ + ore_put_io_state(ios); + /* Good thanks nothing to do anymore */ +} + +/* Asynchronously write the stats attribute */ +int exofs_sbi_write_stats(struct exofs_sb_info *sbi) +{ + struct osd_attr attrs[] = { + [0] = g_attr_sb_stats, + }; + struct ore_io_state *ios; + int ret; + + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_get_io_state failed.\n", __func__); + return ret; + } + + sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid); + sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles); + attrs[0].val_ptr = &sbi->s_ess; + + + ios->done = stats_done; + ios->private = sbi; + ios->out_attr = attrs; + ios->out_attr_len = ARRAY_SIZE(attrs); + + ret = ore_write(ios); + if (unlikely(ret)) { + EXOFS_ERR("%s: ore_write failed.\n", __func__); + ore_put_io_state(ios); + } + + return ret; +} + +/****************************************************************************** * SUPERBLOCK FUNCTIONS *****************************************************************************/ static const struct super_operations exofs_sops; @@ -206,60 +360,57 @@ static const struct export_operations exofs_export_ops; /* * Write the superblock to the OSD */ -int exofs_sync_fs(struct super_block *sb, int wait) +static int exofs_sync_fs(struct super_block *sb, int wait) { struct exofs_sb_info *sbi; struct exofs_fscb *fscb; - struct exofs_io_state *ios; + struct ore_comp one_comp; + struct ore_components oc; + struct ore_io_state *ios; int ret = -ENOMEM; - lock_super(sb); + fscb = kmalloc(sizeof(*fscb), GFP_KERNEL); + if (unlikely(!fscb)) + return -ENOMEM; + sbi = sb->s_fs_info; - fscb = &sbi->s_fscb; - ret = exofs_get_io_state(&sbi->layout, &ios); - if (ret) + /* NOTE: We no longer dirty the super_block anywhere in exofs. The + * reason we write the fscb here on unmount is so we can stay backwards + * compatible with fscb->s_version == 1. (What we are not compatible + * with is if a new version FS crashed and then we try to mount an old + * version). Otherwise the exofs_fscb is read-only from mkfs time. All + * the writeable info is set in exofs_sbi_write_stats() above. + */ + + exofs_init_comps(&oc, &one_comp, sbi, EXOFS_SUPER_ID); + + ret = ore_get_io_state(&sbi->layout, &oc, &ios); + if (unlikely(ret)) goto out; - /* Note: We only write the changing part of the fscb. .i.e upto the - * the fscb->s_dev_table_oid member. There is no read-modify-write - * here. - */ ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); memset(fscb, 0, ios->length); fscb->s_nextid = cpu_to_le64(sbi->s_nextid); - fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); + fscb->s_numfiles = cpu_to_le64(sbi->s_numfiles); fscb->s_magic = cpu_to_le16(sb->s_magic); fscb->s_newfs = 0; fscb->s_version = EXOFS_FSCB_VER; - ios->obj.id = EXOFS_SUPER_ID; ios->offset = 0; ios->kern_buff = fscb; - ios->cred = sbi->s_cred; - ret = exofs_sbi_write(ios); - if (unlikely(ret)) { - EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); - goto out; - } - sb->s_dirt = 0; + ret = ore_write(ios); + if (unlikely(ret)) + EXOFS_ERR("%s: ore_write failed.\n", __func__); out: EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); - exofs_put_io_state(ios); - unlock_super(sb); + ore_put_io_state(ios); + kfree(fscb); return ret; } -static void exofs_write_super(struct super_block *sb) -{ - if (!(sb->s_flags & MS_RDONLY)) - exofs_sync_fs(sb, 1); - else - sb->s_dirt = 0; -} - static void _exofs_print_device(const char *msg, const char *dev_path, struct osd_dev *od, u64 pid) { @@ -269,17 +420,20 @@ static void _exofs_print_device(const char *msg, const char *dev_path, msg, dev_path ?: "", odi->osdname, _LLU(pid)); } -void exofs_free_sbi(struct exofs_sb_info *sbi) +static void exofs_free_sbi(struct exofs_sb_info *sbi) { - while (sbi->layout.s_numdevs) { - int i = --sbi->layout.s_numdevs; - struct osd_dev *od = sbi->layout.s_ods[i]; + unsigned numdevs = sbi->oc.numdevs; + + while (numdevs) { + unsigned i = --numdevs; + struct osd_dev *od = ore_comp_dev(&sbi->oc, i); if (od) { - sbi->layout.s_ods[i] = NULL; + ore_comp_set_dev(&sbi->oc, i, NULL); osduld_put_device(od); } } + kfree(sbi->oc.ods); kfree(sbi); } @@ -292,22 +446,24 @@ static void exofs_put_super(struct super_block *sb) int num_pend; struct exofs_sb_info *sbi = sb->s_fs_info; - if (sb->s_dirt) - exofs_write_super(sb); - /* make sure there are no pending commands */ for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; num_pend = atomic_read(&sbi->s_curr_pending)) { wait_queue_head_t wq; + + printk(KERN_NOTICE "%s: !!Pending operations in flight. " + "This is a BUG. please report to osd-dev@open-osd.org\n", + __func__); init_waitqueue_head(&wq); wait_event_timeout(wq, (atomic_read(&sbi->s_curr_pending) == 0), msecs_to_jiffies(100)); } - _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], - sbi->layout.s_pid); + _exofs_print_device("Unmounting", NULL, ore_comp_dev(&sbi->oc, 0), + sbi->one_comp.obj.partition); + exofs_sysfs_sb_del(sbi); bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; @@ -316,78 +472,48 @@ static void exofs_put_super(struct super_block *sb) static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs, struct exofs_device_table *dt) { - u64 stripe_length; + int ret; - sbi->data_map.odm_num_comps = - le32_to_cpu(dt->dt_data_map.cb_num_comps); - sbi->data_map.odm_stripe_unit = + sbi->layout.stripe_unit = le64_to_cpu(dt->dt_data_map.cb_stripe_unit); - sbi->data_map.odm_group_width = + sbi->layout.group_width = le32_to_cpu(dt->dt_data_map.cb_group_width); - sbi->data_map.odm_group_depth = + sbi->layout.group_depth = le32_to_cpu(dt->dt_data_map.cb_group_depth); - sbi->data_map.odm_mirror_cnt = - le32_to_cpu(dt->dt_data_map.cb_mirror_cnt); - sbi->data_map.odm_raid_algorithm = + sbi->layout.mirrors_p1 = + le32_to_cpu(dt->dt_data_map.cb_mirror_cnt) + 1; + sbi->layout.raid_algorithm = le32_to_cpu(dt->dt_data_map.cb_raid_algorithm); -/* FIXME: Only raid0 for now. if not so, do not mount */ - if (sbi->data_map.odm_num_comps != numdevs) { - EXOFS_ERR("odm_num_comps(%u) != numdevs(%u)\n", - sbi->data_map.odm_num_comps, numdevs); - return -EINVAL; - } - if (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) { - EXOFS_ERR("Only RAID_0 for now\n"); - return -EINVAL; - } - if (0 != (numdevs % (sbi->data_map.odm_mirror_cnt + 1))) { - EXOFS_ERR("Data Map wrong, numdevs=%d mirrors=%d\n", - numdevs, sbi->data_map.odm_mirror_cnt); - return -EINVAL; - } - - if (0 != (sbi->data_map.odm_stripe_unit & ~PAGE_MASK)) { - EXOFS_ERR("Stripe Unit(0x%llx)" - " must be Multples of PAGE_SIZE(0x%lx)\n", - _LLU(sbi->data_map.odm_stripe_unit), PAGE_SIZE); - return -EINVAL; - } + ret = ore_verify_layout(numdevs, &sbi->layout); + + EXOFS_DBGMSG("exofs: layout: " + "num_comps=%u stripe_unit=0x%x group_width=%u " + "group_depth=0x%llx mirrors_p1=%u raid_algorithm=%u\n", + numdevs, + sbi->layout.stripe_unit, + sbi->layout.group_width, + _LLU(sbi->layout.group_depth), + sbi->layout.mirrors_p1, + sbi->layout.raid_algorithm); + return ret; +} - sbi->layout.stripe_unit = sbi->data_map.odm_stripe_unit; - sbi->layout.mirrors_p1 = sbi->data_map.odm_mirror_cnt + 1; +static unsigned __ra_pages(struct ore_layout *layout) +{ + const unsigned _MIN_RA = 32; /* min 128K read-ahead */ + unsigned ra_pages = layout->group_width * layout->stripe_unit / + PAGE_SIZE; + unsigned max_io_pages = exofs_max_io_pages(layout, ~0); - if (sbi->data_map.odm_group_width) { - sbi->layout.group_width = sbi->data_map.odm_group_width; - sbi->layout.group_depth = sbi->data_map.odm_group_depth; - if (!sbi->layout.group_depth) { - EXOFS_ERR("group_depth == 0 && group_width != 0\n"); - return -EINVAL; - } - sbi->layout.group_count = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1 / - sbi->data_map.odm_group_width; - } else { - if (sbi->data_map.odm_group_depth) { - printk(KERN_NOTICE "Warning: group_depth ignored " - "group_width == 0 && group_depth == %d\n", - sbi->data_map.odm_group_depth); - sbi->data_map.odm_group_depth = 0; - } - sbi->layout.group_width = sbi->data_map.odm_num_comps / - sbi->layout.mirrors_p1; - sbi->layout.group_depth = -1; - sbi->layout.group_count = 1; - } + ra_pages *= 2; /* two stripes */ + if (ra_pages < _MIN_RA) + ra_pages = roundup(_MIN_RA, ra_pages / 2); - stripe_length = (u64)sbi->layout.group_width * sbi->layout.stripe_unit; - if (stripe_length >= (1ULL << 32)) { - EXOFS_ERR("Total Stripe length(0x%llx)" - " >= 32bit is not supported\n", _LLU(stripe_length)); - return -EINVAL; - } + if (ra_pages > max_io_pages) + ra_pages = max_io_pages; - return 0; + return ra_pages; } /* @odi is valid only as long as @fscb_dev is valid */ @@ -395,7 +521,8 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, struct osd_dev_info *odi) { odi->systemid_len = le32_to_cpu(dt_dev->systemid_len); - memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len); + if (likely(odi->systemid_len)) + memcpy(odi->systemid, dt_dev->systemid, OSD_SYSTEMID_LEN); odi->osdname_len = le32_to_cpu(dt_dev->osdname_len); odi->osdname = dt_dev->osdname; @@ -416,14 +543,40 @@ static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, return !(odi->systemid_len || odi->osdname_len); } -static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, +static int __alloc_dev_table(struct exofs_sb_info *sbi, unsigned numdevs, + struct exofs_dev **peds) +{ + struct __alloc_ore_devs_and_exofs_devs { + /* Twice bigger table: See exofs_init_comps() and comment at + * exofs_read_lookup_dev_table() + */ + struct ore_dev *oreds[numdevs * 2 - 1]; + struct exofs_dev eds[numdevs]; + } *aoded; + struct exofs_dev *eds; + unsigned i; + + aoded = kzalloc(sizeof(*aoded), GFP_KERNEL); + if (unlikely(!aoded)) { + EXOFS_ERR("ERROR: failed allocating Device array[%d]\n", + numdevs); + return -ENOMEM; + } + + sbi->oc.ods = aoded->oreds; + *peds = eds = aoded->eds; + for (i = 0; i < numdevs; ++i) + aoded->oreds[i] = &eds[i].ored; + return 0; +} + +static int exofs_read_lookup_dev_table(struct exofs_sb_info *sbi, + struct osd_dev *fscb_od, unsigned table_count) { - struct exofs_sb_info *sbi = *psbi; - struct osd_dev *fscb_od; - struct osd_obj_id obj = {.partition = sbi->layout.s_pid, - .id = EXOFS_DEVTABLE_ID}; + struct ore_comp comp; struct exofs_device_table *dt; + struct exofs_dev *eds; unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) + sizeof(*dt); unsigned numdevs, i; @@ -436,10 +589,14 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, return -ENOMEM; } - fscb_od = sbi->layout.s_ods[0]; - sbi->layout.s_ods[0] = NULL; - sbi->layout.s_numdevs = 0; - ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes); + sbi->oc.numdevs = 0; + + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_DEVTABLE_ID; + exofs_make_credential(comp.cred, &comp.obj); + + ret = exofs_read_kern(fscb_od, comp.cred, &comp.obj, 0, dt, + table_bytes); if (unlikely(ret)) { EXOFS_ERR("ERROR: reading device table\n"); goto out; @@ -456,18 +613,22 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, if (unlikely(ret)) goto out; - if (likely(numdevs > 1)) { - unsigned size = numdevs * sizeof(sbi->layout.s_ods[0]); + ret = __alloc_dev_table(sbi, numdevs, &eds); + if (unlikely(ret)) + goto out; + /* exofs round-robins the device table view according to inode + * number. We hold a: twice bigger table hence inodes can point + * to any device and have a sequential view of the table + * starting at this device. See exofs_init_comps() + */ + memcpy(&sbi->oc.ods[numdevs], &sbi->oc.ods[0], + (numdevs - 1) * sizeof(sbi->oc.ods[0])); - sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL); - if (unlikely(!sbi)) { - ret = -ENOMEM; - goto out; - } - memset(&sbi->layout.s_ods[1], 0, - size - sizeof(sbi->layout.s_ods[0])); - *psbi = sbi; - } + /* create sysfs subdir under which we put the device table + * And cluster layout. A Superblock is identified by the string: + * "dev[0].osdname"_"pid" + */ + exofs_sysfs_sb_add(sbi, &dt->dt_dev_table[0]); for (i = 0; i < numdevs; i++) { struct exofs_fscb fscb; @@ -483,32 +644,36 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n", i, odi.osdname); + /* the exofs id is currently the table index */ + eds[i].did = i; + /* On all devices the device table is identical. The user can * specify any one of the participating devices on the command * line. We always keep them in device-table order. */ if (fscb_od && osduld_device_same(fscb_od, &odi)) { - sbi->layout.s_ods[i] = fscb_od; - ++sbi->layout.s_numdevs; + eds[i].ored.od = fscb_od; + ++sbi->oc.numdevs; fscb_od = NULL; + exofs_sysfs_odev_add(&eds[i], sbi); continue; } od = osduld_info_lookup(&odi); - if (unlikely(IS_ERR(od))) { + if (IS_ERR(od)) { ret = PTR_ERR(od); EXOFS_ERR("ERROR: device requested is not found " "osd_name-%s =>%d\n", odi.osdname, ret); goto out; } - sbi->layout.s_ods[i] = od; - ++sbi->layout.s_numdevs; + eds[i].ored.od = od; + ++sbi->oc.numdevs; /* Read the fscb of the other devices to make sure the FS * partition is there. */ - ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); if (unlikely(ret)) { EXOFS_ERR("ERROR: Malformed participating device " @@ -516,6 +681,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, odi.osdname); goto out; } + exofs_sysfs_odev_add(&eds[i], sbi); /* TODO: verify other information is correct and FS-uuid * matches. Benny what did you say about device table @@ -525,13 +691,11 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi, out: kfree(dt); - if (unlikely(!ret && fscb_od)) { - EXOFS_ERR( - "ERROR: Bad device-table container device not present\n"); - osduld_put_device(fscb_od); - ret = -EINVAL; + if (unlikely(fscb_od && !ret)) { + EXOFS_ERR("ERROR: Bad device-table container device not present\n"); + osduld_put_device(fscb_od); + return -EINVAL; } - return ret; } @@ -545,7 +709,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) struct exofs_sb_info *sbi; /*extended info */ struct osd_dev *od; /* Master device */ struct exofs_fscb fscb; /*on-disk superblock info */ - struct osd_obj_id obj; + struct ore_comp comp; unsigned table_count; int ret; @@ -553,14 +717,20 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); - if (ret) - goto free_bdi; - /* use mount options to fill superblock */ - od = osduld_path_lookup(opts->dev_name); + if (opts->is_osdname) { + struct osd_dev_info odi = {.systemid_len = 0}; + + odi.osdname_len = strlen(opts->dev_name); + odi.osdname = (u8 *)opts->dev_name; + od = osduld_info_lookup(&odi); + kfree(opts->dev_name); + opts->dev_name = NULL; + } else { + od = osduld_path_lookup(opts->dev_name); + } if (IS_ERR(od)) { - ret = PTR_ERR(od); + ret = -EINVAL; goto free_sbi; } @@ -570,30 +740,35 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) sbi->layout.group_width = 1; sbi->layout.group_depth = -1; sbi->layout.group_count = 1; - sbi->layout.s_ods[0] = od; - sbi->layout.s_numdevs = 1; - sbi->layout.s_pid = opts->pid; sbi->s_timeout = opts->timeout; + sbi->one_comp.obj.partition = opts->pid; + sbi->one_comp.obj.id = 0; + exofs_make_credential(sbi->one_comp.cred, &sbi->one_comp.obj); + sbi->oc.single_comp = EC_SINGLE_COMP; + sbi->oc.comps = &sbi->one_comp; + /* fill in some other data by hand */ memset(sb->s_id, 0, sizeof(sb->s_id)); strcpy(sb->s_id, "exofs"); sb->s_blocksize = EXOFS_BLKSIZE; sb->s_blocksize_bits = EXOFS_BLKSHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_max_links = EXOFS_LINK_MAX; atomic_set(&sbi->s_curr_pending, 0); sb->s_bdev = NULL; sb->s_dev = 0; - obj.partition = sbi->layout.s_pid; - obj.id = EXOFS_SUPER_ID; - exofs_make_credential(sbi->s_cred, &obj); + comp.obj.partition = sbi->one_comp.obj.partition; + comp.obj.id = EXOFS_SUPER_ID; + exofs_make_credential(comp.cred, &comp.obj); - ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb)); + ret = exofs_read_kern(od, comp.cred, &comp.obj, 0, &fscb, sizeof(fscb)); if (unlikely(ret)) goto free_sbi; sb->s_magic = le16_to_cpu(fscb.s_magic); + /* NOTE: we read below to be backward compatible with old versions */ sbi->s_nextid = le64_to_cpu(fscb.s_nextid); sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); @@ -604,7 +779,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = -EINVAL; goto free_sbi; } - if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { + if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) { EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); ret = -EINVAL; @@ -617,12 +792,24 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) table_count = le64_to_cpu(fscb.s_dev_table_count); if (table_count) { - ret = exofs_read_lookup_dev_table(&sbi, table_count); + ret = exofs_read_lookup_dev_table(sbi, od, table_count); + if (unlikely(ret)) + goto free_sbi; + } else { + struct exofs_dev *eds; + + ret = __alloc_dev_table(sbi, 1, &eds); if (unlikely(ret)) goto free_sbi; + + ore_comp_set_dev(&sbi->oc, 0, od); + sbi->oc.numdevs = 1; } + __sbi_read_stats(sbi); + /* set up operation vectors */ + sbi->bdi.ra_pages = __ra_pages(&sbi->layout); sb->s_bdi = &sbi->bdi; sb->s_fs_info = sbi; sb->s_op = &exofs_sops; @@ -633,9 +820,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto free_sbi; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); EXOFS_ERR("ERROR: get root inode failed\n"); ret = -ENOMEM; goto free_sbi; @@ -650,15 +836,23 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], - sbi->layout.s_pid); + ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + if (ret) { + EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); + dput(sb->s_root); + sb->s_root = NULL; + goto free_sbi; + } + + exofs_sysfs_dbg_print(); + _exofs_print_device("Mounting", opts->dev_name, + ore_comp_dev(&sbi->oc, 0), + sbi->one_comp.obj.partition); return 0; free_sbi: - bdi_destroy(&sbi->bdi); -free_bdi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", - opts->dev_name, sbi->layout.s_pid, ret); + opts->dev_name, sbi->one_comp.obj.partition, ret); exofs_free_sbi(sbi); return ret; } @@ -677,7 +871,8 @@ static struct dentry *exofs_mount(struct file_system_type *type, if (ret) return ERR_PTR(ret); - opts.dev_name = dev_name; + if (!opts.dev_name) + opts.dev_name = dev_name; return mount_nodev(type, flags, &opts, exofs_fill_super); } @@ -689,7 +884,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct exofs_sb_info *sbi = sb->s_fs_info; - struct exofs_io_state *ios; + struct ore_io_state *ios; struct osd_attr attrs[] = { ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), @@ -698,21 +893,18 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) }; uint64_t capacity = ULLONG_MAX; uint64_t used = ULLONG_MAX; - uint8_t cred_a[OSD_CAP_LEN]; int ret; - ret = exofs_get_io_state(&sbi->layout, &ios); + ret = ore_get_io_state(&sbi->layout, &sbi->oc, &ios); if (ret) { - EXOFS_DBGMSG("exofs_get_io_state failed.\n"); + EXOFS_DBGMSG("ore_get_io_state failed.\n"); return ret; } - exofs_make_credential(cred_a, &ios->obj); - ios->cred = sbi->s_cred; ios->in_attr = attrs; ios->in_attr_len = ARRAY_SIZE(attrs); - ret = exofs_sbi_read(ios); + ret = ore_read(ios); if (unlikely(ret)) goto out; @@ -741,7 +933,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = EXOFS_NAME_LEN; out: - exofs_put_io_state(ios); + ore_put_io_state(ios); return ret; } @@ -751,7 +943,6 @@ static const struct super_operations exofs_sops = { .write_inode = exofs_write_inode, .evict_inode = exofs_evict_inode, .put_super = exofs_put_super, - .write_super = exofs_write_super, .sync_fs = exofs_sync_fs, .statfs = exofs_statfs, }; @@ -760,12 +951,12 @@ static const struct super_operations exofs_sops = { * EXPORT OPERATIONS *****************************************************************************/ -struct dentry *exofs_get_parent(struct dentry *child) +static struct dentry *exofs_get_parent(struct dentry *child) { unsigned long ino = exofs_parent_ino(child); if (!ino) - return NULL; + return ERR_PTR(-ESTALE); return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); } @@ -819,6 +1010,7 @@ static struct file_system_type exofs_type = { .mount = exofs_mount, .kill_sb = generic_shutdown_super, }; +MODULE_ALIAS_FS("exofs"); static int __init init_exofs(void) { @@ -832,6 +1024,9 @@ static int __init init_exofs(void) if (err) goto out_d; + /* We don't fail if sysfs creation failed */ + exofs_sysfs_init(); + return 0; out_d: destroy_inodecache(); @@ -841,6 +1036,7 @@ out: static void __exit exit_exofs(void) { + exofs_sysfs_uninit(); unregister_filesystem(&exofs_type); destroy_inodecache(); } diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c new file mode 100644 index 00000000000..1b4f2f95fc3 --- /dev/null +++ b/fs/exofs/sys.c @@ -0,0 +1,205 @@ +/* + * Copyright (C) 2012 + * Sachin Bhamare <sbhamare@panasas.com> + * Boaz Harrosh <bharrosh@panasas.com> + * + * This file is part of exofs. + * + * exofs is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License 2 as published by + * the Free Software Foundation. + * + * exofs is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with exofs; if not, write to the: + * Free Software Foundation <licensing@fsf.org> + */ + +#include <linux/kobject.h> +#include <linux/device.h> + +#include "exofs.h" + +struct odev_attr { + struct attribute attr; + ssize_t (*show)(struct exofs_dev *, char *); + ssize_t (*store)(struct exofs_dev *, const char *, size_t); +}; + +static ssize_t odev_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->show ? a->show(edp, buf) : 0; +} + +static ssize_t odev_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct exofs_dev *edp = container_of(kobj, struct exofs_dev, ed_kobj); + struct odev_attr *a = container_of(attr, struct odev_attr, attr); + + return a->store ? a->store(edp, buf, len) : len; +} + +static const struct sysfs_ops odev_attr_ops = { + .show = odev_attr_show, + .store = odev_attr_store, +}; + + +static struct kset *exofs_kset; + +static ssize_t osdname_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + return snprintf(buf, odi->osdname_len + 1, "%s", odi->osdname); +} + +static ssize_t systemid_show(struct exofs_dev *edp, char *buf) +{ + struct osd_dev *odev = edp->ored.od; + const struct osd_dev_info *odi = osduld_device_info(odev); + + memcpy(buf, odi->systemid, odi->systemid_len); + return odi->systemid_len; +} + +static ssize_t uri_show(struct exofs_dev *edp, char *buf) +{ + return snprintf(buf, edp->urilen, "%s", edp->uri); +} + +static ssize_t uri_store(struct exofs_dev *edp, const char *buf, size_t len) +{ + uint8_t *new_uri; + + edp->urilen = strlen(buf) + 1; + new_uri = krealloc(edp->uri, edp->urilen, GFP_KERNEL); + if (new_uri == NULL) + return -ENOMEM; + edp->uri = new_uri; + strncpy(edp->uri, buf, edp->urilen); + return edp->urilen; +} + +#define OSD_ATTR(name, mode, show, store) \ + static struct odev_attr odev_attr_##name = \ + __ATTR(name, mode, show, store) + +OSD_ATTR(osdname, S_IRUGO, osdname_show, NULL); +OSD_ATTR(systemid, S_IRUGO, systemid_show, NULL); +OSD_ATTR(uri, S_IRWXU, uri_show, uri_store); + +static struct attribute *odev_attrs[] = { + &odev_attr_osdname.attr, + &odev_attr_systemid.attr, + &odev_attr_uri.attr, + NULL, +}; + +static struct kobj_type odev_ktype = { + .default_attrs = odev_attrs, + .sysfs_ops = &odev_attr_ops, +}; + +static struct kobj_type uuid_ktype = { +}; + +void exofs_sysfs_dbg_print(void) +{ +#ifdef CONFIG_EXOFS_DEBUG + struct kobject *k_name, *k_tmp; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + printk(KERN_INFO "%s: name %s ref %d\n", + __func__, kobject_name(k_name), + (int)atomic_read(&k_name->kref.refcount)); + } +#endif +} +/* + * This function removes all kobjects under exofs_kset + * At the end of it, exofs_kset kobject will have a refcount + * of 1 which gets decremented only on exofs module unload + */ +void exofs_sysfs_sb_del(struct exofs_sb_info *sbi) +{ + struct kobject *k_name, *k_tmp; + struct kobject *s_kobj = &sbi->s_kobj; + + list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) { + /* Remove all that are children of this SBI */ + if (k_name->parent == s_kobj) + kobject_put(k_name); + } + kobject_put(s_kobj); +} + +/* + * This function creates sysfs entries to hold the current exofs cluster + * instance (uniquely identified by osdname,pid tuple). + * This function gets called once per exofs mount instance. + */ +int exofs_sysfs_sb_add(struct exofs_sb_info *sbi, + struct exofs_dt_device_info *dt_dev) +{ + struct kobject *s_kobj; + int retval = 0; + uint64_t pid = sbi->one_comp.obj.partition; + + /* allocate new uuid dirent */ + s_kobj = &sbi->s_kobj; + s_kobj->kset = exofs_kset; + retval = kobject_init_and_add(s_kobj, &uuid_ktype, + &exofs_kset->kobj, "%s_%llx", dt_dev->osdname, pid); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "uuid-%s_%llx => %d\n", dt_dev->osdname, pid, retval); + return -ENOMEM; + } + return 0; +} + +int exofs_sysfs_odev_add(struct exofs_dev *edev, struct exofs_sb_info *sbi) +{ + struct kobject *d_kobj; + int retval = 0; + + /* create osd device group which contains following attributes + * osdname, systemid & uri + */ + d_kobj = &edev->ed_kobj; + d_kobj->kset = exofs_kset; + retval = kobject_init_and_add(d_kobj, &odev_ktype, + &sbi->s_kobj, "dev%u", edev->did); + if (retval) { + EXOFS_ERR("ERROR: Failed to create sysfs entry for " + "device dev%u\n", edev->did); + return retval; + } + return 0; +} + +int exofs_sysfs_init(void) +{ + exofs_kset = kset_create_and_add("exofs", NULL, fs_kobj); + if (!exofs_kset) { + EXOFS_ERR("ERROR: kset_create_and_add exofs failed\n"); + return -ENOMEM; + } + return 0; +} + +void exofs_sysfs_uninit(void) +{ + kset_unregister(exofs_kset); +} |
