aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-04-03 09:53:22 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-03 09:53:22 -0700
commit9b59f0316bc556a1b63518f0b1224cf9be48467b (patch)
treed6ffccbe5d9ce5f55c1b2efaf02220c701d4420a
parentac7c1a776dfe1a9c83ea7885f858f5f1a144d8af (diff)
parent0d8fe329a80714e0f729ae48cba8d64cbe5701cb (diff)
Merge branch 'for-linus' of git://git.open-osd.org/linux-open-osd
* 'for-linus' of git://git.open-osd.org/linux-open-osd: fs: Add exofs to Kernel build exofs: Documentation exofs: export_operations exofs: super_operations and file_system_type exofs: dir_inode and directory operations exofs: address_space_operations exofs: symlink_inode and fast_symlink_inode operations exofs: file and file_inode operations exofs: Kbuild, Headers and osd utils
-rw-r--r--Documentation/filesystems/exofs.txt176
-rw-r--r--fs/Kconfig2
-rw-r--r--fs/Makefile1
-rw-r--r--fs/exofs/BUGS3
-rw-r--r--fs/exofs/Kbuild16
-rw-r--r--fs/exofs/Kconfig13
-rw-r--r--fs/exofs/common.h184
-rw-r--r--fs/exofs/dir.c672
-rw-r--r--fs/exofs/exofs.h180
-rw-r--r--fs/exofs/file.c87
-rw-r--r--fs/exofs/inode.c1303
-rw-r--r--fs/exofs/namei.c342
-rw-r--r--fs/exofs/osd.c153
-rw-r--r--fs/exofs/super.c584
-rw-r--r--fs/exofs/symlink.c57
15 files changed, 3773 insertions, 0 deletions
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
new file mode 100644
index 00000000000..0ced74c2f73
--- /dev/null
+++ b/Documentation/filesystems/exofs.txt
@@ -0,0 +1,176 @@
+===============================================================================
+WHAT IS EXOFS?
+===============================================================================
+
+exofs is a file system that uses an OSD and exports the API of a normal Linux
+file system. Users access exofs like any other local file system, and exofs
+will in turn issue commands to the local OSD initiator.
+
+OSD is a new T10 command set that views storage devices not as a large/flat
+array of sectors but as a container of objects, each having a length, quota,
+time attributes and more. Each object is addressed by a 64bit ID, and is
+contained in a 64bit ID partition. Each object has associated attributes
+attached to it, which are integral part of the object and provide metadata about
+the object. The standard defines some common obligatory attributes, but user
+attributes can be added as needed.
+
+===============================================================================
+ENVIRONMENT
+===============================================================================
+
+To use this file system, you need to have an object store to run it on. You
+may download a target from:
+http://open-osd.org
+
+See Documentation/scsi/osd.txt for how to setup a working osd environment.
+
+===============================================================================
+USAGE
+===============================================================================
+
+1. Download and compile exofs and open-osd initiator:
+ You need an external Kernel source tree or kernel headers from your
+ distribution. (anything based on 2.6.26 or later).
+
+ a. download open-osd including exofs source using:
+ [parent-directory]$ git clone git://git.open-osd.org/open-osd.git
+
+ b. Build the library module like this:
+ [parent-directory]$ make -C KSRC=$(KER_DIR) open-osd
+
+ This will build both the open-osd initiator as well as the exofs kernel
+ module. Use whatever parameters you compiled your Kernel with and
+ $(KER_DIR) above pointing to the Kernel you compile against. See the file
+ open-osd/top-level-Makefile for an example.
+
+2. Get the OSD initiator and target set up properly, and login to the target.
+ See Documentation/scsi/osd.txt for farther instructions. Also see ./do-osd
+ for example script that does all these steps.
+
+3. Insmod the exofs.ko module:
+ [exofs]$ insmod exofs.ko
+
+4. Make sure the directory where you want to mount exists. If not, create it.
+ (For example, mkdir /mnt/exofs)
+
+5. At first run you will need to invoke the mkfs.exofs application
+
+ As an example, this will create the file system on:
+ /dev/osd0 partition ID 65536
+
+ mkfs.exofs --pid=65536 --format /dev/osd0
+
+ The --format is optional if not specified no OSD_FORMAT will be
+ preformed and a clean file system will be created in the specified pid,
+ in the available space of the target. (Use --format=size_in_meg to limit
+ the total LUN space available)
+
+ If pid already exist it will be deleted and a new one will be created in it's
+ place. Be careful.
+
+ An exofs lives inside a single OSD partition. You can create multiple exofs
+ filesystems on the same device using multiple pids.
+
+ (run mkfs.exofs without any parameters for usage help message)
+
+6. Mount the file system.
+
+ For example, to mount /dev/osd0, partition ID 0x10000 on /mnt/exofs:
+
+ mount -t exofs -o pid=65536 /dev/osd0 /mnt/exofs/
+
+7. For reference (See do-exofs example script):
+ do-exofs start - an example of how to perform the above steps.
+ do-exofs stop - an example of how to unmount the file system.
+ do-exofs format - an example of how to format and mkfs a new exofs.
+
+8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
+ CONFIG_EXOFS_DEBUG - for debug messages and extra checks.
+
+===============================================================================
+exofs mount options
+===============================================================================
+Similar to any mount command:
+ mount -t exofs -o exofs_options /dev/osdX mount_exofs_directory
+
+Where:
+ -t exofs: specifies the exofs file system
+
+ /dev/osdX: X is a decimal number. /dev/osdX was created after a successful
+ login into an OSD target.
+
+ mount_exofs_directory: The directory to mount the file system on
+
+ exofs specific options: Options are separated by commas (,)
+ pid=<integer> - The partition number to mount/create as
+ container of the filesystem.
+ This option is mandatory
+ to=<integer> - Timeout in ticks for a single command
+ default is (60 * HZ) [for debugging only]
+
+===============================================================================
+DESIGN
+===============================================================================
+
+* The file system control block (AKA on-disk superblock) resides in an object
+ with a special ID (defined in common.h).
+ Information included in the file system control block is used to fill the
+ in-memory superblock structure at mount time. This object is created before
+ the file system is used by mkexofs.c It contains information such as:
+ - The file system's magic number
+ - The next inode number to be allocated
+
+* Each file resides in its own object and contains the data (and it will be
+ possible to extend the file over multiple objects, though this has not been
+ implemented yet).
+
+* A directory is treated as a file, and essentially contains a list of <file
+ name, inode #> pairs for files that are found in that directory. The object
+ IDs correspond to the files' inode numbers and will be allocated according to
+ a bitmap (stored in a separate object). Now they are allocated using a
+ counter.
+
+* Each file's control block (AKA on-disk inode) is stored in its object's
+ attributes. This applies to both regular files and other types (directories,
+ device files, symlinks, etc.).
+
+* Credentials are generated per object (inode and superblock) when they is
+ created in memory (read off disk or created). The credential works for all
+ operations and is used as long as the object remains in memory.
+
+* Async OSD operations are used whenever possible, but the target may execute
+ them out of order. The operations that concern us are create, delete,
+ readpage, writepage, update_inode, and truncate. The following pairs of
+ operations should execute in the order written, and we need to prevent them
+ from executing in reverse order:
+ - The following are handled with the OBJ_CREATED and OBJ_2BCREATED
+ flags. OBJ_CREATED is set when we know the object exists on the OSD -
+ in create's callback function, and when we successfully do a read_inode.
+ OBJ_2BCREATED is set in the beginning of the create function, so we
+ know that we should wait.
+ - create/delete: delete should wait until the object is created
+ on the OSD.
+ - create/readpage: readpage should be able to return a page
+ full of zeroes in this case. If there was a write already
+ en-route (i.e. create, writepage, readpage) then the page
+ would be locked, and so it would really be the same as
+ create/writepage.
+ - create/writepage: if writepage is called for a sync write, it
+ should wait until the object is created on the OSD.
+ Otherwise, it should just return.
+ - create/truncate: truncate should wait until the object is
+ created on the OSD.
+ - create/update_inode: update_inode should wait until the
+ object is created on the OSD.
+ - Handled by VFS locks:
+ - readpage/delete: shouldn't happen because of page lock.
+ - writepage/delete: shouldn't happen because of page lock.
+ - readpage/writepage: shouldn't happen because of page lock.
+
+===============================================================================
+LICENSE/COPYRIGHT
+===============================================================================
+The exofs file system is based on ext2 v0.5b (distributed with the Linux kernel
+version 2.6.10). All files include the original copyrights, and the license
+is GPL version 2 (only version 2, as is true for the Linux kernel). The
+Linux kernel can be downloaded from www.kernel.org.
diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa..ae3b34a2ea6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -169,6 +169,8 @@ source "fs/romfs/Kconfig"
source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
+source "fs/exofs/Kconfig"
+
endif # MISC_FILESYSTEMS
menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index b5cd8e18dd9..15f73014a20 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_DEBUG_FS) += debugfs/
obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
+obj-$(CONFIG_EXOFS_FS) += exofs/
diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 00000000000..1b2d4c63a57
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+ were written, but the inode attributes failed. Then if the filesystem was
+ unmounted and mounted the kernel can get into an endless loop doing a readdir.
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 00000000000..cc2d22db119
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc. All rights reserved.
+#
+# Authors:
+# Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 00000000000..86194b2f799
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+ tristate "exofs: OSD based file system support"
+ depends on SCSI_OSD_ULD
+ help
+ EXOFS is a file system that uses an OSD storage device,
+ as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+ bool "Enable debugging"
+ depends on EXOFS_FS
+ help
+ This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 00000000000..b1512c4bb8c
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
+#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
+#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA 1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number. This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+ EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+ (1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+ EXOFS_MAX_ID = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT 12
+#define EXOFS_BLKSIZE (1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC 0x5DF5
+
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored
+ * on disk. Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+ __le64 s_nextid; /* Highest object ID used */
+ __le32 s_numfiles; /* Number of files on fs */
+ __le16 s_magic; /* Magic signature */
+ __le16 s_newfs; /* Non-zero if this is a new fs */
+};
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA 5
+
+/*
+ * The file control block - stored in an object's attributes. This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+ __le64 i_size; /* Size of the file */
+ __le16 i_mode; /* File mode */
+ __le16 i_links_count; /* Links count */
+ __le32 i_uid; /* Owner Uid */
+ __le32 i_gid; /* Group Id */
+ __le32 i_atime; /* Access time */
+ __le32 i_ctime; /* Creation time */
+ __le32 i_mtime; /* Modification time */
+ __le32 i_flags; /* File flags (unused for now)*/
+ __le32 i_generation; /* File version (for NFS) */
+ __le32 i_data[EXOFS_IDATA]; /* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+ EXOFS_APAGE_FS_DATA,
+ EXOFS_ATTR_INODE_DATA,
+ EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN 255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+ __le64 inode_no; /* inode number */
+ __le16 rec_len; /* directory entry length */
+ u8 name_len; /* name length */
+ u8 file_type; /* umm...file type */
+ char name[EXOFS_NAME_LEN]; /* file name */
+};
+
+enum {
+ EXOFS_FT_UNKNOWN,
+ EXOFS_FT_REG_FILE,
+ EXOFS_FT_DIR,
+ EXOFS_FT_CHRDEV,
+ EXOFS_FT_BLKDEV,
+ EXOFS_FT_FIFO,
+ EXOFS_FT_SOCK,
+ EXOFS_FT_SYMLINK,
+ EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD 4
+#define EXOFS_DIR_ROUND (EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+ (((name_len) + offsetof(struct exofs_dir_entry, name) + \
+ EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+ const struct osd_obj_id *obj);
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+ return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+ osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+
+int osd_req_read_kern(struct osd_request *or,
+ const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+int osd_req_write_kern(struct osd_request *or,
+ const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 00000000000..65b0c8c776a
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,672 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Laboratoire MASI - Institut Blaise Pascal
+ * Universite Pierre et Marie Curie (Paris VI)
+ * from
+ * linux/fs/minix/inode.c
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation. Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+ return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+ kunmap(page);
+ page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+ return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+ loff_t last_byte = inode->i_size;
+
+ last_byte -= page_nr << PAGE_CACHE_SHIFT;
+ if (last_byte > PAGE_CACHE_SIZE)
+ last_byte = PAGE_CACHE_SIZE;
+ return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *dir = mapping->host;
+ int err = 0;
+
+ dir->i_version++;
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ if (pos+len > dir->i_size) {
+ i_size_write(dir, pos+len);
+ mark_inode_dirty(dir);
+ }
+ set_page_dirty(page);
+
+ if (IS_DIRSYNC(dir))
+ err = write_one_page(page, 1);
+ else
+ unlock_page(page);
+
+ return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+ struct inode *dir = page->mapping->host;
+ unsigned chunk_size = exofs_chunk_size(dir);
+ char *kaddr = page_address(page);
+ unsigned offs, rec_len;
+ unsigned limit = PAGE_CACHE_SIZE;
+ struct exofs_dir_entry *p;
+ char *error;
+
+ /* if the page is the last one in the directory */
+ if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+ limit = dir->i_size & ~PAGE_CACHE_MASK;
+ if (limit & (chunk_size - 1))
+ goto Ebadsize;
+ if (!limit)
+ goto out;
+ }
+ for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+ p = (struct exofs_dir_entry *)(kaddr + offs);
+ rec_len = le16_to_cpu(p->rec_len);
+
+ if (rec_len < EXOFS_DIR_REC_LEN(1))
+ goto Eshort;
+ if (rec_len & 3)
+ goto Ealign;
+ if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+ goto Enamelen;
+ if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+ goto Espan;
+ }
+ if (offs != limit)
+ goto Eend;
+out:
+ SetPageChecked(page);
+ return;
+
+Ebadsize:
+ EXOFS_ERR("ERROR [exofs_check_page]: "
+ "size of directory #%lu is not a multiple of chunk size",
+ dir->i_ino
+ );
+ goto fail;
+Eshort:
+ error = "rec_len is smaller than minimal";
+ goto bad_entry;
+Ealign:
+ error = "unaligned directory entry";
+ goto bad_entry;
+Enamelen:
+ error = "rec_len is too small for name_len";
+ goto bad_entry;
+Espan:
+ error = "directory entry across blocks";
+ goto bad_entry;
+bad_entry:
+ EXOFS_ERR(
+ "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+ "offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ _LLU(le64_to_cpu(p->inode_no)),
+ rec_len, p->name_len);
+ goto fail;
+Eend:
+ p = (struct exofs_dir_entry *)(kaddr + offs);
+ EXOFS_ERR("ERROR [exofs_check_page]: "
+ "entry in directory #%lu spans the page boundary"
+ "offset=%lu, inode=%llu",
+ dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+ _LLU(le64_to_cpu(p->inode_no)));
+fail:
+ SetPageChecked(page);
+ SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+ struct address_space *mapping = dir->i_mapping;
+ struct page *page = read_mapping_page(mapping, n, NULL);
+
+ if (!IS_ERR(page)) {
+ kmap(page);
+ if (!PageChecked(page))
+ exofs_check_page(page);
+ if (PageError(page))
+ goto fail;
+ }
+ return page;
+
+fail:
+ exofs_put_page(page);
+ return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+ struct exofs_dir_entry *de)
+{
+ if (len != de->name_len)
+ return 0;
+ if (!de->inode_no)
+ return 0;
+ return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+ return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+ struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+ struct exofs_dir_entry *p =
+ (struct exofs_dir_entry *)(base + (offset&mask));
+ while ((char *)p < (char *)de) {
+ if (p->rec_len == 0)
+ break;
+ p = exofs_next_entry(p);
+ }
+ return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+ [EXOFS_FT_UNKNOWN] = DT_UNKNOWN,
+ [EXOFS_FT_REG_FILE] = DT_REG,
+ [EXOFS_FT_DIR] = DT_DIR,
+ [EXOFS_FT_CHRDEV] = DT_CHR,
+ [EXOFS_FT_BLKDEV] = DT_BLK,
+ [EXOFS_FT_FIFO] = DT_FIFO,
+ [EXOFS_FT_SOCK] = DT_SOCK,
+ [EXOFS_FT_SYMLINK] = DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+ [S_IFREG >> S_SHIFT] = EXOFS_FT_REG_FILE,
+ [S_IFDIR >> S_SHIFT] = EXOFS_FT_DIR,
+ [S_IFCHR >> S_SHIFT] = EXOFS_FT_CHRDEV,
+ [S_IFBLK >> S_SHIFT] = EXOFS_FT_BLKDEV,
+ [S_IFIFO >> S_SHIFT] = EXOFS_FT_FIFO,
+ [S_IFSOCK >> S_SHIFT] = EXOFS_FT_SOCK,
+ [S_IFLNK >> S_SHIFT] = EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+ mode_t mode = inode->i_mode;
+ de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ loff_t pos = filp->f_pos;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ unsigned int offset = pos & ~PAGE_CACHE_MASK;
+ unsigned long n = pos >> PAGE_CACHE_SHIFT;
+ unsigned long npages = dir_pages(inode);
+ unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+ unsigned char *types = NULL;
+ int need_revalidate = (filp->f_version != inode->i_version);
+
+ if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+ return 0;
+
+ types = exofs_filetype_table;
+
+ for ( ; n < npages; n++, offset = 0) {
+ char *kaddr, *limit;
+ struct exofs_dir_entry *de;
+ struct page *page = exofs_get_page(inode, n);
+
+ if (IS_ERR(page)) {
+ EXOFS_ERR("ERROR: "
+ "bad page in #%lu",
+ inode->i_ino);
+ filp->f_pos += PAGE_CACHE_SIZE - offset;
+ return PTR_ERR(page);
+ }
+ kaddr = page_address(page);
+ if (unlikely(need_revalidate)) {
+ if (offset) {
+ offset = exofs_validate_entry(kaddr, offset,
+ chunk_mask);
+ filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+ }
+ filp->f_version = inode->i_version;
+ need_revalidate = 0;
+ }
+ de = (struct exofs_dir_entry *)(kaddr + offset);
+ limit = kaddr + exofs_last_byte(inode, n) -
+ EXOFS_DIR_REC_LEN(1);
+ for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+ if (de->rec_len == 0) {
+ EXOFS_ERR("ERROR: "
+ "zero-length directory entry");
+ exofs_put_page(page);
+ return -EIO;
+ }
+ if (de->inode_no) {
+ int over;
+ unsigned char d_type = DT_UNKNOWN;
+
+ if (types && de->file_type < EXOFS_FT_MAX)
+ d_type = types[de->file_type];
+
+ offset = (char *)de - kaddr;
+ over = filldir(dirent, de->name, de->name_len,
+ (n<<PAGE_CACHE_SHIFT) | offset,
+ le64_to_cpu(de->inode_no),
+ d_type);
+ if (over) {
+ exofs_put_page(page);
+ return 0;
+ }
+ }
+ filp->f_pos += le16_to_cpu(de->rec_len);
+ }
+ exofs_put_page(page);
+ }
+
+ return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+ struct dentry *dentry, struct page **res_page)
+{
+ const unsigned char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+ unsigned long start, n;
+ unsigned long npages = dir_pages(dir);
+ struct page *page = NULL;
+ struct exofs_i_info *oi = exofs_i(dir);
+ struct exofs_dir_entry *de;
+
+ if (npages == 0)
+ goto out;
+
+ *res_page = NULL;
+
+ start = oi->i_dir_start_lookup;
+ if (start >= npages)
+ start = 0;
+ n = start;
+ do {
+ char *kaddr;
+ page = exofs_get_page(dir, n);
+ if (!IS_ERR(page)) {
+ kaddr = page_address(page);
+ de = (struct exofs_dir_entry *) kaddr;
+ kaddr += exofs_last_byte(dir, n) - reclen;
+ while ((char *) de <= kaddr) {
+ if (de->rec_len == 0) {
+ EXOFS_ERR(
+ "ERROR: exofs_find_entry: "
+ "zero-length directory entry");
+ exofs_put_page(page);
+ goto out;
+ }
+ if (exofs_match(namelen, name, de))
+ goto found;
+ de = exofs_next_entry(de);
+ }
+ exofs_put_page(page);
+ }
+ if (++n >= npages)
+ n = 0;
+ } while (n != start);
+out:
+ return NULL;
+
+found:
+ *res_page = page;
+ oi->i_dir_start_lookup = n;
+ return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+ struct page *page = exofs_get_page(dir, 0);
+ struct exofs_dir_entry *de = NULL;
+
+ if (!IS_ERR(page)) {
+ de = exofs_next_entry(
+ (struct exofs_dir_entry *)page_address(page));
+ *p = page;
+ }
+ return de;
+}
+
+ino_t exofs_parent_ino(struct dentry *child)
+{
+ struct page *page;
+ struct exofs_dir_entry *de;
+ ino_t ino;
+
+ de = exofs_dotdot(child->d_inode, &page);
+ if (!de)
+ return 0;
+
+ ino = le64_to_cpu(de->inode_no);
+ exofs_put_page(page);
+ return ino;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+ ino_t res = 0;
+ struct exofs_dir_entry *de;
+ struct page *page;
+
+ de = exofs_find_entry(dir, dentry, &page);
+ if (de) {
+ res = le64_to_cpu(de->inode_no);
+ exofs_put_page(page);
+ }
+ return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+ struct page *page, struct inode *inode)
+{
+ loff_t pos = page_offset(page) +
+ (char *) de - (char *) page_address(page);
+ unsigned len = le16_to_cpu(de->rec_len);
+ int err;
+
+ lock_page(page);
+ err = exofs_write_begin(NULL, page->mapping, pos, len,
+ AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+ if (err)
+ EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+ err);
+
+ de->inode_no = cpu_to_le64(inode->i_ino);
+ exofs_set_de_type(de, inode);
+ if (likely(!err))
+ err = exofs_commit_chunk(page, pos, len);
+ exofs_put_page(page);
+ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ mark_inode_dirty(dir);
+ return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const unsigned char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned chunk_size = exofs_chunk_size(dir);
+ unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+ unsigned short rec_len, name_len;
+ struct page *page = NULL;
+ struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct exofs_dir_entry *de;
+ unsigned long npages = dir_pages(dir);
+ unsigned long n;
+ char *kaddr;
+ loff_t pos;
+ int err;
+
+ for (n = 0; n <= npages; n++) {
+ char *dir_end;
+
+ page = exofs_get_page(dir, n);
+ err = PTR_ERR(page);
+ if (IS_ERR(page))
+ goto out;
+ lock_page(page);
+ kaddr = page_address(page);
+ dir_end = kaddr + exofs_last_byte(dir, n);
+ de = (struct exofs_dir_entry *)kaddr;
+ kaddr += PAGE_CACHE_SIZE - reclen;
+ while ((char *)de <= kaddr) {
+ if ((char *)de == dir_end) {
+ name_len = 0;
+ rec_len = chunk_size;
+ de->rec_len = cpu_to_le16(chunk_size);
+ de->inode_no = 0;
+ goto got_it;
+ }
+ if (de->rec_len == 0) {
+ EXOFS_ERR("ERROR: exofs_add_link: "
+ "zero-length directory entry");
+ err = -EIO;
+ goto out_unlock;
+ }
+ err = -EEXIST;
+ if (exofs_match(namelen, name, de))
+ goto out_unlock;
+ name_len = EXOFS_DIR_REC_LEN(de->name_len);
+ rec_len = le16_to_cpu(de->rec_len);
+ if (!de->inode_no && rec_len >= reclen)
+ goto got_it;
+ if (rec_len >= name_len + reclen)
+ goto got_it;
+ de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+ }
+ unlock_page(page);
+ exofs_put_page(page);
+ }
+
+ EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+ return -EINVAL;
+
+got_it:
+ pos = page_offset(page) +
+ (char *)de - (char *)page_address(page);
+ err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+ &page, NULL);
+ if (err)
+ goto out_unlock;
+ if (de->inode_no) {
+ struct exofs_dir_entry *de1 =
+ (struct exofs_dir_entry *)((char *)de + name_len);
+ de1->rec_len = cpu_to_le16(r