aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ext4.txt9
-rw-r--r--Documentation/kernel-parameters.txt3
-rw-r--r--Documentation/prctl/seccomp_filter.txt74
-rw-r--r--Documentation/security/keys.txt17
-rw-r--r--arch/sh/mm/Kconfig1
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/include/asm/pgtable.h17
-rw-r--r--arch/x86/include/asm/pgtable_types.h20
-rw-r--r--arch/x86/kernel/vsyscall_64.c110
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--drivers/bus/Kconfig1
-rw-r--r--drivers/char/tpm/tpm_ibmvtpm.c81
-rw-r--r--drivers/char/tpm/tpm_ibmvtpm.h5
-rw-r--r--drivers/input/keyboard/Kconfig1
-rw-r--r--drivers/usb/phy/Kconfig1
-rw-r--r--drivers/video/omap2/Kconfig4
-rw-r--r--drivers/w1/masters/Kconfig1
-rw-r--r--drivers/xen/swiotlb-xen.c25
-rw-r--r--fs/Kconfig4
-rw-r--r--fs/cifs/cifsacl.c12
-rw-r--r--fs/ext4/Kconfig15
-rw-r--r--fs/ext4/Makefile4
-rw-r--r--fs/ext4/acl.c6
-rw-r--r--fs/ext4/dir.c41
-rw-r--r--fs/ext4/ext4.h165
-rw-r--r--fs/ext4/ext4_extents.h40
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c480
-rw-r--r--fs/ext4/extents_status.c500
-rw-r--r--fs/ext4/extents_status.h45
-rw-r--r--fs/ext4/file.c336
-rw-r--r--fs/ext4/fsync.c6
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/indirect.c5
-rw-r--r--fs/ext4/inline.c1884
-rw-r--r--fs/ext4/inode.c629
-rw-r--r--fs/ext4/mballoc.c60
-rw-r--r--fs/ext4/migrate.c1
-rw-r--r--fs/ext4/move_extent.c1
-rw-r--r--fs/ext4/namei.c531
-rw-r--r--fs/ext4/page-io.c3
-rw-r--r--fs/ext4/resize.c17
-rw-r--r--fs/ext4/super.c57
-rw-r--r--fs/ext4/symlink.c4
-rw-r--r--fs/ext4/xattr.c110
-rw-r--r--fs/ext4/xattr.h158
-rw-r--r--fs/jbd2/journal.c1
-rw-r--r--fs/jbd2/transaction.c11
-rw-r--r--fs/nfs/idmap.c12
-rw-r--r--include/asm-generic/pgtable.h110
-rw-r--r--include/linux/cred.h17
-rw-r--r--include/linux/huge_mm.h16
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/jbd2.h9
-rw-r--r--include/linux/key.h1
-rw-r--r--include/linux/mempolicy.h8
-rw-r--r--include/linux/migrate.h46
-rw-r--r--include/linux/mm.h39
-rw-r--r--include/linux/mm_types.h31
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--include/linux/rmap.h33
-rw-r--r--include/linux/sched.h27
-rw-r--r--include/linux/swiotlb.h20
-rw-r--r--include/linux/vm_event_item.h12
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/trace/events/ext4.h136
-rw-r--r--include/trace/events/migrate.h51
-rw-r--r--include/uapi/linux/mempolicy.h15
-rw-r--r--init/Kconfig44
-rw-r--r--kernel/cred.c127
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/sched/core.c71
-rw-r--r--kernel/sched/fair.c227
-rw-r--r--kernel/sched/features.h11
-rw-r--r--kernel/sched/sched.h12
-rw-r--r--kernel/seccomp.c13
-rw-r--r--kernel/sysctl.c45
-rw-r--r--lib/swiotlb.c269
-rw-r--r--mm/compaction.c15
-rw-r--r--mm/huge_memory.c108
-rw-r--r--mm/hugetlb.c10
-rw-r--r--mm/internal.h7
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/memcontrol.c7
-rw-r--r--mm/memory-failure.c7
-rw-r--r--mm/memory.c198
-rw-r--r--mm/memory_hotplug.c3
-rw-r--r--mm/mempolicy.c283
-rw-r--r--mm/migrate.c337
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mprotect.c135
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/rmap.c66
-rw-r--r--mm/vmstat.c16
-rw-r--r--net/dns_resolver/dns_key.c15
-rw-r--r--security/keys/key.c6
-rw-r--r--security/keys/keyctl.c15
-rw-r--r--security/keys/keyring.c10
-rw-r--r--security/keys/process_keys.c92
-rw-r--r--security/keys/request_key.c21
-rw-r--r--security/smack/Kconfig6
-rw-r--r--security/smack/smackfs.c17
-rw-r--r--security/yama/yama_lsm.c88
105 files changed, 6653 insertions, 1799 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 104322bf378..34ea4f1fa6e 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -200,12 +200,9 @@ inode_readahead_blks=n This tuning parameter controls the maximum
table readahead algorithm will pre-read into
the buffer cache. The default value is 32 blocks.
-nouser_xattr Disables Extended User Attributes. If you have extended
- attribute support enabled in the kernel configuration
- (CONFIG_EXT4_FS_XATTR), extended attribute support
- is enabled by default on mount. See the attr(5) manual
- page and http://acl.bestbits.at/ for more information
- about extended attributes.
+nouser_xattr Disables Extended User Attributes. See the
+ attr(5) manual page and http://acl.bestbits.at/
+ for more information about extended attributes.
noacl This option disables POSIX Access Control List
support. If ACL support is enabled in the kernel
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 20e248cc03a..ea8e5b48557 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nr_uarts= [SERIAL] maximum number of UARTs to be registered.
+ numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing.
+ Allowed values are enable and disable
+
numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
one of ['zone', 'node', 'default'] can be specified
This can be set from sysctl after boot.
diff --git a/Documentation/prctl/seccomp_filter.txt b/Documentation/prctl/seccomp_filter.txt
index 597c3c58137..1e469ef7577 100644
--- a/Documentation/prctl/seccomp_filter.txt
+++ b/Documentation/prctl/seccomp_filter.txt
@@ -95,12 +95,15 @@ SECCOMP_RET_KILL:
SECCOMP_RET_TRAP:
Results in the kernel sending a SIGSYS signal to the triggering
- task without executing the system call. The kernel will
- rollback the register state to just before the system call
- entry such that a signal handler in the task will be able to
- inspect the ucontext_t->uc_mcontext registers and emulate
- system call success or failure upon return from the signal
- handler.
+ task without executing the system call. siginfo->si_call_addr
+ will show the address of the system call instruction, and
+ siginfo->si_syscall and siginfo->si_arch will indicate which
+ syscall was attempted. The program counter will be as though
+ the syscall happened (i.e. it will not point to the syscall
+ instruction). The return value register will contain an arch-
+ dependent value -- if resuming execution, set it to something
+ sensible. (The architecture dependency is because replacing
+ it with -ENOSYS could overwrite some useful information.)
The SECCOMP_RET_DATA portion of the return value will be passed
as si_errno.
@@ -123,6 +126,18 @@ SECCOMP_RET_TRACE:
the BPF program return value will be available to the tracer
via PTRACE_GETEVENTMSG.
+ The tracer can skip the system call by changing the syscall number
+ to -1. Alternatively, the tracer can change the system call
+ requested by changing the system call to a valid syscall number. If
+ the tracer asks to skip the system call, then the system call will
+ appear to return the value that the tracer puts in the return value
+ register.
+
+ The seccomp check will not be run again after the tracer is
+ notified. (This means that seccomp-based sandboxes MUST NOT
+ allow use of ptrace, even of other sandboxed processes, without
+ extreme care; ptracers can use this mechanism to escape.)
+
SECCOMP_RET_ALLOW:
Results in the system call being executed.
@@ -161,3 +176,50 @@ architecture supports both ptrace_event and seccomp, it will be able to
support seccomp filter with minor fixup: SIGSYS support and seccomp return
value checking. Then it must just add CONFIG_HAVE_ARCH_SECCOMP_FILTER
to its arch-specific Kconfig.
+
+
+
+Caveats
+-------
+
+The vDSO can cause some system calls to run entirely in userspace,
+leading to surprises when you run programs on different machines that
+fall back to real syscalls. To minimize these surprises on x86, make
+sure you test with
+/sys/devices/system/clocksource/clocksource0/current_clocksource set to
+something like acpi_pm.
+
+On x86-64, vsyscall emulation is enabled by default. (vsyscalls are
+legacy variants on vDSO calls.) Currently, emulated vsyscalls will honor seccomp, with a few oddities:
+
+- A return value of SECCOMP_RET_TRAP will set a si_call_addr pointing to
+ the vsyscall entry for the given call and not the address after the
+ 'syscall' instruction. Any code which wants to restart the call
+ should be aware that (a) a ret instruction has been emulated and (b)
+ trying to resume the syscall will again trigger the standard vsyscall
+ emulation security checks, making resuming the syscall mostly
+ pointless.
+
+- A return value of SECCOMP_RET_TRACE will signal the tracer as usual,
+ but the syscall may not be changed to another system call using the
+ orig_rax register. It may only be changed to -1 order to skip the
+ currently emulated call. Any other change MAY terminate the process.
+ The rip value seen by the tracer will be the syscall entry address;
+ this is different from normal behavior. The tracer MUST NOT modify
+ rip or rsp. (Do not rely on other changes terminating the process.
+ They might work. For example, on some kernels, choosing a syscall
+ that only exists in future kernels will be correctly emulated (by
+ returning -ENOSYS).
+
+To detect this quirky behavior, check for addr & ~0x0C00 ==
+0xFFFFFFFFFF600000. (For SECCOMP_RET_TRACE, use rip. For
+SECCOMP_RET_TRAP, use siginfo->si_call_addr.) Do not check any other
+condition: future kernels may improve vsyscall emulation and current
+kernels in vsyscall=native mode will behave differently, but the
+instructions at 0xF...F600{0,4,8,C}00 will not be system calls in these
+cases.
+
+Note that modern systems are unlikely to use vsyscalls at all -- they
+are a legacy feature and they are considerably slower than standard
+syscalls. New code will use the vDSO, and vDSO-issued system calls
+are indistinguishable from normal system calls.
diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 7d9ca92022d..7b4145d0045 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -994,6 +994,23 @@ payload contents" for more information.
reference pointer if successful.
+(*) A keyring can be created by:
+
+ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
+ const struct cred *cred,
+ key_perm_t perm,
+ unsigned long flags,
+ struct key *dest);
+
+ This creates a keyring with the given attributes and returns it. If dest
+ is not NULL, the new keyring will be linked into the keyring to which it
+ points. No permission checks are made upon the destination keyring.
+
+ Error EDQUOT can be returned if the keyring would overload the quota (pass
+ KEY_ALLOC_NOT_IN_QUOTA in flags if the keyring shouldn't be accounted
+ towards the user's quota). Error ENOMEM can also be returned.
+
+
(*) To check the validity of a key, this function can be called:
int validate_key(struct key *key);
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index cb8f9920f4d..0f7c852f355 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -111,6 +111,7 @@ config VSYSCALL
config NUMA
bool "Non Uniform Memory Access (NUMA) Support"
depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
+ select ARCH_WANT_NUMA_VARIABLE_LOCALITY
default n
help
Some SH systems have many various memories scattered around
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 65a872bf72f..97f8c5ad8c2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -22,6 +22,8 @@ config X86
def_bool y
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
+ select ARCH_SUPPORTS_NUMA_BALANCING
+ select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a1f780d45f7..5199db2923d 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b)
static inline int pte_present(pte_t a)
{
- return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
+ return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
+ _PAGE_NUMA);
+}
+
+#define pte_accessible pte_accessible
+static inline int pte_accessible(pte_t a)
+{
+ return pte_flags(a) & _PAGE_PRESENT;
}
static inline int pte_hidden(pte_t pte)
@@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd)
* the _PAGE_PSE flag will remain set at all times while the
* _PAGE_PRESENT bit is clear).
*/
- return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
+ return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE |
+ _PAGE_NUMA);
}
static inline int pmd_none(pmd_t pmd)
@@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
static inline int pmd_bad(pmd_t pmd)
{
+#ifdef CONFIG_NUMA_BALANCING
+ /* pmd_numa check */
+ if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
+ return 0;
+#endif
return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index ec8a1fc9505..3c32db8c539 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -64,6 +64,26 @@
#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+/*
+ * _PAGE_NUMA indicates that this page will trigger a numa hinting
+ * minor page fault to gather numa placement statistics (see
+ * pte_numa()). The bit picked (8) is within the range between
+ * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
+ * require changes to the swp entry format because that bit is always
+ * zero when the pte is not present.
+ *
+ * The bit picked must be always zero when the pmd is present and not
+ * present, so that we don't lose information when we set it while
+ * atomically clearing the present bit.
+ *
+ * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
+ * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
+ * couldn't reach, like handle_mm_fault() (see access_error in
+ * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
+ * handle_mm_fault() to be invoked).
+ */
+#define _PAGE_NUMA _PAGE_PROTNONE
+
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 3a3e8c9e280..9a907a67be8 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -145,19 +145,6 @@ static int addr_to_vsyscall_nr(unsigned long addr)
return nr;
}
-#ifdef CONFIG_SECCOMP
-static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr)
-{
- if (!seccomp_mode(&tsk->seccomp))
- return 0;
- task_pt_regs(tsk)->orig_ax = syscall_nr;
- task_pt_regs(tsk)->ax = syscall_nr;
- return __secure_computing(syscall_nr);
-}
-#else
-#define vsyscall_seccomp(_tsk, _nr) 0
-#endif
-
static bool write_ok_or_segv(unsigned long ptr, size_t size)
{
/*
@@ -190,10 +177,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
{
struct task_struct *tsk;
unsigned long caller;
- int vsyscall_nr;
+ int vsyscall_nr, syscall_nr, tmp;
int prev_sig_on_uaccess_error;
long ret;
- int skip;
/*
* No point in checking CS -- the only way to get here is a user mode
@@ -225,56 +211,84 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
}
tsk = current;
- /*
- * With a real vsyscall, page faults cause SIGSEGV. We want to
- * preserve that behavior to make writing exploits harder.
- */
- prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
- current_thread_info()->sig_on_uaccess_error = 1;
/*
+ * Check for access_ok violations and find the syscall nr.
+ *
* NULL is a valid user pointer (in the access_ok sense) on 32-bit and
* 64-bit, so we don't need to special-case it here. For all the
* vsyscalls, NULL means "don't write anything" not "write it at
* address 0".
*/
- ret = -EFAULT;
- skip = 0;
switch (vsyscall_nr) {
case 0:
- skip = vsyscall_seccomp(tsk, __NR_gettimeofday);
- if (skip)
- break;
-
if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
- !write_ok_or_segv(regs->si, sizeof(struct timezone)))
- break;
+ !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_gettimeofday;
+ break;
+
+ case 1:
+ if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_time;
+ break;
+
+ case 2:
+ if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
+ !write_ok_or_segv(regs->si, sizeof(unsigned))) {
+ ret = -EFAULT;
+ goto check_fault;
+ }
+
+ syscall_nr = __NR_getcpu;
+ break;
+ }
+
+ /*
+ * Handle seccomp. regs->ip must be the original value.
+ * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
+ *
+ * We could optimize the seccomp disabled case, but performance
+ * here doesn't matter.
+ */
+ regs->orig_ax = syscall_nr;
+ regs->ax = -ENOSYS;
+ tmp = secure_computing(syscall_nr);
+ if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
+ warn_bad_vsyscall(KERN_DEBUG, regs,
+ "seccomp tried to change syscall nr or ip");
+ do_exit(SIGSYS);
+ }
+ if (tmp)
+ goto do_ret; /* skip requested */
+ /*
+ * With a real vsyscall, page faults cause SIGSEGV. We want to
+ * preserve that behavior to make writing exploits harder.
+ */
+ prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
+ current_thread_info()->sig_on_uaccess_error = 1;
+
+ ret = -EFAULT;
+ switch (vsyscall_nr) {
+ case 0:
ret = sys_gettimeofday(
(struct timeval __user *)regs->di,
(struct timezone __user *)regs->si);
break;
case 1:
- skip = vsyscall_seccomp(tsk, __NR_time);
- if (skip)
- break;
-
- if (!write_ok_or_segv(regs->di, sizeof(time_t)))
- break;
-
ret = sys_time((time_t __user *)regs->di);
break;
case 2:
- skip = vsyscall_seccomp(tsk, __NR_getcpu);
- if (skip)
- break;
-
- if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
- !write_ok_or_segv(regs->si, sizeof(unsigned)))
- break;
-
ret = sys_getcpu((unsigned __user *)regs->di,
(unsigned __user *)regs->si,
NULL);
@@ -283,12 +297,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
- if (skip) {
- if ((long)regs->ax <= 0L) /* seccomp errno emulation */
- goto do_ret;
- goto done; /* seccomp trace/trap */
- }
-
+check_fault:
if (ret == -EFAULT) {
/* Bad news -- userspace fed a bad pointer to a vsyscall. */
warn_bad_vsyscall(KERN_INFO, regs,
@@ -311,7 +320,6 @@ do_ret:
/* Emulate a ret instruction. */
regs->ip = caller;
regs->sp += 8;
-done:
return true;
sigsegv:
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 217eb705fac..e27fbf887f3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
free_page((unsigned long)pgd);
}
+/*
+ * Used to set accessed or dirty bits in the page table entries
+ * on other architectures. On x86, the accessed and dirty bits
+ * are tracked by hardware. However, do_wp_page calls this function
+ * to also make the pte writeable at the same time the dirty bit is
+ * set. In that case we do actually need to write the PTE.
+ */
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
@@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
if (changed && dirty) {
*ptep = entry;
pte_update_defer(vma->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
}
return changed;
diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
index bbec35d21fe..0f51ed687dc 100644
--- a/drivers/bus/Kconfig
+++ b/drivers/bus/Kconfig
@@ -6,6 +6,7 @@ menu "Bus devices"
config OMAP_OCP2SCP
tristate "OMAP OCP2SCP DRIVER"
+ depends on ARCH_OMAP2PLUS
help
Driver to enable ocp2scp module which transforms ocp interface
protocol to scp protocol. In OMAP4, USB PHY is connected via
diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c
index 7da840d487d..9978609d93b 100644
--- a/drivers/char/tpm/tpm_ibmvtpm.c
+++ b/drivers/char/tpm/tpm_ibmvtpm.c
@@ -38,8 +38,6 @@ static struct vio_device_id tpm_ibmvtpm_device_table[] = {
};
MODULE_DEVICE_TABLE(vio, tpm_ibmvtpm_device_table);
-DECLARE_WAIT_QUEUE_HEAD(wq);
-
/**
* ibmvtpm_send_crq - Send a CRQ request
* @vdev: vio device struct
@@ -83,6 +81,7 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
{
struct ibmvtpm_dev *ibmvtpm;
u16 len;
+ int sig;
ibmvtpm = (struct ibmvtpm_dev *)chip->vendor.data;
@@ -91,22 +90,23 @@ static int tpm_ibmvtpm_recv(struct tpm_chip *chip, u8 *buf, size_t count)
return 0;
}
- wait_event_interruptible(wq, ibmvtpm->crq_res.len != 0);
+ sig = wait_event_interruptible(ibmvtpm->wq, ibmvtpm->res_len != 0);
+ if (sig)
+ return -EINTR;
+
+ len = ibmvtpm->res_len;
- if (count < ibmvtpm->crq_res.len) {
+ if (count < len) {
dev_err(ibmvtpm->dev,
"Invalid size in recv: count=%ld, crq_size=%d\n",
- count, ibmvtpm->crq_res.len);
+ count, len);
return -EIO;
}
spin_lock(&ibmvtpm->rtce_lock);
- memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, ibmvtpm->crq_res.len);
- memset(ibmvtpm->rtce_buf, 0, ibmvtpm->crq_res.len);
- ibmvtpm->crq_res.valid = 0;
- ibmvtpm->crq_res.msg = 0;
- len = ibmvtpm->crq_res.len;
- ibmvtpm->crq_res.len = 0;
+ memcpy((void *)buf, (void *)ibmvtpm->rtce_buf, len);
+ memset(ibmvtpm->rtce_buf, 0, len);
+ ibmvtpm->res_len = 0;
spin_unlock(&ibmvtpm->rtce_lock);
return len;
}
@@ -273,7 +273,6 @@ static int tpm_ibmvtpm_remove(struct vio_dev *vdev)
int rc = 0;
free_irq(vdev->irq, ibmvtpm);
- tasklet_kill(&ibmvtpm->tasklet);
do {
if (rc)
@@ -372,7 +371,6 @@ static int ibmvtpm_reset_crq(struct ibmvtpm_dev *ibmvtpm)
static int tpm_ibmvtpm_resume(struct device *dev)
{
struct ibmvtpm_dev *ibmvtpm = ibmvtpm_get_data(dev);
- unsigned long flags;
int rc = 0;
do {
@@ -387,10 +385,11 @@ static int tpm_ibmvtpm_resume(struct device *dev)
return rc;
}
- spin_lock_irqsave(&ibmvtpm->lock, flags);
- vio_disable_interrupts(ibmvtpm->vdev);
- tasklet_schedule(&ibmvtpm->tasklet);
- spin_unlock_irqrestore(&ibmvtpm->lock, flags);
+ rc = vio_enable_interrupts(ibmvtpm->vdev);
+ if (rc) {
+ dev_err(dev, "Error vio_enable_interrupts rc=%d\n", rc);
+ return rc;
+ }
rc = ibmvtpm_crq_send_init(ibmvtpm);
if (rc)
@@ -467,7 +466,7 @@ static struct ibmvtpm_crq *ibmvtpm_crq_get_next(struct ibmvtpm_dev *ibmvtpm)
if (crq->valid & VTPM_MSG_RES) {
if (++crq_q->index == crq_q->num_entry)
crq_q->index = 0;
- rmb();
+ smp_rmb();
} else
crq = NULL;
return crq;
@@ -535,11 +534,9 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq,
ibmvtpm->vtpm_version = crq->data;
return;
case VTPM_TPM_COMMAND_RES:
- ibmvtpm->crq_res.valid = crq->valid;
- ibmvtpm->crq_res.msg = crq->msg;
- ibmvtpm->crq_res.len = crq->len;
- ibmvtpm->crq_res.data = crq->data;
- wake_up_interruptible(&wq);
+ /* len of the data in rtce buffer */
+ ibmvtpm->res_len = crq->len;
+ wake_up_interruptible(&ibmvtpm->wq);
return;
default:
return;
@@ -559,38 +556,19 @@ static void ibmvtpm_crq_process(struct ibmvtpm_crq *crq,
static irqreturn_t ibmvtpm_interrupt(int irq, void *vtpm_instance)
{
struct ibmvtpm_dev *ibmvtpm = (struct ibmvtpm_dev *) vtpm_instance;
- unsigned long flags;
-
- spin_lock_irqsave(&ibmvtpm->lock, flags);
- vio_disable_interrupts(ibmvtpm->vdev);
- tasklet_schedule(&ibmvtpm->tasklet);
- spin_unlock_irqrestore(&ibmvtpm->lock, flags);
-
- return IRQ_HANDLED;
-}
-
-/**
- * ibmvtpm_tasklet - Interrupt handler tasklet
- * @data: ibm vtpm device struct
- *
- * Returns:
- * Nothing
- **/
-static void ibmvtpm_tasklet(void *data)
-{
- struct ibmvtpm_dev *ibmvtpm = data;
struct ibmvtpm_crq *crq;
- unsigned long flags;
- spin_lock_irqsave(&ibmvtpm->lock, flags);
+ /* while loop is needed for initial setup (get version and
+ * get rtce_size). There should be only one tpm request at any
+ * given time.
+ */
while ((crq = ibmvtpm_crq_get_next(ibmvtpm)) != NULL) {
ibmvtpm_crq_process(crq, ibmvtpm);
crq->valid = 0;
- wmb();
+ smp_wmb();
}
- vio_enable_interrupts(ibmvtpm->vdev);
- spin_unlock_irqrestore(&ibmvtpm->lock, flags);
+ return IRQ_HANDLED;
}
/**
@@ -650,9 +628,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
goto reg_crq_cleanup;
}
- tasklet_init(&ibmvtpm->tasklet, (void *)ibmvtpm_tasklet,
- (unsigned long)ibmvtpm);
-
rc = request_irq(vio_dev->irq, ibmvtpm_interrupt, 0,
tpm_ibmvtpm_driver_name, ibmvtpm);
if (rc) {
@@ -666,13 +641,14 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
goto init_irq_cleanup;
}
+ init_waitqueue_head(&ibmvtpm->wq);
+
crq_q->index = 0;
ibmvtpm->dev = dev;
ibmvtpm->vdev = vio_dev;
chip->vendor.data = (void *)ibmvtpm;
- spin_lock_init(&ibmvtpm->lock);
spin_lock_init(&ibmvtpm->rtce_lock);
rc = ibmvtpm_crq_send_init(ibmvtpm);
@@ -689,7 +665,6 @@ static int tpm_ibmvtpm_probe(struct vio_dev *vio_dev,
return rc;
init_irq_cleanup:
- tasklet_kill(&ibmvtpm->tasklet);
do {
rc1 = plpar_hcall_norets(H_FREE_CRQ, vio_dev->unit_address);
} while (rc1 == H_BUSY || H_IS_LONG_BUSY(rc1));
diff --git a/drivers/char/tpm/tpm_ibmvtpm.h b/drivers/char/tpm/tpm_ibmvtpm.h
index 4296eb4b4d8..bd82a791f99 100644
--- a/drivers/char/tpm/tpm_ibmvtpm.h
+++ b/drivers/char/tpm/tpm_ibmvtpm.h
@@ -38,13 +38,12 @@ struct ibmvtpm_dev {
struct vio_dev *vdev;
struct ibmvtpm_crq_queue crq_queue;
dma_addr_t crq_dma_handle;
- spinlock_t lock;
- struct tasklet_struct tasklet;
u32 rtce_size;
void __iomem *rtce_buf;
dma_addr_t rtce_dma_handle;
spinlock_t rtce_lock;
- struct ibmvtpm_crq crq_res;
+ wait_queue_head_t wq;
+ u16 res_len;
u32 vtpm_version;
};
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index 77629d33f03..febead4bf8a 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -544,6 +544,7 @@ config KEYBOARD_OMAP
config KEYBOARD_OMAP4
tristate "TI OMAP4+ keypad support"
+ depends on ARCH_OMAP2PLUS
select INPUT_MATRIXKMAP
help
Say Y here if you want to use the OMAP4+ keypad.
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 7eb73c561bd..5de6e7f39f9 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -6,6 +6,7 @@ comment "USB Physical Layer drivers"
config OMAP_USB2
tristate "OMAP USB2 PHY Driver"
+ depends on ARCH_OMAP2PLUS
select USB_OTG_UTILS
help
Enable this to support the transceiver that is part of SOC. This
diff --git a/drivers/video/omap2/Kconfig b/drivers/video/omap2/Kconfig
index 346d67d6cf4..b07b2b042e7 100644
--- a/drivers/video/omap2/Kconfig
+++ b/drivers/video/omap2/Kconfig
@@ -1,6 +1,10 @@
config OMAP2_VRFB
bool
+if ARCH_OMAP2PLUS
+
source "drivers/video/omap2/dss/Kconfig"
source "drivers/video/omap2/omapfb/Kconfig"
source "drivers/video/omap2/displays/Kconfig"
+
+endif
diff --git a/drivers/w1/masters/Kconfig b/drivers/w1/masters/Kconfig
index c433a746e3f..e8ca63a82b9 100644
--- a/drivers/w1/masters/Kconfig
+++ b/drivers/w1/masters/Kconfig
@@ -60,6 +60,7 @@ config W1_MASTER_GPIO
config HDQ_MASTER_OMAP
tristate "OMAP HDQ driver"
+ depends on ARCH_OMAP
help
Say Y here if you want support for the 1-wire or HDQ Interface
on an OMAP processor.
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 58db6df866e..af47e759446 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -338,9 +338,8 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
enum dma_data_direction dir,
struct dma_attrs *attrs)
{
- phys_addr_t phys = page_to_phys(page) + offset;
+ phys_addr_t map, phys = page_to_phys(page) + offset;
dma_addr_t dev_addr = xen_phys_to_bus(phys);
- void *map;
BUG_ON(dir == DMA_NONE);
/*
@@ -356,10 +355,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
* Oh well, have to allocate and map a bounce buffer.
*/
map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir);
- if (!map)
+ if (map == SWIOTLB_MAP_ERROR)
return DMA_ERROR_CODE;
- dev_addr = xen_virt_to_bus(map);
+ dev_addr = xen_phys_to_bus(map);
/*
* Ensure that the address returned is DMA'ble
@@ -389,7 +388,7 @@ static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
/* NOTE: We use dev_addr here, not paddr! */
if (is_xen_swiotlb_buffer(dev_addr)) {
- swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
return;
}
@@ -434,8 +433,7 @@ xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
/* NOTE: We use dev_addr here, not paddr! */
if (is_xen_swiotlb_buffer(dev_addr)) {
- swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
- target);
+ swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
return;
}
@@ -494,11 +492,12 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
if (swiotlb_force ||
!dma_capable(hwdev, dev_addr, sg->length) ||
range_straddles_page_boundary(paddr, sg->length)) {
- void *map = swiotlb_tbl_map_single(hwdev,
- start_dma_addr,
- sg_phys(sg),
- sg->length, dir);
- if (!map) {
+ phys_addr_t map = swiotlb_tbl_map_single(hwdev,
+ start_dma_addr,
+ sg_phys(sg),
+ sg->length,
+ dir);
+ if (map == SWIOTLB_MAP_ERROR) {
/* Don't panic here, we expect map_sg users
to do proper error handling. */
xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir,
@@ -506,7 +505,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
sgl[0].dma_length = 0;
return DMA_ERROR_CODE;
}
- sg->dma_address = xen_virt_to_bus(map);
+ sg->dma_address = xen_phys_to_bus(map);
} else
sg->dma_address = dev_addr;
sg->dma_length = sg->length;
diff --git a/fs/Kconfig b/fs/Kconfig
index f95ae3a027f..eaff24a1950 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -28,8 +28,8 @@ config FS_MBCACHE
tristate
default y if EXT2_FS=y && EXT2_FS_XATTR
default y if EXT3_FS=y && EXT3_FS_XATTR
- default y if EXT4_FS=y && EXT4_FS_XATTR
- default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR
+ default y if EXT4_FS=y
+ default m if EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS
source "fs/reiserfs/Kconfig"
source "fs/jfs/Kconfig"
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 75c1ee69914..5cbd00e7406 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -346,19 +346,15 @@ init_cifs_idmap(void)
if (!cred)
return -ENOMEM;
- keyring = key_alloc(&key_type_keyring, ".cifs_idmap", 0, 0, cred,
- (KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA);
+ keyring = keyring_alloc(".cifs_idmap", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
}
- ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
- if (ret < 0)
- goto failed_put_key;
-
ret = register_key_type(&cifs_idmap_key_type);
if (ret < 0)
goto failed_put_key;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index c22f17021b6..0a475c88185 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -39,22 +39,8 @@ config EXT4_USE_FOR_EXT23
compiled kernel size by using one file system driver for
ext2, ext3, and ext4 file systems.
-config EXT4_FS_XATTR
- bool "Ext4 extended attributes"
- depends on EXT4_FS
- default y
- help
- Extended attributes are name:value pairs associated with inodes by
- the kernel or by users (see the attr(5) manual page, or visit
- <http://acl.bestbits.at/> for details).
-
- If unsure, say N.
-
- You need this for POSIX ACL support on ext4.
-
config EXT4_FS_POSIX_ACL
bool "Ext4 POSIX Access Control Lists"
- depends on EXT4_FS_XATTR
select FS_POSIX_ACL
help
POSIX Access Control Lists (ACLs) support permissions for users and
@@ -67,7 +53,6 @@ config EXT4_FS_POSIX_ACL
config EXT4_FS_SECURITY
bool "Ext4 Security Labels"
- depends on EXT4_FS_XATTR
help
Security labels support alternative access control models
implemented by security modules like SELinux. This option
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 56fd8f86593..0310fec2ee3 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -7,8 +7,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
- mmp.o indirect.o
+ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+ xattr_trusted.o inline.o
-ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index d3c5b88fd89..e6e0d988439 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -423,8 +423,10 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
retry:
handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto release_and_out;
+ }
error = ext4_set_acl(handle, inode, type, acl);
ext4_journal_stop(handle);
if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 8e07d2a5a13..b8d877f6c1f 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -27,23 +27,11 @@
#include <linux/slab.h>
#include <linux/rbtree.h>
#include "ext4.h"
-
-static unsigned char ext4_filetype_table[] = {
- DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
-};
+#include "xattr.h"
static int ext4_dx_readdir(struct file *filp,
void *dirent, filldir_t filldir);
-static unsigned char get_dtype(struct super_block *sb, int filetype)
-{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
- return DT_UNKNOWN;
-
- return (ext4_filetype_table[filetype]);
-}
-
/**
* Check if the given dir-inode refers to an htree-indexed directory
* (or a directory which chould potentially get coverted to use htree
@@ -68,11 +56,14 @@ static int is_dx_dir(struct inode *inode)
* Return 0 if the directory entry is OK, and 1 if there is a problem
*
* Note: this is the opposite of what ext2 and ext3 historically returned...
+ *
+ * bh passed here can be an inode block or a dir data block, depending
+ * on the inode inline data flag.
*/
int __ext4_check_dir_entry(const char *function, unsigned int line,
struct inode *dir, struct file *filp,
struct ext4_dir_entry_2 *de,
- struct buffer_head *bh,
+ struct buffer_head *bh, char *buf, int size,
unsigned int offset)
{
const char *error_msg = NULL;
@@ -85,9 +76,8 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
error_msg = "rec_len % 4 != 0";
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
- else if (unlikely(((char *) de - bh->b_data) + rlen >
- dir->i_sb->s_blocksize))
- error_msg = "directory entry across blocks";
+ else if (unlikely(((char *) de - buf) + rlen > size))
+ error_msg = "directory entry across range";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
@@ -98,14 +88,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
ext4_error_file(filp, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset % bh->b_size),
+ error_msg, (unsigned) (offset % size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
"bad entry in directory: %s - offset=%u(%u), "
"inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset % bh->b_size),
+ error_msg, (unsigned) (offset % size),
offset, le32_to_cpu(de->inode),
rlen, de->name_len);
@@ -125,6 +115,14 @@ static int ext4_readdir(struct file *filp,
int ret = 0;
int dir_has_error = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+ ret = ext4_read_inline_dir(filp, dirent, filldir,
+ &has_inline_data);
+ if (has_inline_data)
+ return ret;
+ }
+
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
@@ -221,8 +219,9 @@ revalidate:
while (!error && filp->f_pos < inode->i_size
&& offset < sb->s_blocksize) {
de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
- if (ext4_check_dir_entry(inode, filp, de,
- bh, offset)) {
+ if (ext4_check_dir_entry(inode, filp, de, bh,
+ bh->b_data, bh->b_size,
+ offset)) {
/*
* On error, skip the f_pos to the next block
*/
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index df163da388c..8462eb3c33a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -57,6 +57,16 @@
#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
+/*
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
#define EXT4_ERROR_INODE(inode, fmt, a...) \
ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)
@@ -392,6 +402,7 @@ struct flex_groups {
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
+#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -448,28 +459,26 @@ enum {
EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
+ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
-#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
- printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
- EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
-
-/*
- * Since it's pretty easy to mix up bit numbers and hex values, and we
- * can't do a compile-time test for ENUM values, we use a run-time
- * test to make sure that EXT4_XXX_FL is consistent with respect to
- * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
- * out so it won't cost any extra space in the compiled kernel image.
- * But it's important that these values are the same, since we are
- * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
- * must be consistent with the values of FS_XXX_FL defined in
- * include/linux/fs.h and the on-disk values found in ext2, ext3, and
- * ext4 filesystems, and of course the values defined in e2fsprogs.
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, we use a
+ * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
+ * any extra space in the compiled kernel image, otherwise, the build will fail.
+ * It's important that these values are the same, since we are using
+ * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
+ * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
+ * values found in ext2, ext3 and ext4 filesystems, and of course the values
+ * defined in e2fsprogs.
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
+
static inline void ext4_check_flag_values(void)
{
CHECK_FLAG_VALUE(SECRM);
@@ -494,6 +503,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EXTENTS);
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
+ CHECK_FLAG_VALUE(INLINE_DATA);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -811,6 +821,8 @@ struct ext4_ext_cache {
__u32 ec_len; /* must be 32bit to return holes */
};
+#include "extents_status.h"
+
/*
* fourth extended file system inode data in memory
*/
@@ -833,7 +845,6 @@ struct ext4_inode_info {
#endif
unsigned long i_flags;
-#ifdef CONFIG_EXT4_FS_XATTR
/*
* Extended attributes can be read independently of the main file
* data. Taking i_mutex even when reading would cause contention
@@ -842,7 +853,6 @@ struct ext4_inode_info {
* EAs.
*/
struct rw_semaphore xattr_sem;
-#endif
struct list_head i_orphan; /* unlinked but open inodes */
@@ -888,6 +898,10 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* extents status tree */
+ struct ext4_es_tree i_es_tree;
+ rwlock_t i_es_lock;
+
/* ialloc */
ext4_group_t i_last_alloc_group;
@@ -902,6 +916,10 @@ struct ext4_inode_info {
/* on-disk additional length */
__u16 i_extra_isize;
+ /* Indicate the inline data space. */
+ u16 i_inline_off;
+ u16 i_inline_size;
+
#ifdef CONFIG_QUOTA
/* quota space reservation, managed internally by quota code */
qsize_t i_reserved_quota;
@@ -1360,6 +1378,7 @@ enum {
EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */
EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read
nolocking */
+ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -1481,7 +1500,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA 0x8000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1505,7 +1524,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
EXT4_FEATURE_INCOMPAT_EXTENTS| \
EXT4_FEATURE_INCOMPAT_64BIT| \
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
- EXT4_FEATURE_INCOMPAT_MMP)
+ EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_INLINE_DATA)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1592,6 +1612,11 @@ struct ext4_dir_entry_tail {
__le32 det_checksum; /* crc32c(uuid+inum+dirblock) */
};
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+ ((blocksize) - \
+ sizeof(struct ext4_dir_entry_tail))))
+
/*
* Ext4 directory file types. Only the low 3 bits are used. The
* other bits are reserved for now.
@@ -1936,14 +1961,42 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
struct file *,
struct ext4_dir_entry_2 *,
- struct buffer_head *, unsigned int);
-#define ext4_check_dir_entry(dir, filp, de, bh, offset) \
+ struct buffer_head *, char *, int,
+ unsigned int);
+#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
- (de), (bh), (offset)))
+ (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
__u32 minor_hash,
struct ext4_dir_entry_2 *dirent);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
+extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de);
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen);
+static inline void ext4_update_dx_flag(struct inode *inode)
+{
+ if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_COMPAT_DIR_INDEX))
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+}
+static unsigned char ext4_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+};
+
+static inline unsigned char get_dtype(struct super_block *sb, int filetype)
+{
+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
+ (filetype >= EXT4_FT_MAX))
+ return DT_UNKNOWN;
+
+ return ext4_filetype_table[filetype];
+}
/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -1994,8 +2047,23 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
struct buffer_head *ext4_bread(handle_t *, struct inode *,
ext4_lblk_t, int, int *);
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create);
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh));
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh);
+#define FALL_BACK_TO_NONDELALLOC 1
+#define CONVERT_INLINE_DATA 2
extern struct inode *ext4_iget(struct super_block *, unsigned long);
extern int ext4_write_inode(struct inode *, struct writeback_control *);
@@ -2050,6 +2118,20 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
+extern int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir);
+extern int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size);
/* resize.c */
extern int ext4_group_add(struct super_block *sb,
@@ -2376,6 +2458,15 @@ extern void ext4_unwritten_wait(struct inode *inode);
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
+extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len);
+extern void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize);
+extern int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh);
/* symlink.c */
extern const struct inode_operations ext4_symlink_inode_operations;
@@ -2393,6 +2484,9 @@ extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int);
/* extents.c */
+struct ext4_ext_path;
+struct ext4_extent;
+
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
@@ -2410,8 +2504,27 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
ssize_t len);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
+extern int ext4_ext_calc_metadata_amount(struct inode *inode,
+ ext4_lblk_t lblocks);
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
+ int num,
+ struct ext4_ext_path *path);
+extern int ext4_can_extents_be_merged(struct inode *inode,
+ struct ext4_extent *ex1,
+ struct ext4_extent *ex2);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *,
+ struct ext4_ext_path *,
+ struct ext4_extent *, int);
+extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+ struct ext4_ext_path *);
+extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
+extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+
+
/* move_extent.c */
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
__u64 start_orig, __u64 start_donor,
@@ -2445,14 +2558,10 @@ enum ext4_state_bits {
* never, ever appear in a buffer_head's state
* flag. See EXT4_MAP_FROM_CLUSTER to see where
* this is used. */
- BH_Da_Mapped, /* Delayed allocated block that now has a mapping. This
- * flag is set when ext4_map_blocks is called on a
- * delayed allocated block to get its real mapping. */
};
BUFFER_FNS(Uninit, uninit)
TAS_BUFFER_FNS(Uninit, uninit)
-BUFFER_FNS(Da_Mapped, da_mapped)
/*
* Add new method to test whether block and inode bitmaps are properly
@@ -2503,6 +2612,4 @@ extern void ext4_resize_end(struct super_block *sb);
#endif /* __KERNEL__ */
-#include "ext4_extents.h"
-
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index cb1b2c91996..487fda12bc0 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,16 +43,6 @@
#define CHECK_BINSEARCH__
/*
- * Turn on EXT_DEBUG to get lots of info about extents operations.
- */
-#define EXT_DEBUG__
-#ifdef EXT_DEBUG
-#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
-#else
-#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
-#endif
-
-/*
* If EXT_STATS is defined then stats numbers are collected.
* These number will be displayed at umount time.
*/
@@ -144,20 +134,6 @@ struct ext4_ext_path {
*/
/*
- * to be called by ext4_ext_walk_space()
- * negative retcode - error
- * positive retcode - signal for ext4_ext_walk_space(), see below
- * callback must return valid extent (passed or newly created)
- */
-typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
- struct ext4_ext_cache *,
- struct ext4_extent *, void *);
-
-#define EXT_CONTINUE 0
-#define EXT_BREAK 1
-#define EXT_REPEAT 2
-
-/*
* Maximum number of logical blocks in a file; ext4_extent's ee_block is
* __le32.
*/
@@ -300,21 +276,5 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
-extern int ext4_ext_calc_metadata_amount(struct inode *inode,
- ext4_lblk_t lblocks);
-extern int ext4_extent_tree_init(handle_t *, struct inode *);
-extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
- int num,
- struct ext4_ext_path *path);
-extern int ext4_can_extents_be_merged(struct inode *inode,
- struct ext4_extent *ex1,
- struct ext4_extent *ex2);
-extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
-extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
- struct ext4_ext_path *);
-extern void ext4_ext_drop_refs(struct ext4_ext_path *);
-extern int ext4_ext_check_inode(struct inode *inode);
-extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
- int search_hint_reverse);
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 56d258c1830..7177f9b21cb 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -254,13 +254,6 @@ static inline void ext4_handle_sync(handle_t *handle)
handle->h_sync = 1;
}
-static inline void ext4_handle_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_release_buffer(handle, bh);
-}
-
static inline int ext4_handle_is_aborted(handle_t *handle)
{
if (ext4_handle_valid(handle))
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7011ac96720..26af22832a8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -41,6 +41,8 @@
#include <asm/uaccess.h>
#include <linux/fiemap.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
+#include "xattr.h"
#include <trace/events/ext4.h>
@@ -109,6 +111,9 @@ static int ext4_split_extent_at(handle_t *handle,
int split_flag,
int flags);
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct ext4_ext_cache *newex);
+
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
int needed)
@@ -1959,27 +1964,33 @@ cleanup:
return err;
}
-static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
- ext4_lblk_t num, ext_prepare_callback func,
- void *cbdata)
+static int ext4_fill_fiemap_extents(struct inode *inode,
+ ext4_lblk_t block, ext4_lblk_t num,
+ struct fiemap_extent_info *fieinfo)
{
struct ext4_ext_path *path = NULL;
- struct ext4_ext_cache cbex;
+ struct ext4_ext_cache newex;
struct ext4_extent *ex;
- ext4_lblk_t next, start = 0, end = 0;
+ ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num;
- int depth, exists, err = 0;
-
- BUG_ON(func == NULL);
- BUG_ON(inode == NULL);
+ int exists, depth = 0, err = 0;
+ unsigned int flags = 0;
+ unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
while (block < last && block != EXT_MAX_BLOCKS) {
num = last - block;
/* find extent for this block */
down_read(&EXT4_I(inode)->i_data_sem);
+
+ if (path && ext_depth(inode) != depth) {
+ /* depth was changed. we have to realloc path */
+ kfree(path);
+ path = NULL;
+ }
+
path = ext4_ext_find_extent(inode, block, path);
- up_read(&EXT4_I(inode)->i_data_sem);
if (IS_ERR(path)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path);
path = NULL;
break;
@@ -1987,13 +1998,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
depth = ext_depth(inode);
if (unlikely(path[depth].p_hdr == NULL)) {
+ up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
err = -EIO;
break;
}
ex = path[depth].p_ext;
next = ext4_ext_next_allocated_block(path);
+ ext4_ext_drop_refs(path);
+ flags = 0;
exists = 0;
if (!ex) {
/* there is no extent yet, so try to allocate
@@ -2030,40 +2044,64 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
BUG_ON(end <= start);
if (!exists) {
- cbex.ec_block = start;
- cbex.ec_len = end - start;
- cbex.ec_start = 0;
+ newex.ec_block = start;
+ newex.ec_len = end - start;
+ newex.ec_start = 0;
} else {
- cbex.ec_block = le32_to_cpu(ex->ee_block);
- cbex.ec_len = ext4_ext_get_actual_len(ex);
- cbex.ec_start = ext4_ext_pblock(ex);
+ newex.ec_block = le32_to_cpu(ex->ee_block);
+ newex.ec_len = ext4_ext_get_actual_len(ex);
+ newex.ec_start = ext4_ext_pblock(ex);
+ if (ext4_ext_is_uninitialized(ex))
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
}
- if (unlikely(cbex.ec_len == 0)) {
- EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
- err = -EIO;
- break;
+ /*
+ * Find delayed extent and update newex accordingly. We call
+ * it even in !exists case to find out whether newex is the
+ * last existing extent or not.
+ */
+ next_del = ext4_find_delayed_extent(inode, &newex);
+ if (!exists && next_del) {
+ exists = 1;
+ flags |= FIEMAP_EXTENT_DELALLOC;
}
- err = func(inode, next, &cbex, ex, cbdata);
- ext4_ext_drop_refs(path);
+ up_read(&EXT4_I(inode)->i_data_sem);
- if (err < 0)
+ if (unlikely(newex.ec_len == 0)) {
+ EXT4_ERROR_INODE(inode, "newex.ec_len == 0");
+ err = -EIO;
break;
+ }
- if (err == EXT_REPEAT)
- continue;
- else if (err == EXT_BREAK) {
- err = 0;
- break;
+ /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
+ if (next == next_del) {
+ flags |= FIEMAP_EXTENT_LAST;
+ if (unlikely(next_del != EXT_MAX_BLOCKS ||
+ next != EXT_MAX_BLOCKS)) {
+ EXT4_ERROR_INODE(inode,
+ "next extent == %u, next "
+ "delalloc extent = %u",
+ next, next_del);
+ err = -EIO;
+ break;
+ }
}
- if (ext_depth(inode) != depth) {
- /* depth was changed. we have to realloc path */
- kfree(path);
- path = NULL;
+ if (exists) {
+ err = fiemap_fill_next_extent(fieinfo,
+ (__u64)newex.ec_block << blksize_bits,
+ (__u64)newex.ec_start << blksize_bits,
+ (__u64)newex.ec_len << blksize_bits,
+ flags);
+ if (err < 0)
+ break;
+ if (err == 1) {
+ err = 0;
+ break;
+ }
}
- block = cbex.ec_block + cbex.ec_len;
+ block = newex.ec_block + newex.ec_len;
}
if (path) {
@@ -2156,7 +2194,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
struct ext4_extent *ex)
{
struct ext4_ext_cache *cex;
- struct ext4_sb_info *sbi;
int ret = 0;
/*
@@ -2164,7 +2201,6 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
*/
spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
cex = &EXT4_I(inode)->i_cached_extent;
- sbi = EXT4_SB(inode->i_sb);
/* has cache valid data? */
if (cex->ec_len == 0)
@@ -2273,7 +2309,13 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
{
int index;
- int depth = ext_depth(inode);
+ int depth;
+
+ /* If we are converting the inline data, only one is needed here. */
+ if (ext4_has_inline_data(inode))
+ return 1;
+
+ depth = ext_depth(inode);
if (chunk)
index = depth * 2;
@@ -3461,115 +3503,34 @@ out:
/**
* ext4_find_delalloc_range: find delayed allocated block in the given range.
*
- * Goes through the buffer heads in the range [lblk_start, lblk_end] and returns
- * whether there are any buffers marked for delayed allocation. It returns '1'
- * on the first delalloc'ed buffer head found. If no buffer head in the given
- * range is marked for delalloc, it returns 0.
- * lblk_start should always be <= lblk_end.
- * search_hint_reverse is to indicate that searching in reverse from lblk_end to
- * lblk_start might be more efficient (i.e., we will likely hit the delalloc'ed
- * block sooner). This is useful when blocks are truncated sequentially from
- * lblk_start towards lblk_end.
+ * Return 1 if there is a delalloc block in the range, otherwise 0.
*/
static int ext4_find_delalloc_range(struct inode *inode,
ext4_lblk_t lblk_start,
- ext4_lblk_t lblk_end,
- int search_hint_reverse)
+ ext4_lblk_t lblk_end)
{
- struct address_space *mapping = inode->i_mapping;
- struct buffer_head *head, *bh = NULL;
- struct page *page;
- ext4_lblk_t i, pg_lblk;
- pgoff_t index;
-
- if (!test_opt(inode->i_sb, DELALLOC))
- return 0;
-
- /* reverse search wont work if fs block size is less than page size */
- if (inode->i_blkbits < PAGE_CACHE_SHIFT)
- search_hint_reverse = 0;
+ struct extent_status es;
- if (search_hint_reverse)
- i = lblk_end;
+ es.start = lblk_start;
+ ext4_es_find_extent(inode, &es);
+ if (es.len == 0)
+ return 0; /* there is no delay extent in this tree */
+ else if (es.start <= lblk_start && lblk_start < es.start + es.len)
+ return 1;
+ else if (lblk_start <= es.start && es.start <= lblk_end)
+ return 1;
else
- i = lblk_start;
-
- index = i >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
- while ((i >= lblk_start) && (i <= lblk_end)) {
- page = find_get_page(mapping, index);
- if (!page)
- goto nextpage;
-
- if (!page_has_buffers(page))
- goto nextpage;
-
- head = page_buffers(page);
- if (!head)
- goto nextpage;
-
- bh = head;
- pg_lblk = index << (PAGE_CACHE_SHIFT -
- inode->i_blkbits);
- do {
- if (unlikely(pg_lblk < lblk_start)) {
- /*
- * This is possible when fs block size is less
- * than page size and our cluster starts/ends in
- * middle of the page. So we need to skip the
- * initial few blocks till we reach the 'lblk'
- */
- pg_lblk++;
- continue;
- }
-
- /* Check if the buffer is delayed allocated and that it
- * is not yet mapped. (when da-buffers are mapped during
- * their writeout, their da_mapped bit is set.)
- */
- if (buffer_delay(bh) && !buffer_da_mapped(bh)) {
- page_cache_release(page);
- trace_ext4_find_delalloc_range(inode,
- lblk_start, lblk_end,
- search_hint_reverse,
- 1, i);
- return 1;
- }
- if (search_hint_reverse)
- i--;
- else
- i++;
- } while ((i >= lblk_start) && (i <= lblk_end) &&
- ((bh = bh->b_this_page) != head));
-nextpage:
- if (page)
- page_cache_release(page);
- /*
- * Move to next page. 'i' will be the first lblk in the next
- * page.
- */
- if (search_hint_reverse)
- index--;
- else
- index++;
- i = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- }
-
- trace_ext4_find_delalloc_range(inode, lblk_start, lblk_end,
- search_hint_reverse, 0, 0);
- return 0;
+ return 0;
}
-int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk,
- int search_hint_reverse)
+int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t lblk_start, lblk_end;
lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
- return ext4_find_delalloc_range(inode, lblk_start, lblk_end,
- search_hint_reverse);
+ return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
}
/**
@@ -3630,7 +3591,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
lblk_to = lblk_from + c_offset - 1;
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
allocated_clusters--;
}
@@ -3640,7 +3601,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
lblk_from = lblk_start + num_blks;
lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
- if (ext4_find_delalloc_range(inode, lblk_from, lblk_to, 0))
+ if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
allocated_clusters--;
}
@@ -3663,8 +3624,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
flags, allocated);
ext4_ext_show_leaf(inode, path);
- trace_ext4_ext_handle_uninitialized_extents(inode, map, allocated,
- newblock);
+ trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
+ allocated, newblock);
/* get_block() before submit the IO, split the extent */
if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
@@ -3911,7 +3872,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_extent newex, *ex, *ex2;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_fsblk_t newblock = 0;
- int free_on_err = 0, err = 0, depth, ret;
+ int free_on_err = 0, err = 0, depth;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar;
@@ -3927,7 +3888,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
if (!newex.ee_start_lo && !newex.ee_start_hi) {
if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
map->m_flags |= EXT4_MAP_FROM_CLUSTER;
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -4007,15 +3968,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ee_len, ee_start);
goto out;
}
- ret = ext4_ext_handle_uninitialized_extents(
+ allocated = ext4_ext_handle_uninitialized_extents(
handle, inode, map, path, flags,
allocated, newblock);
- return ret;
+ goto out3;
}
}
if ((sbi->s_cluster_ratio > 1) &&
- ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
map->m_flags |= EXT4_MAP_FROM_CLUSTER;
/*
@@ -4284,8 +4245,8 @@ out2:
kfree(path);
}
- trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
- newblock, map->m_len, err ? err : allocated);
+out3:
+ trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated);
return err ? err : allocated;
}
@@ -4344,6 +4305,8 @@ void ext4_ext_truncate(struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
+ err = ext4_es_remove_extent(inode, last_block,
+ EXT_MAX_BLOCKS - last_block);
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
/* In a multi-transaction truncate, we only make the final
@@ -4434,6 +4397,10 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (mode & FALLOC_FL_PUNCH_HOLE)
return ext4_punch_hole(file, offset, len);
+ ret = ext4_convert_inline_data(inode);
+ if (ret)
+ return ret;
+
trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
@@ -4572,206 +4539,43 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
/*
- * Callback function called for each extent to gather FIEMAP information.
+ * If newex is not existing extent (newex->ec_start equals zero) find
+ * delayed extent at start of newex and update newex accordingly and
+ * return start of the next delayed extent.
+ *
+ * If newex is existing extent (newex->ec_start is not equal zero)
+ * return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
+ * extent found. Leave newex unmodified.
*/
-static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
- struct ext4_ext_cache *newex, struct ext4_extent *ex,
- void *data)
+static int ext4_find_delayed_extent(struct inode *inode,
+ struct ext4_ext_cache *newex)
{
- __u64 logical;
- __u64 physical;
- __u64 length;
- __u32 flags = 0;
- int ret = 0;
- struct fiemap_extent_info *fieinfo = data;
- unsigned char blksize_bits;
+ struct extent_status es;
+ ext4_lblk_t next_del;
- blksize_bits = inode->i_sb->s_blocksize_bits;
- logical = (__u64)newex->ec_block << blksize_bits;
+ es.start = newex->ec_block;
+ next_del = ext4_es_find_extent(inode, &es);
if (newex->ec_start == 0) {
/*
* No extent in extent-tree contains block @newex->ec_start,
* then the block may stay in 1)a hole or 2)delayed-extent.
- *
- * Holes or delayed-extents are processed as follows.
- * 1. lookup dirty pages with specified range in pagecache.
- * If no page is got, then there is no delayed-extent and
- * return with EXT_CONTINUE.
- * 2. find the 1st mapped buffer,
- * 3. check if the mapped buffer is both in the request range
- * and a delayed buffer. If not, there is no delayed-extent,
- * then return.
- * 4. a delayed-extent is found, the extent will be collected.
*/
- ext4_lblk_t end = 0;
- pgoff_t last_offset;
- pgoff_t offset;
- pgoff_t index;
- pgoff_t start_index = 0;
- struct page **pages = NULL;
- struct buffer_head *bh = NULL;
- struct buffer_head *head = NULL;
- unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
-
- pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (pages == NULL)
- return -ENOMEM;
-
- offset = logical >> PAGE_SHIFT;
-repeat:
- last_offset = offset;
- head = NULL;
- ret = find_get_pages_tag(inode->i_mapping, &offset,
- PAGECACHE_TAG_DIRTY, nr_pages, pages);
-
- if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
- /* First time, try to find a mapped buffer. */
- if (ret == 0) {
-out:
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- /* just a hole. */
- kfree(pages);
- return EXT_CONTINUE;
- }
- index = 0;
-
-next_page:
- /* Try to find the 1st mapped buffer. */
- end = ((__u64)pages[index]->index << PAGE_SHIFT) >>
- blksize_bits;
- if (!page_has_buffers(pages[index]))
- goto out;
- head = page_buffers(pages[index]);
- if (!head)
- goto out;
-
- index++;
- bh = head;
- do {
- if (end >= newex->ec_block +
- newex->ec_len)
- /* The buffer is out of
- * the request range.
- */
- goto out;
-
- if (buffer_mapped(bh) &&
- end >= newex->ec_block) {
- start_index = index - 1;
- /* get the 1st mapped buffer. */
- goto found_mapped_buffer;
- }
-
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
-
- /* No mapped buffer in the range found in this page,
- * We need to look up next page.
- */
- if (index >= ret) {
- /* There is no page left, but we need to limit
- * newex->ec_len.
- */
- newex->ec_len = end - newex->ec_block;
- goto out;
- }
- goto next_page;
- } else {
- /*Find contiguous delayed buffers. */
- if (ret > 0 && pages[0]->index == last_offset)
- head = page_buffers(pages[0]);
- bh = head;
- index = 1;
- start_index = 0;
- }
-
-found_mapped_buffer:
- if (bh != NULL && buffer_delay(bh)) {
- /* 1st or contiguous delayed buffer found. */
- if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
- /*
- * 1st delayed buffer found, record
- * the start of extent.
- */
- flags |= FIEMAP_EXTENT_DELALLOC;
- newex->ec_block = end;
- logical = (__u64)end << blksize_bits;
- }
- /* Find contiguous delayed buffers. */
- do {
- if (!buffer_delay(bh))
- goto found_delayed_extent;
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
-
- for (; index < ret; index++) {
- if (!page_has_buffers(pages[index])) {
- bh = NULL;
- break;
- }
- head = page_buffers(pages[index]);
- if (!head) {
- bh = NULL;
- break;
- }
-
- if (pages[index]->index !=
- pages[start_index]->index + index
- - start_index) {
- /* Blocks are not contiguous. */
- bh = NULL;
- break;
- }
- bh = head;
- do {
- if (!buffer_delay(bh))
- /* Delayed-extent ends. */
- goto found_delayed_extent;
- bh = bh->b_this_page;
- end++;
- } while (bh != head);
- }
- } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
- /* a hole found. */
- goto out;
+ if (es.len == 0)
+ /* A hole found. */
+ return 0;
-found_delayed_extent:
- newex->ec_len = min(end - newex->ec_block,
- (ext4_lblk_t)EXT_INIT_MAX_LEN);
- if (ret == nr_pages && bh != NULL &&
- newex->ec_len < EXT_INIT_MAX_LEN &&
- buffer_delay(bh)) {
- /* Have not collected an extent and continue. */
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- goto repeat;
+ if (es.start > newex->ec_block) {
+ /* A hole found. */
+ newex->ec_len = min(es.start - newex->ec_block,
+ newex->ec_len);
+ return 0;
}
- for (index = 0; index < ret; index++)
- page_cache_release(pages[index]);
- kfree(pages);
+ newex->ec_len = es.start + es.len - newex->ec_block;
}
- physical = (__u64)newex->ec_start << blksize_bits;
- length = (__u64)newex->ec_len << blksize_bits;
-
- if (ex && ext4_ext_is_uninitialized(ex))
- flags |= FIEMAP_EXTENT_UNWRITTEN;
-
- if (next == EXT_MAX_BLOCKS)
- flags |= FIEMAP_EXTENT_LAST;
-
- ret = fiemap_fill_next_extent(fieinfo, logical, physical,
- length, flags);
- if (ret < 0)
- return ret;
- if (ret == 1)
- return EXT_BREAK;
- return EXT_CONTINUE;
+ return next_del;
}
/* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -4971,6 +4775,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
ext4_ext_invalidate_cache(inode);
ext4_discard_preallocations(inode);
+ err = ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
ext4_ext_invalidate_cache(inode);
@@ -4991,12 +4797,22 @@ out_mutex:
mutex_unlock(&inode->i_mutex);
return err;
}
+
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
ext4_lblk_t start_blk;
int error = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline);
+
+ if (has_inline)
+ return error;
+ }
+
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len,
@@ -5018,11 +4834,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/*
- * Walk the extent tree gathering extent information.
- * ext4_ext_fiemap_cb will push extents back to user.
+ * Walk the extent tree gathering extent information
+ * and pushing extents back to the user.
*/
- error = ext4_ext_walk_space(inode, start_blk, len_blks,
- ext4_ext_fiemap_cb, fieinfo);
+ error = ext4_fill_fiemap_extents(inode, start_blk,
+ len_blks, fieinfo);
}
return error;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
new file mode 100644
index 00000000000..564d981a2fc
--- /dev/null
+++ b/fs/ext4/extents_status.c
@@ -0,0 +1,500 @@
+/*
+ * fs/ext4/extents_status.c
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Hugh Dickins <hughd@google.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ * Ext4 extents status tree core functions.
+ */
+#include <linux/rbtree.h>
+#include "ext4.h"
+#include "extents_status.h"
+#include "ext4_extents.h"
+
+#include <trace/events/ext4.h>
+
+/*
+ * According to previous discussion in Ext4 Developer Workshop, we
+ * will introduce a new structure called io tree to track all extent
+ * status in order to solve some problems that we have met
+ * (e.g. Reservation space warning), and provide extent-level locking.
+ * Delay extent tree is the first step to achieve this goal. It is
+ * original built by Yongqiang Yang. At that time it is called delay
+ * extent tree, whose goal is only track delay extent in memory to
+ * simplify the implementation of fiemap and bigalloc, and introduce
+ * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called
+ * delay extent tree at the following comment. But for better
+ * understand what it does, it has been rename to extent status tree.
+ *
+ * Currently the first step has been done. All delay extents are
+ * tracked in the tree. It maintains the delay extent when a delay
+ * allocation is issued, and the delay extent is written out or
+ * invalidated. Therefore the implementation of fiemap and bigalloc
+ * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
+ *
+ * The following comment describes the implemenmtation of extent
+ * status tree and future works.
+ */
+
+/*
+ * extents status tree implementation for ext4.
+ *
+ *
+ * ==========================================================================
+ * Extents status encompass delayed extents and extent locks
+ *
+ * 1. Why delayed extent implementation ?
+ *
+ * Without delayed extent, ext4 identifies a delayed extent by looking
+ * up page cache, this has several deficiencies - complicated, buggy,
+ * and inefficient code.
+ *
+ * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need
+ * to know if a block or a range of blocks are belonged to a delayed
+ * extent.
+ *
+ * Let us have a look at how they do without delayed extents implementation.
+ * -- FIEMAP
+ * FIEMAP looks up page cache to identify delayed allocations from holes.
+ *
+ * -- SEEK_HOLE/DATA
+ * SEEK_HOLE/DATA has the same problem as FIEMAP.
+ *
+ * -- bigalloc
+ * bigalloc looks up page cache to figure out if a block is
+ * already under delayed allocation or not to determine whether
+ * quota reserving is needed for the cluster.
+ *
+ * -- punch hole
+ * punch hole looks up page cache to identify a delayed extent.
+ *
+ * -- writeout
+ * Writeout looks up whole page cache to see if a buffer is
+ * mapped, If there are not very many delayed buffers, then it is
+ * time comsuming.
+ *
+ * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA,
+ * bigalloc and writeout can figure out if a block or a range of
+ * blocks is under delayed allocation(belonged to a delayed extent) or
+ * not by searching the delayed extent tree.
+ *
+ *
+ * ==========================================================================
+ * 2. ext4 delayed extents impelmentation
+ *
+ * -- delayed extent
+ * A delayed extent is a range of blocks which are contiguous
+ * logically and under delayed allocation. Unlike extent in
+ * ext4, delayed extent in ext4 is a in-memory struct, there is
+ * no corresponding on-disk data. There is no limit on length of
+ * delayed extent, so a delayed extent can contain as many blocks
+ * as they are contiguous logically.
+ *
+ * -- delayed extent tree
+ * Every inode has a delayed extent tree and all under delayed
+ * allocation blocks are added to the tree as delayed extents.
+ * Delayed extents in the tree are ordered by logical block no.
+ *
+ * -- operations on a delayed extent tree
+ * There are three operations on a delayed extent tree: find next
+ * delayed extent, adding a space(a range of blocks) and removing
+ * a space.
+ *
+ * -- race on a delayed extent tree
+ * Delayed extent tree is protected inode->i_es_lock.
+ *
+ *
+ * ==========================================================================
+ * 3. performance analysis
+ * -- overhead
+ * 1. There is a cache extent for write access, so if writes are
+ * not very random, adding space operaions are in O(1) time.
+ *
+ * -- gain
+ * 2. Code is much simpler, more readable, more maintainable and
+ * more efficient.
+ *
+ *
+ * ==========================================================================
+ * 4. TODO list
+ * -- Track all extent status
+ *
+ * -- Improve get block process
+ *
+ * -- Extent-level locking
+ */
+
+static struct kmem_cache *ext4_es_cachep;
+
+int __init ext4_init_es(void)
+{
+ ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
+ if (ext4_es_cachep == NULL)
+ return -ENOMEM;
+ return 0;
+}
+
+void ext4_exit_es(void)
+{
+ if (ext4_es_cachep)
+ kmem_cache_destroy(ext4_es_cachep);
+}
+
+void ext4_es_init_tree(struct ext4_es_tree *tree)
+{
+ tree->root = RB_ROOT;
+ tree->cache_es = NULL;
+}
+
+#ifdef ES_DEBUG__
+static void ext4_es_print_tree(struct inode *inode)
+{
+ struct ext4_es_tree *tree;
+ struct rb_node *node;
+
+ printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
+ tree = &EXT4_I(inode)->i_es_tree;
+ node = rb_first(&tree->root);
+ while (node) {
+ struct extent_status *es;
+ es = rb_entry(node, struct extent_status, rb_node);
+ printk(KERN_DEBUG " [%u/%u)", es->start, es->len);
+ node = rb_next(node);
+ }
+ printk(KERN_DEBUG "\n");
+}
+#else
+#define ext4_es_print_tree(inode)
+#endif
+
+static inline ext4_lblk_t extent_status_end(struct extent_status *es)
+{
+ BUG_ON(es->start + es->len < es->start);
+ return es->start + es->len - 1;
+}
+
+/*
+ * search through the tree for an delayed extent with a given offset. If
+ * it can't be found, try to find next extent.
+ */
+static struct extent_status *__es_tree_search(struct rb_root *root,
+ ext4_lblk_t offset)
+{
+ struct rb_node *node = root->rb_node;
+ struct extent_status *es = NULL;
+
+ while (node) {
+ es = rb_entry(node, struct extent_status, rb_node);
+ if (offset < es->start)
+ node = node->rb_left;
+ else if (offset > extent_status_end(es))
+ node = node->rb_right;
+ else
+ return es;
+ }
+
+ if (es && offset < es->start)
+ return es;
+
+ if (es && offset > extent_status_end(es)) {
+ node = rb_next(&es->rb_node);
+ return node ? rb_entry(node, struct extent_status, rb_node) :
+ NULL;
+ }
+
+ return NULL;
+}
+
+/*
+ * ext4_es_find_extent: find the 1st delayed extent covering @es->start
+ * if it exists, otherwise, the next extent after @es->start.
+ *
+ * @inode: the inode which owns delayed extents
+ * @es: delayed extent that we found
+ *
+ * Returns the first block of the next extent after es, otherwise
+ * EXT_MAX_BLOCKS if no delay extent is found.
+ * Delayed extent is returned via @es.
+ */
+ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es)
+{
+ struct ext4_es_tree *tree = NULL;
+ struct extent_status *es1 = NULL;
+ struct rb_node *node;
+ ext4_lblk_t ret = EXT_MAX_BLOCKS;
+
+ trace_ext4_es_find_extent_enter(inode, es->start);
+
+ read_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+
+ /* find delay extent in cache firstly */
+ if (tree->cache_es) {
+ es1 = tree->cache_es;
+ if (in_range(es->start, es1->start, es1->len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ es->start, es1->start, es1->len);
+ goto out;
+ }
+ }
+
+ es->len = 0;
+ es1 = __es_tree_search(&tree->root, es->start);
+
+out:
+ if (es1) {
+ tree->cache_es = es1;
+ es->start = es1->start;
+ es->len = es1->len;
+ node = rb_next(&es1->rb_node);
+ if (node) {
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ ret = es1->start;
+ }
+ }
+
+ read_unlock(&EXT4_I(inode)->i_es_lock);
+
+ trace_ext4_es_find_extent_exit(inode, es, ret);
+ return ret;
+}
+
+static struct extent_status *
+ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len)
+{
+ struct extent_status *es;
+ es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+ if (es == NULL)
+ return NULL;
+ es->start = start;
+ es->len = len;
+ return es;
+}
+
+static void ext4_es_free_extent(struct extent_status *es)
+{
+ kmem_cache_free(ext4_es_cachep, es);
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es)
+{
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_prev(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es->start == extent_status_end(es1) + 1) {
+ es1->len += es->len;
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(es);
+ es = es1;
+ }
+
+ return es;
+}
+
+static struct extent_status *
+ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es)
+{
+ struct extent_status *es1;
+ struct rb_node *node;
+
+ node = rb_next(&es->rb_node);
+ if (!node)
+ return es;
+
+ es1 = rb_entry(node, struct extent_status, rb_node);
+ if (es1->start == extent_status_end(es) + 1) {
+ es->len += es1->len;
+ rb_erase(node, &tree->root);
+ ext4_es_free_extent(es1);
+ }
+
+ return es;
+}
+
+static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct rb_node **p = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+ struct extent_status *es;
+ ext4_lblk_t end = offset + len - 1;
+
+ BUG_ON(end < offset);
+ es = tree->cache_es;
+ if (es && offset == (extent_status_end(es) + 1)) {
+ es_debug("cached by [%u/%u)\n", es->start, es->len);
+ es->len += len;
+ es = ext4_es_try_to_merge_right(tree, es);
+ goto out;
+ } else if (es && es->start == end + 1) {
+ es_debug("cached by [%u/%u)\n", es->start, es->len);
+ es->start = offset;
+ es->len += len;
+ es = ext4_es_try_to_merge_left(tree, es);
+ goto out;
+ } else if (es && es->start <= offset &&
+ end <= extent_status_end(es)) {
+ es_debug("cached by [%u/%u)\n", es->start, es->len);
+ goto out;
+ }
+
+ while (*p) {
+ parent = *p;
+ es = rb_entry(parent, struct extent_status, rb_node);
+
+ if (offset < es->start) {
+ if (es->start == end + 1) {
+ es->start = offset;
+ es->len += len;
+ es = ext4_es_try_to_merge_left(tree, es);
+ goto out;
+ }
+ p = &(*p)->rb_left;
+ } else if (offset > extent_status_end(es)) {
+ if (offset == extent_status_end(es) + 1) {
+ es->len += len;
+ es = ext4_es_try_to_merge_right(tree, es);
+ goto out;
+ }
+ p = &(*p)->rb_right;
+ } else {
+ if (extent_status_end(es) <= end)
+ es->len = offset - es->start + len;
+ goto out;
+ }
+ }
+
+ es = ext4_es_alloc_extent(offset, len);
+ if (!es)
+ return -ENOMEM;
+ rb_link_node(&es->rb_node, parent, p);
+ rb_insert_color(&es->rb_node, &tree->root);
+
+out:
+ tree->cache_es = es;
+ return 0;
+}
+
+/*
+ * ext4_es_insert_extent() adds a space to a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * ext4_es_insert_extent is called by ext4_da_write_begin and
+ * ext4_es_remove_extent.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct ext4_es_tree *tree;
+ int err = 0;
+
+ trace_ext4_es_insert_extent(inode, offset, len);
+ es_debug("add [%u/%u) to extent status tree of inode %lu\n",
+ offset, len, inode->i_ino);
+
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ err = __es_insert_extent(tree, offset, len);
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+
+ ext4_es_print_tree(inode);
+
+ return err;
+}
+
+/*
+ * ext4_es_remove_extent() removes a space from a delayed extent tree.
+ * Caller holds inode->i_es_lock.
+ *
+ * Return 0 on success, error code on failure.
+ */
+int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset,
+ ext4_lblk_t len)
+{
+ struct rb_node *node;
+ struct ext4_es_tree *tree;
+ struct extent_status *es;
+ struct extent_status orig_es;
+ ext4_lblk_t len1, len2, end;
+ int err = 0;
+
+ trace_ext4_es_remove_extent(inode, offset, len);
+ es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
+ offset, len, inode->i_ino);
+
+ end = offset + len - 1;
+ BUG_ON(end < offset);
+ write_lock(&EXT4_I(inode)->i_es_lock);
+ tree = &EXT4_I(inode)->i_es_tree;
+ es = __es_tree_search(&tree->root, offset);
+ if (!es)
+ goto out;
+ if (es->start > end)
+ goto out;
+
+ /* Simply invalidate cache_es. */
+ tree->cache_es = NULL;
+
+ orig_es.start = es->start;
+ orig_es.len = es->len;
+ len1 = offset > es->start ? offset - es->start : 0;
+ len2 = extent_status_end(es) > end ?
+ extent_status_end(es) - end : 0;
+ if (len1 > 0)
+ es->len = len1;
+ if (len2 > 0) {
+ if (len1 > 0) {
+ err = __es_insert_extent(tree, end + 1, len2);
+ if (err) {
+ es->start = orig_es.start;
+ es->len = orig_es.len;
+ goto out;
+ }
+ } else {
+ es->start = end + 1;
+ es->len = len2;
+ }
+ goto out;
+ }
+
+ if (len1 > 0) {
+ node = rb_next(&es->rb_node);
+ if (node)
+ es = rb_entry(node, struct extent_status, rb_node);
+ else
+ es = NULL;
+ }
+
+ while (es && extent_status_end(es) <= end) {
+ node = rb_next(&es->rb_node);
+ rb_erase(&es->rb_node, &tree->root);
+ ext4_es_free_extent(es);
+ if (!node) {
+ es = NULL;
+ break;
+ }
+ es = rb_entry(node, struct extent_status, rb_node);
+ }
+
+ if (es && es->start < end + 1) {
+ len1 = extent_status_end(es) - end;
+ es->start = end + 1;
+ es->len = len1;
+ }
+
+out:
+ write_unlock(&EXT4_I(inode)->i_es_lock);
+ ext4_es_print_tree(inode);
+ return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
new file mode 100644
index 00000000000..077f82db092
--- /dev/null
+++ b/fs/ext4/extents_status.h
@@ -0,0 +1,45 @@
+/*
+ * fs/ext4/extents_status.h
+ *
+ * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
+ * Modified by
+ * Allison Henderson <achender@linux.vnet.ibm.com>
+ * Zheng Liu <wenqing.lz@taobao.com>
+ *
+ */
+
+#ifndef _EXT4_EXTENTS_STATUS_H
+#define _EXT4_EXTENTS_STATUS_H
+
+/*
+ * Turn on ES_DEBUG__ to get lots of info about extent status operations.
+ */
+#ifdef ES_DEBUG__
+#define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__)
+#else
+#define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
+#endif
+
+struct extent_status {
+ struct rb_node rb_node;
+ ext4_lblk_t start; /* first block extent covers */
+ ext4_lblk_t len; /* length of extent in block */
+};
+
+struct ext4_es_tree {
+ struct rb_root root;
+ struct extent_status *cache_es; /* recently accessed extent */
+};
+
+extern int __init ext4_init_es(void);
+extern void ext4_exit_es(void);
+extern void ext4_es_init_tree(struct ext4_es_tree *tree);
+
+extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
+extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start,
+ ext4_lblk_t len);
+extern ext4_lblk_t ext4_es_find_extent(struct inode *inode,
+ struct extent_status *es);
+
+#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index bf3966bccd3..b64a60bf105 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -24,6 +24,7 @@
#include <linux/mount.h>
#include <linux/path.h>
#include <linux/quotaops.h>
+#include <linux/pagevec.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -286,6 +287,324 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
}
/*
+ * Here we use ext4_map_blocks() to get a block mapping for a extent-based
+ * file rather than ext4_ext_walk_space() because we can introduce
+ * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
+ * function. When extent status tree has been fully implemented, it will
+ * track all extent status for a file and we can directly use it to
+ * retrieve the offset for SEEK_DATA/SEEK_HOLE.
+ */
+
+/*
+ * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
+ * lookup page cache to check whether or not there has some data between
+ * [startoff, endoff] because, if this range contains an unwritten extent,
+ * we determine this extent as a data or a hole according to whether the
+ * page cache has data or not.
+ */
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+ int origin,
+ struct ext4_map_blocks *map,
+ loff_t *offset)
+{
+ struct pagevec pvec;
+ unsigned int blkbits;
+ pgoff_t index;
+ pgoff_t end;
+ loff_t endoff;
+ loff_t startoff;
+ loff_t lastoff;
+ int found = 0;
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ startoff = *offset;
+ lastoff = startoff;
+ endoff = (map->m_lblk + map->m_len) << blkbits;
+
+ index = startoff >> PAGE_CACHE_SHIFT;
+ end = endoff >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+ do {
+ int i, num;
+ unsigned long nr_pages;
+
+ num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
+ nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+ (pgoff_t)num);
+ if (nr_pages == 0) {
+ if (origin == SEEK_DATA)
+ break;
+
+ BUG_ON(origin != SEEK_HOLE);
+ /*
+ * If this is the first time to go into the loop and
+ * offset is not beyond the end offset, it will be a
+ * hole at this offset
+ */
+ if (lastoff == startoff || lastoff < endoff)
+ found = 1;
+ break;
+ }
+
+ /*
+ * If this is the first time to go into the loop and
+ * offset is smaller than the first page offset, it will be a
+ * hole at this offset.
+ */
+ if (lastoff == startoff && origin == SEEK_HOLE &&
+ lastoff < page_offset(pvec.pages[0])) {
+ found = 1;
+ break;
+ }
+
+ for (i = 0; i < nr_pages; i++) {
+ struct page *page = pvec.pages[i];
+ struct buffer_head *bh, *head;
+
+ /*
+ * If the current offset is not beyond the end of given
+ * range, it will be a hole.
+ */
+ if (lastoff < endoff && origin == SEEK_HOLE &&
+ page->index > end) {
+ found = 1;
+ *offset = lastoff;
+ goto out;
+ }
+
+ lock_page(page);
+
+ if (unlikely(page->mapping != inode->i_mapping)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (!page_has_buffers(page)) {
+ unlock_page(page);
+ continue;
+ }
+
+ if (page_has_buffers(page)) {
+ lastoff = page_offset(page);
+ bh = head = page_buffers(page);
+ do {
+ if (buffer_uptodate(bh) ||
+ buffer_unwritten(bh)) {
+ if (origin == SEEK_DATA)
+ found = 1;
+ } else {
+ if (origin == SEEK_HOLE)
+ found = 1;
+ }
+ if (found) {
+ *offset = max_t(loff_t,
+ startoff, lastoff);
+ unlock_page(page);
+ goto out;
+ }
+ lastoff += bh->b_size;
+ bh = bh->b_this_page;
+ } while (bh != head);
+ }
+
+ lastoff = page_offset(page) + PAGE_SIZE;
+ unlock_page(page);
+ }
+
+ /*
+ * The no. of pages is less than our desired, that would be a
+ * hole in there.
+ */
+ if (nr_pages < num && origin == SEEK_HOLE) {
+ found = 1;
+ *offset = lastoff;
+ break;
+ }
+
+ index = pvec.pages[i - 1]->index + 1;
+ pagevec_release(&pvec);
+ } while (index <= end);
+
+out:
+ pagevec_release(&pvec);
+ return found;
+}
+
+/*
+ * ext4_seek_data() retrieves the offset for SEEK_DATA.
+ */
+static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t dataoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ dataoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (last != start)
+ dataoff = last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * it will be as a data.
+ */
+ es.start = last;
+ (void)ext4_es_find_extent(inode, &es);
+ if (last >= es.start &&
+ last < es.start + es.len) {
+ if (last != start)
+ dataoff = last << blkbits;
+ break;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ &map, &dataoff);
+ if (unwritten)
+ break;
+ }
+
+ last++;
+ dataoff = last << blkbits;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (dataoff > isize)
+ return -ENXIO;
+
+ if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ return -EINVAL;
+ if (dataoff > maxsize)
+ return -EINVAL;
+
+ if (dataoff != file->f_pos) {
+ file->f_pos = dataoff;
+ file->f_version = 0;
+ }
+
+ return dataoff;
+}
+
+/*
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
+ */
+static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t holeoff, isize;
+ int blkbits;
+ int ret = 0;
+
+ mutex_lock(&inode->i_mutex);
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
+ mutex_unlock(&inode->i_mutex);
+ return -ENXIO;
+ }
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ holeoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ last += ret;
+ holeoff = last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a delay extent at this offset,
+ * we will skip this extent.
+ */
+ es.start = last;
+ (void)ext4_es_find_extent(inode, &es);
+ if (last >= es.start &&
+ last < es.start + es.len) {
+ last = es.start + es.len;
+ holeoff = last << blkbits;
+ continue;
+ }
+
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ &map, &holeoff);
+ if (!unwritten) {
+ last += ret;
+ holeoff = last << blkbits;
+ continue;
+ }
+ }
+
+ /* find a hole */
+ break;
+ } while (last <= end);
+
+ mutex_unlock(&inode->i_mutex);
+
+ if (holeoff > isize)
+ holeoff = isize;
+
+ if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+ return -EINVAL;
+ if (holeoff > maxsize)
+ return -EINVAL;
+
+ if (holeoff != file->f_pos) {
+ file->f_pos = holeoff;
+ file->f_version = 0;
+ }
+
+ return holeoff;
+}
+
+/*
* ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
* by calling generic_file_llseek_size() with the appropriate maxbytes
* value for each.
@@ -300,8 +619,19 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int origin)
else
maxbytes = inode->i_sb->s_maxbytes;
- return generic_file_llseek_size(file, offset, origin,
- maxbytes, i_size_read(inode));
+ switch (origin) {
+ case SEEK_SET:
+ case SEEK_CUR:
+ case SEEK_END:
+ return generic_file_llseek_size(file, offset, origin,
+ maxbytes, i_size_read(inode));
+ case SEEK_DATA:
+ return ext4_seek_data(file, offset, maxbytes);
+ case SEEK_HOLE:
+ return ext4_seek_hole(file, offset, maxbytes);
+ }
+
+ return -EINVAL;
}
const struct file_operations ext4_file_operations = {
@@ -326,12 +656,10 @@ const struct file_operations ext4_file_operations = {
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
.get_acl = ext4_get_acl,
.fiemap = ext4_fiemap,
};
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index be1d89f385b..dfbc1fe9667 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,7 +44,6 @@
*/
static int ext4_sync_parent(struct inode *inode)
{
- struct writeback_control wbc;
struct dentry *dentry = NULL;
struct inode *next;
int ret = 0;
@@ -66,10 +65,7 @@ static int ext4_sync_parent(struct inode *inode)
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
break;
- memset(&wbc, 0, sizeof(wbc));
- wbc.sync_mode = WB_SYNC_ALL;
- wbc.nr_to_write = 0; /* only write out the inode */
- ret = sync_inode(inode, &wbc);
+ ret = sync_inode_metadata(inode, 1);
if (ret)
break;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3a100e7a62a..3f32c801244 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -762,7 +762,6 @@ got:
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
- brelse(block_bitmap_bh);
/* recheck and clear flag under lock if we still need to */
ext4_lock_group(sb, group);
@@ -775,6 +774,7 @@ got:
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
+ brelse(block_bitmap_bh);
if (err)
goto fail;
@@ -902,6 +902,10 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
+ ei->i_inline_off = 0;
+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+
ret = inode;
dquot_initialize(inode);
err = dquot_alloc_inode(inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 792e388e7b4..20862f96e8a 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -22,6 +22,7 @@
#include "ext4_jbd2.h"
#include "truncate.h"
+#include "ext4_extents.h" /* Needed for EXT_MAX_BLOCKS */
#include <trace/events/ext4.h>
@@ -755,8 +756,7 @@ cleanup:
partial--;
}
out:
- trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
- map->m_pblk, map->m_len, err);
+ trace_ext4_ind_map_blocks_exit(inode, map, err);
return err;
}
@@ -1412,6 +1412,7 @@ void ext4_ind_truncate(struct inode *inode)
down_write(&ei->i_data_sem);
ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
/*
* The orphan list entry will now protect us from any crash which
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
new file mode 100644
index 00000000000..387c47c6cda
--- /dev/null
+++ b/fs/ext4/inline.c
@@ -0,0 +1,1884 @@
+/*
+ * Copyright (c) 2012 Taobao.
+ * Written by Tao Ma <boyu.mt@taobao.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include "ext4_jbd2.h"
+#include "ext4.h"
+#include "xattr.h"
+#include "truncate.h"
+#include <linux/fiemap.h>
+
+#define EXT4_XATTR_SYSTEM_DATA "data"
+#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
+#define EXT4_INLINE_DOTDOT_SIZE 4
+
+int ext4_get_inline_size(struct inode *inode)
+{
+ if (EXT4_I(inode)->i_inline_off)
+ return EXT4_I(inode)->i_inline_size;
+
+ return 0;
+}
+
+static int get_max_inline_xattr_value_size(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_entry *entry;
+ struct ext4_inode *raw_inode;
+ int free, min_offs;
+
+ min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE -
+ EXT4_I(inode)->i_extra_isize -
+ sizeof(struct ext4_xattr_ibody_header);
+
+ /*
+ * We need to subtract another sizeof(__u32) since an in-inode xattr
+ * needs an empty 4 bytes to indicate the gap between the xattr entry
+ * and the name/value pair.
+ */
+ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
+ return EXT4_XATTR_SIZE(min_offs -
+ EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA)) -
+ EXT4_XATTR_ROUND - sizeof(__u32));
+
+ raw_inode = ext4_raw_inode(iloc);
+ header = IHDR(inode, raw_inode);
+ entry = IFIRST(header);
+
+ /* Compute min_offs. */
+ for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ if (!entry->e_value_block && entry->e_value_size) {
+ size_t offs = le16_to_cpu(entry->e_value_offs);
+ if (offs < min_offs)
+ min_offs = offs;
+ }
+ }
+ free = min_offs -
+ ((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
+
+ if (EXT4_I(inode)->i_inline_off) {
+ entry = (struct ext4_xattr_entry *)
+ ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
+
+ free += le32_to_cpu(entry->e_value_size);
+ goto out;
+ }
+
+ free -= EXT4_XATTR_LEN(strlen(EXT4_XATTR_SYSTEM_DATA));
+
+ if (free > EXT4_XATTR_ROUND)
+ free = EXT4_XATTR_SIZE(free - EXT4_XATTR_ROUND);
+ else
+ free = 0;
+
+out:
+ return free;
+}
+
+/*
+ * Get the maximum size we now can store in an inode.
+ * If we can't find the space for a xattr entry, don't use the space
+ * of the extents since we have no space to indicate the inline data.
+ */
+int ext4_get_max_inline_size(struct inode *inode)
+{
+ int error, max_inline_size;
+ struct ext4_iloc iloc;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error) {
+ ext4_error_inode(inode, __func__, __LINE__, 0,
+ "can't get inode location %lu",
+ inode->i_ino);
+ return 0;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ max_inline_size = get_max_inline_xattr_value_size(inode, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ brelse(iloc.bh);
+
+ if (!max_inline_size)
+ return 0;
+
+ return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
+}
+
+int ext4_has_inline_data(struct inode *inode)
+{
+ return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
+ EXT4_I(inode)->i_inline_off;
+}
+
+/*
+ * this function does not take xattr_sem, which is OK because it is
+ * currently only used in a code path coming form ext4_iget, before
+ * the new inode has been unlocked
+ */
+int ext4_find_inline_data_nolock(struct inode *inode)
+{
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ if (!is.s.not_found) {
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ }
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_read_inline_data(struct inode *inode, void *buffer,
+ unsigned int len,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ int cp_len = 0;
+ struct ext4_inode *raw_inode;
+
+ if (!len)
+ return 0;
+
+ BUG_ON(len > EXT4_I(inode)->i_inline_size);
+
+ cp_len = len < EXT4_MIN_INLINE_DATA_SIZE ?
+ len : EXT4_MIN_INLINE_DATA_SIZE;
+
+ raw_inode = ext4_raw_inode(iloc);
+ memcpy(buffer, (void *)(raw_inode->i_block), cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+
+ if (!len)
+ goto out;
+
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ len = min_t(unsigned int, len,
+ (unsigned int)le32_to_cpu(entry->e_value_size));
+
+ memcpy(buffer,
+ (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs), len);
+ cp_len += len;
+
+out:
+ return cp_len;
+}
+
+/*
+ * write the buffer to the inline inode.
+ * If 'create' is set, we don't need to do the extra copy in the xattr
+ * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * That saves us one memcpy.
+ */
+void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
+ void *buffer, loff_t pos, unsigned int len)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ int cp_len = 0;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+ BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
+
+ raw_inode = ext4_raw_inode(iloc);
+ buffer += pos;
+
+ if (pos < EXT4_MIN_INLINE_DATA_SIZE) {
+ cp_len = pos + len > EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE - pos : len;
+ memcpy((void *)raw_inode->i_block + pos, buffer, cp_len);
+
+ len -= cp_len;
+ buffer += cp_len;
+ pos += cp_len;
+ }
+
+ if (!len)
+ return;
+
+ pos -= EXT4_MIN_INLINE_DATA_SIZE;
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+
+ memcpy((void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs) + pos,
+ buffer, len);
+}
+
+static int ext4_create_inline_data(handle_t *handle,
+ struct inode *inode, unsigned len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ if (len > EXT4_MIN_INLINE_DATA_SIZE) {
+ value = EXT4_ZERO_XATTR_VALUE;
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ value = "";
+ len = 0;
+ }
+
+ /* Insert the the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(!is.s.not_found);
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error) {
+ if (error == -ENOSPC)
+ ext4_clear_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
+ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ brelse(is.iloc.bh);
+ return error;
+}
+
+static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int error;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+ /* If the old space is ok, write the data directly. */
+ if (len <= EXT4_I(inode)->i_inline_size)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ BUG_ON(is.s.not_found);
+
+ len -= EXT4_MIN_INLINE_DATA_SIZE;
+ value = kzalloc(len, GFP_NOFS);
+ if (!value)
+ goto out;
+
+ error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, len);
+ if (error == -ENODATA)
+ goto out;
+
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ /* Update the xttr entry. */
+ i.value = value;
+ i.value_len = len;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ EXT4_I(inode)->i_inline_off = (u16)((void *)is.s.here -
+ (void *)ext4_raw_inode(&is.iloc));
+ EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
+ le32_to_cpu(is.s.here->e_value_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+out:
+ kfree(value);
+ brelse(is.iloc.bh);
+ return error;
+}
+
+int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len)
+{
+ int ret, size;
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ return -ENOSPC;
+
+ size = ext4_get_max_inline_size(inode);
+ if (size < len)
+ return -ENOSPC;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+
+ if (ei->i_inline_off)
+ ret = ext4_update_inline_data(handle, inode, len);
+ else
+ ret = ext4_create_inline_data(handle, inode, len);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+static int ext4_destroy_inline_data_nolock(handle_t *handle,
+ struct inode *inode)
+{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = 0, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ .value = NULL,
+ .value_len = 0,
+ };
+ int error;
+
+ if (!ei->i_inline_off)
+ return 0;
+
+ error = ext4_get_inode_loc(inode, &is.iloc);
+ if (error)
+ return error;
+
+ error = ext4_xattr_ibody_find(inode, &i, &is);
+ if (error)
+ goto out;
+
+ error = ext4_journal_get_write_access(handle, is.iloc.bh);
+ if (error)
+ goto out;
+
+ error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ if (error)
+ goto out;
+
+ memset((void *)ext4_raw_inode(&is.iloc)->i_block,
+ 0, EXT4_MIN_INLINE_DATA_SIZE);
+
+ if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (S_ISDIR(inode->i_mode) ||
+ S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
+ ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
+ ext4_ext_tree_init(handle, inode);
+ }
+ }
+ ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+
+ get_bh(is.iloc.bh);
+ error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
+
+ EXT4_I(inode)->i_inline_off = 0;
+ EXT4_I(inode)->i_inline_size = 0;
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+out:
+ brelse(is.iloc.bh);
+ if (error == -ENODATA)
+ error = 0;
+ return error;
+}
+
+static int ext4_read_inline_page(struct inode *inode, struct page *page)
+{
+ void *kaddr;
+ int ret = 0;
+ size_t len;
+ struct ext4_iloc iloc;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!ext4_has_inline_data(inode));
+ BUG_ON(page->index);
+
+ if (!EXT4_I(inode)->i_inline_off) {
+ ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.",
+ inode->i_ino);
+ goto out;
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ goto out;
+
+ len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode));
+ kaddr = kmap_atomic(page);
+ ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ zero_user_segment(page, len, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ brelse(iloc.bh);
+
+out:
+ return ret;
+}
+
+int ext4_readpage_inline(struct inode *inode, struct page *page)
+{
+ int ret = 0;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return -EAGAIN;
+ }
+
+ /*
+ * Current inline data can only exist in the 1st page,
+ * So for all the other pages, just set them uptodate.
+ */
+ if (!page->index)
+ ret = ext4_read_inline_page(inode, page);
+ else if (!PageUptodate(page)) {
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+
+ unlock_page(page);
+ return ret >= 0 ? 0 : ret;
+}
+
+static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags)
+{
+ int ret, needed_blocks;
+ handle_t *handle = NULL;
+ int retries = 0, sem_held = 0;
+ struct page *page = NULL;
+ unsigned from, to;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ /*
+ * clear the flag so that no new write
+ * will trap here again.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+retry:
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ /* We cannot recurse into the filesystem as the transaction is already
+ * started */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 1;
+ /* If some one has already done this for us, just exit. */
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out;
+ }
+
+ from = 0;
+ to = ext4_get_inline_size(inode);
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ if (ret)
+ goto out;
+
+ if (ext4_should_dioread_nolock(inode))
+ ret = __block_write_begin(page, from, to, ext4_get_block_write);
+ else
+ ret = __block_write_begin(page, from, to, ext4_get_block);
+
+ if (!ret && ext4_should_journal_data(inode)) {
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
+ }
+
+ if (ret) {
+ unlock_page(page);
+ page_cache_release(page);
+ ext4_orphan_add(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ sem_held = 0;
+ ext4_journal_stop(handle);
+ handle = NULL;
+ ext4_truncate_failed_write(inode);
+ /*
+ * If truncate failed early the inode might
+ * still be on the orphan list; we need to
+ * make sure the inode is removed from the
+ * orphan list in that case.
+ */
+ if (inode->i_nlink)
+ ext4_orphan_del(NULL, inode);
+ }
+
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+
+ block_commit_write(page, from, to);
+out:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (sem_held)
+ up_write(&EXT4_I(inode)->xattr_sem);
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+}
+
+/*
+ * Try to write data in the inode.
+ * If the inode has inline data, check whether the new write can be
+ * in the inode also. If not, create the page the handle, move the data
+ * to the page make it update and let the later codes create extent for it.
+ */
+int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep)
+{
+ int ret;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+
+ if (pos + len > ext4_get_max_inline_size(inode))
+ goto convert;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ /*
+ * The possible write could happen in the inode,
+ * so try to reserve the space in inode first.
+ */
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ /* We don't have space in inline inode, so convert it to extent. */
+ if (ret == -ENOSPC) {
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ goto convert;
+ }
+
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ *pagep = page;
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ unlock_page(page);
+ page_cache_release(page);
+ goto out_up_read;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_up_read;
+ }
+
+ ret = 1;
+ handle = NULL;
+out_up_read:
+ up_read(&EXT4_I(inode)->xattr_sem);
+out:
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+convert:
+ return ext4_convert_inline_data_to_extent(mapping,
+ inode, flags);
+}
+
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ if (unlikely(copied < len)) {
+ if (!PageUptodate(page)) {
+ copied = 0;
+ goto out;
+ }
+ }
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ copied = 0;
+ goto out;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ BUG_ON(!ext4_has_inline_data(inode));
+
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, pos, len);
+ kunmap_atomic(kaddr);
+ SetPageUptodate(page);
+ /* clear page dirty so that writepages wouldn't work for us. */
+ ClearPageDirty(page);
+
+ up_write(&EXT4_I(inode)->xattr_sem);
+ brelse(iloc.bh);
+out:
+ return copied;
+}
+
+struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page)
+{
+ int ret;
+ void *kaddr;
+ struct ext4_iloc iloc;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret) {
+ ext4_std_error(inode->i_sb, ret);
+ return NULL;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ kaddr = kmap_atomic(page);
+ ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
+ kunmap_atomic(kaddr);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return iloc.bh;
+}
+
+/*
+ * Try to make the page cache and handle ready for the inline data case.
+ * We can call this function in 2 cases:
+ * 1. The inode is created and the first write exceeds inline size. We can
+ * clear the inode state safely.
+ * 2. The inode has inline data, then we need to read the data, make it
+ * update and dirty so that ext4_da_writepages can handle it. We don't
+ * need to start the journal since the file's metatdata isn't changed now.
+ */
+static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
+ struct inode *inode,
+ unsigned flags,
+ void **fsdata)
+{
+ int ret = 0, inline_size;
+ struct page *page;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page)
+ return -ENOMEM;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = __block_write_begin(page, 0, inline_size,
+ ext4_da_get_block_prep);
+ if (ret) {
+ ext4_truncate_failed_write(inode);
+ goto out;
+ }
+
+ SetPageDirty(page);
+ SetPageUptodate(page);
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ *fsdata = (void *)CONVERT_INLINE_DATA;
+
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ return ret;
+}
+
+/*
+ * Prepare the write for the inline data.
+ * If the the data can be written into the inode, we just read
+ * the page and make it uptodate, and start the journal.
+ * Otherwise read the page, makes it dirty so that it can be
+ * handle in writepages(the i_disksize update is left to the
+ * normal ext4_da_write_end).
+ */
+int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata)
+{
+ int ret, inline_size;
+ handle_t *handle;
+ struct page *page;
+ struct ext4_iloc iloc;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ handle = ext4_journal_start(inode, 1);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ handle = NULL;
+ goto out;
+ }
+
+ inline_size = ext4_get_max_inline_size(inode);
+
+ ret = -ENOSPC;
+ if (inline_size >= pos + len) {
+ ret = ext4_prepare_inline_data(handle, inode, pos + len);
+ if (ret && ret != -ENOSPC)
+ goto out;
+ }
+
+ if (ret == -ENOSPC) {
+ ret = ext4_da_convert_inline_data_to_extent(mapping,
+ inode,
+ flags,
+ fsdata);
+ goto out;
+ }
+
+ /*
+ * We cannot recurse into the filesystem as the transaction
+ * is already started.
+ */
+ flags |= AOP_FLAG_NOFS;
+
+ page = grab_cache_page_write_begin(mapping, 0, flags);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ ret = 0;
+ goto out_release_page;
+ }
+
+ if (!PageUptodate(page)) {
+ ret = ext4_read_inline_page(inode, page);
+ if (ret < 0)
+ goto out_release_page;
+ }
+
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *pagep = page;
+ handle = NULL;
+ brelse(iloc.bh);
+ return 1;
+out_release_page:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ unlock_page(page);
+ page_cache_release(page);
+out:
+ if (handle)
+ ext4_journal_stop(handle);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page)
+{
+ int i_size_changed = 0;
+
+ copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+
+ /*
+ * No need to use i_size_read() here, the i_size
+ * cannot change under us because we hold i_mutex.
+ *
+ * But it's important to update i_size while still holding page lock:
+ * page writeout could otherwise come in and zero beyond i_size.
+ */
+ if (pos+copied > inode->i_size) {
+ i_size_write(inode, pos+copied);
+ i_size_changed = 1;
+ }
+ unlock_page(page);
+ page_cache_release(page);
+
+ /*
+ * Don't mark the inode dirty under page lock. First, it unnecessarily
+ * makes the holding time of page lock longer. Second, it forces lock
+ * ordering of page lock and transaction start for journaling
+ * filesystems.
+ */
+ if (i_size_changed)
+ mark_inode_dirty(inode);
+
+ return copied;
+}
+
+#ifdef INLINE_DIR_DEBUG
+void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
+ void *inline_start, int inline_size)
+{
+ int offset;
+ unsigned short de_len;
+ struct ext4_dir_entry_2 *de = inline_start;
+ void *dlimit = inline_start + inline_size;
+
+ trace_printk("inode %lu\n", dir->i_ino);
+ offset = 0;
+ while ((void *)de < dlimit) {
+ de_len = ext4_rec_len_from_disk(de->rec_len, inline_size);
+ trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n",
+ offset, de_len, de->name_len, de->name,
+ de->name_len, le32_to_cpu(de->inode));
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ inline_start, inline_size, offset))
+ BUG();
+
+ offset += de_len;
+ de = (struct ext4_dir_entry_2 *) ((char *) de + de_len);
+ }
+}
+#else
+#define ext4_show_inline_dir(dir, bh, inline_start, inline_size)
+#endif
+
+/*
+ * Add a new entry into a inline dir.
+ * It will return -ENOSPC if no space is available, and -EIO
+ * and -EEXIST if directory entry already exists.
+ */
+static int ext4_add_dirent_to_inline(handle_t *handle,
+ struct dentry *dentry,
+ struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *inline_start, int inline_size)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+ const char *name = dentry->d_name.name;
+ int namelen = dentry->d_name.len;
+ unsigned short reclen;
+ int err;
+ struct ext4_dir_entry_2 *de;
+
+ reclen = EXT4_DIR_REC_LEN(namelen);
+ err = ext4_find_dest_de(dir, inode, iloc->bh,
+ inline_start, inline_size,
+ name, namelen, &de);
+ if (err)
+ return err;
+
+ err = ext4_journal_get_write_access(handle, iloc->bh);
+ if (err)
+ return err;
+ ext4_insert_dentry(inode, de, inline_size, name, namelen);
+
+ ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+
+ /*
+ * XXX shouldn't update any times until successful
+ * completion of syscall, but too many callers depend
+ * on this.
+ *
+ * XXX similarly, too many callers depend on
+ * ext4_new_inode() setting the times, but error
+ * recovery deletes the inode, so the worst that can
+ * happen is that the times are slightly out of date
+ * and/or different from the directory change time.
+ */
+ dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+ ext4_update_dx_flag(dir);
+ dir->i_version++;
+ ext4_mark_inode_dirty(handle, dir);
+ return 1;
+}
+
+static void *ext4_get_inline_xattr_pos(struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+
+ BUG_ON(!EXT4_I(inode)->i_inline_off);
+
+ header = IHDR(inode, ext4_raw_inode(iloc));
+ entry = (struct ext4_xattr_entry *)((void *)ext4_raw_inode(iloc) +
+ EXT4_I(inode)->i_inline_off);
+
+ return (void *)IFIRST(header) + le16_to_cpu(entry->e_value_offs);
+}
+
+/* Set the final de to cover the whole block. */
+static void ext4_update_final_de(void *de_buf, int old_size, int new_size)
+{
+ struct ext4_dir_entry_2 *de, *prev_de;
+ void *limit;
+ int de_len;
+
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ if (old_size) {
+ limit = de_buf + old_size;
+ do {
+ prev_de = de;
+ de_len = ext4_rec_len_from_disk(de->rec_len, old_size);
+ de_buf += de_len;
+ de = (struct ext4_dir_entry_2 *)de_buf;
+ } while (de_buf < limit);
+
+ prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size -
+ old_size, new_size);
+ } else {
+ /* this is just created, so create an empty entry. */
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(new_size, new_size);
+ }
+}
+
+static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+ struct ext4_iloc *iloc)
+{
+ int ret;
+ int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+ int new_size = get_max_inline_xattr_value_size(dir, iloc);
+
+ if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
+ return -ENOSPC;
+
+ ret = ext4_update_inline_data(handle, dir,
+ new_size + EXT4_MIN_INLINE_DATA_SIZE);
+ if (ret)
+ return ret;
+
+ ext4_update_final_de(ext4_get_inline_xattr_pos(dir, iloc), old_size,
+ EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE);
+ dir->i_size = EXT4_I(dir)->i_disksize = EXT4_I(dir)->i_inline_size;
+ return 0;
+}
+
+static void ext4_restore_inline_data(handle_t *handle, struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buf, int inline_size)
+{
+ ext4_create_inline_data(handle, inode, inline_size);
+ ext4_write_inline_data(inode, iloc, buf, 0, inline_size);
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+}
+
+static int ext4_finish_convert_inline_dir(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *dir_block,
+ void *buf,
+ int inline_size)
+{
+ int err, csum_size = 0, header_size = 0;
+ struct ext4_dir_entry_2 *de;
+ struct ext4_dir_entry_tail *t;
+ void *target = dir_block->b_data;
+
+ /*
+ * First create "." and ".." and then copy the dir information
+ * back to the block.
+ */
+ de = (struct ext4_dir_entry_2 *)target;
+ de = ext4_init_dot_dotdot(inode, de,
+ inode->i_sb->s_blocksize, csum_size,
+ le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1);
+ header_size = (void *)de - target;
+
+ memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE);
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ inode->i_size = inode->i_sb->s_blocksize;
+ i_size_write(inode, inode->i_sb->s_blocksize);
+ EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ ext4_update_final_de(dir_block->b_data,
+ inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size,
+ inode->i_sb->s_blocksize - csum_size);
+
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data,
+ inode->i_sb->s_blocksize);
+ initialize_dirent_tail(t, inode->i_sb->s_blocksize);
+ }
+ set_buffer_uptodate(dir_block);
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ return err;
+}
+
+static int ext4_convert_inline_data_nolock(handle_t *handle,
+ struct inode *inode,
+ struct ext4_iloc *iloc)
+{
+ int error;
+ void *buf = NULL;
+ struct buffer_head *data_bh = NULL;
+ struct ext4_map_blocks map;
+ int inline_size;
+
+ inline_size = ext4_get_inline_size(inode);
+ buf = kmalloc(inline_size, GFP_NOFS);
+ if (!buf) {
+ error = -ENOMEM;
+ goto out;
+ }
+
+ error = ext4_read_inline_data(inode, buf, inline_size, iloc);
+ if (error < 0)
+ goto out;
+
+ error = ext4_destroy_inline_data_nolock(handle, inode);
+ if (error)
+ goto out;
+
+ map.m_lblk = 0;
+ map.m_len = 1;
+ map.m_flags = 0;
+ error = ext4_map_blocks(handle, inode, &map, EXT4_GET_BLOCKS_CREATE);
+ if (error < 0)
+ goto out_restore;
+ if (!(map.m_flags & EXT4_MAP_MAPPED)) {
+ error = -EIO;
+ goto out_restore;
+ }
+
+ data_bh = sb_getblk(inode->i_sb, map.m_pblk);
+ if (!data_bh) {
+ error = -EIO;
+ goto out_restore;
+ }
+
+ lock_buffer(data_bh);
+ error = ext4_journal_get_create_access(handle, data_bh);
+ if (error) {
+ unlock_buffer(data_bh);
+ error = -EIO;
+ goto out_restore;
+ }
+ memset(data_bh->b_data, 0, inode->i_sb->s_blocksize);
+
+ if (!S_ISDIR(inode->i_mode)) {
+ memcpy(data_bh->b_data, buf, inline_size);
+ set_buffer_uptodate(data_bh);
+ error = ext4_handle_dirty_metadata(handle,
+ inode, data_bh);
+ } else {
+ error = ext4_finish_convert_inline_dir(handle, inode, data_bh,
+ buf, inline_size);
+ }
+
+ unlock_buffer(data_bh);
+out_restore:
+ if (error)
+ ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
+
+out:
+ brelse(data_bh);
+ kfree(buf);
+ return error;
+}
+
+/*
+ * Try to add the new entry to the inline data.
+ * If succeeds, return 0. If not, extended the inline dir and copied data to
+ * the new created block.
+ */
+int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+{
+ int ret, inline_size;
+ void *inline_start;
+ struct ext4_iloc iloc;
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ ret = ext4_get_inode_loc(dir, &iloc);
+ if (ret)
+ return ret;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir))
+ goto out;
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+ if (ret != -ENOSPC)
+ goto out;
+
+ /* check whether it can be inserted to inline xattr space. */
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ if (!inline_size) {
+ /* Try to use the xattr space.*/
+ ret = ext4_update_inline_dir(handle, dir, &iloc);
+ if (ret && ret != -ENOSPC)
+ goto out;
+
+ inline_size = EXT4_I(dir)->i_inline_size -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_size) {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+
+ ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc,
+ inline_start, inline_size);
+
+ if (ret != -ENOSPC)
+ goto out;
+ }
+
+ /*
+ * The inline space is filled up, so create a new block for it.
+ * As the extent tree will be created, we have to save the inline
+ * dir first.
+ */
+ ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
+
+out:
+ ext4_mark_inode_dirty(handle, dir);
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_read_inline_dir(struct file *filp,
+ void *dirent, filldir_t filldir,
+ int *has_inline_data)
+{
+ int error = 0;
+ unsigned int offset, parent_ino;
+ int i, stored;
+ struct ext4_dir_entry_2 *de;
+ struct super_block *sb;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ int ret, inline_size = 0;
+ struct ext4_iloc iloc;
+ void *dir_buf = NULL;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_read(&EXT4_I(inode)->xattr_sem);
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_size = ext4_get_inline_size(inode);
+ dir_buf = kmalloc(inline_size, GFP_NOFS);
+ if (!dir_buf) {
+ ret = -ENOMEM;
+ up_read(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
+ up_read(&EXT4_I(inode)->xattr_sem);
+ if (ret < 0)
+ goto out;
+
+ sb = inode->i_sb;
+ stored = 0;
+ parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
+
+ while (!error && !stored && filp->f_pos < inode->i_size) {
+revalidate:
+ /*
+ * If the version has changed since the last call to
+ * readdir(2), then we might be pointing to an invalid
+ * dirent right now. Scan from the start of the inline
+ * dir to make sure.
+ */
+ if (filp->f_version != inode->i_version) {
+ for (i = 0;
+ i < inode->i_size && i < offset;) {
+ if (!i) {
+ /* skip "." and ".." if needed. */
+ i += EXT4_INLINE_DOTDOT_SIZE;
+ continue;
+ }
+ de = (struct ext4_dir_entry_2 *)
+ (dir_buf + i);
+ /* It's too expensive to do a full
+ * dirent test each time round this
+ * loop, but we do have to test at
+ * least that it is non-zero. A
+ * failure will be detected in the
+ * dirent test below. */
+ if (ext4_rec_len_from_disk(de->rec_len,
+ inline_size) < EXT4_DIR_REC_LEN(1))
+ break;
+ i += ext4_rec_len_from_disk(de->rec_len,
+ inline_size);
+ }
+ offset = i;
+ filp->f_pos = offset;
+ filp->f_version = inode->i_version;
+ }
+
+ while (!error && filp->f_pos < inode->i_size) {
+ if (filp->f_pos == 0) {
+ error = filldir(dirent, ".", 1, 0, inode->i_ino,
+ DT_DIR);
+ if (error)
+ break;
+ stored++;
+
+ error = filldir(dirent, "..", 2, 0, parent_ino,
+ DT_DIR);
+ if (error)
+ break;
+ stored++;
+
+ filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
+ continue;
+ }
+
+ de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
+ if (ext4_check_dir_entry(inode, filp, de,
+ iloc.bh, dir_buf,
+ inline_size, offset)) {
+ ret = stored;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ inline_size);
+ if (le32_to_cpu(de->inode)) {
+ /* We might block in the next section
+ * if the data destination is
+ * currently swapped out. So, use a
+ * version stamp to detect whether or
+ * not the directory has been modified
+ * during the copy operation.
+ */
+ u64 version = filp->f_version;
+
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+ goto revalidate;
+ stored++;
+ }
+ filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ inline_size);
+ }
+ offset = 0;
+ }
+out:
+ kfree(dir_buf);
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval)
+{
+ struct ext4_iloc iloc;
+
+ *retval = ext4_get_inode_loc(inode, &iloc);
+ if (*retval)
+ return NULL;
+
+ *parent_de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+
+ return iloc.bh;
+}
+
+/*
+ * Try to create the inline data for the new dir.
+ * If it succeeds, return 0, otherwise return the error.
+ * In case of ENOSPC, the caller should create the normal disk layout dir.
+ */
+int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent,
+ struct inode *inode)
+{
+ int ret, inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ struct ext4_iloc iloc;
+ struct ext4_dir_entry_2 *de;
+
+ ret = ext4_get_inode_loc(inode, &iloc);
+ if (ret)
+ return ret;
+
+ ret = ext4_prepare_inline_data(handle, inode, inline_size);
+ if (ret)
+ goto out;
+
+ /*
+ * For inline dir, we only save the inode information for the ".."
+ * and create a fake dentry to cover the left space.
+ */
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ de->inode = cpu_to_le32(parent->i_ino);
+ de = (struct ext4_dir_entry_2 *)((void *)de + EXT4_INLINE_DOTDOT_SIZE);
+ de->inode = 0;
+ de->rec_len = ext4_rec_len_to_disk(
+ inline_size - EXT4_INLINE_DOTDOT_SIZE,
+ inline_size);
+ set_nlink(inode, 2);
+ inode->i_size = EXT4_I(inode)->i_disksize = inline_size;
+out:
+ brelse(iloc.bh);
+ return ret;
+}
+
+struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data)
+{
+ int ret;
+ struct ext4_iloc iloc;
+ void *inline_start;
+ int inline_size;
+
+ if (ext4_get_inode_loc(dir, &iloc))
+ return NULL;
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+ if (ret < 0)
+ goto out;
+
+ if (ext4_get_inline_size(dir) == EXT4_MIN_INLINE_DATA_SIZE)
+ goto out;
+
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE;
+
+ ret = search_dir(iloc.bh, inline_start, inline_size,
+ dir, d_name, 0, res_dir);
+ if (ret == 1)
+ goto out_find;
+
+out:
+ brelse(iloc.bh);
+ iloc.bh = NULL;
+out_find:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ return iloc.bh;
+}
+
+int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_start;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err)
+ return err;
+
+ down_write(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ if ((void *)de_del - ((void *)ext4_raw_inode(&iloc)->i_block) <
+ EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_start = (void *)ext4_raw_inode(&iloc)->i_block +
+ EXT4_INLINE_DOTDOT_SIZE;
+ inline_size = EXT4_MIN_INLINE_DATA_SIZE -
+ EXT4_INLINE_DOTDOT_SIZE;
+ } else {
+ inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
+ inline_size = ext4_get_inline_size(dir) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ err = ext4_journal_get_write_access(handle, bh);
+ if (err)
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del, bh,
+ inline_start, inline_size, 0);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_mark_inode_dirty(handle, dir);
+ if (unlikely(err))
+ goto out;
+
+ ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
+out:
+ up_write(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
+/*
+ * Get the inline dentry at offset.
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_get_inline_entry(struct inode *inode,
+ struct ext4_iloc *iloc,
+ unsigned int offset,
+ void **inline_start,
+ int *inline_size)
+{
+ void *inline_pos;
+
+ BUG_ON(offset > ext4_get_inline_size(inode));
+
+ if (offset < EXT4_MIN_INLINE_DATA_SIZE) {
+ inline_pos = (void *)ext4_raw_inode(iloc)->i_block;
+ *inline_size = EXT4_MIN_INLINE_DATA_SIZE;
+ } else {
+ inline_pos = ext4_get_inline_xattr_pos(inode, iloc);
+ offset -= EXT4_MIN_INLINE_DATA_SIZE;
+ *inline_size = ext4_get_inline_size(inode) -
+ EXT4_MIN_INLINE_DATA_SIZE;
+ }
+
+ if (inline_start)
+ *inline_start = inline_pos;
+ return (struct ext4_dir_entry_2 *)(inline_pos + offset);
+}
+
+int empty_inline_dir(struct inode *dir, int *has_inline_data)
+{
+ int err, inline_size;
+ struct ext4_iloc iloc;
+ void *inline_pos;
+ unsigned int offset;
+ struct ext4_dir_entry_2 *de;
+ int ret = 1;
+
+ err = ext4_get_inode_loc(dir, &iloc);
+ if (err) {
+ EXT4_ERROR_INODE(dir, "error %d getting inode %lu block",
+ err, dir->i_ino);
+ return 1;
+ }
+
+ down_read(&EXT4_I(dir)->xattr_sem);
+ if (!ext4_has_inline_data(dir)) {
+ *has_inline_data = 0;
+ goto out;
+ }
+
+ de = (struct ext4_dir_entry_2 *)ext4_raw_inode(&iloc)->i_block;
+ if (!le32_to_cpu(de->inode)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - no `..'",
+ dir->i_ino);
+ ret = 1;
+ goto out;
+ }
+
+ offset = EXT4_INLINE_DOTDOT_SIZE;
+ while (offset < dir->i_size) {
+ de = ext4_get_inline_entry(dir, &iloc, offset,
+ &inline_pos, &inline_size);
+ if (ext4_check_dir_entry(dir, NULL, de,
+ iloc.bh, inline_pos,
+ inline_size, offset)) {
+ ext4_warning(dir->i_sb,
+ "bad inline directory (dir #%lu) - "
+ "inode %u, rec_len %u, name_len %d"
+ "inline size %d\n",
+ dir->i_ino, le32_to_cpu(de->inode),
+ le16_to_cpu(de->rec_len), de->name_len,
+ inline_size);
+ ret = 1;
+ goto out;
+ }
+ if (le32_to_cpu(de->inode)) {
+ ret = 0;
+ goto out;
+ }
+ offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ }
+
+out:
+ up_read(&EXT4_I(dir)->xattr_sem);
+ brelse(iloc.bh);
+ return ret;
+}
+
+int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
+{
+ int ret;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ ret = ext4_destroy_inline_data_nolock(handle, inode);
+ up_write(&EXT4_I(inode)->xattr_sem);
+
+ return ret;
+}
+
+int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline)
+{
+ __u64 physical = 0;
+ __u64 length;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST;
+ int error = 0;
+ struct ext4_iloc iloc;
+
+ down_read(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ goto out;
+ }
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ goto out;
+
+ physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
+ physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
+ physical += offsetof(struct ext4_inode, i_block);
+ length = i_size_read(inode);
+
+ if (physical)
+ error = fiemap_fill_next_extent(fieinfo, 0, physical,
+ length, flags);
+ brelse(iloc.bh);
+out:
+ up_read(&EXT4_I(inode)->xattr_sem);
+ return (error < 0 ? error : 0);
+}
+
+/*
+ * Called during xattr set, and if we can sparse space 'needed',
+ * just create the extent tree evict the data to the outer block.
+ *
+ * We use jbd2 instead of page cache to move data to the 1st block
+ * so that the whole transaction can be committed as a whole and
+ * the data isn't lost because of the delayed page cache write.
+ */
+int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed)
+{
+ int error;
+ struct ext4_xattr_entry *entry;
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_inode *raw_inode;
+ struct ext4_iloc iloc;
+
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ raw_inode = ext4_raw_inode(&iloc);
+ header = IHDR(inode, raw_inode);
+ entry = (struct ext4_xattr_entry *)((void *)raw_inode +
+ EXT4_I(inode)->i_inline_off);
+ if (EXT4_XATTR_LEN(entry->e_name_len) +
+ EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size)) < needed) {
+ error = -ENOSPC;
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+out:
+ brelse(iloc.bh);
+ return error;
+}
+
+void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+{
+ handle_t *handle;
+ int inline_size, value_len, needed_blocks;
+ size_t i_size;
+ void *value = NULL;
+ struct ext4_xattr_ibody_find is = {
+ .s = { .not_found = -ENODATA, },
+ };
+ struct ext4_xattr_info i = {
+ .name_index = EXT4_XATTR_INDEX_SYSTEM,
+ .name = EXT4_XATTR_SYSTEM_DATA,
+ };
+
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle))
+ return;
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ *has_inline = 0;
+ ext4_journal_stop(handle);
+ return;
+ }
+
+ if (ext4_orphan_add(handle, inode))
+ goto out;
+
+ if (ext4_get_inode_loc(inode, &is.iloc))
+ goto out;
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size = inode->i_size;
+ inline_size = ext4_get_inline_size(inode);
+ EXT4_I(inode)->i_disksize = i_size;
+
+ if (i_size < inline_size) {
+ /* Clear the content in the xattr space. */
+ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
+ if (ext4_xattr_ibody_find(inode, &i, &is))
+ goto out_error;
+
+ BUG_ON(is.s.not_found);
+
+ value_len = le32_to_cpu(is.s.here->e_value_size);
+ value = kmalloc(value_len, GFP_NOFS);
+ if (!value)
+ goto out_error;
+
+ if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
+ value, value_len))
+ goto out_error;
+
+ i.value = value;
+ i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
+ i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
+ if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+ goto out_error;
+ }
+
+ /* Clear the content within i_blocks. */
+ if (i_size < EXT4_MIN_INLINE_DATA_SIZE)
+ memset(ext4_raw_inode(&is.iloc)->i_block + i_size, 0,
+ EXT4_MIN_INLINE_DATA_SIZE - i_size);
+
+ EXT4_I(inode)->i_inline_size = i_size <
+ EXT4_MIN_INLINE_DATA_SIZE ?
+ EXT4_MIN_INLINE_DATA_SIZE : i_size;
+ }
+
+out_error:
+ up_write(&EXT4_I(inode)->i_data_sem);
+out:
+ brelse(is.iloc.bh);
+ up_write(&EXT4_I(inode)->xattr_sem);
+ kfree(value);
+ if (inode->i_nlink)
+ ext4_orphan_del(handle, inode);
+
+ inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ ext4_mark_inode_dirty(handle, inode);
+ if (IS_SYNC(inode))
+ ext4_handle_sync(handle);
+
+ ext4_journal_stop(handle);
+ return;
+}
+
+int ext4_convert_inline_data(struct inode *inode)
+{
+ int error, needed_blocks;
+ handle_t *handle;
+ struct ext4_iloc iloc;
+
+ if (!ext4_has_inline_data(inode)) {
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return 0;
+ }
+
+ needed_blocks = ext4_writepage_trans_blocks(inode);
+
+ iloc.bh = NULL;
+ error = ext4_get_inode_loc(inode, &iloc);
+ if (error)
+ return error;
+
+ handle = ext4_journal_start(inode, needed_blocks);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out_free;
+ }
+
+ down_write(&EXT4_I(inode)->xattr_sem);
+ if (!ext4_has_inline_data(inode)) {
+ up_write(&EXT4_I(inode)->xattr_sem);
+ goto out;
+ }
+
+ error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+ up_write(&EXT4_I(inode)->xattr_sem);
+out:
+ ext4_journal_stop(handle);
+out_free:
+ brelse(iloc.bh);
+ return error;
+}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b3c243b9afa..cb1c1ab2720 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -484,49 +484,6 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
}
/*
- * Sets the BH_Da_Mapped bit on the buffer heads corresponding to the given map.
- */
-static void set_buffers_da_mapped(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct address_space *mapping = inode->i_mapping;
- struct pagevec pvec;
- int i, nr_pages;
- pgoff_t index, end;
-
- index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
- end = (map->m_lblk + map->m_len - 1) >>
- (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
- pagevec_init(&pvec, 0);
- while (index <= end) {
- nr_pages = pagevec_lookup(&pvec, mapping, index,
- min(end - index + 1,
- (pgoff_t)PAGEVEC_SIZE));
- if (nr_pages == 0)
- break;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pvec.pages[i];
- struct buffer_head *bh, *head;
-
- if (unlikely(page->mapping != mapping) ||
- !PageDirty(page))
- break;
-
- if (page_has_buffers(page)) {
- bh = head = page_buffers(page);
- do {
- set_buffer_da_mapped(bh);
- bh = bh->b_this_page;
- } while (bh != head);
- }
- index++;
- }
- pagevec_release(&pvec);
- }
-}
-
-/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
*
@@ -574,7 +531,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
- int ret = check_block_validity(inode, map);
+ int ret;
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+ /* delayed alloc may be allocated by fallocate and
+ * coverted to initialized by directIO.
+ * we need to handle delayed extent here.
+ */
+ down_write((&EXT4_I(inode)->i_data_sem));
+ goto delayed_mapped;
+ }
+ ret = check_block_validity(inode, map);
if (ret != 0)
return ret;
}
@@ -652,12 +618,15 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED);
- /* If we have successfully mapped the delayed allocated blocks,
- * set the BH_Da_Mapped bit on them. Its important to do this
- * under the protection of i_data_sem.
- */
- if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
- set_buffers_da_mapped(inode, map);
+ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
+ int ret;
+delayed_mapped:
+ /* delayed allocation blocks has been allocated */
+ ret = ext4_es_remove_extent(inode, map->m_lblk,
+ map->m_len);
+ if (ret < 0)
+ retval = ret;
+ }
}
up_write((&EXT4_I(inode)->i_data_sem));
@@ -680,10 +649,13 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
int ret = 0, started = 0;
int dio_credits;
+ if (ext4_has_inline_data(inode))
+ return -ERANGE;
+
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -798,13 +770,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
return NULL;
}
-static int walk_page_buffers(handle_t *handle,
- struct buffer_head *head,
- unsigned from,
- unsigned to,
- int *partial,
- int (*fn)(handle_t *handle,
- struct buffer_head *bh))
+int ext4_walk_page_buffers(handle_t *handle,
+ struct buffer_head *head,
+ unsigned from,
+ unsigned to,
+ int *partial,
+ int (*fn)(handle_t *handle,
+ struct buffer_head *bh))
{
struct buffer_head *bh;
unsigned block_start, block_end;
@@ -854,8 +826,8 @@ static int walk_page_buffers(handle_t *handle,
* is elevated. We'll still have enough credits for the tiny quotafile
* write.
*/
-static int do_journal_get_write_access(handle_t *handle,
- struct buffer_head *bh)
+int do_journal_get_write_access(handle_t *handle,
+ struct buffer_head *bh)
{
int dirty = buffer_dirty(bh);
int ret;
@@ -878,7 +850,7 @@ static int do_journal_get_write_access(handle_t *handle,
return ret;
}
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
static int ext4_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
@@ -902,6 +874,17 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
+ flags, pagep);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ }
+
retry:
handle = ext4_journal_start(inode, needed_blocks);
if (IS_ERR(handle)) {
@@ -919,6 +902,7 @@ retry:
ret = -ENOMEM;
goto out;
}
+
*pagep = page;
if (ext4_should_dioread_nolock(inode))
@@ -927,8 +911,9 @@ retry:
ret = __block_write_begin(page, pos, len, ext4_get_block);
if (!ret && ext4_should_journal_data(inode)) {
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, NULL, do_journal_get_write_access);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page),
+ from, to, NULL,
+ do_journal_get_write_access);
}
if (ret) {
@@ -983,7 +968,12 @@ static int ext4_generic_write_end(struct file *file,
struct inode *inode = mapping->host;
handle_t *handle = ext4_journal_current_handle();
- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else
+ copied = block_write_end(file, mapping, pos,
+ len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
@@ -1134,16 +1124,21 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
- if (copied < len) {
- if (!PageUptodate(page))
- copied = 0;
- page_zero_new_buffers(page, from+copied, to);
- }
+ if (ext4_has_inline_data(inode))
+ copied = ext4_write_inline_data_end(inode, pos, len,
+ copied, page);
+ else {
+ if (copied < len) {
+ if (!PageUptodate(page))
+ copied = 0;
+ page_zero_new_buffers(page, from+copied, to);
+ }
- ret = walk_page_buffers(handle, page_buffers(page), from,
- to, &partial, write_end_fn);
- if (!partial)
- SetPageUptodate(page);
+ ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
+ to, &partial, write_end_fn);
+ if (!partial)
+ SetPageUptodate(page);
+ }
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
@@ -1301,6 +1296,7 @@ static void ext4_da_page_release_reservation(struct page *page,
struct inode *inode = page->mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int num_clusters;
+ ext4_fsblk_t lblk;
head = page_buffers(page);
bh = head;
@@ -1310,20 +1306,23 @@ static void ext4_da_page_release_reservation(struct page *page,
if ((offset <= curr_off) && (buffer_delay(bh))) {
to_release++;
clear_buffer_delay(bh);
- clear_buffer_da_mapped(bh);
}
curr_off = next_off;
} while ((bh = bh->b_this_page) != head);
+ if (to_release) {
+ lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, lblk, to_release);
+ }
+
/* If we have released all the blocks belonging to a cluster, then we
* need to release the reserved space for that cluster. */
num_clusters = EXT4_NUM_B2C(sbi, to_release);
while (num_clusters > 0) {
- ext4_fsblk_t lblk;
lblk = (page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits)) +
((num_clusters - 1) << sbi->s_cluster_bits);
if (sbi->s_cluster_ratio == 1 ||
- !ext4_find_delalloc_cluster(inode, lblk, 1))
+ !ext4_find_delalloc_cluster(inode, lblk))
ext4_da_release_space(inode, 1);
num_clusters--;
@@ -1429,8 +1428,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
clear_buffer_delay(bh);
bh->b_blocknr = pblock;
}
- if (buffer_da_mapped(bh))
- clear_buffer_da_mapped(bh);
if (buffer_unwritten(bh) ||
buffer_mapped(bh))
BUG_ON(bh->b_blocknr != pblock);
@@ -1500,9 +1497,16 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
struct pagevec pvec;
struct inode *inode = mpd->inode;
struct address_space *mapping = inode->i_mapping;
+ ext4_lblk_t start, last;
index = mpd->first_page;
end = mpd->next_page - 1;
+
+ start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+ ext4_es_remove_extent(inode, start, last - start + 1);
+
+ pagevec_init(&pvec, 0);
while (index <= end) {
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
if (nr_pages == 0)
@@ -1656,15 +1660,6 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
for (i = 0; i < map.m_len; i++)
unmap_underlying_metadata(bdev, map.m_pblk + i);
-
- if (ext4_should_order_data(mpd->inode)) {
- err = ext4_jbd2_file_inode(handle, mpd->inode);
- if (err) {
- /* Only if the journal is aborted */
- mpd->retval = err;
- goto submit_io;
- }
- }
}
/*
@@ -1795,7 +1790,19 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
* file system block.
*/
down_read((&EXT4_I(inode)->i_data_sem));
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ if (ext4_has_inline_data(inode)) {
+ /*
+ * We will soon create blocks for this page, and let
+ * us pretend as if the blocks aren't allocated yet.
+ * In case of clusters, we have to handle the work
+ * of mapping from cluster so that the reserved space
+ * is calculated properly.
+ */
+ if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) &&
+ ext4_find_delalloc_cluster(inode, map->m_lblk))
+ map->m_flags |= EXT4_MAP_FROM_CLUSTER;
+ retval = 0;
+ } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
@@ -1814,6 +1821,10 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
goto out_unlock;
}
+ retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len);
+ if (retval)
+ goto out_unlock;
+
/* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served
* and it should not appear on the bh->b_state.
*/
@@ -1842,8 +1853,8 @@ out_unlock:
* We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
* initialized properly.
*/
-static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
{
struct ext4_map_blocks map;
int ret = 0;
@@ -1917,15 +1928,29 @@ static int __ext4_journalled_writepage(struct page *page,
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
- struct buffer_head *page_bufs;
+ struct buffer_head *page_bufs = NULL;
handle_t *handle = NULL;
- int ret = 0;
- int err;
+ int ret = 0, err = 0;
+ int inline_data = ext4_has_inline_data(inode);
+ struct buffer_head *inode_bh = NULL;
ClearPageChecked(page);
- page_bufs = page_buffers(page);
- BUG_ON(!page_bufs);
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one);
+
+ if (inline_data) {
+ BUG_ON(page->index != 0);
+ BUG_ON(len > ext4_get_max_inline_size(inode));
+ inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+ if (inode_bh == NULL)
+ goto out;
+ } else {
+ page_bufs = page_buffers(page);
+ if (!page_bufs) {
+ BUG();
+ goto out;
+ }
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bget_one);
+ }
/* As soon as we unlock the page, it can go away, but we have
* references to buffers so we are safe */
unlock_page(page);
@@ -1938,11 +1963,18 @@ static int __ext4_journalled_writepage(struct page *page,
BUG_ON(!ext4_handle_valid(handle));
- ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- do_journal_get_write_access);
+ if (inline_data) {
+ ret = ext4_journal_get_write_access(handle, inode_bh);
+
+ err = ext4_handle_dirty_metadata(handle, inode, inode_bh);
- err = walk_page_buffers(handle, page_bufs, 0, len, NULL,
- write_end_fn);
+ } else {
+ ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ do_journal_get_write_access);
+
+ err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL,
+ write_end_fn);
+ }
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -1950,9 +1982,12 @@ static int __ext4_journalled_writepage(struct page *page,
if (!ret)
ret = err;
- walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one);
+ if (!ext4_has_inline_data(inode))
+ ext4_walk_page_buffers(handle, page_bufs, 0, len,
+ NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
+ brelse(inode_bh);
return ret;
}
@@ -2029,8 +2064,8 @@ static int ext4_writepage(struct page *page,
commit_write = 1;
}
page_bufs = page_buffers(page);
- if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
- ext4_bh_delay_or_unwritten)) {
+ if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+ ext4_bh_delay_or_unwritten)) {
/*
* We don't want to do block allocation, so redirty
* the page and return. We may reach here when we do
@@ -2096,7 +2131,8 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
* mpage_da_map_and_submit to map a single contiguous memory region
* and then write them.
*/
-static int write_cache_pages_da(struct address_space *mapping,
+static int write_cache_pages_da(handle_t *handle,
+ struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd,
pgoff_t *done_index)
@@ -2175,6 +2211,17 @@ static int write_cache_pages_da(struct address_space *mapping,
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * If we have inline data and arrive here, it means that
+ * we will soon create the block for the 1st page, so
+ * we'd better clear the inline data here.
+ */
+ if (ext4_has_inline_data(inode)) {
+ BUG_ON(ext4_test_inode_state(inode,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, inode);
+ }
+
if (mpd->next_page != page->index)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -2381,7 +2428,8 @@ retry:
* contiguous region of logical blocks that need
* blocks to be allocated by ext4 and submit them.
*/
- ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
+ ret = write_cache_pages_da(handle, mapping,
+ wbc, &mpd, &done_index);
/*
* If we have a contiguous extent of pages and we
* haven't done the I/O yet, map the blocks and submit
@@ -2445,7 +2493,6 @@ out_writepages:
return ret;
}
-#define FALL_BACK_TO_NONDELALLOC 1
static int ext4_nonda_switch(struct super_block *sb)
{
s64 free_blocks, dirty_blocks;
@@ -2502,6 +2549,19 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
*fsdata = (void *)0;
trace_ext4_da_write_begin(inode, pos, len, flags);
+
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ ret = ext4_da_write_inline_data_begin(mapping, inode,
+ pos, len, flags,
+ pagep, fsdata);
+ if (ret < 0)
+ goto out;
+ if (ret == 1) {
+ ret = 0;
+ goto out;
+ }
+ }
+
retry:
/*
* With delayed allocation, we don't log the i_disksize update
@@ -2603,22 +2663,13 @@ static int ext4_da_write_end(struct file *file,
* changes. So let's piggyback the i_disksize mark_inode_dirty
* into that.
*/
-
new_i_size = pos + copied;
if (copied && new_i_size > EXT4_I(inode)->i_disksize) {
- if (ext4_da_should_update_i_disksize(page, end)) {
+ if (ext4_has_inline_data(inode) ||
+ ext4_da_should_update_i_disksize(page, end)) {
down_write(&EXT4_I(inode)->i_data_sem);
- if (new_i_size > EXT4_I(inode)->i_disksize) {
- /*
- * Updating i_disksize when extending file
- * without needing block allocation
- */
- if (ext4_should_order_data(inode))
- ret = ext4_jbd2_file_inode(handle,
- inode);
-
+ if (new_i_size > EXT4_I(inode)->i_disksize)
EXT4_I(inode)->i_disksize = new_i_size;
- }
up_write(&EXT4_I(inode)->i_data_sem);
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
@@ -2627,8 +2678,16 @@ static int ext4_da_write_end(struct file *file,
ext4_mark_inode_dirty(handle, inode);
}
}
- ret2 = generic_write_end(file, mapping, pos, len, copied,
+
+ if (write_mode != CONVERT_INLINE_DATA &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
+ ext4_has_inline_data(inode))
+ ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied,
+ page);
+ else
+ ret2 = generic_write_end(file, mapping, pos, len, copied,
page, fsdata);
+
copied = ret2;
if (ret2 < 0)
ret = ret2;
@@ -2721,6 +2780,12 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
journal_t *journal;
int err;
+ /*
+ * We can get here for an inline file via the FIBMAP ioctl
+ */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
/*
@@ -2766,14 +2831,30 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
+ int ret = -EAGAIN;
+ struct inode *inode = page->mapping->host;
+
trace_ext4_readpage(page);
- return mpage_readpage(page, ext4_get_block);
+
+ if (ext4_has_inline_data(inode))
+ ret = ext4_readpage_inline(inode, page);
+
+ if (ret == -EAGAIN)
+ return mpage_readpage(page, ext4_get_block);
+
+ return ret;
}
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
+ struct inode *inode = mapping->host;
+
+ /* If the file has inline data, no need to do readpages. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
}
@@ -2840,7 +2921,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
* We allocate an uinitialized extent if blocks haven't been allocated.
* The extent will be converted to initialized after the IO is complete.
*/
-static int ext4_get_block_write(struct inode *inode, sector_t iblock,
+int ext4_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
@@ -2850,29 +2931,12 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
}
static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int flags)
+ struct buffer_head *bh_result, int create)
{
- handle_t *handle = ext4_journal_current_handle();
- struct ext4_map_blocks map;
- int ret = 0;
-
- ext4_debug("ext4_get_block_write_nolock: inode %lu, flag %d\n",
- inode->i_ino, flags);
-
- flags = EXT4_GET_BLOCKS_NO_LOCK;
-
- map.m_lblk = iblock;
- map.m_len = bh_result->b_size >> inode->i_blkbits;
-
- ret = ext4_map_blocks(handle, inode, &map, flags);
- if (ret > 0) {
- map_bh(bh_result, inode->i_sb, map.m_pblk);
- bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
- map.m_flags;
- bh_result->b_size = inode->i_sb->s_blocksize * map.m_len;
- ret = 0;
- }
- return ret;
+ ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
}
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -2978,10 +3042,10 @@ retry:
* fall back to buffered IO.
*
* For holes, we fallocate those blocks, mark them as uninitialized
- * If those blocks were preallocated, we mark sure they are splited, but
+ * If those blocks were preallocated, we mark sure they are split, but
* still keep the range to write as uninitialized.
*
- * The unwrritten extents will be converted to written when DIO is completed.
+ * The unwritten extents will be converted to written when DIO is completed.
* For async direct IO, since the IO may still pending when return, we
* set up an end_io call back function, which will do the conversion
* when async direct IO completed.
@@ -2999,125 +3063,120 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
struct inode *inode = file->f_mapping->host;
ssize_t ret;
size_t count = iov_length(iov, nr_segs);
-
+ int overwrite = 0;
+ get_block_t *get_block_func = NULL;
+ int dio_flags = 0;
loff_t final_size = offset + count;
- if (rw == WRITE && final_size <= inode->i_size) {
- int overwrite = 0;
- BUG_ON(iocb->private == NULL);
+ /* Use the old path for reads and writes beyond i_size. */
+ if (rw != WRITE || final_size > inode->i_size)
+ return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
- /* If we do a overwrite dio, i_mutex locking can be released */
- overwrite = *((int *)iocb->private);
+ BUG_ON(iocb->private == NULL);
- if (overwrite) {
- atomic_inc(&inode->i_dio_count);
- down_read(&EXT4_I(inode)->i_data_sem);
- mutex_unlock(&inode->i_mutex);
- }
+ /* If we do a overwrite dio, i_mutex locking can be released */
+ overwrite = *((int *)iocb->private);
- /*
- * We could direct write to holes and fallocate.
- *
- * Allocated blocks to fill the hole are marked as uninitialized
- * to prevent parallel buffered read to expose the stale data
- * before DIO complete the data IO.
- *
- * As to previously fallocated extents, ext4 get_block
- * will just simply mark the buffer mapped but still
- * keep the extents uninitialized.
- *
- * for non AIO case, we will convert those unwritten extents
- * to written after return back from blockdev_direct_IO.
- *
- * for async DIO, the conversion needs to be defered when
- * the IO is completed. The ext4 end_io callback function
- * will be called to take care of the conversion work.
- * Here for async case, we allocate an io_end structure to
- * hook to the iocb.
- */
- iocb->private = NULL;
- ext4_inode_aio_set(inode, NULL);
- if (!is_sync_kiocb(iocb)) {
- ext4_io_end_t *io_end =
- ext4_init_io_end(inode, GFP_NOFS);
- if (!io_end) {
- ret = -ENOMEM;
- goto retake_lock;
- }
- io_end->flag |= EXT4_IO_END_DIRECT;
- iocb->private = io_end;
- /*
- * we save the io structure for current async
- * direct IO, so that later ext4_map_blocks()
- * could flag the io structure whether there
- * is a unwritten extents needs to be converted
- * when IO is completed.
- */
- ext4_inode_aio_set(inode, io_end);
- }
+ if (overwrite) {
+ atomic_inc(&inode->i_dio_count);
+ down_read(&EXT4_I(inode)->i_data_sem);
+ mutex_unlock(&inode->i_mutex);
+ }
- if (overwrite)
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write_nolock,
- ext4_end_io_dio,
- NULL,
- 0);
- else
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio,
- NULL,
- DIO_LOCKING);
- if (iocb->private)
- ext4_inode_aio_set(inode, NULL);
+ /*
+ * We could direct write to holes and fallocate.
+ *
+ * Allocated blocks to fill the hole are marked as
+ * uninitialized to prevent parallel buffered read to expose
+ * the stale data before DIO complete the data IO.
+ *
+ * As to previously fallocated extents, ext4 get_block will
+ * just simply mark the buffer mapped but still keep the
+ * extents uninitialized.
+ *
+ * For non AIO case, we will convert those unwritten extents
+ * to written after return back from blockdev_direct_IO.
+ *
+ * For async DIO, the conversion needs to be deferred when the
+ * IO is completed. The ext4 end_io callback function will be
+ * called to take care of the conversion work. Here for async
+ * case, we allocate an io_end structure to hook to the iocb.
+ */
+ iocb->private = NULL;
+ ext4_inode_aio_set(inode, NULL);
+ if (!is_sync_kiocb(iocb)) {
+ ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS);
+ if (!io_end) {
+ ret = -ENOMEM;
+ goto retake_lock;
+ }
+ io_end->flag |= EXT4_IO_END_DIRECT;
+ iocb->private = io_end;
/*
- * The io_end structure takes a reference to the inode,
- * that structure needs to be destroyed and the
- * reference to the inode need to be dropped, when IO is
- * complete, even with 0 byte write, or failed.
- *
- * In the successful AIO DIO case, the io_end structure will be
- * desctroyed and the reference to the inode will be dropped
- * after the end_io call back function is called.
- *
- * In the case there is 0 byte write, or error case, since
- * VFS direct IO won't invoke the end_io call back function,
- * we need to free the end_io structure here.
+ * we save the io structure for current async direct
+ * IO, so that later ext4_map_blocks() could flag the
+ * io structure whether there is a unwritten extents
+ * needs to be converted when IO is completed.
*/
- if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
- ext4_free_io_end(iocb->private);
- iocb->private = NULL;
- } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
- EXT4_STATE_DIO_UNWRITTEN)) {
- int err;
- /*
- * for non AIO case, since the IO is already
- * completed, we could do the conversion right here
- */
- err = ext4_convert_unwritten_extents(inode,
- offset, ret);
- if (err < 0)
- ret = err;
- ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
- }
+ ext4_inode_aio_set(inode, io_end);
+ }
- retake_lock:
- /* take i_mutex locking again if we do a ovewrite dio */
- if (overwrite) {
- inode_dio_done(inode);
- up_read(&EXT4_I(inode)->i_data_sem);
- mutex_lock(&inode->i_mutex);
- }
+ if (overwrite) {
+ get_block_func = ext4_get_block_write_nolock;
+ } else {
+ get_block_func = ext4_get_block_write;
+ dio_flags = DIO_LOCKING;
+ }
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ get_block_func,
+ ext4_end_io_dio,
+ NULL,
+ dio_flags);
+
+ if (iocb->private)
+ ext4_inode_aio_set(inode, NULL);
+ /*
+ * The io_end structure takes a reference to the inode, that
+ * structure needs to be destroyed and the reference to the
+ * inode need to be dropped, when IO is complete, even with 0
+ * byte write, or failed.
+ *
+ * In the successful AIO DIO case, the io_end structure will
+ * be destroyed and the reference to the inode will be dropped
+ * after the end_io call back function is called.
+ *
+ * In the case there is 0 byte write, or error case, since VFS
+ * direct IO won't invoke the end_io call back function, we
+ * need to free the end_io structure here.
+ */
+ if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
+ ext4_free_io_end(iocb->private);
+ iocb->private = NULL;
+ } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
+ EXT4_STATE_DIO_UNWRITTEN)) {
+ int err;
+ /*
+ * for non AIO case, since the IO is already
+ * completed, we could do the conversion right here
+ */
+ err = ext4_convert_unwritten_extents(inode,
+ offset, ret);
+ if (err < 0)
+ ret = err;
+ ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
+ }
- return ret;
+retake_lock:
+ /* take i_mutex locking again if we do a ovewrite dio */
+ if (overwrite) {
+ inode_dio_done(inode);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
}
- /* for write the the end of file case, we fall back to old way */
- return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+ return ret;
}
static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
@@ -3134,6 +3193,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
if (ext4_should_journal_data(inode))
return 0;
+ /* Let buffer I/O handle the inline data case. */
+ if (ext4_has_inline_data(inode))
+ return 0;
+
trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3531,6 +3594,14 @@ void ext4_truncate(struct inode *inode)
if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
+ if (ext4_has_inline_data(inode)) {
+ int has_inline = 1;
+
+ ext4_inline_data_truncate(inode, &has_inline);
+ if (has_inline)
+ return;
+ }
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ext4_ext_truncate(inode);
else
@@ -3756,6 +3827,19 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
}
}
+static inline void ext4_iget_extra_inode(struct inode *inode,
+ struct ext4_inode *raw_inode,
+ struct ext4_inode_info *ei)
+{
+ __le32 *magic = (void *)raw_inode +
+ EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
+ if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_find_inline_data_nolock(inode);
+ } else
+ EXT4_I(inode)->i_inline_off = 0;
+}
+
struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
{
struct ext4_iloc iloc;
@@ -3826,6 +3910,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
+ ei->i_inline_off = 0;
ei->i_dir_start_lookup = 0;
ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
/* We now have enough fields to check if the inode was active or not.
@@ -3898,11 +3983,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
} else {
- __le32 *magic = (void *)raw_inode +
- EXT4_GOOD_OLD_INODE_SIZE +
- ei->i_extra_isize;
- if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ ext4_iget_extra_inode(inode, raw_inode, ei);
}
}
@@ -3925,17 +4006,19 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_file_acl);
ret = -EIO;
goto bad_inode;
- } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
- if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode)))
- /* Validate extent which is part of inode */
- ret = ext4_ext_check_inode(inode);
- } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
- (S_ISLNK(inode->i_mode) &&
- !ext4_inode_is_fast_symlink(inode))) {
- /* Validate block references which are part of inode */
- ret = ext4_ind_check_inode(inode);
+ } else if (!ext4_has_inline_data(inode)) {
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))))
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_ind_check_inode(inode);
+ }
}
if (ret)
goto bad_inode;
@@ -4122,9 +4205,10 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le32(new_encode_dev(inode->i_rdev));
raw_inode->i_block[2] = 0;
}
- } else
+ } else if (!ext4_has_inline_data(inode)) {
for (block = 0; block < EXT4_N_BLOCKS; block++)
raw_inode->i_block[block] = ei->i_data[block];
+ }
raw_inode->i_disk_version = cpu_to_le32(inode->i_version);
if (ei->i_extra_isize) {
@@ -4811,8 +4895,9 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
* journal_start/journal_stop which can block and take a long time
*/
if (page_has_buffers(page)) {
- if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
- ext4_bh_unmapped)) {
+ if (!ext4_walk_page_buffers(NULL, page_buffers(page),
+ 0, len, NULL,
+ ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
wait_on_page_writeback(page);
ret = VM_FAULT_LOCKED;
@@ -4833,7 +4918,7 @@ retry_alloc:
}
ret = __block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
- if (walk_page_buffers(handle, page_buffers(page), 0,
+ if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
unlock_page(page);
ret = VM_FAULT_SIGBUS;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 526e5535860..1bf6fe785c4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1373,7 +1373,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
ex->fe_start += next;
while (needed > ex->fe_len &&
- (buddy = mb_find_buddy(e4b, order, &max))) {
+ mb_find_buddy(e4b, order, &max)) {
if (block + 1 >= max)
break;
@@ -2607,9 +2607,17 @@ static void ext4_free_data_callback(struct super_block *sb,
mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
entry->efd_count, entry->efd_group, entry);
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, entry->efd_group,
- entry->efd_start_cluster, entry->efd_count);
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%d failed"
+ " with %d", entry->efd_group,
+ entry->efd_start_cluster,
+ entry->efd_count, err);
+ }
err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
/* we expect to find existing buddy because it's pinned */
@@ -4310,8 +4318,10 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
repeat:
/* allocate space in core */
*errp = ext4_mb_regular_allocator(ac);
- if (*errp)
+ if (*errp) {
+ ext4_discard_allocated_blocks(ac);
goto errout;
+ }
/* as we've just preallocated more space than
* user requested orinally, we store allocated
@@ -4333,10 +4343,10 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
goto repeat;
- } else if (*errp)
- errout:
+ } else if (*errp) {
ext4_discard_allocated_blocks(ac);
- else {
+ goto errout;
+ } else {
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len = ac->ac_b_ex.fe_len;
}
@@ -4347,6 +4357,7 @@ repeat:
*errp = -ENOSPC;
}
+errout:
if (*errp) {
ac->ac_b_ex.fe_len = 0;
ar->len = 0;
@@ -4656,8 +4667,16 @@ do_more:
* with group lock held. generate_buddy look at
* them with group lock_held
*/
- if (test_opt(sb, DISCARD))
- ext4_issue_discard(sb, block_group, bit, count);
+ if (test_opt(sb, DISCARD)) {
+ err = ext4_issue_discard(sb, block_group, bit, count);
+ if (err && err != -EOPNOTSUPP)
+ ext4_msg(sb, KERN_WARNING, "discard request in"
+ " group:%d block:%d count:%lu failed"
+ " with %d", block_group, bit, count,
+ err);
+ }
+
+
ext4_lock_group(sb, block_group);
mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
mb_free_blocks(inode, &e4b, bit, count_clusters);
@@ -4851,10 +4870,11 @@ error_return:
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static void ext4_trim_extent(struct super_block *sb, int start, int count,
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
ext4_group_t group, struct ext4_buddy *e4b)
{
struct ext4_free_extent ex;
+ int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -4870,9 +4890,10 @@ static void ext4_trim_extent(struct super_block *sb, int start, int count,
*/
mb_mark_used(e4b, &ex);
ext4_unlock_group(sb, group);
- ext4_issue_discard(sb, group, start, count);
+ ret = ext4_issue_discard(sb, group, start, count);
ext4_lock_group(sb, group);
mb_free_blocks(NULL, e4b, start, ex.fe_len);
+ return ret;
}
/**
@@ -4901,7 +4922,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
void *bitmap;
ext4_grpblk_t next, count = 0, free_count = 0;
struct ext4_buddy e4b;
- int ret;
+ int ret = 0;
trace_ext4_trim_all_free(sb, group, start, max);
@@ -4928,8 +4949,11 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
next = mb_find_next_bit(bitmap, max + 1, start);
if ((next - start) >= minblocks) {
- ext4_trim_extent(sb, start,
- next - start, group, &e4b);
+ ret = ext4_trim_extent(sb, start,
+ next - start, group, &e4b);
+ if (ret && ret != -EOPNOTSUPP)
+ break;
+ ret = 0;
count += next - start;
}
free_count += next - start;
@@ -4950,8 +4974,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
break;
}
- if (!ret)
+ if (!ret) {
+ ret = count;
EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
+ }
out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -4959,7 +4985,7 @@ out:
ext4_debug("trimmed %d blocks in the group %d\n",
count, group);
- return count;
+ return ret;
}
/**
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index f1bb32ec016..db8226d595f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -14,6 +14,7 @@
#include <linux/slab.h>
#include "ext4_jbd2.h"
+#include "ext4_extents.h"
/*
* The contiguous blocks details which can be
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 292daeeed45..d9cc5ee42f5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include "ext4_jbd2.h"
#include "ext4.h"
+#include "ext4_extents.h"
/**
* get_ext_path - Find an extent path for designated logical block number.
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d600a69fc9..cac44828233 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -202,13 +202,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
/* checksumming functions */
-#define EXT4_DIRENT_TAIL(block, blocksize) \
- ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
- ((blocksize) - \
- sizeof(struct ext4_dir_entry_tail))))
-
-static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
- unsigned int blocksize)
+void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+ unsigned int blocksize)
{
memset(t, 0, sizeof(struct ext4_dir_entry_tail));
t->det_rec_len = ext4_rec_len_to_disk(
@@ -261,6 +256,12 @@ static __le32 ext4_dirent_csum(struct inode *inode,
return cpu_to_le32(csum);
}
+static void warn_no_space_for_csum(struct inode *inode)
+{
+ ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for "
+ "checksum. Please run e2fsck -D.", inode->i_ino);
+}
+
int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
{
struct ext4_dir_entry_tail *t;
@@ -271,8 +272,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
t = get_dirent_tail(inode, dirent);
if (!t) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
- "leaf for checksum. Please run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return 0;
}
@@ -294,8 +294,7 @@ static void ext4_dirent_csum_set(struct inode *inode,
t = get_dirent_tail(inode, dirent);
if (!t) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
- "leaf for checksum. Please run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return;
}
@@ -303,9 +302,9 @@ static void ext4_dirent_csum_set(struct inode *inode,
(void *)t - (void *)dirent);
}
-static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh)
+int ext4_handle_dirty_dirent_node(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh)
{
ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
return ext4_handle_dirty_metadata(handle, inode, bh);
@@ -377,8 +376,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
count = le16_to_cpu(c->count);
if (count_offset + (limit * sizeof(struct dx_entry)) >
EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
- "tree checksum found. Run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return 1;
}
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -408,8 +406,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
count = le16_to_cpu(c->count);
if (count_offset + (limit * sizeof(struct dx_entry)) >
EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
- EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
- "tree checksum. Run e2fsck -D.");
+ warn_no_space_for_csum(inode);
return;
}
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
@@ -890,6 +887,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
EXT4_DIR_REC_LEN(0));
for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+ ((char *)de - bh->b_data))) {
/* On error, skip the f_pos to the next block. */
@@ -1007,6 +1005,15 @@ errout:
return (err);
}
+static inline int search_dirblock(struct buffer_head *bh,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
+{
+ return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir,
+ d_name, offset, res_dir);
+}
/*
* Directory block splitting, compacting
@@ -1081,13 +1088,6 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
dx_set_count(entries, count + 1);
}
-static void ext4_update_dx_flag(struct inode *inode)
-{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
- ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
-}
-
/*
* NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure.
*
@@ -1107,11 +1107,13 @@ static inline int ext4_match (int len, const char * const name,
/*
* Returns 0 if not found, -1 on failure, and 1 on success
*/
-static inline int search_dirblock(struct buffer_head *bh,
- struct inode *dir,
- const struct qstr *d_name,
- unsigned int offset,
- struct ext4_dir_entry_2 ** res_dir)
+int search_dir(struct buffer_head *bh,
+ char *search_buf,
+ int buf_size,
+ struct inode *dir,
+ const struct qstr *d_name,
+ unsigned int offset,
+ struct ext4_dir_entry_2 **res_dir)
{
struct ext4_dir_entry_2 * de;
char * dlimit;
@@ -1119,8 +1121,8 @@ static inline int search_dirblock(struct buffer_head *bh,
const char *name = d_name->name;
int namelen = d_name->len;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ de = (struct ext4_dir_entry_2 *)search_buf;
+ dlimit = search_buf + buf_size;
while ((char *) de < dlimit) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
@@ -1128,7 +1130,8 @@ static inline int search_dirblock(struct buffer_head *bh,
if ((char *) de + namelen <= dlimit &&
ext4_match (namelen, name, de)) {
/* found a match - just to be sure, do a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
+ bh->b_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -1144,6 +1147,21 @@ static inline int search_dirblock(struct buffer_head *bh,
return 0;
}
+static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+ struct ext4_dir_entry *de)
+{
+ struct super_block *sb = dir->i_sb;
+
+ if (!is_dx(dir))
+ return 0;
+ if (block == 0)
+ return 1;
+ if (de->inode == 0 &&
+ ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) ==
+ sb->s_blocksize)
+ return 1;
+ return 0;
+}
/*
* ext4_find_entry()
@@ -1158,7 +1176,8 @@ static inline int search_dirblock(struct buffer_head *bh,
*/
static struct buffer_head * ext4_find_entry (struct inode *dir,
const struct qstr *d_name,
- struct ext4_dir_entry_2 ** res_dir)
+ struct ext4_dir_entry_2 **res_dir,
+ int *inlined)
{
struct super_block *sb;
struct buffer_head *bh_use[NAMEI_RA_SIZE];
@@ -1179,6 +1198,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
namelen = d_name->len;
if (namelen > EXT4_NAME_LEN)
return NULL;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ ret = ext4_find_inline_entry(dir, d_name, res_dir,
+ &has_inline_data);
+ if (has_inline_data) {
+ if (inlined)
+ *inlined = 1;
+ return ret;
+ }
+ }
+
if ((namelen <= 2) && (name[0] == '.') &&
(name[1] == '.' || name[1] == '\0')) {
/*
@@ -1244,6 +1275,8 @@ restart:
goto next;
}
if (!buffer_verified(bh) &&
+ !is_dx_internal_node(dir, block,
+ (struct ext4_dir_entry *)bh->b_data) &&
!ext4_dirent_csum_verify(dir,
(struct ext4_dir_entry *)bh->b_data)) {
EXT4_ERROR_INODE(dir, "checksumming directory "
@@ -1361,7 +1394,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
if (dentry->d_name.len > EXT4_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
@@ -1395,7 +1428,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct ext4_dir_entry_2 * de;
struct buffer_head *bh;
- bh = ext4_find_entry(child->d_inode, &dotdot, &de);
+ bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
@@ -1593,6 +1626,63 @@ errout:
return NULL;
}
+int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+ struct buffer_head *bh,
+ void *buf, int buf_size,
+ const char *name, int namelen,
+ struct ext4_dir_entry_2 **dest_de)
+{
+ struct ext4_dir_entry_2 *de;
+ unsigned short reclen = EXT4_DIR_REC_LEN(namelen);
+ int nlen, rlen;
+ unsigned int offset = 0;
+ char *top;
+
+ de = (struct ext4_dir_entry_2 *)buf;
+ top = buf + buf_size - reclen;
+ while ((char *) de <= top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ buf, buf_size, offset))
+ return -EIO;
+ if (ext4_match(namelen, name, de))
+ return -EEXIST;
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if ((de->inode ? rlen - nlen : rlen) >= reclen)
+ break;
+ de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+ offset += rlen;
+ }
+ if ((char *) de > top)
+ return -ENOSPC;
+
+ *dest_de = de;
+ return 0;
+}
+
+void ext4_insert_dentry(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ const char *name, int namelen)
+{
+
+ int nlen, rlen;
+
+ nlen = EXT4_DIR_REC_LEN(de->name_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+ if (de->inode) {
+ struct ext4_dir_entry_2 *de1 =
+ (struct ext4_dir_entry_2 *)((char *)de + nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size);
+ de->rec_len = ext4_rec_len_to_disk(nlen, buf_size);
+ de = de1;
+ }
+ de->file_type = EXT4_FT_UNKNOWN;
+ de->inode = cpu_to_le32(inode->i_ino);
+ ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+ de->name_len = namelen;
+ memcpy(de->name, name, namelen);
+}
/*
* Add a new entry into a directory (leaf) block. If de is non-NULL,
* it points to a directory entry which is guaranteed to be large
@@ -1608,12 +1698,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
struct inode *dir = dentry->d_parent->d_inode;
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
- unsigned int offset = 0;
unsigned int blocksize = dir->i_sb->s_blocksize;
unsigned short reclen;
- int nlen, rlen, err;
- char *top;
int csum_size = 0;
+ int err;
if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
@@ -1621,22 +1709,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
- de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + (blocksize - csum_size) - reclen;
- while ((char *) de <= top) {
- if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
- return -EIO;
- if (ext4_match(namelen, name, de))
- return -EEXIST;
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if ((de->inode? rlen - nlen: rlen) >= reclen)
- break;
- de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
- offset += rlen;
- }
- if ((char *) de > top)
- return -ENOSPC;
+ err = ext4_find_dest_de(dir, inode,
+ bh, bh->b_data, blocksize - csum_size,
+ name, namelen, &de);
+ if (err)
+ return err;
}
BUFFER_TRACE(bh, "get_write_access");
err = ext4_journal_get_write_access(handle, bh);
@@ -1646,19 +1723,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
}
/* By now the buffer is marked for journaling */
- nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
- if (de->inode) {
- struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
- de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
- de = de1;
- }
- de->file_type = EXT4_FT_UNKNOWN;
- de->inode = cpu_to_le32(inode->i_ino);
- ext4_set_de_type(dir->i_sb, de, inode->i_mode);
- de->name_len = namelen;
- memcpy(de->name, name, namelen);
+ ext4_insert_dentry(inode, de, blocksize, name, namelen);
+
/*
* XXX shouldn't update any times until successful
* completion of syscall, but too many callers depend
@@ -1831,6 +1897,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
blocksize = sb->s_blocksize;
if (!dentry->d_name.len)
return -EINVAL;
+
+ if (ext4_has_inline_data(dir)) {
+ retval = ext4_try_add_inline_entry(handle, dentry, inode);
+ if (retval < 0)
+ return retval;
+ if (retval == 1) {
+ retval = 0;
+ return retval;
+ }
+ }
+
if (is_dx(dir)) {
retval = ext4_dx_add_entry(handle, dentry, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
@@ -2036,36 +2113,29 @@ cleanup:
}
/*
- * ext4_delete_entry deletes a directory entry by merging it with the
- * previous entry
+ * ext4_generic_delete_entry deletes a directory entry by merging it
+ * with the previous entry
*/
-static int ext4_delete_entry(handle_t *handle,
- struct inode *dir,
- struct ext4_dir_entry_2 *de_del,
- struct buffer_head *bh)
+int ext4_generic_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ void *entry_buf,
+ int buf_size,
+ int csum_size)
{
struct ext4_dir_entry_2 *de, *pde;
unsigned int blocksize = dir->i_sb->s_blocksize;
- int csum_size = 0;
- int i, err;
-
- if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
- csum_size = sizeof(struct ext4_dir_entry_tail);
+ int i;
i = 0;
pde = NULL;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
- while (i < bh->b_size - csum_size) {
- if (ext4_check_dir_entry(dir, NULL, de, bh, i))
+ de = (struct ext4_dir_entry_2 *)entry_buf;
+ while (i < buf_size - csum_size) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh,
+ bh->b_data, bh->b_size, i))
return -EIO;
if (de == de_del) {
- BUFFER_TRACE(bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, bh);
- if (unlikely(err)) {
- ext4_std_error(dir->i_sb, err);
- return err;
- }
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
ext4_rec_len_from_disk(pde->rec_len,
@@ -2076,12 +2146,6 @@ static int ext4_delete_entry(handle_t *handle,
else
de->inode = 0;
dir->i_version++;
- BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, dir, bh);
- if (unlikely(err)) {
- ext4_std_error(dir->i_sb, err);
- return err;
- }
return 0;
}
i += ext4_rec_len_from_disk(de->rec_len, blocksize);
@@ -2091,6 +2155,48 @@ static int ext4_delete_entry(handle_t *handle,
return -ENOENT;
}
+static int ext4_delete_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh)
+{
+ int err, csum_size = 0;
+
+ if (ext4_has_inline_data(dir)) {
+ int has_inline_data = 1;
+ err = ext4_delete_inline_entry(handle, dir, de_del, bh,
+ &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
+ if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ csum_size = sizeof(struct ext4_dir_entry_tail);
+
+ BUFFER_TRACE(bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, bh);
+ if (unlikely(err))
+ goto out;
+
+ err = ext4_generic_delete_entry(handle, dir, de_del,
+ bh, bh->b_data,
+ dir->i_sb->s_blocksize, csum_size);
+ if (err)
+ goto out;
+
+ BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, dir, bh);
+ if (unlikely(err))
+ goto out;
+
+ return 0;
+out:
+ if (err != -ENOENT)
+ ext4_std_error(dir->i_sb, err);
+ return err;
+}
+
/*
* DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
* since this indicates that nlinks count was previously 1.
@@ -2211,21 +2317,95 @@ retry:
return err;
}
-static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int blocksize, int csum_size,
+ unsigned int parent_ino, int dotdot_real_len)
+{
+ de->inode = cpu_to_le32(inode->i_ino);
+ de->name_len = 1;
+ de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ blocksize);
+ strcpy(de->name, ".");
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ de = ext4_next_entry(de, blocksize);
+ de->inode = cpu_to_le32(parent_ino);
+ de->name_len = 2;
+ if (!dotdot_real_len)
+ de->rec_len = ext4_rec_len_to_disk(blocksize -
+ (csum_size + EXT4_DIR_REC_LEN(1)),
+ blocksize);
+ else
+ de->rec_len = ext4_rec_len_to_disk(
+ EXT4_DIR_REC_LEN(de->name_len), blocksize);
+ strcpy(de->name, "..");
+ ext4_set_de_type(inode->i_sb, de, S_IFDIR);
+
+ return ext4_next_entry(de, blocksize);
+}
+
+static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode)
{
- handle_t *handle;
- struct inode *inode;
struct buffer_head *dir_block = NULL;
struct ext4_dir_entry_2 *de;
struct ext4_dir_entry_tail *t;
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
- int err, retries = 0;
+ int err;
if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
csum_size = sizeof(struct ext4_dir_entry_tail);
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ err = ext4_try_create_inline_dir(handle, dir, inode);
+ if (err < 0 && err != -ENOSPC)
+ goto out;
+ if (!err)
+ goto out;
+ }
+
+ inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
+ dir_block = ext4_bread(handle, inode, 0, 1, &err);
+ if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
+ if (!err) {
+ err = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
+ goto out;
+ }
+ BUFFER_TRACE(dir_block, "get_write_access");
+ err = ext4_journal_get_write_access(handle, dir_block);
+ if (err)
+ goto out;
+ de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+ ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
+ set_nlink(inode, 2);
+ if (csum_size) {
+ t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+ initialize_dirent_tail(t, blocksize);
+ }
+
+ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ if (err)
+ goto out;
+ set_buffer_verified(dir_block);
+out:
+ brelse(dir_block);
+ return err;
+}
+
+static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ handle_t *handle;
+ struct inode *inode;
+ int err, retries = 0;
+
if (EXT4_DIR_LINK_MAX(dir))
return -EMLINK;
@@ -2249,47 +2429,9 @@ retry:
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
- inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize;
- if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
- if (!err) {
- err = -EIO;
- ext4_error(inode->i_sb,
- "Directory hole detected on inode %lu\n",
- inode->i_ino);
- }
- goto out_clear_inode;
- }
- BUFFER_TRACE(dir_block, "get_write_access");
- err = ext4_journal_get_write_access(handle, dir_block);
- if (err)
- goto out_clear_inode;
- de = (struct ext4_dir_entry_2 *) dir_block->b_data;
- de->inode = cpu_to_le32(inode->i_ino);
- de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
- blocksize);
- strcpy(de->name, ".");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- de = ext4_next_entry(de, blocksize);
- de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + EXT4_DIR_REC_LEN(1)),
- blocksize);
- de->name_len = 2;
- strcpy(de->name, "..");
- ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- set_nlink(inode, 2);
-
- if (csum_size) {
- t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
- initialize_dirent_tail(t, blocksize);
- }
-
- BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
+ err = ext4_init_new_dir(handle, dir, inode);
if (err)
goto out_clear_inode;
- set_buffer_verified(dir_block);
err = ext4_mark_inode_dirty(handle, inode);
if (!err)
err = ext4_add_entry(handle, dentry, inode);
@@ -2309,7 +2451,6 @@ out_clear_inode:
unlock_new_inode(inode);
d_instantiate(dentry, inode);
out_stop:
- brelse(dir_block);
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
@@ -2327,6 +2468,14 @@ static int empty_dir(struct inode *inode)
struct super_block *sb;
int err = 0;
+ if (ext4_has_inline_data(inode)) {
+ int has_inline_data = 1;
+
+ err = empty_inline_dir(inode, &has_inline_data);
+ if (has_inline_data)
+ return err;
+ }
+
sb = inode->i_sb;
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
!(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
@@ -2393,7 +2542,8 @@ static int empty_dir(struct inode *inode)
set_buffer_verified(bh);
de = (struct ext4_dir_entry_2 *) bh->b_data;
}
- if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
+ if (ext4_check_dir_entry(inode, NULL, de, bh,
+ bh->b_data, bh->b_size, offset)) {
de = (struct ext4_dir_entry_2 *)(bh->b_data +
sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
@@ -2579,7 +2729,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
return PTR_ERR(handle);
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_rmdir;
@@ -2644,7 +2794,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
ext4_handle_sync(handle);
retval = -ENOENT;
- bh = ext4_find_entry(dir, &dentry->d_name, &de);
+ bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (!bh)
goto end_unlink;
@@ -2826,8 +2976,39 @@ retry:
return err;
}
-#define PARENT_INO(buffer, size) \
- (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
+
+/*
+ * Try to find buffer head where contains the parent block.
+ * It should be the inode block if it is inlined or the 1st block
+ * if it is a normal dir.
+ */
+static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
+ struct inode *inode,
+ int *retval,
+ struct ext4_dir_entry_2 **parent_de,
+ int *inlined)
+{
+ struct buffer_head *bh;
+
+ if (!ext4_has_inline_data(inode)) {
+ if (!(bh = ext4_bread(handle, inode, 0, 0, retval))) {
+ if (!*retval) {
+ *retval = -EIO;
+ ext4_error(inode->i_sb,
+ "Directory hole detected on inode %lu\n",
+ inode->i_ino);
+ }
+ return NULL;
+ }
+ *parent_de = ext4_next_entry(
+ (struct ext4_dir_entry_2 *)bh->b_data,
+ inode->i_sb->s_blocksize);
+ return bh;
+ }
+
+ *inlined = 1;
+ return ext4_get_first_inline_block(inode, parent_de, retval);
+}
/*
* Anybody can rename anything with this: the permission checks are left to the
@@ -2841,6 +3022,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
int retval, force_da_alloc = 0;
+ int inlined = 0, new_inlined = 0;
+ struct ext4_dir_entry_2 *parent_de;
dquot_initialize(old_dir);
dquot_initialize(new_dir);
@@ -2860,7 +3043,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir))
ext4_handle_sync(handle);
- old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de);
+ old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de, NULL);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -2873,7 +3056,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
new_inode = new_dentry->d_inode;
- new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de);
+ new_bh = ext4_find_entry(new_dir, &new_dentry->d_name,
+ &new_de, &new_inlined);
if (new_bh) {
if (!new_inode) {
brelse(new_bh);
@@ -2887,22 +3071,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
goto end_rename;
}
retval = -EIO;
- if (!(dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval))) {
- if (!retval) {
- retval = -EIO;
- ext4_error(old_inode->i_sb,
- "Directory hole detected on inode %lu\n",
- old_inode->i_ino);
- }
+ dir_bh = ext4_get_first_dir_block(handle, old_inode,
+ &retval, &parent_de,
+ &inlined);
+ if (!dir_bh)
goto end_rename;
- }
- if (!buffer_verified(dir_bh) &&
+ if (!inlined && !buffer_verified(dir_bh) &&
!ext4_dirent_csum_verify(old_inode,
(struct ext4_dir_entry *)dir_bh->b_data))
goto end_rename;
set_buffer_verified(dir_bh);
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
- old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
+ if (le32_to_cpu(parent_de->inode) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
@@ -2931,10 +3110,13 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_current_time(new_dir);
ext4_mark_inode_dirty(handle, new_dir);
BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
- retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
- if (unlikely(retval)) {
- ext4_std_error(new_dir->i_sb, retval);
- goto end_rename;
+ if (!new_inlined) {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ new_dir, new_bh);
+ if (unlikely(retval)) {
+ ext4_std_error(new_dir->i_sb, retval);
+ goto end_rename;
+ }
}
brelse(new_bh);
new_bh = NULL;
@@ -2962,7 +3144,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *old_bh2;
struct ext4_dir_entry_2 *old_de2;
- old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2);
+ old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name,
+ &old_de2, NULL);
if (old_bh2) {
retval = ext4_delete_entry(handle, old_dir,
old_de2, old_bh2);
@@ -2982,17 +3165,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
ext4_update_dx_flag(old_dir);
if (dir_bh) {
- PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
- cpu_to_le32(new_dir->i_ino);
+ parent_de->inode = cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
- if (is_dx(old_inode)) {
- retval = ext4_handle_dirty_dx_node(handle,
- old_inode,
- dir_bh);
+ if (!inlined) {
+ if (is_dx(old_inode)) {
+ retval = ext4_handle_dirty_dx_node(handle,
+ old_inode,
+ dir_bh);
+ } else {
+ retval = ext4_handle_dirty_dirent_node(handle,
+ old_inode, dir_bh);
+ }
} else {
- retval = ext4_handle_dirty_dirent_node(handle,
- old_inode,
- dir_bh);
+ retval = ext4_mark_inode_dirty(handle, old_inode);
}
if (retval) {
ext4_std_error(old_dir->i_sb, retval);
@@ -3043,23 +3228,19 @@ const struct inode_operations ext4_dir_inode_operations = {
.mknod = ext4_mknod,
.rename = ext4_rename,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
.get_acl = ext4_get_acl,
.fiemap = ext4_fiemap,
};
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
.get_acl = ext4_get_acl,
};
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 68e896e12a6..0016fbca2a4 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -27,7 +27,6 @@
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
-#include "ext4_extents.h"
static struct kmem_cache *io_page_cachep, *io_end_cachep;
@@ -111,7 +110,7 @@ static int ext4_end_io(ext4_io_end_t *io)
inode_dio_done(inode);
/* Wake up anyone waiting on unwritten extent conversion */
if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
- wake_up_all(ext4_ioend_wq(io->inode));
+ wake_up_all(ext4_ioend_wq(inode));
return ret;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 47bf06a2765..d99387b89ed 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -783,7 +783,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
err = ext4_journal_get_write_access(handle, gdb_bh);
if (unlikely(err))
- goto exit_sbh;
+ goto exit_dind;
err = ext4_journal_get_write_access(handle, dind);
if (unlikely(err))
@@ -792,7 +792,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
/* ext4_reserve_inode_write() gets a reference on the iloc */
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (unlikely(err))
- goto exit_dindj;
+ goto exit_dind;
n_group_desc = ext4_kvmalloc((gdb_num + 1) *
sizeof(struct buffer_head *),
@@ -846,12 +846,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
exit_inode:
ext4_kvfree(n_group_desc);
- /* ext4_handle_release_buffer(handle, iloc.bh); */
brelse(iloc.bh);
-exit_dindj:
- /* ext4_handle_release_buffer(handle, dind); */
-exit_sbh:
- /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
exit_dind:
brelse(dind);
exit_bh:
@@ -969,14 +964,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
}
for (i = 0; i < reserved_gdb; i++) {
- if ((err = ext4_journal_get_write_access(handle, primary[i]))) {
- /*
- int j;
- for (j = 0; j < i; j++)
- ext4_handle_release_buffer(handle, primary[j]);
- */
+ if ((err = ext4_journal_get_write_access(handle, primary[i])))
goto exit_bh;
- }
}
if ((err = ext4_reserve_inode_write(handle, inode, &iloc)))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 80928f71685..3cdb0a2fc64 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,7 +45,7 @@
#include <linux/freezer.h>
#include "ext4.h"
-#include "ext4_extents.h"
+#include "ext4_extents.h" /* Needed for trace points definition */
#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
@@ -939,10 +939,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
ei->vfs_inode.i_version = 1;
- ei->vfs_inode.i_data.writeback_index = 0;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
+ ext4_es_init_tree(&ei->i_es_tree);
+ rwlock_init(&ei->i_es_lock);
ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0;
@@ -996,9 +997,7 @@ static void init_once(void *foo)
struct ext4_inode_info *ei = (struct ext4_inode_info *) foo;
INIT_LIST_HEAD(&ei->i_orphan);
-#ifdef CONFIG_EXT4_FS_XATTR
init_rwsem(&ei->xattr_sem);
-#endif
init_rwsem(&ei->i_data_sem);
inode_init_once(&ei->vfs_inode);
}
@@ -1031,6 +1030,7 @@ void ext4_clear_inode(struct inode *inode)
clear_inode(inode);
dquot_drop(inode);
ext4_discard_preallocations(inode);
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
if (EXT4_I(inode)->jinode) {
jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode),
EXT4_I(inode)->jinode);
@@ -1447,13 +1447,8 @@ static const struct mount_opts {
{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
-#ifdef CONFIG_EXT4_FS_XATTR
{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
-#else
- {Opt_user_xattr, 0, MOPT_NOSUPPORT},
- {Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
-#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
@@ -3202,7 +3197,6 @@ int ext4_calculate_overhead(struct super_block *sb)
ext4_fsblk_t overhead = 0;
char *buf = (char *) get_zeroed_page(GFP_KERNEL);
- memset(buf, 0, PAGE_SIZE);
if (!buf)
return -ENOMEM;
@@ -3256,7 +3250,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
unsigned int i;
int needs_recovery, has_huge_files, has_bigalloc;
__u64 blocks_count;
- int err;
+ int err = 0;
unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ext4_group_t first_not_zeroed;
@@ -3272,9 +3266,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sb->s_fs_info = sbi;
sbi->s_sb = sb;
- sbi->s_mount_opt = 0;
- sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
- sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
if (sb->s_bdev->bd_part)
@@ -3285,6 +3276,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (cp = sb->s_id; (cp = strchr(cp, '/'));)
*cp = '!';
+ /* -EINVAL is default */
ret = -EINVAL;
blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
if (!blocksize) {
@@ -3369,9 +3361,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (def_mount_opts & EXT4_DEFM_UID16)
set_opt(sb, NO_UID32);
/* xattr user namespace & acls are now defaulted on */
-#ifdef CONFIG_EXT4_FS_XATTR
set_opt(sb, XATTR_USER);
-#endif
#ifdef CONFIG_EXT4_FS_POSIX_ACL
set_opt(sb, POSIX_ACL);
#endif
@@ -3662,7 +3652,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
" too large to mount safely on this system");
if (sizeof(sector_t) < 8)
ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
- ret = err;
goto failed_mount;
}
@@ -3770,7 +3759,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
- ret = err;
goto failed_mount3;
}
@@ -3801,7 +3789,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
mutex_init(&sbi->s_orphan_lock);
- sbi->s_resize_flags = 0;
sb->s_root = NULL;
@@ -3897,8 +3884,8 @@ no_journal:
if (es->s_overhead_clusters)
sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
else {
- ret = ext4_calculate_overhead(sb);
- if (ret)
+ err = ext4_calculate_overhead(sb);
+ if (err)
goto failed_mount_wq;
}
@@ -3910,6 +3897,7 @@ no_journal:
alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
if (!EXT4_SB(sb)->dio_unwritten_wq) {
printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+ ret = -ENOMEM;
goto failed_mount_wq;
}
@@ -4012,12 +4000,20 @@ no_journal:
/* Enable quota usage during mount. */
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
!(sb->s_flags & MS_RDONLY)) {
- ret = ext4_enable_quotas(sb);
- if (ret)
+ err = ext4_enable_quotas(sb);
+ if (err)
goto failed_mount7;
}
#endif /* CONFIG_QUOTA */
+ if (test_opt(sb, DISCARD)) {
+ struct request_queue *q = bdev_get_queue(sb->s_bdev);
+ if (!blk_queue_discard(q))
+ ext4_msg(sb, KERN_WARNING,
+ "mounting with \"discard\" option, but "
+ "the device does not support discard");
+ }
+
ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
"Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
*sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -4084,7 +4080,7 @@ out_fail:
kfree(sbi);
out_free_orig:
kfree(orig_data);
- return ret;
+ return err ? err : ret;
}
/*
@@ -4790,7 +4786,7 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = EXT4_SUPER_MAGIC;
buf->f_bsize = sb->s_blocksize;
- buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, sbi->s_overhead);
+ buf->f_blocks = ext4_blocks_count(es) - EXT4_C2B(sbi, overhead);
bfree = percpu_counter_sum_positive(&sbi->s_freeclusters_counter) -
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
@@ -5282,6 +5278,7 @@ static int __init ext4_init_fs(void)
ext4_li_info = NULL;
mutex_init(&ext4_li_mtx);
+ /* Build-time check for flags consistency */
ext4_check_flag_values();
for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
@@ -5289,9 +5286,14 @@ static int __init ext4_init_fs(void)
init_waitqueue_head(&ext4__ioend_wq[i]);
}
- err = ext4_init_pageio();
+ err = ext4_init_es();
if (err)
return err;
+
+ err = ext4_init_pageio();
+ if (err)
+ goto out7;
+
err = ext4_init_system_zone();
if (err)
goto out6;
@@ -5341,6 +5343,9 @@ out5:
ext4_exit_system_zone();
out6:
ext4_exit_pageio();
+out7:
+ ext4_exit_es();
+
return err;
}
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index ed9354aff27..ff371193201 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -35,22 +35,18 @@ const struct inode_operations ext4_symlink_inode_operations = {
.follow_link = page_follow_link_light,
.put_link = page_put_link,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
const struct inode_operations ext4_fast_symlink_inode_operations = {
.readlink = generic_readlink,
.follow_link = ext4_follow_link,
.setattr = ext4_setattr,
-#ifdef CONFIG_EXT4_FS_XATTR
.setxattr = generic_setxattr,
.getxattr = generic_getxattr,
.listxattr = ext4_listxattr,
.removexattr = generic_removexattr,
-#endif
};
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2cdb98d6298..3a91ebc2b66 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -61,11 +61,6 @@
#include "xattr.h"
#include "acl.h"
-#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
-#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
-#define BFIRST(bh) ENTRY(BHDR(bh)+1)
-#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
-
#ifdef EXT4_XATTR_DEBUG
# define ea_idebug(inode, f...) do { \
printk(KERN_DEBUG "inode %s:%lu: ", \
@@ -312,7 +307,7 @@ cleanup:
return error;
}
-static int
+int
ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
void *buffer, size_t buffer_size)
{
@@ -581,21 +576,6 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
return (*min_offs - ((void *)last - base) - sizeof(__u32));
}
-struct ext4_xattr_info {
- int name_index;
- const char *name;
- const void *value;
- size_t value_len;
-};
-
-struct ext4_xattr_search {
- struct ext4_xattr_entry *first;
- void *base;
- void *end;
- struct ext4_xattr_entry *here;
- int not_found;
-};
-
static int
ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
{
@@ -648,9 +628,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size. Just replace. */
s->here->e_value_size =
cpu_to_le32(i->value_len);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
return 0;
}
@@ -689,9 +674,14 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
size_t size = EXT4_XATTR_SIZE(i->value_len);
void *val = s->base + min_offs - size;
s->here->e_value_offs = cpu_to_le16(min_offs - size);
- memset(val + size - EXT4_XATTR_PAD, 0,
- EXT4_XATTR_PAD); /* Clear the pad bytes. */
- memcpy(val, i->value, i->value_len);
+ if (i->value == EXT4_ZERO_XATTR_VALUE) {
+ memset(val, 0, size);
+ } else {
+ /* Clear the pad bytes first. */
+ memset(val + size - EXT4_XATTR_PAD, 0,
+ EXT4_XATTR_PAD);
+ memcpy(val, i->value, i->value_len);
+ }
}
}
return 0;
@@ -794,7 +784,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
int offset = (char *)s->here - bs->bh->b_data;
unlock_buffer(bs->bh);
- ext4_handle_release_buffer(handle, bs->bh);
if (ce) {
mb_cache_entry_release(ce);
ce = NULL;
@@ -950,14 +939,8 @@ bad_block:
#undef header
}
-struct ext4_xattr_ibody_find {
- struct ext4_xattr_search s;
- struct ext4_iloc iloc;
-};
-
-static int
-ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_inode *raw_inode;
@@ -985,10 +968,47 @@ ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-static int
-ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
+int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
+{
+ struct ext4_xattr_ibody_header *header;
+ struct ext4_xattr_search *s = &is->s;
+ int error;
+
+ if (EXT4_I(inode)->i_extra_isize == 0)
+ return -ENOSPC;
+ error = ext4_xattr_set_entry(i, s);
+ if (error) {
+ if (error == -ENOSPC &&
+ ext4_has_inline_data(inode)) {
+ error = ext4_try_to_evict_inline_data(handle, inode,
+ EXT4_XATTR_LEN(strlen(i->name) +
+ EXT4_XATTR_SIZE(i->value_len)));
+ if (error)
+ return error;
+ error = ext4_xattr_ibody_find(inode, i, is);
+ if (error)
+ return error;
+ error = ext4_xattr_set_entry(i, s);
+ }
+ if (error)
+ return error;
+ }
+ header = IHDR(inode, ext4_raw_inode(&is->iloc));
+ if (!IS_LAST_ENTRY(s->first)) {
+ header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
+ ext4_set_inode_state(inode, EXT4_STATE_XATTR);
+ } else {
+ header->h_magic = cpu_to_le32(0);
+ ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
+ }
+ return 0;
+}
+
+static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is)
{
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_search *s = &is->s;
@@ -1144,9 +1164,17 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
{
handle_t *handle;
int error, retries = 0;
+ int credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb);
retry:
- handle = ext4_journal_start(inode, EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
+ /*
+ * In case of inline data, we may push out the data to a block,
+ * So reserve the journal space first.
+ */
+ if (ext4_has_inline_data(inode))
+ credits += ext4_writepage_trans_blocks(inode) + 1;
+
+ handle = ext4_journal_start(inode, credits);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
} else {
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 91f31ca7d9a..69eda787a96 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -21,6 +21,7 @@
#define EXT4_XATTR_INDEX_TRUSTED 4
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
+#define EXT4_XATTR_INDEX_SYSTEM 7
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
@@ -65,7 +66,32 @@ struct ext4_xattr_entry {
EXT4_I(inode)->i_extra_isize))
#define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
-# ifdef CONFIG_EXT4_FS_XATTR
+#define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+#define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+#define BFIRST(bh) ENTRY(BHDR(bh)+1)
+#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
+
+#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+
+struct ext4_xattr_info {
+ int name_index;
+ const char *name;
+ const void *value;
+ size_t value_len;
+};
+
+struct ext4_xattr_search {
+ struct ext4_xattr_entry *first;
+ void *base;
+ void *end;
+ struct ext4_xattr_entry *here;
+ int not_found;
+};
+
+struct ext4_xattr_ibody_find {
+ struct ext4_xattr_search s;
+ struct ext4_iloc iloc;
+};
extern const struct xattr_handler ext4_xattr_user_handler;
extern const struct xattr_handler ext4_xattr_trusted_handler;
@@ -90,60 +116,82 @@ extern void ext4_exit_xattr(void);
extern const struct xattr_handler *ext4_xattr_handlers[];
-# else /* CONFIG_EXT4_FS_XATTR */
-
-static inline int
-ext4_xattr_get(struct inode *inode, int name_index, const char *name,
- void *buffer, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set(struct inode *inode, int name_index, const char *name,
- const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline int
-ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
- const char *name, const void *value, size_t size, int flags)
-{
- return -EOPNOTSUPP;
-}
-
-static inline void
-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
-{
-}
-
-static inline void
-ext4_xattr_put_super(struct super_block *sb)
-{
-}
-
-static __init inline int
-ext4_init_xattr(void)
-{
- return 0;
-}
-
-static inline void
-ext4_exit_xattr(void)
-{
-}
-
-static inline int
-ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
- struct ext4_inode *raw_inode, handle_t *handle)
-{
- return -EOPNOTSUPP;
-}
-
-#define ext4_xattr_handlers NULL
-
-# endif /* CONFIG_EXT4_FS_XATTR */
+extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
+ const char *name,
+ void *buffer, size_t buffer_size);
+extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
+
+extern int ext4_has_inline_data(struct inode *inode);
+extern int ext4_get_inline_size(struct inode *inode);
+extern int ext4_get_max_inline_size(struct inode *inode);
+extern int ext4_find_inline_data_nolock(struct inode *inode);
+extern void ext4_write_inline_data(struct inode *inode,
+ struct ext4_iloc *iloc,
+ void *buffer, loff_t pos,
+ unsigned int len);
+extern int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
+ unsigned int len);
+extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);
+
+extern int ext4_readpage_inline(struct inode *inode, struct page *page);
+extern int ext4_try_to_write_inline_data(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep);
+extern int ext4_write_inline_data_end(struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned copied,
+ struct page *page);
+extern struct buffer_head *
+ext4_journalled_write_inline_data(struct inode *inode,
+ unsigned len,
+ struct page *page);
+extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
+ struct inode *inode,
+ loff_t pos, unsigned len,
+ unsigned flags,
+ struct page **pagep,
+ void **fsdata);
+extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
+ unsigned len, unsigned copied,
+ struct page *page);
+extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
+ struct inode *inode);
+extern int ext4_try_create_inline_dir(handle_t *handle,
+ struct inode *parent,
+ struct inode *inode);
+extern int ext4_read_inline_dir(struct file *filp,
+ void *dirent, filldir_t filldir,
+ int *has_inline_data);
+extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
+ const struct qstr *d_name,
+ struct ext4_dir_entry_2 **res_dir,
+ int *has_inline_data);
+extern int ext4_delete_inline_entry(handle_t *handle,
+ struct inode *dir,
+ struct ext4_dir_entry_2 *de_del,
+ struct buffer_head *bh,
+ int *has_inline_data);
+extern int empty_inline_dir(struct inode *dir, int *has_inline_data);
+extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
+ struct ext4_dir_entry_2 **parent_de,
+ int *retval);
+extern int ext4_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ int *has_inline);
+extern int ext4_try_to_evict_inline_data(handle_t *handle,
+ struct inode *inode,
+ int needed);
+extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+
+extern int ext4_convert_inline_data(struct inode *inode);
#ifdef CONFIG_EXT4_FS_SECURITY
extern int ext4_init_security(handle_t *handle, struct inode *inode,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 484b8d1c6cb..dbf41f9452d 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -60,7 +60,6 @@ EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
-EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
#if 0
EXPORT_SYMBOL(journal_sync_buffer);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d8da40e99d8..42f6615af0a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1207,17 +1207,6 @@ out:
return ret;
}
-/*
- * jbd2_journal_release_buffer: undo a get_write_access without any buffer
- * updates, if the update decided in the end that it didn't need access.
- *
- */
-void
-jbd2_journal_release_buffer(handle_t *handle, struct buffer_head *bh)
-{
- BUFFER_TRACE(bh, "entry");
-}
-
/**
* void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 9cc4a3fbf4b..bc3968fa81e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -193,19 +193,15 @@ static int nfs_idmap_init_keyring(void)
if (!cred)
return -ENOMEM;
- keyring = key_alloc(&key_type_keyring, ".id_resolver", 0, 0, cred,
- (KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA);
+ keyring = keyring_alloc(".id_resolver", 0, 0, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
}
- ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
- if (ret < 0)
- goto failed_put_key;
-
ret = register_key_type(&key_type_id_resolver);
if (ret < 0)
goto failed_put_key;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 284e80831d2..701beab27aa 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
#define move_pte(pte, prot, old_addr, new_addr) (pte)
#endif
+#ifndef pte_accessible
+# define pte_accessible(pte) ((void)(pte),1)
+#endif
+
#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
#endif
@@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
#endif
}
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
+ * same bit too). It's set only when _PAGE_PRESET is not set and it's
+ * never set if _PAGE_PRESENT is set.
+ *
+ * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
+ * fault triggers on those regions if pte/pmd_numa returns true
+ * (because _PAGE_PRESENT is not set).
+ */
+#ifndef pte_numa
+static inline int pte_numa(pte_t pte)
+{
+ return (pte_flags(pte) &
+ (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+
+#ifndef pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+ return (pmd_flags(pmd) &
+ (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+#endif
+
+/*
+ * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
+ * because they're called by the NUMA hinting minor page fault. If we
+ * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
+ * would be forced to set it later while filling the TLB after we
+ * return to userland. That would trigger a second write to memory
+ * that we optimize away by setting _PAGE_ACCESSED here.
+ */
+#ifndef pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+ pte = pte_clear_flags(pte, _PAGE_NUMA);
+ return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+
+#ifndef pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+ pmd = pmd_clear_flags(pmd, _PAGE_NUMA);
+ return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED);
+}
+#endif
+
+#ifndef pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+ pte = pte_set_flags(pte, _PAGE_NUMA);
+ return pte_clear_flags(pte, _PAGE_PRESENT);
+}
+#endif
+
+#ifndef pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+ pmd = pmd_set_flags(pmd, _PAGE_NUMA);
+ return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+#endif
+#else
+extern int pte_numa(pte_t pte);
+extern int pmd_numa(pmd_t pmd);
+extern pte_t pte_mknonnuma(pte_t pte);
+extern pmd_t pmd_mknonnuma(pmd_t pmd);
+extern pte_t pte_mknuma(pte_t pte);
+extern pmd_t pmd_mknuma(pmd_t pmd);
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#else
+static inline int pmd_numa(pmd_t pmd)
+{
+ return 0;
+}
+
+static inline int pte_numa(pte_t pte)
+{
+ return 0;
+}
+
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+ return pmd;
+}
+
+static inline pte_t pte_mknuma(pte_t pte)
+{
+ return pte;
+}
+
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+ return pmd;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
#endif /* CONFIG_MMU */
#endif /* !__ASSEMBLY__ */
diff --git a/include/linux/cred.h b/include/linux/cred.h
index ebbed2ce663..0142aacb70b 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -77,21 +77,6 @@ extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
/*
- * The common credentials for a thread group
- * - shared by CLONE_THREAD
- */
-#ifdef CONFIG_KEYS
-struct thread_group_cred {
- atomic_t usage;
- pid_t tgid; /* thread group process ID */
- spinlock_t lock;
- struct key __rcu *session_keyring; /* keyring inherited over fork */
- struct key *process_keyring; /* keyring private to this process */
- struct rcu_head rcu; /* RCU deletion hook */
-};
-#endif
-
-/*
* The security context of a task
*
* The parts of the context break down into two categories:
@@ -139,6 +124,8 @@ struct cred {
#ifdef CONFIG_KEYS
unsigned char jit_keyring; /* default keyring to attach requested
* keys to */
+ struct key __rcu *session_keyring; /* keyring inherited over fork */
+ struct key *process_keyring; /* keyring private to this process */
struct key *thread_keyring; /* keyring private to this thread */
struct key *request_key_auth; /* assumed request_key authority */
struct thread_group_cred *tgcred; /* thread-group shared credentials */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 092dc5305a3..1d76f8ca90f 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd);
extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot);
+ unsigned long addr, pgprot_t newprot,
+ int prot_numa);
enum transparent_hugepage_flag {
TRANSPARENT_HUGEPAGE_FLAG,
@@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
#define wait_split_huge_page(__anon_vma, __pmd) \
do { \
pmd_t *____pmd = (__pmd); \
- anon_vma_lock(__anon_vma); \
+ anon_vma_lock_write(__anon_vma); \
anon_vma_unlock(__anon_vma); \
BUG_ON(pmd_trans_splitting(*____pmd) || \
pmd_trans_huge(*____pmd)); \
@@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page)
}
return page;
}
+
+extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp);
+
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
{
return 0;
}
+
+static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ return 0;
+}
+
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3e7fa1acf09..0c80d3f57a5 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int write);
int pmd_huge(pmd_t pmd);
int pud_huge(pud_t pmd);
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot);
#else /* !CONFIG_HUGETLB_PAGE */
@@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
-#define hugetlb_change_protection(vma, address, end, newprot)
+static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
+ unsigned long address, unsigned long end, pgprot_t newprot)
+{
+ return 0;
+}
static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 3efc43f3f16..1be23d9fdac 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1096,7 +1096,6 @@ extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
void jbd2_journal_set_triggers(struct buffer_head *,
struct jbd2_buffer_trigger_type *type);
extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
-extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
extern void journal_sync_buffer (struct buffer_head *);
extern void jbd2_journal_invalidatepage(journal_t *,
@@ -1303,15 +1302,21 @@ static inline int jbd_space_needed(journal_t *journal)
extern int jbd_blocks_per_page(struct inode *inode);
+/* JBD uses a CRC32 checksum */
+#define JBD_MAX_CHECKSUM_SIZE 4
+
static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
const void *address, unsigned int length)
{
struct {
struct shash_desc shash;
- char ctx[crypto_shash_descsize(journal->j_chksum_driver)];
+ char ctx[JBD_MAX_CHECKSUM_SIZE];
} desc;
int err;
+ BUG_ON(crypto_shash_descsize(journal->j_chksum_driver) >
+ JBD_MAX_CHECKSUM_SIZE);
+
desc.shash.tfm = journal->j_chksum_driver;
desc.shash.flags = 0;
*(u32 *)desc.ctx = crc;
diff --git a/include/linux/key.h b/include/linux/key.h
index 2393b1c040b..4dfde1161c5 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -265,6 +265,7 @@ extern int key_unlink(struct key *keyring,
extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
const struct cred *cred,
+ key_perm_t perm,
unsigned long flags,
struct key *dest);
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index dbd212723b7..9adc270de7e 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
return 1;
}
+extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+
#else
struct mempolicy {};
@@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
return 0;
}
+static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return -1; /* no node preference */
+}
+
#endif /* CONFIG_NUMA */
#endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0b5865c61ef..1e9f627967a 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page
* sucessful migration case.
*/
+enum migrate_reason {
+ MR_COMPACTION,
+ MR_MEMORY_FAILURE,
+ MR_MEMORY_HOTPLUG,
+ MR_SYSCALL, /* also applies to cpusets */
+ MR_MEMPOLICY_MBIND,
+ MR_NUMA_MISPLACED,
+ MR_CMA
+};
#ifdef CONFIG_MIGRATION
@@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- enum migrate_mode mode);
+ enum migrate_mode mode, int reason);
extern int migrate_huge_page(struct page *, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode);
@@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
static inline void putback_movable_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, bool offlining,
- enum migrate_mode mode) { return -ENOSYS; }
+ enum migrate_mode mode, int reason) { return -ENOSYS; }
static inline int migrate_huge_page(struct page *page, new_page_t x,
unsigned long private, bool offlining,
enum migrate_mode mode) { return -ENOSYS; }
@@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#define fail_migrate_page NULL
#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_NUMA_BALANCING
+extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page, int node);
+extern bool migrate_ratelimited(int node);
+#else
+static inline int migrate_misplaced_page(struct page *page, int node)
+{
+ return -EAGAIN; /* can't migrate now */
+}
+static inline bool migrate_ratelimited(int node)
+{
+ return false;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node);
+#else
+static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ return -EAGAIN;
+}
+#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
+
#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4af4f0b1be4..7f4f906190b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page)
}
#endif
+#ifdef CONFIG_NUMA_BALANCING
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+ return xchg(&page->_last_nid, nid);
+}
+
+static inline int page_last_nid(struct page *page)
+{
+ return page->_last_nid;
+}
+static inline void reset_page_last_nid(struct page *page)
+{
+ page->_last_nid = -1;
+}
+#else
+static inline int page_xchg_last_nid(struct page *page, int nid)
+{
+ return page_to_nid(page);
+}
+
+static inline int page_last_nid(struct page *page)
+{
+ return page_to_nid(page);
+}
+
+static inline void reset_page_last_nid(struct page *page)
+{
+}
+#endif
+
static inline struct zone *page_zone(const struct page *page)
{
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
@@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
extern unsigned long do_mremap(unsigned long addr,
unsigned long old_len, unsigned long new_len,
unsigned long flags, unsigned long new_addr);
+extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa);
extern int mprotect_fixup(struct vm_area_struct *vma,
struct vm_area_struct **pprev, unsigned long start,
unsigned long end, unsigned long newflags);
@@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
}
#endif
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end);
+#endif
+
struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
@@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
#define FOLL_MLOCK 0x40 /* mark page as mlocked */
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
+#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7ade2731b5d..7d9ebb7cc98 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -175,6 +175,10 @@ struct page {
*/
void *shadow;
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ int _last_nid;
+#endif
}
/*
* The struct page can be forced to be double word aligned so that atomic ops
@@ -411,9 +415,36 @@ struct mm_struct {
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * numa_next_scan is the next time when the PTEs will me marked
+ * pte_numa to gather statistics and migrate pages to new nodes
+ * if necessary
+ */
+ unsigned long numa_next_scan;
+
+ /* numa_next_reset is when the PTE scanner period will be reset */
+ unsigned long numa_next_reset;
+
+ /* Restart point for scanning and setting pte_numa */
+ unsigned long numa_scan_offset;
+
+ /* numa_scan_seq prevents two threads setting pte_numa */
+ int numa_scan_seq;
+
+ /*
+ * The first node a task was scheduled on. If a task runs on
+ * a different node than Make PTE Scan Go Now.
+ */
+ int first_nid;
+#endif
struct uprobes_state uprobes_state;
};
+/* first nid will either be a valid NID or one of these values */
+#define NUMA_PTE_SCAN_INIT -1
+#define NUMA_PTE_SCAN_ACTIVE -2
+
static inline void mm_init_cpumask(struct mm_struct *mm)
{
#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd55dad56aa..4bec5be82ca 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -735,6 +735,19 @@ typedef struct pglist_data {
struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
int kswapd_max_order;
enum zone_type classzone_idx;
+#ifdef CONFIG_NUMA_BALANCING
+ /*
+ * Lock serializing the per destination node AutoNUMA memory
+ * migration rate limiting data.
+ */
+ spinlock_t numabalancing_migrate_lock;
+
+ /* Rate limiting time interval */
+ unsigned long numabalancing_migrate_next_window;
+
+ /* Number of pages migrated during the rate limiting time interval */
+ unsigned long numabalancing_migrate_nr_pages;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bfe1f478064..c20635c527a 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -7,7 +7,7 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
-#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/memcontrol.h>
/*
@@ -25,8 +25,8 @@
* pointing to this anon_vma once its vma list is empty.
*/
struct anon_vma {
- struct anon_vma *root; /* Root of this anon_vma tree */
- struct mutex mutex; /* Serialize access to vma list */
+ struct anon_vma *root; /* Root of this anon_vma tree */
+ struct rw_semaphore rwsem; /* W: modification, R: walking the list */
/*
* The refcount is taken on an anon_vma when there is no
* guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
struct vm_area_struct *vma;
struct anon_vma *anon_vma;
struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
- struct rb_node rb; /* locked by anon_vma->mutex */
+ struct rb_node rb; /* locked by anon_vma->rwsem */
unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
unsigned long cached_vma_start, cached_vma_last;
@@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
- mutex_lock(&anon_vma->root->mutex);
+ down_write(&anon_vma->root->rwsem);
}
static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
- mutex_unlock(&anon_vma->root->mutex);
+ up_write(&anon_vma->root->rwsem);
}
-static inline void anon_vma_lock(struct anon_vma *anon_vma)
+static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
- mutex_lock(&anon_vma->root->mutex);
+ down_write(&anon_vma->root->rwsem);
}
static inline void anon_vma_unlock(struct anon_vma *anon_vma)
{
- mutex_unlock(&anon_vma->root->mutex);
+ up_write(&anon_vma->root->rwsem);
}
+static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
+{
+ down_read(&anon_vma->root->rwsem);
+}
+
+static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
+{
+ up_read(&anon_vma->root->rwsem);
+}
+
+
/*
* anon_vma helper functions.
*/
@@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
/*
* Called by memory-failure.c to kill processes.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page);
-void page_unlock_anon_vma(struct anon_vma *anon_vma);
+struct anon_vma *page_lock_anon_vma_read(struct page *page);
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2f3072bee..b089c92c609 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,14 @@ struct task_struct {
short il_next;
short pref_node_fork;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ int numa_scan_seq;
+ int numa_migrate_seq;
+ unsigned int numa_scan_period;
+ u64 node_stamp; /* migration stamp */
+ struct callback_head numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+
struct rcu_head rcu;
/*
@@ -1601,6 +1609,18 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#ifdef CONFIG_NUMA_BALANCING
+extern void task_numa_fault(int node, int pages, bool migrated);
+extern void set_numabalancing_state(bool enabled);
+#else
+static inline void task_numa_fault(int node, int pages, bool migrated)
+{
+}
+static inline void set_numabalancing_state(bool enabled)
+{
+}
+#endif
+
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2030,6 +2050,13 @@ enum sched_tunable_scaling {
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_period_reset;
+extern unsigned int sysctl_numa_balancing_scan_size;
+extern unsigned int sysctl_numa_balancing_settle_count;
+
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 8d08b3ed406..071d62c214a 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -34,21 +34,25 @@ enum dma_sync_target {
SYNC_FOR_CPU = 0,
SYNC_FOR_DEVICE = 1,
};
-extern void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
- phys_addr_t phys, size_t size,
- enum dma_data_direction dir);
-extern void swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr,
+/* define the last possible byte of physical address space as a mapping error */
+#define SWIOTLB_MAP_ERROR (~(phys_addr_t)0x0)
+
+extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
+ dma_addr_t tbl_dma_addr,
+ phys_addr_t phys, size_t size,
+ enum dma_data_direction dir);
+
+extern void swiotlb_tbl_unmap_single(struct device *hwdev,
+ phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir);
-extern void swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr,
+extern void swiotlb_tbl_sync_single(struct device *hwdev,
+ phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir,
enum dma_sync_target target);
/* Accessory functions. */
-extern void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
- enum dma_data_direction dir);
-
extern void
*swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_addr_t *dma_handle, gfp_t flags);
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index fe786f07d2b..fce0a2799d4 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,8 +38,18 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
KSWAPD_SKIP_CONGESTION_WAIT,
PAGEOUTRUN, ALLOCSTALL, PGROTATED,
+#ifdef CONFIG_NUMA_BALANCING
+ NUMA_PTE_UPDATES,
+ NUMA_HINT_FAULTS,
+ NUMA_HINT_FAULTS_LOCAL,
+ NUMA_PAGE_MIGRATE,
+#endif
+#ifdef CONFIG_MIGRATION
+ PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
+#endif
#ifdef CONFIG_COMPACTION
- COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
+ COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
+ COMPACTISOLATED,
COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
#endif
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 92a86b2cce3..a13291f7da8 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu)
#endif /* CONFIG_VM_EVENT_COUNTERS */
+#ifdef CONFIG_NUMA_BALANCING
+#define count_vm_numa_event(x) count_vm_event(x)
+#define count_vm_numa_events(x, y) count_vm_events(x, y)
+#else
+#define count_vm_numa_event(x) do {} while (0)
+#define count_vm_numa_events(x, y) do {} while (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
#define __count_zone_vm_events(item, zone, delta) \
__count_vm_events(item##_NORMAL - ZONE_NORMAL + \
zone_idx(zone), delta)
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index d49b285385e..f6372b01136 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -15,6 +15,7 @@ struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct ext4_extent;
+struct extent_status;
#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))
@@ -1519,10 +1520,9 @@ DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
);
DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
- TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- ext4_fsblk_t pblk, unsigned int len, int ret),
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
- TP_ARGS(inode, lblk, pblk, len, ret),
+ TP_ARGS(inode, map, ret),
TP_STRUCT__entry(
__field( dev_t, dev )
@@ -1530,37 +1530,37 @@ DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
__field( ext4_fsblk_t, pblk )
__field( ext4_lblk_t, lblk )
__field( unsigned int, len )
+ __field( unsigned int, flags )
__field( int, ret )
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->pblk = pblk;
- __entry->lblk = lblk;
- __entry->len = len;
+ __entry->pblk = map->m_pblk;
+ __entry->lblk = map->m_lblk;
+ __entry->len = map->m_len;
+ __entry->flags = map->m_flags;
__entry->ret = ret;
),
- TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u ret %d",
+ TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u flags %x ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->lblk, __entry->pblk,
- __entry->len, __entry->ret)
+ __entry->len, __entry->flags, __entry->ret)
);
DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
- TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- ext4_fsblk_t pblk, unsigned len, int ret),
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
- TP_ARGS(inode, lblk, pblk, len, ret)
+ TP_ARGS(inode, map, ret)
);
DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
- TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
- ext4_fsblk_t pblk, unsigned len, int ret),
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int ret),
- TP_ARGS(inode, lblk, pblk, len, ret)
+ TP_ARGS(inode, map, ret)
);
TRACE_EVENT(ext4_ext_load_extent,
@@ -1680,10 +1680,10 @@ DEFINE_EVENT(ext4__trim, ext4_trim_all_free,
);
TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
- TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+ TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
unsigned int allocated, ext4_fsblk_t newblock),
- TP_ARGS(inode, map, allocated, newblock),
+ TP_ARGS(inode, map, flags, allocated, newblock),
TP_STRUCT__entry(
__field( dev_t, dev )
@@ -1699,7 +1699,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
- __entry->flags = map->m_flags;
+ __entry->flags = flags;
__entry->lblk = map->m_lblk;
__entry->pblk = map->m_pblk;
__entry->len = map->m_len;
@@ -1707,7 +1707,7 @@ TRACE_EVENT(ext4_ext_handle_uninitialized_extents,
__entry->newblk = newblock;
),
- TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %d"
+ TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %x "
"allocated %d newblock %llu",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
@@ -2055,6 +2055,106 @@ TRACE_EVENT(ext4_ext_remove_space_done,
(unsigned short) __entry->eh_entries)
);
+TRACE_EVENT(ext4_es_insert_extent,
+ TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len),
+
+ TP_ARGS(inode, start, len),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( loff_t, start )
+ __field( loff_t, len )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start = start;
+ __entry->len = len;
+ ),
+
+ TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->start, __entry->len)
+);
+
+TRACE_EVENT(ext4_es_remove_extent,
+ TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len),
+
+ TP_ARGS(inode, start, len),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( loff_t, start )
+ __field( loff_t, len )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start = start;
+ __entry->len = len;
+ ),
+
+ TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->start, __entry->len)
+);
+
+TRACE_EVENT(ext4_es_find_extent_enter,
+ TP_PROTO(struct inode *inode, ext4_lblk_t start),
+
+ TP_ARGS(inode, start),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ext4_lblk_t, start )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start = start;
+ ),
+
+ TP_printk("dev %d,%d ino %lu start %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino, __entry->start)
+);
+
+TRACE_EVENT(ext4_es_find_extent_exit,
+ TP_PROTO(struct inode *inode, struct extent_status *es,
+ ext4_lblk_t ret),
+
+ TP_ARGS(inode, es, ret),
+
+ TP_STRUCT__entry(
+ __field( dev_t, dev )
+ __field( ino_t, ino )
+ __field( ext4_lblk_t, start )
+ __field( ext4_lblk_t, len )
+ __field( ext4_lblk_t, ret )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->start = es->start;
+ __entry->len = es->len;
+ __entry->ret = ret;
+ ),
+
+ TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ (unsigned long) __entry->ino,
+ __entry->start, __entry->len, __entry->ret)
+);
+
#endif /* _TRACE_EXT4_H */
/* This part must be outside protection */
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
new file mode 100644
index 00000000000..ec2a6ccfd7e
--- /dev/null
+++ b/include/trace/events/migrate.h
@@ -0,0 +1,51 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM migrate
+
+#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MIGRATE_H
+
+#define MIGRATE_MODE \
+ {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \
+ {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \
+ {MIGRATE_SYNC, "MIGRATE_SYNC"}
+
+#define MIGRATE_REASON \
+ {MR_COMPACTION, "compaction"}, \
+ {MR_MEMORY_FAILURE, "memory_failure"}, \
+ {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \
+ {MR_SYSCALL, "syscall_or_cpuset"}, \
+ {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \
+ {MR_CMA, "cma"}
+
+TRACE_EVENT(mm_migrate_pages,
+
+ TP_PROTO(unsigned long succeeded, unsigned long failed,
+ enum migrate_mode mode, int reason),
+
+ TP_ARGS(succeeded, failed, mode, reason),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, succeeded)
+ __field( unsigned long, failed)
+ __field( enum migrate_mode, mode)
+ __field( int, reason)
+ ),
+
+ TP_fast_assign(
+ __entry->succeeded = succeeded;
+ __entry->failed = failed;
+ __entry->mode = mode;
+ __entry->reason = reason;
+ ),
+
+ TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s",
+ __entry->succeeded,
+ __entry->failed,
+ __print_symbolic(__entry->mode, MIGRATE_MODE),
+ __print_symbolic(__entry->reason, MIGRATE_REASON))
+);
+
+#endif /* _TRACE_MIGRATE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 23e62e0537e..0d11c3dcd3a 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -20,6 +20,7 @@ enum {
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
+ MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
@@ -47,9 +48,15 @@ enum mpol_rebind_step {
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
-#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */
-#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */
-#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */
+#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
+ to policy */
+#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
+#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
+#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
+
+#define MPOL_MF_VALID (MPOL_MF_STRICT | \
+ MPOL_MF_MOVE | \
+ MPOL_MF_MOVE_ALL)
/*
* Internal flags that share the struct mempolicy flags word with
@@ -59,6 +66,8 @@ enum mpol_rebind_step {
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
+#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
+#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
#endif /* _UAPI_LINUX_MEMPOLICY_H */
diff --git a/init/Kconfig b/init/Kconfig
index 2054e048bb9..1a207efca59 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -717,6 +717,50 @@ config LOG_BUF_SHIFT
config HAVE_UNSTABLE_SCHED_CLOCK
bool
+#
+# For architectures that want to enable the support for NUMA-affine scheduler
+# balancing logic:
+#
+config ARCH_SUPPORTS_NUMA_BALANCING
+ bool
+
+# For architectures that (ab)use NUMA to represent different memory regions
+# all cpu-local but of different latencies, such as SuperH.
+#
+config ARCH_WANT_NUMA_VARIABLE_LOCALITY
+ bool
+
+#
+# For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE
+config ARCH_WANTS_PROT_NUMA_PROT_NONE
+ bool
+
+config ARCH_USES_NUMA_PROT_NONE
+ bool
+ default y
+ depends on ARCH_WANTS_PROT_NUMA_PROT_NONE
+ depends on NUMA_BALANCING
+
+config NUMA_BALANCING_DEFAULT_ENABLED
+ bool "Automatically enable NUMA aware memory/task placement"
+ default y
+ depends on NUMA_BALANCING
+ help
+ If set, autonumic NUMA balancing will be enabled if running on a NUMA
+ machine.
+
+config NUMA_BALANCING
+ bool "Memory placement aware NUMA scheduler"
+ depends on ARCH_SUPPORTS_NUMA_BALANCING
+ depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
+ depends on SMP && NUMA && MIGRATION
+ help
+ This option adds support for automatic NUMA aware memory/task placement.
+ The mechanism is quite primitive and is based on migrating memory when
+ it is references to the node the task is running on.
+
+ This system will be inactive on UMA systems.
+
menuconfig CGROUPS
boolean "Control Group support"
depends on EVENTFD
diff --git a/kernel/cred.c b/kernel/cred.c
index 48cea3da6d0..8888afb846e 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -30,17 +30,6 @@
static struct kmem_cache *cred_jar;
/*
- * The common credentials for the initial task's thread group
- */
-#ifdef CONFIG_KEYS
-static struct thread_group_cred init_tgcred = {
- .usage = ATOMIC_INIT(2),
- .tgid = 0,
- .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
-};
-#endif
-
-/*
* The initial credentials for the initial task
*/
struct cred init_cred = {
@@ -65,9 +54,6 @@ struct cred init_cred = {
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
-#ifdef CONFIG_KEYS
- .tgcred = &init_tgcred,
-#endif
};
static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -96,36 +82,6 @@ static inline void alter_cred_subscribers(const struct cred *_cred, int n)
}
/*
- * Dispose of the shared task group credentials
- */
-#ifdef CONFIG_KEYS
-static void release_tgcred_rcu(struct rcu_head *rcu)
-{
- struct thread_group_cred *tgcred =
- container_of(rcu, struct thread_group_cred, rcu);
-
- BUG_ON(atomic_read(&tgcred->usage) != 0);
-
- key_put(tgcred->session_keyring);
- key_put(tgcred->process_keyring);
- kfree(tgcred);
-}
-#endif
-
-/*
- * Release a set of thread group credentials.
- */
-static void release_tgcred(struct cred *cred)
-{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred = cred->tgcred;
-
- if (atomic_dec_and_test(&tgcred->usage))
- call_rcu(&tgcred->rcu, release_tgcred_rcu);
-#endif
-}
-
-/*
* The RCU callback to actually dispose of a set of credentials
*/
static void put_cred_rcu(struct rcu_head *rcu)
@@ -150,9 +106,10 @@ static void put_cred_rcu(struct rcu_head *rcu)
#endif
security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
key_put(cred->thread_keyring);
key_put(cred->request_key_auth);
- release_tgcred(cred);
if (cred->group_info)
put_group_info(cred->group_info);
free_uid(cred->user);
@@ -246,15 +203,6 @@ struct cred *cred_alloc_blank(void)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
- if (!new->tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
- atomic_set(&new->tgcred->usage, 1);
-#endif
-
atomic_set(&new->usage, 1);
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
@@ -308,9 +256,10 @@ struct cred *prepare_creds(void)
get_user_ns(new->user_ns);
#ifdef CONFIG_KEYS
+ key_get(new->session_keyring);
+ key_get(new->process_keyring);
key_get(new->thread_keyring);
key_get(new->request_key_auth);
- atomic_inc(&new->tgcred->usage);
#endif
#ifdef CONFIG_SECURITY
@@ -334,39 +283,20 @@ EXPORT_SYMBOL(prepare_creds);
*/
struct cred *prepare_exec_creds(void)
{
- struct thread_group_cred *tgcred = NULL;
struct cred *new;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred)
- return NULL;
-#endif
-
new = prepare_creds();
- if (!new) {
- kfree(tgcred);
+ if (!new)
return new;
- }
#ifdef CONFIG_KEYS
/* newly exec'd tasks don't get a thread keyring */
key_put(new->thread_keyring);
new->thread_keyring = NULL;
- /* create a new per-thread-group creds for all this set of threads to
- * share */
- memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred));
-
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
-
/* inherit the session keyring; new process keyring */
- key_get(tgcred->session_keyring);
- tgcred->process_keyring = NULL;
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
#endif
return new;
@@ -383,9 +313,6 @@ struct cred *prepare_exec_creds(void)
*/
int copy_creds(struct task_struct *p, unsigned long clone_flags)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
struct cred *new;
int ret;
@@ -425,22 +352,12 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
install_thread_keyring_to_cred(new);
}
- /* we share the process and session keyrings between all the threads in
- * a process - this is slightly icky as we violate COW credentials a
- * bit */
+ /* The process keyring is only shared between the threads in a process;
+ * anything outside of those threads doesn't inherit.
+ */
if (!(clone_flags & CLONE_THREAD)) {
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- ret = -ENOMEM;
- goto error_put;
- }
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = key_get(new->tgcred->session_keyring);
-
- release_tgcred(new);
- new->tgcred = tgcred;
+ key_put(new->process_keyring);
+ new->process_keyring = NULL;
}
#endif
@@ -643,9 +560,6 @@ void __init cred_init(void)
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
-#ifdef CONFIG_KEYS
- struct thread_group_cred *tgcred;
-#endif
const struct cred *old;
struct cred *new;
@@ -653,14 +567,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
-#ifdef CONFIG_KEYS
- tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
- if (!tgcred) {
- kmem_cache_free(cred_jar, new);
- return NULL;
- }
-#endif
-
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
@@ -678,13 +584,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_set(&tgcred->usage, 1);
- spin_lock_init(&tgcred->lock);
- tgcred->process_keyring = NULL;
- tgcred->session_keyring = NULL;
- new->tgcred = tgcred;
- new->request_key_auth = NULL;
+ new->session_keyring = NULL;
+ new->process_keyring = NULL;
new->thread_keyring = NULL;
+ new->request_key_auth = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 3c31e874afa..115d6c2e4cc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->first_nid = NUMA_PTE_SCAN_INIT;
+#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533496b622..c1fb82104bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
-static ssize_t
-sched_feat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static int sched_feat_set(char *cmp)
{
- char buf[64];
- char *cmp;
- int neg = 0;
int i;
-
- if (cnt > 63)
- cnt = 63;
-
- if (copy_from_user(&buf, ubuf, cnt))
- return -EFAULT;
-
- buf[cnt] = 0;
- cmp = strstrip(buf);
+ int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
@@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
}
}
+ return i;
+}
+
+static ssize_t
+sched_feat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ char *cmp;
+ int i;
+
+ if (cnt > 63)
+ cnt = 63;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+ cmp = strstrip(buf);
+
+ i = sched_feat_set(cmp);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -1560,7 +1568,40 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+ if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
+ p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
+ p->mm->numa_scan_seq = 0;
+ }
+
+ p->node_stamp = 0ULL;
+ p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
+ p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
+ p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_work.next = &p->numa_work;
+#endif /* CONFIG_NUMA_BALANCING */
+}
+
+#ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_SCHED_DEBUG
+void set_numabalancing_state(bool enabled)
+{
+ if (enabled)
+ sched_feat_set("NUMA");
+ else
+ sched_feat_set("NO_NUMA");
+}
+#else
+__read_mostly bool numabalancing_enabled;
+
+void set_numabalancing_state(bool enabled)
+{
+ numabalancing_enabled = enabled;
}
+#endif /* CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
/*
* fork()/clone()-time setup:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 756f9f9e854..9af5af979a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
+#include <linux/mempolicy.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
#include <trace/events/sched.h>
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * numa task sample period in ms
+ */
+unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
+
+/* Portion of address space to scan in MB */
+unsigned int sysctl_numa_balancing_scan_size = 256;
+
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
+static void task_numa_placement(struct task_struct *p)
+{
+ int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+
+ /* FIXME: Scheduling placement policy hints go here */
+}
+
+/*
+ * Got a PROT_NONE fault for a page on @node.
+ */
+void task_numa_fault(int node, int pages, bool migrated)
+{
+ struct task_struct *p = current;
+
+ if (!sched_feat_numa(NUMA))
+ return;
+
+ /* FIXME: Allocate task-specific structure for placement policy here */
+
+ /*
+ * If pages are properly placed (did not migrate) then scan slower.
+ * This is reset periodically in case of phase changes
+ */
+ if (!migrated)
+ p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+ p->numa_scan_period + jiffies_to_msecs(10));
+
+ task_numa_placement(p);
+}
+
+static void reset_ptenuma_scan(struct task_struct *p)
+{
+ ACCESS_ONCE(p->mm->numa_scan_seq)++;
+ p->mm->numa_scan_offset = 0;
+}
+
+/*
+ * The expensive part of numa migration is done from task_work context.
+ * Triggered from task_tick_numa().
+ */
+void task_numa_work(struct callback_head *work)
+{
+ unsigned long migrate, next_scan, now = jiffies;
+ struct task_struct *p = current;
+ struct mm_struct *mm = p->mm;
+ struct vm_area_struct *vma;
+ unsigned long start, end;
+ long pages;
+
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+
+ work->next = work; /* protect against double add */
+ /*
+ * Who cares about NUMA placement when they're dying.
+ *
+ * NOTE: make sure not to dereference p->mm before this check,
+ * exit_task_work() happens _after_ exit_mm() so we could be called
+ * without p->mm even though we still had it when we enqueued this
+ * work.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /*
+ * We do not care about task placement until a task runs on a node
+ * other than the first one used by the address space. This is
+ * largely because migrations are driven by what CPU the task
+ * is running on. If it's never scheduled on another node, it'll
+ * not migrate so why bother trapping the fault.
+ */
+ if (mm->first_nid == NUMA_PTE_SCAN_INIT)
+ mm->first_nid = numa_node_id();
+ if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
+ /* Are we running on a new node yet? */
+ if (numa_node_id() == mm->first_nid &&
+ !sched_feat_numa(NUMA_FORCE))
+ return;
+
+ mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
+ }
+
+ /*
+ * Reset the scan period if enough time has gone by. Objective is that
+ * scanning will be reduced if pages are properly placed. As tasks
+ * can enter different phases this needs to be re-examined. Lacking
+ * proper tracking of reference behaviour, this blunt hammer is used.
+ */
+ migrate = mm->numa_next_reset;
+ if (time_after(now, migrate)) {
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+ xchg(&mm->numa_next_reset, next_scan);
+ }
+
+ /*
+ * Enforce maximal scan/migration frequency..
+ */
+ migrate = mm->numa_next_scan;
+ if (time_before(now, migrate))
+ return;
+
+ if (p->numa_scan_period == 0)
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+
+ next_scan = now + msecs_to_jiffies(p->numa_scan_period);
+ if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
+ return;
+
+ /*
+ * Do not set pte_numa if the current running node is rate-limited.
+ * This loses statistics on the fault but if we are unwilling to
+ * migrate to this node, it is less likely we can do useful work
+ */
+ if (migrate_ratelimited(numa_node_id()))
+ return;
+
+ start = mm->numa_scan_offset;
+ pages = sysctl_numa_balancing_scan_size;
+ pages <<= 20 - PAGE_SHIFT; /* MB in pages */
+ if (!pages)
+ return;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+ if (!vma) {
+ reset_ptenuma_scan(p);
+ start = 0;
+ vma = mm->mmap;
+ }
+ for (; vma; vma = vma->vm_next) {
+ if (!vma_migratable(vma))
+ continue;
+
+ /* Skip small VMAs. They are not likely to be of relevance */
+ if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
+ continue;
+
+ do {
+ start = max(start, vma->vm_start);
+ end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
+ end = min(end, vma->vm_end);
+ pages -= change_prot_numa(vma, start, end);
+
+ start = end;
+ if (pages <= 0)
+ goto out;
+ } while (end != vma->vm_end);
+ }
+
+out:
+ /*
+ * It is possible to reach the end of the VMA list but the last few VMAs are
+ * not guaranteed to the vma_migratable. If they are not, we would find the
+ * !migratable VMA on the next scan but not reset the scanner to the start
+ * so check it now.
+ */
+ if (vma)
+ mm->numa_scan_offset = start;
+ else
+ reset_ptenuma_scan(p);
+ up_read(&mm->mmap_sem);
+}
+
+/*
+ * Drive the periodic memory faults..
+ */
+void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+ struct callback_head *work = &curr->numa_work;
+ u64 period, now;
+
+ /*
+ * We don't care about NUMA placement if we don't have memory.
+ */
+ if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ return;
+
+ /*
+ * Using runtime rather than walltime has the dual advantage that
+ * we (mostly) drive the selection from busy threads and that the
+ * task needs to have done some actual work before we bother with
+ * NUMA placement.
+ */
+ now = curr->se.sum_exec_runtime;
+ period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
+
+ if (now - curr->node_stamp > period) {
+ if (!curr->node_stamp)
+ curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ curr->node_stamp = now;
+
+ if (!time_before(jiffies, curr->mm->numa_next_scan)) {
+ init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
+ task_work_add(curr, work, true);
+ }
+ }
+}
+#else
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
entity_tick(cfs_rq, se, queued);
}
+ if (sched_feat_numa(NUMA))
+ task_tick_numa(rq, curr);
+
update_rq_runnable_avg(rq, 1);
}
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e68e69ab917..1ad1d2b5395 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
+
+/*
+ * Apply the automatic NUMA scheduling policy. Enabled automatically
+ * at runtime if running on a NUMA machine. Can be controlled via
+ * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * for debugging the core machinery.
+ */
+#ifdef CONFIG_NUMA_BALANCING
+SCHED_FEAT(NUMA, false)
+SCHED_FEAT(NUMA_FORCE, false)
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5eca173b563..fc886441436 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -663,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+#ifdef CONFIG_NUMA_BALANCING
+#define sched_feat_numa(x) sched_feat(x)
+#ifdef CONFIG_SCHED_DEBUG
+#define numabalancing_enabled sched_feat_numa(NUMA)
+#else
+extern bool numabalancing_enabled;
+#endif /* CONFIG_SCHED_DEBUG */
+#else
+#define sched_feat_numa(x) (0)
+#define numabalancing_enabled (0)
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ee376beedaf..5af44b59377 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -396,25 +396,29 @@ int __secure_computing(int this_syscall)
#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER: {
int data;
+ struct pt_regs *regs = task_pt_regs(current);
ret = seccomp_run_filters(this_syscall);
data = ret & SECCOMP_RET_DATA;
ret &= SECCOMP_RET_ACTION;
switch (ret) {
case SECCOMP_RET_ERRNO:
/* Set the low-order 16-bits as a errno. */
- syscall_set_return_value(current, task_pt_regs(current),
+ syscall_set_return_value(current, regs,
-data, 0);
goto skip;
case SECCOMP_RET_TRAP:
/* Show the handler the original registers. */
- syscall_rollback(current, task_pt_regs(current));
+ syscall_rollback(current, regs);
/* Let the filter pass back 16 bits of data. */
seccomp_send_sigsys(this_syscall, data);
goto skip;
case SECCOMP_RET_TRACE:
/* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP))
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current, regs,
+ -ENOSYS, 0);
goto skip;
+ }
/* Allow the BPF to provide the event message */
ptrace_event(PTRACE_EVENT_SECCOMP, data);
/*
@@ -425,6 +429,9 @@ int __secure_computing(int this_syscall)
*/
if (fatal_signal_pending(current))
break;
+ if (syscall_get_nr(current, regs) < 0)
+ goto skip; /* Explicit request to skip. */
+
return 0;
case SECCOMP_RET_ALLOW:
return 0;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 33f71f37267..c88878db491 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
+#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
-#endif
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
+#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#endif
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_NUMA_BALANCING
+ {
+ .procname = "numa_balancing_scan_delay_ms",
+ .data = &sysctl_numa_balancing_scan_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_min_ms",
+ .data = &sysctl_numa_balancing_scan_period_min,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_reset",
+ .data = &sysctl_numa_balancing_scan_period_reset,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_period_max_ms",
+ .data = &sysctl_numa_balancing_scan_period_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "numa_balancing_scan_size_mb",
+ .data = &sysctl_numa_balancing_scan_size,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif /* CONFIG_NUMA_BALANCING */
+#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index f114bf6a8e1..196b06984de 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -57,7 +57,7 @@ int swiotlb_force;
* swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this
* API.
*/
-static char *io_tlb_start, *io_tlb_end;
+static phys_addr_t io_tlb_start, io_tlb_end;
/*
* The number of IO TLB blocks (in groups of 64) between io_tlb_start and
@@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs;
*/
static unsigned long io_tlb_overflow = 32*1024;
-static void *io_tlb_overflow_buffer;
+static phys_addr_t io_tlb_overflow_buffer;
/*
* This is a free list describing the number of free entries available from
@@ -125,27 +125,38 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
void swiotlb_print_info(void)
{
unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT;
- phys_addr_t pstart, pend;
+ unsigned char *vstart, *vend;
- pstart = virt_to_phys(io_tlb_start);
- pend = virt_to_phys(io_tlb_end);
+ vstart = phys_to_virt(io_tlb_start);
+ vend = phys_to_virt(io_tlb_end);
printk(KERN_INFO "software IO TLB [mem %#010llx-%#010llx] (%luMB) mapped at [%p-%p]\n",
- (unsigned long long)pstart, (unsigned long long)pend - 1,
- bytes >> 20, io_tlb_start, io_tlb_end - 1);
+ (unsigned long long)io_tlb_start,
+ (unsigned long long)io_tlb_end,
+ bytes >> 20, vstart, vend - 1);
}
void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
{
+ void *v_overflow_buffer;
unsigned long i, bytes;
bytes = nslabs << IO_TLB_SHIFT;
io_tlb_nslabs = nslabs;
- io_tlb_start = tlb;
+ io_tlb_start = __pa(tlb);
io_tlb_end = io_tlb_start + bytes;
/*
+ * Get the overflow emergency buffer
+ */
+ v_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
+ if (!v_overflow_buffer)
+ panic("Cannot allocate SWIOTLB overflow buffer!\n");
+
+ io_tlb_overflow_buffer = __pa(v_overflow_buffer);
+
+ /*
* Allocate and initialize the free list array. This array is used
* to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
* between io_tlb_start and io_tlb_end.
@@ -156,12 +167,6 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
io_tlb_index = 0;
io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
- /*
- * Get the overflow emergency buffer
- */
- io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
- if (!io_tlb_overflow_buffer)
- panic("Cannot allocate SWIOTLB overflow buffer!\n");
if (verbose)
swiotlb_print_info();
}
@@ -173,6 +178,7 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
static void __init
swiotlb_init_with_default_size(size_t default_size, int verbose)
{
+ unsigned char *vstart;
unsigned long bytes;
if (!io_tlb_nslabs) {
@@ -185,11 +191,11 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
/*
* Get IO TLB memory from the low pages
*/
- io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
- if (!io_tlb_start)
+ vstart = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
+ if (!vstart)
panic("Cannot allocate SWIOTLB buffer");
- swiotlb_init_with_tbl(io_tlb_start, io_tlb_nslabs, verbose);
+ swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose);
}
void __init
@@ -207,6 +213,7 @@ int
swiotlb_late_init_with_default_size(size_t default_size)
{
unsigned long bytes, req_nslabs = io_tlb_nslabs;
+ unsigned char *vstart = NULL;
unsigned int order;
int rc = 0;
@@ -223,14 +230,14 @@ swiotlb_late_init_with_default_size(size_t default_size)
bytes = io_tlb_nslabs << IO_TLB_SHIFT;
while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) {
- io_tlb_start = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
- order);
- if (io_tlb_start)
+ vstart = (void *)__get_free_pages(GFP_DMA | __GFP_NOWARN,
+ order);
+ if (vstart)
break;
order--;
}
- if (!io_tlb_start) {
+ if (!vstart) {
io_tlb_nslabs = req_nslabs;
return -ENOMEM;
}
@@ -239,9 +246,9 @@ swiotlb_late_init_with_default_size(size_t default_size)
"for software IO TLB\n", (PAGE_SIZE << order) >> 20);
io_tlb_nslabs = SLABS_PER_PAGE << order;
}
- rc = swiotlb_late_init_with_tbl(io_tlb_start, io_tlb_nslabs);
+ rc = swiotlb_late_init_with_tbl(vstart, io_tlb_nslabs);
if (rc)
- free_pages((unsigned long)io_tlb_start, order);
+ free_pages((unsigned long)vstart, order);
return rc;
}
@@ -249,14 +256,25 @@ int
swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
{
unsigned long i, bytes;
+ unsigned char *v_overflow_buffer;
bytes = nslabs << IO_TLB_SHIFT;
io_tlb_nslabs = nslabs;
- io_tlb_start = tlb;
+ io_tlb_start = virt_to_phys(tlb);
io_tlb_end = io_tlb_start + bytes;
- memset(io_tlb_start, 0, bytes);
+ memset(tlb, 0, bytes);
+
+ /*
+ * Get the overflow emergency buffer
+ */
+ v_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
+ get_order(io_tlb_overflow));
+ if (!v_overflow_buffer)
+ goto cleanup2;
+
+ io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer);
/*
* Allocate and initialize the free list array. This array is used
@@ -266,7 +284,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL,
get_order(io_tlb_nslabs * sizeof(int)));
if (!io_tlb_list)
- goto cleanup2;
+ goto cleanup3;
for (i = 0; i < io_tlb_nslabs; i++)
io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
@@ -277,18 +295,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
get_order(io_tlb_nslabs *
sizeof(phys_addr_t)));
if (!io_tlb_orig_addr)
- goto cleanup3;
+ goto cleanup4;
memset(io_tlb_orig_addr, 0, io_tlb_nslabs * sizeof(phys_addr_t));
- /*
- * Get the overflow emergency buffer
- */
- io_tlb_overflow_buffer = (void *)__get_free_pages(GFP_DMA,
- get_order(io_tlb_overflow));
- if (!io_tlb_overflow_buffer)
- goto cleanup4;
-
swiotlb_print_info();
late_alloc = 1;
@@ -296,42 +306,42 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
return 0;
cleanup4:
- free_pages((unsigned long)io_tlb_orig_addr,
- get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
- io_tlb_orig_addr = NULL;
-cleanup3:
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
sizeof(int)));
io_tlb_list = NULL;
+cleanup3:
+ free_pages((unsigned long)v_overflow_buffer,
+ get_order(io_tlb_overflow));
+ io_tlb_overflow_buffer = 0;
cleanup2:
- io_tlb_end = NULL;
- io_tlb_start = NULL;
+ io_tlb_end = 0;
+ io_tlb_start = 0;
io_tlb_nslabs = 0;
return -ENOMEM;
}
void __init swiotlb_free(void)
{
- if (!io_tlb_overflow_buffer)
+ if (!io_tlb_orig_addr)
return;
if (late_alloc) {
- free_pages((unsigned long)io_tlb_overflow_buffer,
+ free_pages((unsigned long)phys_to_virt(io_tlb_overflow_buffer),
get_order(io_tlb_overflow));
free_pages((unsigned long)io_tlb_orig_addr,
get_order(io_tlb_nslabs * sizeof(phys_addr_t)));
free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs *
sizeof(int)));
- free_pages((unsigned long)io_tlb_start,
+ free_pages((unsigned long)phys_to_virt(io_tlb_start),
get_order(io_tlb_nslabs << IO_TLB_SHIFT));
} else {
- free_bootmem_late(__pa(io_tlb_overflow_buffer),
+ free_bootmem_late(io_tlb_overflow_buffer,
PAGE_ALIGN(io_tlb_overflow));
free_bootmem_late(__pa(io_tlb_orig_addr),
PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
free_bootmem_late(__pa(io_tlb_list),
PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
- free_bootmem_late(__pa(io_tlb_start),
+ free_bootmem_late(io_tlb_start,
PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
}
io_tlb_nslabs = 0;
@@ -339,21 +349,21 @@ void __init swiotlb_free(void)
static int is_swiotlb_buffer(phys_addr_t paddr)
{
- return paddr >= virt_to_phys(io_tlb_start) &&
- paddr < virt_to_phys(io_tlb_end);
+ return paddr >= io_tlb_start && paddr < io_tlb_end;
}
/*
* Bounce: copy the swiotlb buffer back to the original dma location
*/
-void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
- enum dma_data_direction dir)
+static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
{
- unsigned long pfn = PFN_DOWN(phys);
+ unsigned long pfn = PFN_DOWN(orig_addr);
+ unsigned char *vaddr = phys_to_virt(tlb_addr);
if (PageHighMem(pfn_to_page(pfn))) {
/* The buffer does not have a mapping. Map it in and copy */
- unsigned int offset = phys & ~PAGE_MASK;
+ unsigned int offset = orig_addr & ~PAGE_MASK;
char *buffer;
unsigned int sz = 0;
unsigned long flags;
@@ -364,32 +374,31 @@ void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
local_irq_save(flags);
buffer = kmap_atomic(pfn_to_page(pfn));
if (dir == DMA_TO_DEVICE)
- memcpy(dma_addr, buffer + offset, sz);
+ memcpy(vaddr, buffer + offset, sz);
else
- memcpy(buffer + offset, dma_addr, sz);
+ memcpy(buffer + offset, vaddr, sz);
kunmap_atomic(buffer);
local_irq_restore(flags);
size -= sz;
pfn++;
- dma_addr += sz;
+ vaddr += sz;
offset = 0;
}
+ } else if (dir == DMA_TO_DEVICE) {
+ memcpy(vaddr, phys_to_virt(orig_addr), size);
} else {
- if (dir == DMA_TO_DEVICE)
- memcpy(dma_addr, phys_to_virt(phys), size);
- else
- memcpy(phys_to_virt(phys), dma_addr, size);
+ memcpy(phys_to_virt(orig_addr), vaddr, size);
}
}
-EXPORT_SYMBOL_GPL(swiotlb_bounce);
-void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
- phys_addr_t phys, size_t size,
- enum dma_data_direction dir)
+phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
+ dma_addr_t tbl_dma_addr,
+ phys_addr_t orig_addr, size_t size,
+ enum dma_data_direction dir)
{
unsigned long flags;
- char *dma_addr;
+ phys_addr_t tlb_addr;
unsigned int nslots, stride, index, wrap;
int i;
unsigned long mask;
@@ -453,7 +462,7 @@ void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
io_tlb_list[i] = 0;
for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--)
io_tlb_list[i] = ++count;
- dma_addr = io_tlb_start + (index << IO_TLB_SHIFT);
+ tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT);
/*
* Update the indices to avoid searching in the next
@@ -471,7 +480,7 @@ void *swiotlb_tbl_map_single(struct device *hwdev, dma_addr_t tbl_dma_addr,
not_found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
- return NULL;
+ return SWIOTLB_MAP_ERROR;
found:
spin_unlock_irqrestore(&io_tlb_lock, flags);
@@ -481,11 +490,11 @@ found:
* needed.
*/
for (i = 0; i < nslots; i++)
- io_tlb_orig_addr[index+i] = phys + (i << IO_TLB_SHIFT);
+ io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)
- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr, size, DMA_TO_DEVICE);
- return dma_addr;
+ return tlb_addr;
}
EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
@@ -493,11 +502,10 @@ EXPORT_SYMBOL_GPL(swiotlb_tbl_map_single);
* Allocates bounce buffer and returns its kernel virtual address.
*/
-static void *
-map_single(struct device *hwdev, phys_addr_t phys, size_t size,
- enum dma_data_direction dir)
+phys_addr_t map_single(struct device *hwdev, phys_addr_t phys, size_t size,
+ enum dma_data_direction dir)
{
- dma_addr_t start_dma_addr = swiotlb_virt_to_bus(hwdev, io_tlb_start);
+ dma_addr_t start_dma_addr = phys_to_dma(hwdev, io_tlb_start);
return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, dir);
}
@@ -505,20 +513,19 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size,
/*
* dma_addr is the kernel virtual address of the bounce buffer to unmap.
*/
-void
-swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
- enum dma_data_direction dir)
+void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir)
{
unsigned long flags;
int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t phys = io_tlb_orig_addr[index];
+ int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = io_tlb_orig_addr[index];
/*
* First, sync the memory before unmapping the entry
*/
- if (phys && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+ if (orig_addr && ((dir == DMA_FROM_DEVICE) || (dir == DMA_BIDIRECTIONAL)))
+ swiotlb_bounce(orig_addr, tlb_addr, size, DMA_FROM_DEVICE);
/*
* Return the buffer to the free list by setting the corresponding
@@ -547,26 +554,27 @@ swiotlb_tbl_unmap_single(struct device *hwdev, char *dma_addr, size_t size,
}
EXPORT_SYMBOL_GPL(swiotlb_tbl_unmap_single);
-void
-swiotlb_tbl_sync_single(struct device *hwdev, char *dma_addr, size_t size,
- enum dma_data_direction dir,
- enum dma_sync_target target)
+void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
+ size_t size, enum dma_data_direction dir,
+ enum dma_sync_target target)
{
- int index = (dma_addr - io_tlb_start) >> IO_TLB_SHIFT;
- phys_addr_t phys = io_tlb_orig_addr[index];
+ int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT;
+ phys_addr_t orig_addr = io_tlb_orig_addr[index];
- phys += ((unsigned long)dma_addr & ((1 << IO_TLB_SHIFT) - 1));
+ orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1);
switch (target) {
case SYNC_FOR_CPU:
if (likely(dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(phys, dma_addr, size, DMA_FROM_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr,
+ size, DMA_FROM_DEVICE);
else
BUG_ON(dir != DMA_TO_DEVICE);
break;
case SYNC_FOR_DEVICE:
if (likely(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
- swiotlb_bounce(phys, dma_addr, size, DMA_TO_DEVICE);
+ swiotlb_bounce(orig_addr, tlb_addr,
+ size, DMA_TO_DEVICE);
else
BUG_ON(dir != DMA_FROM_DEVICE);
break;
@@ -589,12 +597,15 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
dma_mask = hwdev->coherent_dma_mask;
ret = (void *)__get_free_pages(flags, order);
- if (ret && swiotlb_virt_to_bus(hwdev, ret) + size - 1 > dma_mask) {
- /*
- * The allocated memory isn't reachable by the device.
- */
- free_pages((unsigned long) ret, order);
- ret = NULL;
+ if (ret) {
+ dev_addr = swiotlb_virt_to_bus(hwdev, ret);
+ if (dev_addr + size - 1 > dma_mask) {
+ /*
+ * The allocated memory isn't reachable by the device.
+ */
+ free_pages((unsigned long) ret, order);
+ ret = NULL;
+ }
}
if (!ret) {
/*
@@ -602,25 +613,29 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
* GFP_DMA memory; fall back on map_single(), which
* will grab memory from the lowest available address range.
*/
- ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
- if (!ret)
+ phys_addr_t paddr = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
+ if (paddr == SWIOTLB_MAP_ERROR)
return NULL;
- }
- memset(ret, 0, size);
- dev_addr = swiotlb_virt_to_bus(hwdev, ret);
+ ret = phys_to_virt(paddr);
+ dev_addr = phys_to_dma(hwdev, paddr);
- /* Confirm address can be DMA'd by device */
- if (dev_addr + size - 1 > dma_mask) {
- printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
- (unsigned long long)dma_mask,
- (unsigned long long)dev_addr);
+ /* Confirm address can be DMA'd by device */
+ if (dev_addr + size - 1 > dma_mask) {
+ printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n",
+ (unsigned long long)dma_mask,
+ (unsigned long long)dev_addr);
- /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
- swiotlb_tbl_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
- return NULL;
+ /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
+ swiotlb_tbl_unmap_single(hwdev, paddr,
+ size, DMA_TO_DEVICE);
+ return NULL;
+ }
}
+
*dma_handle = dev_addr;
+ memset(ret, 0, size);
+
return ret;
}
EXPORT_SYMBOL(swiotlb_alloc_coherent);
@@ -636,7 +651,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
free_pages((unsigned long)vaddr, get_order(size));
else
/* DMA_TO_DEVICE to avoid memcpy in swiotlb_tbl_unmap_single */
- swiotlb_tbl_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, DMA_TO_DEVICE);
}
EXPORT_SYMBOL(swiotlb_free_coherent);
@@ -677,9 +692,8 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
enum dma_data_direction dir,
struct dma_attrs *attrs)
{
- phys_addr_t phys = page_to_phys(page) + offset;
+ phys_addr_t map, phys = page_to_phys(page) + offset;
dma_addr_t dev_addr = phys_to_dma(dev, phys);
- void *map;
BUG_ON(dir == DMA_NONE);
/*
@@ -690,23 +704,19 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
if (dma_capable(dev, dev_addr, size) && !swiotlb_force)
return dev_addr;
- /*
- * Oh well, have to allocate and map a bounce buffer.
- */
+ /* Oh well, have to allocate and map a bounce buffer. */
map = map_single(dev, phys, size, dir);
- if (!map) {
+ if (map == SWIOTLB_MAP_ERROR) {
swiotlb_full(dev, size, dir, 1);
- map = io_tlb_overflow_buffer;
+ return phys_to_dma(dev, io_tlb_overflow_buffer);
}
- dev_addr = swiotlb_virt_to_bus(dev, map);
+ dev_addr = phys_to_dma(dev, map);
- /*
- * Ensure that the address returned is DMA'ble
- */
+ /* Ensure that the address returned is DMA'ble */
if (!dma_capable(dev, dev_addr, size)) {
swiotlb_tbl_unmap_single(dev, map, size, dir);
- dev_addr = swiotlb_virt_to_bus(dev, io_tlb_overflow_buffer);
+ return phys_to_dma(dev, io_tlb_overflow_buffer);
}
return dev_addr;
@@ -729,7 +739,7 @@ static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(paddr)) {
- swiotlb_tbl_unmap_single(hwdev, phys_to_virt(paddr), size, dir);
+ swiotlb_tbl_unmap_single(hwdev, paddr, size, dir);
return;
}
@@ -773,8 +783,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
BUG_ON(dir == DMA_NONE);
if (is_swiotlb_buffer(paddr)) {
- swiotlb_tbl_sync_single(hwdev, phys_to_virt(paddr), size, dir,
- target);
+ swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target);
return;
}
@@ -831,9 +840,9 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
if (swiotlb_force ||
!dma_capable(hwdev, dev_addr, sg->length)) {
- void *map = map_single(hwdev, sg_phys(sg),
- sg->length, dir);
- if (!map) {
+ phys_addr_t map = map_single(hwdev, sg_phys(sg),
+ sg->length, dir);
+ if (map == SWIOTLB_MAP_ERROR) {
/* Don't panic here, we expect map_sg users
to do proper error handling. */
swiotlb_full(hwdev, sg->length, dir, 0);
@@ -842,7 +851,7 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems,
sgl[0].dma_length = 0;
return 0;
}
- sg->dma_address = swiotlb_virt_to_bus(hwdev, map);
+ sg->dma_address = phys_to_dma(hwdev, map);
} else
sg->dma_address = dev_addr;
sg->dma_length = sg->length;
@@ -925,7 +934,7 @@ EXPORT_SYMBOL(swiotlb_sync_sg_for_device);
int
swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr)
{
- return (dma_addr == swiotlb_virt_to_bus(hwdev, io_tlb_overflow_buffer));
+ return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer));
}
EXPORT_SYMBOL(swiotlb_dma_mapping_error);
@@ -938,6 +947,6 @@ EXPORT_SYMBOL(swiotlb_dma_mapping_error);
int
swiotlb_dma_supported(struct device *hwdev, u64 mask)
{
- return swiotlb_virt_to_bus(hwdev, io_tlb_end - 1) <= mask;
+ return phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
}
EXPORT_SYMBOL(swiotlb_dma_supported);
diff --git a/mm/compaction.c b/mm/compaction.c
index 12979121822..5ad7f4f4d6f 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
if (blockpfn == end_pfn)
update_pageblock_skip(cc, valid_page, total_isolated, false);
+ count_vm_events(COMPACTFREE_SCANNED, nr_scanned);
+ if (total_isolated)
+ count_vm_events(COMPACTISOLATED, total_isolated);
+
return total_isolated;
}
@@ -609,6 +613,10 @@ next_pageblock:
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned);
+ if (nr_isolated)
+ count_vm_events(COMPACTISOLATED, nr_isolated);
+
return low_pfn;
}
@@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
- cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
+ cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC,
+ MR_COMPACTION);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
- count_vm_event(COMPACTBLOCKS);
- count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
- if (nr_remaining)
- count_vm_events(COMPACTPAGEFAILED, nr_remaining);
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 827d9c81305..d7ee1691fd2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -19,6 +19,7 @@
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
+#include <linux/migrate.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -690,7 +691,7 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
-static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
@@ -848,7 +849,8 @@ out:
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
@@ -1287,6 +1289,81 @@ out:
return page;
}
+/* NUMA hinting page fault entry point for trans huge pmds */
+int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t pmd, pmd_t *pmdp)
+{
+ struct page *page;
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ int target_nid;
+ int current_nid = -1;
+ bool migrated;
+ bool page_locked = false;
+
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp)))
+ goto out_unlock;
+
+ page = pmd_page(pmd);
+ get_page(page);
+ current_nid = page_to_nid(page);
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ target_nid = mpol_misplaced(page, vma, haddr);
+ if (target_nid == -1) {
+ put_page(page);
+ goto clear_pmdnuma;
+ }
+
+ /* Acquire the page lock to serialise THP migrations */
+ spin_unlock(&mm->page_table_lock);
+ lock_page(page);
+ page_locked = true;
+
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ put_page(page);
+ goto out_unlock;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ /* Migrate the THP to the requested node */
+ migrated = migrate_misplaced_transhuge_page(mm, vma,
+ pmdp, pmd, addr,
+ page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+ else {
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(pmd, *pmdp))) {
+ unlock_page(page);
+ goto out_unlock;
+ }
+ goto clear_pmdnuma;
+ }
+
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+
+clear_pmdnuma:
+ pmd = pmd_mknonnuma(pmd);
+ set_pmd_at(mm, haddr, pmdp, pmd);
+ VM_BUG_ON(pmd_numa(*pmdp));
+ update_mmu_cache_pmd(vma, addr, pmdp);
+ if (page_locked)
+ unlock_page(page);
+
+out_unlock:
+ spin_unlock(&mm->page_table_lock);
+ if (current_nid != -1)
+ task_numa_fault(current_nid, HPAGE_PMD_NR, migrated);
+ return 0;
+}
+
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
@@ -1375,7 +1452,7 @@ out:
}
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot)
+ unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
int ret = 0;
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
entry = pmdp_get_and_clear(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
+ if (!prot_numa)
+ entry = pmd_modify(entry, newprot);
+ else {
+ struct page *page = pmd_page(*pmd);
+
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
+ !pmd_numa(*pmd)) {
+ entry = pmd_mknuma(entry);
+ }
+ }
BUG_ON(pmd_write(entry));
set_pmd_at(mm, addr, pmd, entry);
spin_unlock(&vma->vm_mm->page_table_lock);
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page,
* We can't temporarily set the pmd to null in order
* to split it, the pmd must remain marked huge at all
* times or the VM won't take the pmd_trans_huge paths
- * and it won't wait on the anon_vma->root->mutex to
+ * and it won't wait on the anon_vma->root->rwsem to
* serialize against split_huge_page*.
*/
pmdp_splitting_flush(vma, address, pmd);
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page)
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
+ page_xchg_last_nid(page_tail, page_last_nid(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page,
BUG_ON(page_mapcount(page) != 1);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
+ if (pmd_numa(*pmd))
+ entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page,
return ret;
}
-/* must be called with anon_vma->root->mutex hold */
+/* must be called with anon_vma->root->rwsem held */
static void __split_huge_page(struct page *page,
struct anon_vma *anon_vma)
{
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page)
BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
BUG_ON(!PageAnon(page));
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
goto out;
ret = 0;
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page)
BUG_ON(PageCompound(page));
out_unlock:
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
out:
return ret;
}
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
- anon_vma_lock(vma->anon_vma);
+ anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
ptl = pte_lockptr(mm, pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 88e7293b96b..e5318c7793a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3016,7 +3016,7 @@ same_page:
return i ? i : -EFAULT;
}
-void hugetlb_change_protection(struct vm_area_struct *vma,
+unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long address, unsigned long end, pgprot_t newprot)
{
struct mm_struct *mm = vma->vm_mm;
@@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
pte_t *ptep;
pte_t pte;
struct hstate *h = hstate_vma(vma);
+ unsigned long pages = 0;
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
@@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- if (huge_pmd_unshare(mm, &address, ptep))
+ if (huge_pmd_unshare(mm, &address, ptep)) {
+ pages++;
continue;
+ }
if (!huge_pte_none(huge_ptep_get(ptep))) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
set_huge_pte_at(mm, address, ptep, pte);
+ pages++;
}
}
spin_unlock(&mm->page_table_lock);
@@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
*/
flush_tlb_range(vma, start, end);
mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+
+ return pages << h->order;
}
int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/internal.h b/mm/internal.h
index 52d1fa95719..d597f94cc20 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
unsigned long flags;
+ int nr_pages = hpage_nr_pages(page);
local_irq_save(flags);
- __dec_zone_page_state(page, NR_MLOCK);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
SetPageMlocked(newpage);
- __inc_zone_page_state(newpage, NR_MLOCK);
+ __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages);
local_irq_restore(flags);
}
}
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern unsigned long vma_address(struct page *page,
struct vm_area_struct *vma);
diff --git a/mm/ksm.c b/mm/ksm.c
index 382d930a0bf..82dfb4b5432 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1624,7 +1624,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1678,7 +1678,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
@@ -1731,7 +1731,7 @@ again:
struct anon_vma_chain *vmac;
struct vm_area_struct *vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
0, ULONG_MAX) {
vma = vmac->vma;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c055929c8c..bbfac5063ca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
struct mem_cgroup **memcgp)
{
struct mem_cgroup *memcg = NULL;
+ unsigned int nr_pages = 1;
struct page_cgroup *pc;
enum charge_type ctype;
*memcgp = NULL;
- VM_BUG_ON(PageTransHuge(page));
if (mem_cgroup_disabled())
return;
+ if (PageTransHuge(page))
+ nr_pages <<= compound_order(page);
+
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
@@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
* charged to the res_counter since we plan on replacing the
* old one and only one page is going to be left afterwards.
*/
- __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
+ __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
}
/* remove redundant charge if migration failed*/
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 108c52fa60f..c6e4dd3e1c0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
struct anon_vma *av;
pgoff_t pgoff;
- av = page_lock_anon_vma(page);
+ av = page_lock_anon_vma_read(page);
if (av == NULL) /* Not actually mapped anymore */
return;
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
}
}
read_unlock(&tasklist_lock);
- page_unlock_anon_vma(av);
+ page_unlock_anon_vma_read(av);
}
/*
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags)
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMORY_FAILURE);
if (ret) {
putback_lru_pages(&pagelist);
pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index db2e9e797a0..e6a3b933517 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -57,6 +57,7 @@
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
+#include <linux/migrate.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
goto out;
}
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto no_page_table;
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
split_huge_page_pmd(vma, address, pmd);
@@ -1532,6 +1535,8 @@ split_fallthrough:
pte = *ptep;
if (!pte_present(pte))
goto no_page;
+ if ((flags & FOLL_NUMA) && pte_numa(pte))
+ goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte))
goto unlock;
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
(VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
vm_flags &= (gup_flags & FOLL_FORCE) ?
(VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ /*
+ * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault
+ * would be called on PROT_NONE ranges. We must never invoke
+ * handle_mm_fault on PROT_NONE ranges or the NUMA hinting
+ * page faults would unprotect the PROT_NONE ranges if
+ * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd
+ * bitflag. So to avoid that, don't set FOLL_NUMA if
+ * FOLL_FORCE is set.
+ */
+ if (!(gup_flags & FOLL_FORCE))
+ gup_flags |= FOLL_NUMA;
+
i = 0;
do {
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, int current_nid)
+{
+ get_page(page);
+
+ count_vm_numa_event(NUMA_HINT_FAULTS);
+ if (current_nid == numa_node_id())
+ count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+
+ return mpol_misplaced(page, vma, addr);
+}
+
+int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
+{
+ struct page *page = NULL;
+ spinlock_t *ptl;
+ int current_nid = -1;
+ int target_nid;
+ bool migrated = false;
+
+ /*
+ * The "pte" at this point cannot be used safely without
+ * validation through pte_unmap_same(). It's of NUMA type but
+ * the pfn may be screwed if the read is non atomic.
+ *
+ * ptep_modify_prot_start is not called as this is clearing
+ * the _PAGE_NUMA bit and it is not really expected that there
+ * would be concurrent hardware modifications to the PTE.
+ */
+ ptl = pte_lockptr(mm, pmd);
+ spin_lock(ptl);
+ if (unlikely(!pte_same(*ptep, pte))) {
+ pte_unmap_unlock(ptep, ptl);
+ goto out;
+ }
+
+ pte = pte_mknonnuma(pte);
+ set_pte_at(mm, addr, ptep, pte);
+ update_mmu_cache(vma, addr, ptep);
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page) {
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+ }
+
+ current_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, current_nid);
+ pte_unmap_unlock(ptep, ptl);
+ if (target_nid == -1) {
+ /*
+ * Account for the fault against the current node if it not
+ * being replaced regardless of where the page is located.
+ */
+ current_nid = numa_node_id();
+ put_page(page);
+ goto out;
+ }
+
+ /* Migrate to the requested node */
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ current_nid = target_nid;
+
+out:
+ if (current_nid != -1)
+ task_numa_fault(current_nid, 1, migrated);
+ return 0;
+}
+
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ pte_t *pte, *orig_pte;
+ unsigned long _addr = addr & PMD_MASK;
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+ int local_nid = numa_node_id();
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+ if (pmd_numa(pmd)) {
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+ numa = true;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!numa)
+ return 0;
+
+ /* we're in a page fault so some vma must be in the range */
+ BUG_ON(!vma);
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+ VM_BUG_ON(offset >= PMD_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+ pte += offset >> PAGE_SHIFT;
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+ int curr_nid = local_nid;
+ int target_nid;
+ bool migrated;
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+ continue;
+ if (addr >= vma->vm_end) {
+ vma = find_vma(mm, addr);
+ /* there's a pte present so there must be a vma */
+ BUG_ON(!vma);
+ BUG_ON(addr < vma->vm_start);
+ }
+ if (pte_numa(pteval)) {
+ pteval = pte_mknonnuma(pteval);
+ set_pte_at(mm, addr, pte, pteval);
+ }
+ page = vm_normal_page(vma, addr, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+ /*
+ * Note that the NUMA fault is later accounted to either
+ * the node that is currently running or where the page is
+ * migrated to.
+ */
+ curr_nid = local_nid;
+ target_nid = numa_migrate_prep(page, vma, addr,
+ page_to_nid(page));
+ if (target_nid == -1) {
+ put_page(page);
+ continue;
+ }
+
+ /* Migrate to the requested node */
+ pte_unmap_unlock(pte, ptl);
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ curr_nid = target_nid;
+ task_numa_fault(curr_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm,
pte, pmd, flags, entry);
}
+ if (pte_numa(entry))
+ return do_numa_page(mm, vma, address, entry, pte, pmd);
+
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
@@ -3520,8 +3704,11 @@ retry:
if (pmd_trans_huge(orig_pmd)) {
unsigned int dirty = flags & FAULT_FLAG_WRITE;
- if (dirty && !pmd_write(orig_pmd) &&
- !pmd_trans_splitting(orig_pmd)) {
+ if (pmd_numa(orig_pmd))
+ return do_huge_pmd_numa_page(mm, vma, address,
+ orig_pmd, pmd);
+
+ if (dirty && !pmd_write(orig_pmd)) {
ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
orig_pmd);
/*
@@ -3536,16 +3723,21 @@ retry:
huge_pmd_set_accessed(mm, vma, address, pmd,
orig_pmd, dirty);
}
+
return 0;
}
}
+ if (pmd_numa(*pmd))
+ return do_pmd_numa_page(mm, vma, address, pmd);
+
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
* run pte_offset_map on the pmd, if an huge pmd could
* materialize from under us from a different thread.
*/
- if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
+ if (unlikely(pmd_none(*pmd)) &&
+ unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
/* if an huge pmd materialized from under us just retry later */
if (unlikely(pmd_trans_huge(*pmd)))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 518baa896e8..962e353aa86 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* migrate_pages returns # of failed pages.
*/
ret = migrate_pages(&source, alloc_migrate_target, 0,
- true, MIGRATE_SYNC);
+ true, MIGRATE_SYNC,
+ MR_MEMORY_HOTPLUG);
if (ret)
putback_lru_pages(&source);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index aaf54566cb6..d1b315e9862 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -90,6 +90,7 @@
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = {
.flags = MPOL_F_LOCAL,
};
+static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+
+static struct mempolicy *get_task_policy(struct task_struct *p)
+{
+ struct mempolicy *pol = p->mempolicy;
+ int node;
+
+ if (!pol) {
+ node = numa_node_id();
+ if (node != -1)
+ pol = &preferred_node_policy[node];
+
+ /* preferred_node_policy is not initialised early in boot */
+ if (!pol->mode)
+ pol = NULL;
+ }
+
+ return pol;
+}
+
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
- return NULL; /* simply delete any existing policy */
+ return NULL;
}
VM_BUG_ON(!nodes);
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
+ } else if (mode == MPOL_LOCAL) {
+ if (!nodes_empty(*nodes))
+ return ERR_PTR(-EINVAL);
+ mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
return 0;
}
+#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+/*
+ * This is used to mark a range of virtual addresses to be inaccessible.
+ * These are later cleared by a NUMA hinting fault. Depending on these
+ * faults, pages may be migrated for better NUMA placement.
+ *
+ * This is assuming that NUMA faults are handled using PROT_NONE. If
+ * an architecture makes a different choice, it will need further
+ * changes to the core.
+ */
+unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ int nr_updated;
+ BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
+
+ nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ if (nr_updated)
+ count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+
+ return nr_updated;
+}
+#else
+static unsigned long change_prot_numa(struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
return ERR_PTR(-EFAULT);
prev = NULL;
for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+ unsigned long endvma = vma->vm_end;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
if (!(flags & MPOL_MF_DISCONTIG_OK)) {
if (!vma->vm_next && vma->vm_end < end)
return ERR_PTR(-EFAULT);
if (prev && prev->vm_end < vma->vm_start)
return ERR_PTR(-EFAULT);
}
- if (!is_vm_hugetlb_page(vma) &&
- ((flags & MPOL_MF_STRICT) ||
+
+ if (is_vm_hugetlb_page(vma))
+ goto next;
+
+ if (flags & MPOL_MF_LAZY) {
+ change_prot_numa(vma, start, endvma);
+ goto next;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))) {
- unsigned long endvma = vma->vm_end;
+ vma_migratable(vma))) {
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
err = check_pgd_range(vma, start, endvma, nodes,
flags, private);
if (err) {
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
break;
}
}
+next:
prev = vma;
}
return first;
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len,
int err;
LIST_HEAD(pagelist);
- if (flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & ~(unsigned long)MPOL_MF_VALID)
return -EINVAL;
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len,
if (IS_ERR(new))
return PTR_ERR(new);
+ if (flags & MPOL_MF_LAZY)
+ new->flags |= MPOL_F_MOF;
+
/*
* If we are using the default policy then operation
* on discontinuous address spaces is okay after all
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len,
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
- int nr_failed = 0;
-
+ err = PTR_ERR(vma); /* maybe ... */
+ if (!IS_ERR(vma))
err = mbind_range(mm, start, end, new);
+ if (!err) {
+ int nr_failed = 0;
+
if (!list_empty(&pagelist)) {
+ WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
(unsigned long)vma,
- false, MIGRATE_SYNC);
+ false, MIGRATE_SYNC,
+ MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
putback_lru_pages(&pagelist);
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
struct mempolicy *get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
- struct mempolicy *pol = task->mempolicy;
+ struct mempolicy *pol = get_task_policy(task);
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
@@ -1956,7 +2028,7 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = get_task_policy(current);
struct page *page;
unsigned int cpuset_mems_cookie;
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+/**
+ * mpol_misplaced - check whether current page node is valid in policy
+ *
+ * @page - page to be checked
+ * @vma - vm area where page mapped
+ * @addr - virtual address where page mapped
+ *
+ * Lookup current policy node id for vma,addr and "compare to" page's
+ * node id.
+ *
+ * Returns:
+ * -1 - not misplaced, page is in the right node
+ * node - node id where the page should be
+ *
+ * Policy determination "mimics" alloc_page_vma().
+ * Called from fault path where we know the vma and faulting address.
+ */
+int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
+{
+ struct mempolicy *pol;
+ struct zone *zone;
+ int curnid = page_to_nid(page);
+ unsigned long pgoff;
+ int polnid = -1;
+ int ret = -1;
+
+ BUG_ON(!vma);
+
+ pol = get_vma_policy(current, vma, addr);
+ if (!(pol->flags & MPOL_F_MOF))
+ goto out;
+
+ switch (pol->mode) {
+ case MPOL_INTERLEAVE:
+ BUG_ON(addr >= vma->vm_end);
+ BUG_ON(addr < vma->vm_start);
+
+ pgoff = vma->vm_pgoff;
+ pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
+ polnid = offset_il_node(pol, vma, pgoff);
+ break;
+
+ case MPOL_PREFERRED:
+ if (pol->flags & MPOL_F_LOCAL)
+ polnid = numa_node_id();
+ else
+ polnid = pol->v.preferred_node;
+ break;
+
+ case MPOL_BIND:
+ /*
+ * allows binding to multiple nodes.
+ * use current page if in policy nodemask,
+ * else select nearest allowed node, if any.
+ * If no allowed nodes, use current [!misplaced].
+ */
+ if (node_isset(curnid, pol->v.nodes))
+ goto out;
+ (void)first_zones_zonelist(
+ node_zonelist(numa_node_id(), GFP_HIGHUSER),
+ gfp_zone(GFP_HIGHUSER),
+ &pol->v.nodes, &zone);
+ polnid = zone->node;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Migrate the page towards the node whose CPU is referencing it */
+ if (pol->flags & MPOL_F_MORON) {
+ int last_nid;
+
+ polnid = numa_node_id();
+
+ /*
+ * Multi-stage node selection is used in conjunction
+ * with a periodic migration fault to build a temporal
+ * task<->page relation. By using a two-stage filter we
+ * remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist
+ * probability, we can equate a task's usage of a
+ * particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and
+ * getting the same result twice in a row, given these
+ * samples are fully independent, is then given by
+ * P(n)^2, provided our sample period is sufficiently
+ * short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making
+ * it less likely we act on an unlikely task<->page
+ * relation.
+ */
+ last_nid = page_xchg_last_nid(page, polnid);
+ if (last_nid != polnid)
+ goto out;
+ }
+
+ if (curnid != polnid)
+ ret = polnid;
+out:
+ mpol_cond_put(pol);
+
+ return ret;
+}
+
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
pr_debug("deleting %lx-l%lx\n", n->start, n->end);
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p)
mutex_unlock(&p->mutex);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool __initdata numabalancing_override;
+
+static void __init check_numabalancing_enable(void)
+{
+ bool numabalancing_default = false;
+
+ if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
+ numabalancing_default = true;
+
+ if (nr_node_ids > 1 && !numabalancing_override) {
+ printk(KERN_INFO "Enabling automatic NUMA balancing. "
+ "Configure with numa_balancing= or sysctl");
+ set_numabalancing_state(numabalancing_default);
+ }
+}
+
+static int __init setup_numabalancing(char *str)
+{
+ int ret = 0;
+ if (!str)
+ goto out;
+ numabalancing_override = true;
+
+ if (!strcmp(str, "enable")) {
+ set_numabalancing_state(true);
+ ret = 1;
+ } else if (!strcmp(str, "disable")) {
+ set_numabalancing_state(false);
+ ret = 1;
+ }
+out:
+ if (!ret)
+ printk(KERN_WARNING "Unable to parse numa_balancing=\n");
+
+ return ret;
+}
+__setup("numa_balancing=", setup_numabalancing);
+#else
+static inline void __init check_numabalancing_enable(void)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void)
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
+ for_each_node(nid) {
+ preferred_node_policy[nid] = (struct mempolicy) {
+ .refcnt = ATOMIC_INIT(1),
+ .mode = MPOL_PREFERRED,
+ .flags = MPOL_F_MOF | MPOL_F_MORON,
+ .v = { .preferred_node = nid, },
+ };
+ }
+
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void)
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
+
+ check_numabalancing_enable();
}
/* Reset policy of current process to default */
@@ -2362,14 +2598,13 @@ void numa_default_policy(void)
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL MPOL_MAX
static const char * const policy_modes[] =
{
[MPOL_DEFAULT] = "default",
[MPOL_PREFERRED] = "prefer",
[MPOL_BIND] = "bind",
[MPOL_INTERLEAVE] = "interleave",
- [MPOL_LOCAL] = "local"
+ [MPOL_LOCAL] = "local",
};
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ for (mode = 0; mode < MPOL_MAX; mode++) {
if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (mode > MPOL_LOCAL)
+ if (mode >= MPOL_MAX)
goto out;
switch (mode) {
diff --git a/mm/migrate.c b/mm/migrate.c
index cae02711181..32efd8028bc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -39,6 +39,9 @@
#include <asm/tlbflush.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/migrate.h>
+
#include "internal.h"
/*
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
struct buffer_head *head, enum migrate_mode mode)
{
- int expected_count;
+ int expected_count = 0;
void **pslot;
if (!mapping) {
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- if (PageHuge(page))
+ if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
*/
if (PageAnon(page)) {
/*
- * Only page_lock_anon_vma() understands the subtleties of
+ * Only page_lock_anon_vma_read() understands the subtleties of
* getting a hold on an anon_vma from outside one of its mms.
*/
anon_vma = page_get_anon_vma(page);
@@ -998,10 +1001,11 @@ out:
*/
int migrate_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, bool offlining,
- enum migrate_mode mode)
+ enum migrate_mode mode, int reason)
{
int retry = 1;
int nr_failed = 0;
+ int nr_succeeded = 0;
int pass = 0;
struct page *page;
struct page *page2;
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from,
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ nr_succeeded++;
break;
default:
/* Permanent failure */
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from,
}
rc = nr_failed + retry;
out:
+ if (nr_succeeded)
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ if (nr_failed)
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
@@ -1176,7 +1187,8 @@ set_status:
err = 0;
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_page_node,
- (unsigned long)pm, 0, MIGRATE_SYNC);
+ (unsigned long)pm, 0, MIGRATE_SYNC,
+ MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
}
return err;
}
-#endif
+
+#ifdef CONFIG_NUMA_BALANCING
+/*
+ * Returns true if this is a safe migration target node for misplaced NUMA
+ * pages. Currently it only checks the watermarks which crude
+ */
+static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
+ int nr_migrate_pages)
+{
+ int z;
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable)
+ continue;
+
+ /* Avoid waking kswapd by allocating pages_to_migrate pages. */
+ if (!zone_watermark_ok(zone, 0,
+ high_wmark_pages(zone) +
+ nr_migrate_pages,
+ 0, 0))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+static struct page *alloc_misplaced_dst_page(struct page *page,
+ unsigned long data,
+ int **result)
+{
+ int nid = (int) data;
+ struct page *newpage;
+
+ newpage = alloc_pages_exact_node(nid,
+ (GFP_HIGHUSER_MOVABLE | GFP_THISNODE |
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN) &
+ ~GFP_IOFS, 0);
+ if (newpage)
+ page_xchg_last_nid(newpage, page_last_nid(page));
+
+ return newpage;
+}
+
+/*
+ * page migration rate limiting control.
+ * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
+ * window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
+ */
+static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
+static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+ msecs_to_jiffies(pteupdate_interval_millisecs)))
+ return false;
+
+ if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+ return false;
+
+ return true;
+}
+
+/* Returns true if the node is migrate rate-limited after the update */
+bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages)
+{
+ bool rate_limited = false;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ spin_lock(&pgdat->numabalancing_migrate_lock);
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) {
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies +
+ msecs_to_jiffies(migrate_interval_millisecs);
+ }
+ if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages)
+ rate_limited = true;
+ else
+ pgdat->numabalancing_migrate_nr_pages += nr_pages;
+ spin_unlock(&pgdat->numabalancing_migrate_lock);
+
+ return rate_limited;
+}
+
+int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+{
+ int ret = 0;
+
+ /* Avoid migrating to a node that is nearly full */
+ if (migrate_balanced_pgdat(pgdat, 1)) {
+ int page_lru;
+
+ if (isolate_lru_page(page)) {
+ put_page(page);
+ return 0;
+ }
+
+ /* Page is isolated */
+ ret = 1;
+ page_lru = page_is_file_cache(page);
+ if (!PageTransHuge(page))
+ inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru);
+ else
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ HPAGE_PMD_NR);
+ }
+
+ /*
+ * Page is either isolated or there is not enough space on the target
+ * node. If isolated, then it has taken a reference count and the
+ * callers reference can be safely dropped without the page
+ * disappearing underneath us during migration. Otherwise the page is
+ * not to be migrated but the callers reference should still be
+ * dropped so it does not leak.
+ */
+ put_page(page);
+
+ return ret;
+}
+
+/*
+ * Attempt to migrate a misplaced page to the specified destination
+ * node. Caller is expected to have an elevated reference count on
+ * the page that will be dropped by this function before returning.
+ */
+int migrate_misplaced_page(struct page *page, int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ int nr_remaining;
+ LIST_HEAD(migratepages);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1) {
+ put_page(page);
+ goto out;
+ }
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, 1)) {
+ put_page(page);
+ goto out;
+ }
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated)
+ goto out;
+
+ list_add(&page->lru, &migratepages);
+ nr_remaining = migrate_pages(&migratepages,
+ alloc_misplaced_dst_page,
+ node, false, MIGRATE_ASYNC,
+ MR_NUMA_MISPLACED);
+ if (nr_remaining) {
+ putback_lru_pages(&migratepages);
+ isolated = 0;
+ } else
+ count_vm_numa_event(NUMA_PAGE_MIGRATE);
+ BUG_ON(!list_empty(&migratepages));
+out:
+ return isolated;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+int migrate_misplaced_transhuge_page(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pmd_t *pmd, pmd_t entry,
+ unsigned long address,
+ struct page *page, int node)
+{
+ unsigned long haddr = address & HPAGE_PMD_MASK;
+ pg_data_t *pgdat = NODE_DATA(node);
+ int isolated = 0;
+ struct page *new_page = NULL;
+ struct mem_cgroup *memcg = NULL;
+ int page_lru = page_is_file_cache(page);
+
+ /*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1)
+ goto out_dropref;
+
+ /*
+ * Rate-limit the amount of data that is being migrated to a node.
+ * Optimal placement is no good if the memory bus is saturated and
+ * all the time is being spent migrating!
+ */
+ if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
+ goto out_dropref;
+
+ new_page = alloc_pages_node(node,
+ (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER);
+ if (!new_page) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out_dropref;
+ }
+ page_xchg_last_nid(new_page, page_last_nid(page));
+
+ isolated = numamigrate_isolate_page(pgdat, page);
+ if (!isolated) {
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ put_page(new_page);
+ goto out_keep_locked;
+ }
+
+ /* Prepare a page as a migration target */
+ __set_page_locked(new_page);
+ SetPageSwapBacked(new_page);
+
+ /* anon mapping, we can simply copy page->mapping to the new page: */
+ new_page->mapping = page->mapping;
+ new_page->index = page->index;
+ migrate_page_copy(new_page, page);
+ WARN_ON(PageLRU(new_page));
+
+ /* Recheck the target PMD */
+ spin_lock(&mm->page_table_lock);
+ if (unlikely(!pmd_same(*pmd, entry))) {
+ spin_unlock(&mm->page_table_lock);
+
+ /* Reverse changes made by migrate_page_copy() */
+ if (TestClearPageActive(new_page))
+ SetPageActive(page);
+ if (TestClearPageUnevictable(new_page))
+ SetPageUnevictable(page);
+ mlock_migrate_page(page, new_page);
+
+ unlock_page(new_page);
+ put_page(new_page); /* Free it */
+
+ unlock_page(page);
+ putback_lru_page(page);
+
+ count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
+ goto out;
+ }
+
+ /*
+ * Traditional migration needs to prepare the memcg charge
+ * transaction early to prevent the old page from being
+ * uncharged when installing migration entries. Here we can
+ * save the potential rollback and start the charge transfer
+ * only when migration is already known to end successfully.
+ */
+ mem_cgroup_prepare_migration(page, new_page, &memcg);
+
+ entry = mk_pmd(new_page, vma->vm_page_prot);
+ entry = pmd_mknonnuma(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ entry = pmd_mkhuge(entry);
+
+ page_add_new_anon_rmap(new_page, vma, haddr);
+
+ set_pmd_at(mm, haddr, pmd, entry);
+ update_mmu_cache_pmd(vma, address, entry);
+ page_remove_rmap(page);
+ /*
+ * Finish the charge transaction under the page table lock to
+ * prevent split_huge_page() from dividing up the charge
+ * before it's fully transferred to the new page.
+ */
+ mem_cgroup_end_migration(memcg, page, new_page, true);
+ spin_unlock(&mm->page_table_lock);
+
+ unlock_page(new_page);
+ unlock_page(page);
+ put_page(page); /* Drop the rmap reference */
+ put_page(page); /* Drop the LRU isolation reference */
+
+ count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
+ count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
+
+out:
+ mod_zone_page_state(page_zone(page),
+ NR_ISOLATED_ANON + page_lru,
+ -HPAGE_PMD_NR);
+ return isolated;
+
+out_dropref:
+ put_page(page);
+out_keep_locked:
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+#endif /* CONFIG_NUMA */
diff --git a/mm/mmap.c b/mm/mmap.c
index 2b7d9e78a56..f54b235f29a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end);
if (anon_vma) {
VM_BUG_ON(adjust_next && next->anon_vma &&
anon_vma != next->anon_vma);
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
anon_vma_interval_tree_pre_update_vma(next);
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
+ down_write(&anon_vma->root->rwsem);
/*
* We can safely modify head.next after taking the
- * anon_vma->root->mutex. If some other vma in this mm shares
+ * anon_vma->root->rwsem. If some other vma in this mm shares
* the same anon_vma we won't take it again.
*
* No need of atomic instructions here, head.next
* can't change from under us thanks to the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (__test_and_set_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
*
* No need of atomic instructions here, head.next
* can't change from under us until we release the
- * anon_vma->root->mutex.
+ * anon_vma->root->rwsem.
*/
if (!__test_and_clear_bit(0, (unsigned long *)
&anon_vma->root->rb_root.rb_node))
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8c3938db6f..3dca970367d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
}
#endif
-static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
+ struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
+ unsigned long pages = 0;
+ bool all_same_node = true;
+ int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
+ bool updated = false;
ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ if (!prot_numa) {
+ ptent = pte_modify(ptent, newprot);
+ updated = true;
+ } else {
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, oldpte);
+ if (page) {
+ int this_nid = page_to_nid(page);
+ if (last_nid == -1)
+ last_nid = this_nid;
+ if (last_nid != this_nid)
+ all_same_node = false;
+
+ /* only check non-shared pages */
+ if (!pte_numa(oldpte) &&
+ page_mapcount(page) == 1) {
+ ptent = pte_mknuma(ptent);
+ updated = true;
+ }
+ }
+ }
/*
* Avoid taking write faults for pages we know to be
* dirty.
*/
- if (dirty_accountable && pte_dirty(ptent))
+ if (dirty_accountable && pte_dirty(ptent)) {
ptent = pte_mkwrite(ptent);
+ updated = true;
+ }
+ if (updated)
+ pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
set_pte_at(mm, addr, pte,
swp_entry_to_pte(entry));
}
+ pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+
+ *ret_all_same_node = all_same_node;
+ return pages;
}
-static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
unsigned long next;
+ unsigned long pages = 0;
+ bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
- else if (change_huge_pmd(vma, pmd, addr, newprot))
+ else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
+ pages += HPAGE_PMD_NR;
continue;
+ }
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
- change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa, &all_same_node);
+
+ /*
+ * If we are changing protections for NUMA hinting faults then
+ * set pmd_numa if the examined pages were all on the same
+ * node. This allows a regular PMD to be handled as one fault
+ * and effectively batches the taking of the PTL
+ */
+ if (prot_numa && all_same_node)
+ change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
+
+ return pages;
}
-static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
pud_t *pud;
unsigned long next;
+ unsigned long pages = 0;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- change_pmd_range(vma, pud, addr, next, newprot,
- dirty_accountable);
+ pages += change_pmd_range(vma, pud, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pud++, addr = next, addr != end);
+
+ return pages;
}
-static void change_protection(struct vm_area_struct *vma,
+static unsigned long change_protection_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable)
+ int dirty_accountable, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
unsigned long next;
unsigned long start = addr;
+ unsigned long pages = 0;
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- change_pud_range(vma, pgd, addr, next, newprot,
- dirty_accountable);
+ pages += change_pud_range(vma, pgd, addr, next, newprot,
+ dirty_accountable, prot_numa);
} while (pgd++, addr = next, addr != end);
- flush_tlb_range(vma, start, end);
+
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
+
+ return pages;
+}
+
+unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgprot_t newprot,
+ int dirty_accountable, int prot_numa)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long pages;
+
+ mmu_notifier_invalidate_range_start(mm, start, end);
+ if (is_vm_hugetlb_page(vma))
+ pages = hugetlb_change_protection(vma, start, end, newprot);
+ else
+ pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
+ mmu_notifier_invalidate_range_end(mm, start, end);
+
+ return pages;
}
int
@@ -213,12 +304,8 @@ success:
dirty_accountable = 1;
}
- mmu_notifier_invalidate_range_start(mm, start, end);
- if (is_vm_hugetlb_page(vma))
- hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
- else
- change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
- mmu_notifier_invalidate_range_end(mm, start, end);
+ change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
+
vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
vm_stat_account(mm, newflags, vma->vm_file, nrpages);
perf_event_mmap(vma);
diff --git a/mm/mremap.c b/mm/mremap.c
index eabb24da6c9..e1031e1f6a6 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
}
if (vma->anon_vma) {
anon_vma = vma->anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 83637dfba11..d037c8bc151 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
+ reset_page_last_nid(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
+ reset_page_last_nid(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
int ret;
pgdat_resize_init(pgdat);
+#ifdef CONFIG_NUMA_BALANCING
+ spin_lock_init(&pgdat->numabalancing_migrate_lock);
+ pgdat->numabalancing_migrate_nr_pages = 0;
+ pgdat->numabalancing_migrate_next_window = jiffies;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
@@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
ret = migrate_pages(&cc->migratepages,
alloc_migrate_target,
- 0, false, MIGRATE_SYNC);
+ 0, false, MIGRATE_SYNC,
+ MR_CMA);
}
putback_movable_pages(&cc->migratepages);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e642627da6b..0c8323fe6c8 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -12,8 +12,8 @@
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
- * Only sets the access flags (dirty, accessed, and
- * writable). Furthermore, we know it always gets set to a "more
+ * Only sets the access flags (dirty, accessed), as well as write
+ * permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
* instructs the caller to do things like update__mmu_cache. This
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed) {
set_pte_at(vma->vm_mm, address, ptep, entry);
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
return changed;
}
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pte_t pte;
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- flush_tlb_page(vma, address);
+ if (pte_accessible(pte))
+ flush_tlb_page(vma, address);
return pte;
}
#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index face808a489..2c78f8cadc9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,7 +24,7 @@
* mm->mmap_sem
* page->flags PG_locked (lock_page)
* mapping->i_mmap_mutex
- * anon_vma->mutex
+ * anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
@@ -37,7 +37,7 @@
* in arch-dependent flush_dcache_mmap_lock,
* within bdi.wb->list_lock in __sync_single_inode)
*
- * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon)
+ * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
*/
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma)
VM_BUG_ON(atomic_read(&anon_vma->refcount));
/*
- * Synchronize against page_lock_anon_vma() such that
+ * Synchronize against page_lock_anon_vma_read() such that
* we can safely hold the lock without the anon_vma getting
* freed.
*
* Relies on the full mb implied by the atomic_dec_and_test() from
* put_anon_vma() against the acquire barrier implied by
- * mutex_trylock() from page_lock_anon_vma(). This orders:
+ * down_read_trylock() from page_lock_anon_vma_read(). This orders:
*
- * page_lock_anon_vma() VS put_anon_vma()
- * mutex_trylock() atomic_dec_and_test()
+ * page_lock_anon_vma_read() VS put_anon_vma()
+ * down_read_trylock() atomic_dec_and_test()
* LOCK MB
- * atomic_read() mutex_is_locked()
+ * atomic_read() rwsem_is_locked()
*
* LOCK should suffice since the actual taking of the lock must
* happen _before_ what follows.
*/
- if (mutex_is_locked(&anon_vma->root->mutex)) {
- anon_vma_lock(anon_vma);
+ if (rwsem_is_locked(&anon_vma->root->rwsem)) {
+ anon_vma_lock_write(anon_vma);
anon_vma_unlock(anon_vma);
}
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* allocate a new one.
*
* Anon-vma allocations are very subtle, because we may have
- * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * optimistically looked up an anon_vma in page_lock_anon_vma_read()
* and that may actually touch the spinlock even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
allocated = anon_vma;
}
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
/* page_table_lock to protect against threads */
spin_lock(&mm->page_table_lock);
if (likely(!vma->anon_vma)) {
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
struct anon_vma *new_root = anon_vma->root;
if (new_root != root) {
if (WARN_ON_ONCE(root))
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
root = new_root;
- mutex_lock(&root->mutex);
+ down_write(&root->rwsem);
}
return root;
}
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct
static inline void unlock_anon_vma_root(struct anon_vma *root)
{
if (root)
- mutex_unlock(&root->mutex);
+ up_write(&root->rwsem);
}
/*
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
get_anon_vma(anon_vma->root);
/* Mark this anon_vma as the one where our new (COWed) pages go. */
vma->anon_vma = anon_vma;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma_unlock(anon_vma);
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
/*
* Iterate the list once more, it now only contains empty and unlinked
* anon_vmas, destroy them. Could not do before due to __put_anon_vma()
- * needing to acquire the anon_vma->root->mutex.
+ * needing to write-acquire the anon_vma->root->rwsem.
*/
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data)
{
struct anon_vma *anon_vma = data;
- mutex_init(&anon_vma->mutex);
+ init_rwsem(&anon_vma->rwsem);
atomic_set(&anon_vma->refcount, 0);
anon_vma->rb_root = RB_ROOT;
}
@@ -442,7 +442,7 @@ out:
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
* reference like with page_get_anon_vma() and then block on the mutex.
*/
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma_read(struct page *page)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
root_anon_vma = ACCESS_ONCE(anon_vma->root);
- if (mutex_trylock(&root_anon_vma->mutex)) {
+ if (down_read_trylock(&root_anon_vma->rwsem)) {
/*
* If the page is still mapped, then this anon_vma is still
* its anon_vma, and holding the mutex ensures that it will
* not go away, see anon_vma_free().
*/
if (!page_mapped(page)) {
- mutex_unlock(&root_anon_vma->mutex);
+ up_read(&root_anon_vma->rwsem);
anon_vma = NULL;
}
goto out;
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
/* we pinned the anon_vma, its safe to sleep */
rcu_read_unlock();
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
if (atomic_dec_and_test(&anon_vma->refcount)) {
/*
* Oops, we held the last refcount, release the lock
* and bail -- can't simply use put_anon_vma() because
- * we'll deadlock on the anon_vma_lock() recursion.
+ * we'll deadlock on the anon_vma_lock_write() recursion.
*/
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
__put_anon_vma(anon_vma);
anon_vma = NULL;
}
@@ -504,9 +504,9 @@ out:
return anon_vma;
}
-void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
{
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
}
/*
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page,
struct anon_vma_chain *avc;
int referenced = 0;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return referenced;
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page,
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return referenced;
}
@@ -1315,7 +1315,7 @@ out_mlock:
/*
* We need mmap_sem locking, Otherwise VM_LOCKED check makes
* unstable result and race. Plus, We can't wait here because
- * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
+ * we now hold anon_vma->rwsem or mapping->i_mmap_mutex.
* if trylock failed, the page remain in evictable lru and later
* vmscan could retry to move the page to unevictable lru if the
* page is actually mlocked.
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
struct anon_vma_chain *avc;
int ret = SWAP_AGAIN;
- anon_vma = page_lock_anon_vma(page);
+ anon_vma = page_lock_anon_vma_read(page);
if (!anon_vma)
return ret;
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
break;
}
- page_unlock_anon_vma(anon_vma);
+ page_unlock_anon_vma_read(anon_vma);
return ret;
}
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
int ret = SWAP_AGAIN;
/*
- * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+ * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
* are holding mmap_sem. Users without mmap_sem are required to
* take a reference count to prevent the anon_vma disappearing
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
anon_vma = page_anon_vma(page);
if (!anon_vma)
return ret;
- anon_vma_lock(anon_vma);
+ anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
if (ret != SWAP_AGAIN)
break;
}
- anon_vma_unlock(anon_vma);
+ anon_vma_unlock_read(anon_vma);
return ret;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index df14808f0a3..9800306c819 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = {
"pgrotated",
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_pte_updates",
+ "numa_hint_faults",
+ "numa_hint_faults_local",
+ "numa_pages_migrated",
+#endif
+#ifdef CONFIG_MIGRATION
+ "pgmigrate_success",
+ "pgmigrate_fail",
+#endif
#ifdef CONFIG_COMPACTION
- "compact_blocks_moved",
- "compact_pages_moved",
- "compact_pagemigrate_failed",
+ "compact_migrate_scanned",
+ "compact_free_scanned",
+ "compact_isolated",
"compact_stall",
"compact_fail",
"compact_success",
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 8aa4b111538..0a69d075779 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -259,20 +259,16 @@ static int __init init_dns_resolver(void)
if (!cred)
return -ENOMEM;
- keyring = key_alloc(&key_type_keyring, ".dns_resolver",
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
- (KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA);
+ keyring = keyring_alloc(".dns_resolver",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred,
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ,
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto failed_put_cred;
}
- ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL);
- if (ret < 0)
- goto failed_put_key;
-
ret = register_key_type(&key_type_dns_resolver);
if (ret < 0)
goto failed_put_key;
@@ -304,3 +300,4 @@ static void __exit exit_dns_resolver(void)
module_init(init_dns_resolver)
module_exit(exit_dns_resolver)
MODULE_LICENSE("GPL");
+
diff --git a/security/keys/key.c b/security/keys/key.c
index a15c9da8f97..8fb7c7bd465 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -854,13 +854,13 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref,
/* if the client doesn't provide, decide on the permissions we want */
if (perm == KEY_PERM_UNDEF) {
perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
- perm |= KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_LINK | KEY_USR_SETATTR;
+ perm |= KEY_USR_VIEW;
if (ktype->read)
- perm |= KEY_POS_READ | KEY_USR_READ;
+ perm |= KEY_POS_READ;
if (ktype == &key_type_keyring || ktype->update)
- perm |= KEY_USR_WRITE;
+ perm |= KEY_POS_WRITE;
}
/* allocate a new key */
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 5d34b4e827d..4b5c948eb41 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1132,12 +1132,12 @@ long keyctl_instantiate_key_iov(key_serial_t id,
ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc,
ARRAY_SIZE(iovstack), iovstack, &iov);
if (ret < 0)
- return ret;
+ goto err;
if (ret == 0)
goto no_payload_free;
ret = keyctl_instantiate_key_common(id, iov, ioc, ret, ringid);
-
+err:
if (iov != iovstack)
kfree(iov);
return ret;
@@ -1495,7 +1495,8 @@ long keyctl_session_to_parent(void)
goto error_keyring;
newwork = &cred->rcu;
- cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r);
+ cred->session_keyring = key_ref_to_ptr(keyring_r);
+ keyring_r = NULL;
init_task_work(newwork, key_change_session_keyring);
me = current;
@@ -1519,7 +1520,7 @@ long keyctl_session_to_parent(void)
mycred = current_cred();
pcred = __task_cred(parent);
if (mycred == pcred ||
- mycred->tgcred->session_keyring == pcred->tgcred->session_keyring) {
+ mycred->session_keyring == pcred->session_keyring) {
ret = 0;
goto unlock;
}
@@ -1535,9 +1536,9 @@ long keyctl_session_to_parent(void)
goto unlock;
/* the keyrings must have the same UID */
- if ((pcred->tgcred->session_keyring &&
- !uid_eq(pcred->tgcred->session_keyring->uid, mycred->euid)) ||
- !uid_eq(mycred->tgcred->session_keyring->uid, mycred->euid))
+ if ((pcred->session_keyring &&
+ !uid_eq(pcred->session_keyring->uid, mycred->euid)) ||
+ !uid_eq(mycred->session_keyring->uid, mycred->euid))
goto unlock;
/* cancel an already pending keyring replacement */
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index 6e42df15a24..6ece7f2e570 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -257,17 +257,14 @@ error:
* Allocate a keyring and link into the destination keyring.
*/
struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
- const struct cred *cred, unsigned long flags,
- struct key *dest)
+ const struct cred *cred, key_perm_t perm,
+ unsigned long flags, struct key *dest)
{
struct key *keyring;
int ret;
keyring = key_alloc(&key_type_keyring, description,
- uid, gid, cred,
- (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL,
- flags);
-
+ uid, gid, cred, perm, flags);
if (!IS_ERR(keyring)) {
ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
if (ret < 0) {
@@ -278,6 +275,7 @@ struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid,
return keyring;
}
+EXPORT_SYMBOL(keyring_alloc);
/**
* keyring_search_aux - Search a keyring tree for a key matching some criteria
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 86468f385fc..58dfe089094 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -45,10 +45,12 @@ int install_user_keyrings(void)
struct user_struct *user;
const struct cred *cred;
struct key *uid_keyring, *session_keyring;
+ key_perm_t user_keyring_perm;
char buf[20];
int ret;
uid_t uid;
+ user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL;
cred = current_cred();
user = cred->user;
uid = from_kuid(cred->user_ns, user->uid);
@@ -73,8 +75,8 @@ int install_user_keyrings(void)
uid_keyring = find_keyring_by_name(buf, true);
if (IS_ERR(uid_keyring)) {
uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID,
- cred, KEY_ALLOC_IN_QUOTA,
- NULL);
+ cred, user_keyring_perm,
+ KEY_ALLOC_IN_QUOTA, NULL);
if (IS_ERR(uid_keyring)) {
ret = PTR_ERR(uid_keyring);
goto error;
@@ -89,7 +91,8 @@ int install_user_keyrings(void)
if (IS_ERR(session_keyring)) {
session_keyring =
keyring_alloc(buf, user->uid, INVALID_GID,
- cred, KEY_ALLOC_IN_QUOTA, NULL);
+ cred, user_keyring_perm,
+ KEY_ALLOC_IN_QUOTA, NULL);
if (IS_ERR(session_keyring)) {
ret = PTR_ERR(session_keyring);
goto error_release;
@@ -130,6 +133,7 @@ int install_thread_keyring_to_cred(struct cred *new)
struct key *keyring;
keyring = keyring_alloc("_tid", new->uid, new->gid, new,
+ KEY_POS_ALL | KEY_USR_VIEW,
KEY_ALLOC_QUOTA_OVERRUN, NULL);
if (IS_ERR(keyring))
return PTR_ERR(keyring);
@@ -170,27 +174,18 @@ static int install_thread_keyring(void)
int install_process_keyring_to_cred(struct cred *new)
{
struct key *keyring;
- int ret;
- if (new->tgcred->process_keyring)
+ if (new->process_keyring)
return -EEXIST;
- keyring = keyring_alloc("_pid", new->uid, new->gid,
- new, KEY_ALLOC_QUOTA_OVERRUN, NULL);
+ keyring = keyring_alloc("_pid", new->uid, new->gid, new,
+ KEY_POS_ALL | KEY_USR_VIEW,
+ KEY_ALLOC_QUOTA_OVERRUN, NULL);
if (IS_ERR(keyring))
return PTR_ERR(keyring);
- spin_lock_irq(&new->tgcred->lock);
- if (!new->tgcred->process_keyring) {
- new->tgcred->process_keyring = keyring;
- keyring = NULL;
- ret = 0;
- } else {
- ret = -EEXIST;
- }
- spin_unlock_irq(&new->tgcred->lock);
- key_put(keyring);
- return ret;
+ new->process_keyring = keyring;
+ return 0;
}
/*
@@ -231,11 +226,12 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
/* create an empty session keyring */
if (!keyring) {
flags = KEY_ALLOC_QUOTA_OVERRUN;
- if (cred->tgcred->session_keyring)
+ if (cred->session_keyring)
flags = KEY_ALLOC_IN_QUOTA;
- keyring = keyring_alloc("_ses", cred->uid, cred->gid,
- cred, flags, NULL);
+ keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred,
+ KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
+ flags, NULL);
if (IS_ERR(keyring))
return PTR_ERR(keyring);
} else {
@@ -243,17 +239,11 @@ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring)
}
/* install the keyring */
- spin_lock_irq(&cred->tgcred->lock);
- old = cred->tgcred->session_keyring;
- rcu_assign_pointer(cred->tgcred->session_keyring, keyring);
- spin_unlock_irq(&cred->tgcred->lock);
-
- /* we're using RCU on the pointer, but there's no point synchronising
- * on it if it didn't previously point to anything */
- if (old) {
- synchronize_rcu();
+ old = cred->session_keyring;
+ rcu_assign_pointer(cred->session_keyring, keyring);
+
+ if (old)
key_put(old);
- }
return 0;
}
@@ -368,9 +358,9 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
}
/* search the process keyring second */
- if (cred->tgcred->process_keyring) {
+ if (cred->process_keyring) {
key_ref = keyring_search_aux(
- make_key_ref(cred->tgcred->process_keyring, 1),
+ make_key_ref(cred->process_keyring, 1),
cred, type, description, match, no_state_check);
if (!IS_ERR(key_ref))
goto found;
@@ -389,12 +379,10 @@ key_ref_t search_my_process_keyrings(struct key_type *type,
}
/* search the session keyring */
- if (cred->tgcred->session_keyring) {
+ if (cred->session_keyring) {
rcu_read_lock();
key_ref = keyring_search_aux(
- make_key_ref(rcu_dereference(
- cred->tgcred->session_keyring),
- 1),
+ make_key_ref(rcu_dereference(cred->session_keyring), 1),
cred, type, description, match, no_state_check);
rcu_read_unlock();
@@ -564,7 +552,7 @@ try_again:
break;
case KEY_SPEC_PROCESS_KEYRING:
- if (!cred->tgcred->process_keyring) {
+ if (!cred->process_keyring) {
if (!(lflags & KEY_LOOKUP_CREATE))
goto error;
@@ -576,13 +564,13 @@ try_again:
goto reget_creds;
}
- key = cred->tgcred->process_keyring;
+ key = cred->process_keyring;
atomic_inc(&key->usage);
key_ref = make_key_ref(key, 1);
break;
case KEY_SPEC_SESSION_KEYRING:
- if (!cred->tgcred->session_keyring) {
+ if (!cred->session_keyring) {
/* always install a session keyring upon access if one
* doesn't exist yet */
ret = install_user_keyrings();
@@ -597,7 +585,7 @@ try_again:
if (ret < 0)
goto error;
goto reget_creds;
- } else if (cred->tgcred->session_keyring ==
+ } else if (cred->session_keyring ==
cred->user->session_keyring &&
lflags & KEY_LOOKUP_CREATE) {
ret = join_session_keyring(NULL);
@@ -607,7 +595,7 @@ try_again:
}
rcu_read_lock();
- key = rcu_dereference(cred->tgcred->session_keyring);
+ key = rcu_dereference(cred->session_keyring);
atomic_inc(&key->usage);
rcu_read_unlock();
key_ref = make_key_ref(key, 1);
@@ -767,12 +755,6 @@ long join_session_keyring(const char *name)
struct key *keyring;
long ret, serial;
- /* only permit this if there's a single thread in the thread group -
- * this avoids us having to adjust the creds on all threads and risking
- * ENOMEM */
- if (!current_is_single_threaded())
- return -EMLINK;
-
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -784,7 +766,7 @@ long join_session_keyring(const char *name)
if (ret < 0)
goto error;
- serial = new->tgcred->session_keyring->serial;
+ serial = new->session_keyring->serial;
ret = commit_creds(new);
if (ret == 0)
ret = serial;
@@ -798,8 +780,10 @@ long join_session_keyring(const char *name)
keyring = find_keyring_by_name(name, false);
if (PTR_ERR(keyring) == -ENOKEY) {
/* not found - try and create a new one */
- keyring = keyring_alloc(name, old->uid, old->gid, old,
- KEY_ALLOC_IN_QUOTA, NULL);
+ keyring = keyring_alloc(
+ name, old->uid, old->gid, old,
+ KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK,
+ KEY_ALLOC_IN_QUOTA, NULL);
if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto error2;
@@ -807,6 +791,9 @@ long join_session_keyring(const char *name)
} else if (IS_ERR(keyring)) {
ret = PTR_ERR(keyring);
goto error2;
+ } else if (keyring == new->session_keyring) {
+ ret = 0;
+ goto error2;
}
/* we've got a keyring - now to install it */
@@ -863,8 +850,7 @@ void key_change_session_keyring(struct callback_head *twork)
new->jit_keyring = old->jit_keyring;
new->thread_keyring = key_get(old->thread_keyring);
- new->tgcred->tgid = old->tgcred->tgid;
- new->tgcred->process_keyring = key_get(old->tgcred->process_keyring);
+ new->process_keyring = key_get(old->process_keyring);
security_transfer_creds(new, old);
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 66e21184b55..4bd6bdb7419 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -126,6 +126,7 @@ static int call_sbin_request_key(struct key_construction *cons,
cred = get_current_cred();
keyring = keyring_alloc(desc, cred->fsuid, cred->fsgid, cred,
+ KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ,
KEY_ALLOC_QUOTA_OVERRUN, NULL);
put_cred(cred);
if (IS_ERR(keyring)) {
@@ -150,12 +151,12 @@ static int call_sbin_request_key(struct key_construction *cons,
cred->thread_keyring ? cred->thread_keyring->serial : 0);
prkey = 0;
- if (cred->tgcred->process_keyring)
- prkey = cred->tgcred->process_keyring->serial;
+ if (cred->process_keyring)
+ prkey = cred->process_keyring->serial;
sprintf(keyring_str[1], "%d", prkey);
rcu_read_lock();
- session = rcu_dereference(cred->tgcred->session_keyring);
+ session = rcu_dereference(cred->session_keyring);
if (!session)
session = cred->user->session_keyring;
sskey = session->serial;
@@ -297,14 +298,14 @@ static void construct_get_dest_keyring(struct key **_dest_keyring)
break;
case KEY_REQKEY_DEFL_PROCESS_KEYRING:
- dest_keyring = key_get(cred->tgcred->process_keyring);
+ dest_keyring = key_get(cred->process_keyring);
if (dest_keyring)
break;
case KEY_REQKEY_DEFL_SESSION_KEYRING:
rcu_read_lock();
dest_keyring = key_get(
- rcu_dereference(cred->tgcred->session_keyring));
+ rcu_dereference(cred->session_keyring));
rcu_read_unlock();
if (dest_keyring)
@@ -347,6 +348,7 @@ static int construct_alloc_key(struct key_type *type,
const struct cred *cred = current_cred();
unsigned long prealloc;
struct key *key;
+ key_perm_t perm;
key_ref_t key_ref;
int ret;
@@ -355,8 +357,15 @@ static int construct_alloc_key(struct key_type *type,
*_key = NULL;
mutex_lock(&user->cons_lock);
+ perm = KEY_POS_VIEW | KEY_POS_SEARCH | KEY_POS_LINK | KEY_POS_SETATTR;
+ perm |= KEY_USR_VIEW;
+ if (type->read)
+ perm |= KEY_POS_READ;
+ if (type == &key_type_keyring || type->update)
+ perm |= KEY_POS_WRITE;
+
key = key_alloc(type, description, cred->fsuid, cred->fsgid, cred,
- KEY_POS_ALL, flags);
+ perm, flags);
if (IS_ERR(key))
goto alloc_failed;
diff --git a/security/smack/Kconfig b/security/smack/Kconfig
index 603b0878434..e69de9c642b 100644
--- a/security/smack/Kconfig
+++ b/security/smack/Kconfig
@@ -1,6 +1,10 @@
config SECURITY_SMACK
bool "Simplified Mandatory Access Control Kernel Support"
- depends on NETLABEL && SECURITY_NETWORK
+ depends on NET
+ depends on INET
+ depends on SECURITY
+ select NETLABEL
+ select SECURITY_NETWORK
default n
help
This selects the Simplified Mandatory Access Control Kernel.
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 99929a50093..76a5dca4640 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -2063,6 +2063,19 @@ static const struct file_operations smk_revoke_subj_ops = {
.llseek = generic_file_llseek,
};
+static struct kset *smackfs_kset;
+/**
+ * smk_init_sysfs - initialize /sys/fs/smackfs
+ *
+ */
+static int smk_init_sysfs(void)
+{
+ smackfs_kset = kset_create_and_add("smackfs", NULL, fs_kobj);
+ if (!smackfs_kset)
+ return -ENOMEM;
+ return 0;
+}
+
/**
* smk_fill_super - fill the /smackfs superblock
* @sb: the empty superblock
@@ -2183,6 +2196,10 @@ static int __init init_smk_fs(void)
if (!security_module_enable(&smack_ops))
return 0;
+ err = smk_init_sysfs();
+ if (err)
+ printk(KERN_ERR "smackfs: sysfs mountpoint problem.\n");
+
err = register_filesystem(&smk_fs_type);
if (!err) {
smackfs_mount = kern_mount(&smk_fs_type);
diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c
index b4c29848b49..2663145d119 100644
--- a/security/yama/yama_lsm.c
+++ b/security/yama/yama_lsm.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/prctl.h>
#include <linux/ratelimit.h>
+#include <linux/workqueue.h>
#define YAMA_SCOPE_DISABLED 0
#define YAMA_SCOPE_RELATIONAL 1
@@ -29,12 +30,37 @@ static int ptrace_scope = YAMA_SCOPE_RELATIONAL;
struct ptrace_relation {
struct task_struct *tracer;
struct task_struct *tracee;
+ bool invalid;
struct list_head node;
+ struct rcu_head rcu;
};
static LIST_HEAD(ptracer_relations);
static DEFINE_SPINLOCK(ptracer_relations_lock);
+static void yama_relation_cleanup(struct work_struct *work);
+static DECLARE_WORK(yama_relation_work, yama_relation_cleanup);
+
+/**
+ * yama_relation_cleanup - remove invalid entries from the relation list
+ *
+ */
+static void yama_relation_cleanup(struct work_struct *work)
+{
+ struct ptrace_relation *relation;
+
+ spin_lock(&ptracer_relations_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid) {
+ list_del_rcu(&relation->node);
+ kfree_rcu(relation, rcu);
+ }
+ }
+ rcu_read_unlock();
+ spin_unlock(&ptracer_relations_lock);
+}
+
/**
* yama_ptracer_add - add/replace an exception for this tracer/tracee pair
* @tracer: the task_struct of the process doing the ptrace
@@ -48,32 +74,34 @@ static DEFINE_SPINLOCK(ptracer_relations_lock);
static int yama_ptracer_add(struct task_struct *tracer,
struct task_struct *tracee)
{
- int rc = 0;
- struct ptrace_relation *added;
- struct ptrace_relation *entry, *relation = NULL;
+ struct ptrace_relation *relation, *added;
added = kmalloc(sizeof(*added), GFP_KERNEL);
if (!added)
return -ENOMEM;
- spin_lock_bh(&ptracer_relations_lock);
- list_for_each_entry(entry, &ptracer_relations, node)
- if (entry->tracee == tracee) {
- relation = entry;
- break;
+ added->tracee = tracee;
+ added->tracer = tracer;
+ added->invalid = false;
+
+ spin_lock(&ptracer_relations_lock);
+ rcu_read_lock();
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid)
+ continue;
+ if (relation->tracee == tracee) {
+ list_replace_rcu(&relation->node, &added->node);
+ kfree_rcu(relation, rcu);
+ goto out;
}
- if (!relation) {
- relation = added;
- relation->tracee = tracee;
- list_add(&relation->node, &ptracer_relations);
}
- relation->tracer = tracer;
- spin_unlock_bh(&ptracer_relations_lock);
- if (added != relation)
- kfree(added);
+ list_add_rcu(&added->node, &ptracer_relations);
- return rc;
+out:
+ rcu_read_unlock();
+ spin_unlock(&ptracer_relations_lock);
+ return 0;
}
/**
@@ -84,16 +112,23 @@ static int yama_ptracer_add(struct task_struct *tracer,
static void yama_ptracer_del(struct task_struct *tracer,
struct task_struct *tracee)
{
- struct ptrace_relation *relation, *safe;
+ struct ptrace_relation *relation;
+ bool marked = false;
- spin_lock_bh(&ptracer_relations_lock);
- list_for_each_entry_safe(relation, safe, &ptracer_relations, node)
+ rcu_read_lock();
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid)
+ continue;
if (relation->tracee == tracee ||
(tracer && relation->tracer == tracer)) {
- list_del(&relation->node);
- kfree(relation);
+ relation->invalid = true;
+ marked = true;
}
- spin_unlock_bh(&ptracer_relations_lock);
+ }
+ rcu_read_unlock();
+
+ if (marked)
+ schedule_work(&yama_relation_work);
}
/**
@@ -217,21 +252,22 @@ static int ptracer_exception_found(struct task_struct *tracer,
struct task_struct *parent = NULL;
bool found = false;
- spin_lock_bh(&ptracer_relations_lock);
rcu_read_lock();
if (!thread_group_leader(tracee))
tracee = rcu_dereference(tracee->group_leader);
- list_for_each_entry(relation, &ptracer_relations, node)
+ list_for_each_entry_rcu(relation, &ptracer_relations, node) {
+ if (relation->invalid)
+ continue;
if (relation->tracee == tracee) {
parent = relation->tracer;
found = true;
break;
}
+ }
if (found && (parent == NULL || task_is_descendant(parent, tracer)))
rc = 1;
rcu_read_unlock();
- spin_unlock_bh(&ptracer_relations_lock);
return rc;
}